├── API_Load_Testing_using_Cloudwatch_Custom_Metric.ipynb
├── AWS Bedrock Gen AI Notes.pdf
├── AWS ElastiCache with Python.txt
├── AWS Glue Data Quality Check.txt
├── AWS Glue Job trigger from Lambda.JPG
├── AWS Lambda Trigger from PostgreSQL.txt
├── AWS MSK Connector for Snowflake.txt
├── AWS Pinpoint using Boto3.ipynb
├── AWS Serverless Data Analytics Pipeline for File Processing .txt
├── AWSS3ToSnowflake.zip
├── AWS_Bedrock_Text_Summarization.ipynb
├── Access_s3_data_in_Spark_outside_AWS_Account.ipynb
├── Analyze Apache Parquet optimized data using Amazon Kinesis Data Firehose, Amazon Athena.png
├── Approximate Duplicate Detection using Weighted Fuzzy.ipynb
├── Architecture of the generic Job to ingest data from s3 to Snowflake.png
├── Artilary Demystified.txt
├── Athena_Table_as_source_for_Apache_Spark.ipynb
├── Athena_parameterized_queries_using_Python.ipynb
├── Automatic Mean Imputation.ipynb
├── Automatic Mode Imputation.ipynb
├── Bloom Filter Demo.ipynb
├── Bloom Filter Theory.pdf
├── Build & Push Docker Image to AWS ECR using GitHub Actions.txt
├── Building_RAG_Application_using_Textract_&_Bedrock.ipynb
├── CSV_Sentiment.ipynb
├── Cake preparation-Detail Architecture (1).jpg
├── Classification_with_GenAI.ipynb
├── Cloudy-Snow.ipynb
├── Consumer_File SQS.ipynb
├── Create_,_Update_,_Delete_Athena_Prepared_Statement_using_boto3.ipynb
├── DLQ Redrive.drawio.png
├── Data Analysis Practice.ipynb
├── Data Warehouse in Snowflake.png
├── DataBricks Quickstart Guide with AWS.ipynb
├── Data_Analysis_Play.tsv
├── Data_Analysis_Practice.ipynb
├── Data_Engineering_Essentials(news sentiment analysis).ipynb
├── Data_Ingestion_from_GSheet_to_s3.ipynb
├── Data_Quality_Check_using_pydeequ.ipynb
├── Databricks column description addition using Gen AI.ipynb
├── Databricks workflow demo.txt
├── Databricks_SQL_Connector_for_Python.ipynb
├── Delta Sharing Databricks Notebook.ipynb
├── DeltaLake AWS Foundation from scratch.ipynb
├── DeltaLake Handwritten Notes.pdf
├── Delta_Lake in AWS Lambda.txt
├── Delta_Lake_using_Python.ipynb
├── Delta_Sharing_Python.ipynb
├── Delta_Sharing_Spark.ipynb
├── Demo Data used in Generic Framework for External Table.zip
├── Docker Build Args in-depth intuition.txt
├── ETL Automation with Metadata Management and Job Tracking.txt
├── End-to-End Project on Location Based Service.txt
├── Error Notifications for Snowflake Tasks using SNS.png
├── Error_Handler_Python_File SQS.ipynb
├── FAISS_Similarity_Search.ipynb
├── Feature Scaling.ipynb
├── Feature Selection based on correlation.ipynb
├── Flat Any JSON File PySpark.ipynb
├── Flat any Json Using_Recursion,Filter & Map.ipynb
├── Flat_DataFrame.ipynb
├── Flatten JSON & Harmonize Schema.ipynb
├── Fundamentals of Data Preprocessing using Python.ipynb
├── Generative AI on AWS with AWS Bedrock.ipynb
├── Geolocation_from_IP.ipynb
├── Get_Data_Lineage_using_Rest_API.ipynb
├── Github Action with AWS.txt
├── Glue Workflow Lab.txt
├── Go Deeper in Partitioning.ipynb
├── HTTP Sink Connector.txt
├── Incremental Crawling , Incremental ETL with Glue Workflow.png
├── Intelligent Text Classification using Databricks Gen AI (1).ipynb
├── Kafka , Faust & Snowflake.txt
├── Kafka Consumer Internals with Auto Offset Commits and At Least Once Processing.png
├── Kafka Consumer Lag Monitoring.txt
├── Kafka Faust Snowflake Python.PNG
├── Kafka Producer Internals.png
├── Kafka Retry Architecture.png
├── Kmeans.ipynb
├── Leverage_pgvector_and_Amazon_PostgreSQL_for_Natural_Language_Processing.ipynb
├── Limit_and_offset_concept_in_API.ipynb
├── Long-Running Workflows with Human Interactions using Step Functions.txt
├── MSK Lab 1.txt
├── MSK To AWS Lambda.txt
├── MSK project with serverless Producer & Consumer.txt
├── MSK with s3 sink connector and Snowflake.txt
├── Manual Offset Commits and At-most Once Processing.png
├── Manual Offset Commits and Exactly Once Processing.png
├── Master Containerization with AWS.txt
├── Monitor_Data_Pipeline_using_CloudWatch_Custom_Metrics.ipynb
├── Multicollinearity Effect in Regression.ipynb
├── Multimodal Embedding using BedRock.zip
├── Multiple Linear Regression Code.ipynb
├── Null_Value_Handling_Pyspark.ipynb
├── One Hot Encoding.ipynb
├── Outliers_&_Skewness_Handling.ipynb
├── Parallel_processing_in_Pandas_using_pandarallel.ipynb
├── Partitioning in Athena using AWS Lambda.txt
├── Percentage_of_Null_values_in_each_column_of_Dataframe.ipynb
├── Pivoting.ipynb
├── Practice Session Data Analysis.ipynb
├── Publish Message in MSK Cluster from AWS Lambda.txt
├── Publisher SQS.ipynb
├── PySpark5.ipynb
├── PySpark6.ipynb
├── Pyspark1.ipynb
├── Pyspark2.ipynb
├── Pyspark3.ipynb
├── Pyspark4.ipynb
├── Python_&_Gsheet.ipynb
├── RAG using Kendra & Langchain AWS.ipynb
├── RAG using Snowflake.sql
├── Real-Time Streaming Project with Smartphone Data.txt
├── Receive_message_from_SQS_Queue.ipynb
├── Recursion Pattern with AWS Step Funciton & Lambda.png
├── Run Batch job Using AWS Lambda.txt
├── SNS_Message_Publish.ipynb
├── Semantic_clustering.ipynb
├── Send_message_SQS_Queue.ipynb
├── Serverless Manual Approval Steps in AWS Step Functions and Amazon API Gateway.png
├── Setup PySpark in ec2 using conda.txt
├── Shake detection using Accelerometer , Kafka & Python.txt
├── Simple Linear Regression (ML).ipynb
├── Simple OTP System using AWS Serverless.txt
├── Snowflake Codes for generic s3 to snowflake loader.txt
├── Snowflake External Table Partitioning.txt
├── Snowflake Parallel Processing using Python Lab.txt
├── Snowflake Row Level Security.sql
├── Snowflake Schema Detection.txt
├── Snowflake Stored Porcedure Parallel execution (Part 1).txt
├── Snowflake Stored Porcedure Parallel execution (Part 2).txt
├── Snowflake code for External Table refresh framework.txt
├── Snowflake logging (1).txt
├── Snowflake_SP_Util.py
├── Snyk Code for Github Action.yml
├── Sorting.ipynb
├── Spark Caching In-Depth.ipynb
├── String Functions.ipynb
├── String_similarity_using_Fuzzy.ipynb
├── Success File in PySpark.txt
├── Talend with EMR & Snowflake.png
├── Time Traven in Snowflake.txt
├── Trigger Airflow code via rest api.txt
├── Unstructured Data processing with Snowflake.txt
├── Untitled7.ipynb
├── Unusual Usecases of Time Travel.ipynb
├── Updated GenAI Notes.pdf
├── Using Ephemeral Storage for AWS Lambda.ipynb
├── Using KMS for Client Side Encryption.ipynb
├── _Manual Offset Commits & At Least Once Processing in Kafka Consumer.drawio.png
├── airflow_emr_s3_snowflake_setup.txt
├── airflow_emr_spark_s3_snowflake.py
├── airflow_install.sh
├── airflow_news_data_pipeline.py
├── airflow_talend_runner.py
├── airflow_talend_success_file_snesor.bash
├── algolia_layer1.zip
├── aws-eventbridge-kinesisfirehose-s3.drawio.png
├── bronze_to_silver_data_lineage.ipynb
├── context_aware_rag.sql
├── convert_json_to_csv_in_kinesis_firehose_transformation.ipynb
├── dbscan_visualization.txt
├── discoverx Lab 1.ipynb
├── dynamic_compaction.ipynb
├── entity_extraction_templating_using_aws_bedrock.py
├── generate_smiling_face_cluster.txt
├── generic lambda_layer_creation_framework.txt
├── generic_job_s3_to_snowflake_using_copy_command.py
├── incremental_etl.zip
├── ingest.sh
├── iris_partitioned_Data.zip
├── isNull,isNotNull,case.ipynb
├── isin,sample,limit.ipynb
├── kafka snowflake integration.txt
├── kafka source rest project.txt
├── kafka_producer_with_topic_partitioning.py
├── kafka_yt_demo.zip
├── key monitor.txt
├── lambda_powertools.py
├── mysql_cdc_fetcher_runner.py
├── news_fetcher.py
├── news_fetcher_etl.py
├── otp system.drawio.png
├── scd type 1 Snowflake.txt
├── scd_type_2_snowflake.py
├── scd_type_2_snowflake_queries.sql
├── scd_type_2_snowflake_version_2.py
├── snowflake cortex fine tuning.txt
├── snowflake_connector_python-2.3.8-py3-none-any.whl
├── snowflake_dq_framework.py
├── snowflake_elt_talend_lab_24_08_2022.txt
├── source_to_bronze_data_lineage.ipynb
├── study_data.csv
├── test123.txt
├── testa
├── transform.py
├── transientcluster.ipynb
├── translator_with_polly.py
└── user_data_yt.sh
/API_Load_Testing_using_Cloudwatch_Custom_Metric.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": [],
7 | "authorship_tag": "ABX9TyPx14exc0tt0xFeYnpWFa2c",
8 | "include_colab_link": true
9 | },
10 | "kernelspec": {
11 | "name": "python3",
12 | "display_name": "Python 3"
13 | },
14 | "language_info": {
15 | "name": "python"
16 | }
17 | },
18 | "cells": [
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {
22 | "id": "view-in-github",
23 | "colab_type": "text"
24 | },
25 | "source": [
26 | "
"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": null,
32 | "metadata": {
33 | "id": "DKGsa9g-0rSj"
34 | },
35 | "outputs": [],
36 | "source": [
37 | "!pip3 install boto3 requests pandas"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "source": [
43 | "import time\n",
44 | "import json\n",
45 | "import pandas as pd\n",
46 | "import requests\n",
47 | "import boto3"
48 | ],
49 | "metadata": {
50 | "id": "z3IlVwZXAYES"
51 | },
52 | "execution_count": null,
53 | "outputs": []
54 | },
55 | {
56 | "cell_type": "code",
57 | "source": [
58 | "\n",
59 | "# Initialize CloudWatch client\n",
60 | "cloudwatch = boto3.client('cloudwatch', aws_access_key_id='', aws_secret_access_key='', region_name='us-east-1')\n",
61 | "\n",
62 | "# Read the CSV file\n",
63 | "csv_file = \"/content/extarcted_data_for_testing30k.csv\" # Replace with the path to your CSV file\n",
64 | "data = pd.read_csv(csv_file)\n",
65 | "data = data.head(300)\n",
66 | "\n",
67 | "# API URL and headers\n",
68 | "api_url = \"https://hccreference.com/api/search\"\n",
69 | "headers = {\n",
70 | " 'accept': 'application/json, text/plain, */*',\n",
71 | " 'accept-language': 'en-US,en;q=0.9,bn;q=0.8,hi;q=0.7',\n",
72 | " 'cache-control': 'no-cache',\n",
73 | " 'content-type': 'application/json',\n",
74 | " 'origin': 'https://hccreference.com',\n",
75 | " 'pragma': 'no-cache',\n",
76 | " 'referer': 'https://hccreference.com/',\n",
77 | " 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',\n",
78 | "}\n",
79 | "\n",
80 | "def send_metrics_to_cloudwatch(metric_name, value, unit='Milliseconds', namespace='API_RESPONSE_DATA_CUSTOM_METRIC'):\n",
81 | " try:\n",
82 | " cloudwatch.put_metric_data(\n",
83 | " MetricData=[\n",
84 | " {\n",
85 | " 'MetricName': metric_name,\n",
86 | " 'Unit': unit,\n",
87 | " 'Value': value\n",
88 | " },\n",
89 | " ],\n",
90 | " Namespace=namespace\n",
91 | " )\n",
92 | " print(f\"Metric {metric_name} sent successfully.\")\n",
93 | " except Exception as e:\n",
94 | " print(f\"Error sending metric {metric_name}: {e}\")\n",
95 | "\n",
96 | "def make_api_requests(data):\n",
97 | " for index, row in data.iterrows():\n",
98 | " payload = {\n",
99 | " \"dx_hcc\": row['dx_hcc'],\n",
100 | " \"dos_year\": row['dos_year'],\n",
101 | " \"drf\": row['drf'],\n",
102 | " \"search\": row['search']\n",
103 | " }\n",
104 | " try:\n",
105 | " start_time = time.perf_counter()\n",
106 | " response = requests.post(api_url, headers=headers, json=payload)\n",
107 | " end_time = time.perf_counter()\n",
108 | "\n",
109 | " response_time = (end_time - start_time) * 1000 # Convert to milliseconds\n",
110 | " print(f\"Row {index + 1}: Response time {response_time:.2f} ms\")\n",
111 | "\n",
112 | " send_metrics_to_cloudwatch('API_Response_Time', response_time)\n",
113 | "\n",
114 | " if response.status_code != 200:\n",
115 | " print(f\"Row {index + 1}: API error - {response.status_code}\")\n",
116 | " send_metrics_to_cloudwatch('API_Errors', 1, unit='Count')\n",
117 | " except Exception as e:\n",
118 | " print(f\"Error processing row {index + 1}: {e}\")\n",
119 | " send_metrics_to_cloudwatch('API_Errors', 1, unit='Count')\n",
120 | "\n",
121 | "# Run the API requests\n",
122 | "make_api_requests(data)\n"
123 | ],
124 | "metadata": {
125 | "id": "I25hiqiz02Aj"
126 | },
127 | "execution_count": null,
128 | "outputs": []
129 | }
130 | ]
131 | }
--------------------------------------------------------------------------------
/AWS Bedrock Gen AI Notes.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/AWS Bedrock Gen AI Notes.pdf
--------------------------------------------------------------------------------
/AWS Glue Job trigger from Lambda.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/AWS Glue Job trigger from Lambda.JPG
--------------------------------------------------------------------------------
/AWS Lambda Trigger from PostgreSQL.txt:
--------------------------------------------------------------------------------
1 | Step 1: Create Postgres Instance
2 |
3 | Instance identifier:
4 | Master User Name:
5 | Master Password:
6 | Security Group :
7 | Database Port:
8 | DB Name:
9 |
10 | Step 2: Create the Lambda Function
11 |
12 | import json
13 | from time import sleep
14 |
15 | def lambda_handler(event, context):
16 | print(event)
17 | sleep(2)
18 | # TODO implement
19 | return {
20 | 'statusCode': 200,
21 | 'body': json.dumps('Hello from Lambda!')
22 | }
23 |
24 |
25 | Step 3: Create IAM Role with Lambda Access & Assign to AWS RDS
26 |
27 | Step 4: Connect to DB
28 |
29 | Driver:
30 |
31 | URL: jdbc:postgresql://host:port/name_of_database
32 | Username:
33 | Password:
34 |
35 |
36 | Step 5: Run SQL Query:
37 |
38 | SELECT current_database();
39 |
40 | CREATE EXTENSION IF NOT EXISTS aws_lambda CASCADE;
41 |
42 | --sync
43 | SELECT * from aws_lambda.invoke(aws_commons.create_lambda_function_arn('arn:aws:lambda:us-east-1:825865577047:function:triggerfromdbrds', 'us-east-1'),
44 | '{"body": "Hello from Postgres second time!"}'::json );
45 |
46 | --async
47 | SELECT * FROM aws_lambda.invoke(aws_commons.create_lambda_function_arn('arn:aws:lambda:us-east-1:825865577047:function:triggerfromdbrds', 'us-east-1'),
48 | '{"body": "Hello from Postgres async!"}'::json, 'Event');
49 |
50 | --create table
51 | -- Create the inventory table
52 | CREATE TABLE inventory (
53 | id SERIAL PRIMARY KEY,
54 | product_name VARCHAR(100) NOT NULL,
55 | quantity INTEGER NOT NULL,
56 | price DECIMAL(10, 2) NOT NULL
57 | );
58 |
59 | -- Insert some sample data into the inventory table
60 | INSERT INTO inventory (product_name, quantity, price) VALUES ('Product A', 10, 50.00);
61 | INSERT INTO inventory (product_name, quantity, price) VALUES ('Product B', 5, 40.00);
62 | INSERT INTO inventory (product_name, quantity, price) VALUES ('Product C', 0, 30.00);
63 |
64 | --create trigger
65 | -- Create or replace the function to invoke Lambda with inserted row as input
66 | CREATE OR REPLACE FUNCTION invoke_lambda_on_insert()
67 | RETURNS TRIGGER AS $$
68 | BEGIN
69 | PERFORM aws_lambda.invoke(
70 | aws_commons.create_lambda_function_arn('arn:aws:lambda:us-east-1:825865577047:function:triggerfromdbrds'),
71 | row_to_json(NEW),
72 | 'Event'
73 | );
74 | RETURN NEW;
75 | END;
76 | $$ LANGUAGE plpgsql;
77 |
78 | -- Create or replace the trigger to call the function on insert
79 | CREATE TRIGGER call_lambda_on_insert
80 | AFTER INSERT ON inventory
81 | FOR EACH ROW
82 | EXECUTE FUNCTION invoke_lambda_on_insert();
83 |
84 | INSERT INTO inventory (product_name, quantity, price) VALUES ('Product E', 0, 30.00);
85 | INSERT INTO inventory (product_name, quantity, price) VALUES ('Product F', 0, 30.00);
86 | INSERT INTO inventory (product_name, quantity, price) VALUES ('Product G', 0, 30.00);
87 | ;
88 |
89 | INSERT INTO inventory (product_name, quantity, price) VALUES ('Product H', 0, 30.00),('Product I', 0, 30.00);
90 |
91 | SELECT * FROM inventory;
--------------------------------------------------------------------------------
/AWS MSK Connector for Snowflake.txt:
--------------------------------------------------------------------------------
1 | Launch MSK Cluster:
2 | ----------------------
3 | Configure NAT Gateway & launch MSK Cluster in Private Subnet
4 |
5 |
6 | openssl genrsa -out rsa_key.pem 2048
7 | openssl rsa -in rsa_key.pem -pubout -out rsa_key.pub
8 |
9 | These files contain keys that may contain spaces and new line characters which need to be removed--
10 | export SNOWFLAKE_PVT_KEY=$(echo `sed -e '2,$!d' -e '$d' -e 's/\n/ /g' rsa_key.pem`|tr -d ' ')
11 | echo $SNOWFLAKE_PVT_KEY > rsa_key_p8.out
12 |
13 | Configure Snowflake:
14 | --------------------------
15 | cat rsa_key.pub
16 |
17 | DROP DATABASE IF EXISTS RAMU;
18 | Create database ramu;
19 | alter user Satadru set rsa_public_key='';
20 |
21 | desc user satadru;
22 | use ramu;
23 | show tables;
24 |
25 |
26 | In EC2 Client Machine:
27 | -----------------------------
28 | sudo yum install java-1.8.0-openjdk
29 | wget https://archive.apache.org/dist/kafka/2.8.1/kafka_2.12-2.8.1.tgz
30 | tar -xvf kafka_2.12-2.8.1.tgz
31 | cd kafka_2.12-2.8.1
32 |
33 | bin/kafka-topics.sh --create --topic demo_testing2 --bootstrap-server {} --replication-factor 1 --partitions 2
34 |
35 |
36 | Create Custom plugins:
37 | -------------------------
38 | https://mvnrepository.com/artifact/com.snowflake/snowflake-kafka-connector/1.5.0
39 |
40 |
41 | For Kafka Connect Config:
42 | ---------------------------
43 | IAM Role:s3--give s3 full access
44 |
45 | Trust Relationship--
46 |
47 | {
48 | "Version": "2012-10-17",
49 | "Statement": [
50 | {
51 | "Effect": "Allow",
52 | "Principal": {
53 | "Service": "kafkaconnect.amazonaws.com"
54 | },
55 | "Action": "sts:AssumeRole",
56 | "Condition": {
57 | "StringEquals": {
58 | "aws:SourceAccount": "Account ID"
59 | }
60 | }
61 | }
62 | ]
63 | }
64 |
65 | Create a cloudwatch log group
66 |
67 |
68 | Connector Config:
69 | -------------------------
70 |
71 | connector.class=com.snowflake.kafka.connector.SnowflakeSinkConnector
72 | tasks.max=8
73 | topics=demo_testing2
74 | snowflake.topic2table.map=demo_testing2:fake_data_real_time_demo
75 | buffer.count.records=10000
76 | buffer.flush.time=60
77 | buffer.size.bytes=5000000
78 | snowflake.url.name=
79 | snowflake.user.name=
80 | snowflake.private.key=
81 | snowflake.database.name=RAMU
82 | snowflake.schema.name=PUBLIC
83 | key.converter=com.snowflake.kafka.connector.records.SnowflakeJsonConverter
84 | value.converter=com.snowflake.kafka.connector.records.SnowflakeJsonConverter
85 |
86 |
87 | Test:
88 | -------
89 | Produce messages:
90 | ---------------------
91 | bin/kafka-console-producer.sh --topic demo_testing2 --bootstrap-server {}
92 |
93 | Consume messages:
94 | ---------------------
95 | bin/kafka-console-consumer.sh --topic demo_testing2 --bootstrap-server {}
96 |
97 | Destination:
98 | -----------------
99 | select * from ramu.public.fake_data_real_time_demo;
100 |
101 | Sample Data to Publish:
102 | ------------------------------
103 | {"email":"wzanettinirp@stanford.edu","timestamp":1663420415,"event":"spamreport","gender":"Female","ip_address":"8.166.173.156"}
104 | {"email":"pstegersrq@reddit.com","timestamp":1664321942,"event":"spamreport","gender":"Female","ip_address":"128.214.160.228"}
105 | {"email":"avlahosrr@posterous.com","timestamp":1646024825,"event":"bounce","gender":"Female","ip_address":"147.51.176.231"}
--------------------------------------------------------------------------------
/AWSS3ToSnowflake.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/AWSS3ToSnowflake.zip
--------------------------------------------------------------------------------
/AWS_Bedrock_Text_Summarization.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "id": "x5HdV-yvIty2"
8 | },
9 | "outputs": [],
10 | "source": [
11 | "!pip install boto3 langchain pypdf unstructured[pdf] langchain-community"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": null,
17 | "metadata": {
18 | "id": "5sqr0kkWTAaa"
19 | },
20 | "outputs": [],
21 | "source": [
22 | "from langchain_community.document_loaders import UnstructuredPDFLoader\n",
23 | "from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter\n",
24 | "import json\n",
25 | "import boto3"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": null,
31 | "metadata": {
32 | "id": "ehFtAxHGUy6j"
33 | },
34 | "outputs": [],
35 | "source": [
36 | "boto3_bedrock = boto3.client('bedrock-runtime',region_name='us-east-1',aws_access_key_id='{}',aws_secret_access_key='{}')"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": null,
42 | "metadata": {
43 | "id": "CXLmd31WUfvZ"
44 | },
45 | "outputs": [],
46 | "source": [
47 | "def summarizer(prompt_data):\n",
48 | " inputText=prompt_data\n",
49 | " body_part=json.dumps({'inputText': inputText,\n",
50 | " 'textGenerationConfig': {'maxTokenCount': 8192,\n",
51 | " 'stopSequences': [],\n",
52 | " 'temperature': 0,\n",
53 | " 'topP': 1}})\n",
54 | " response = boto3_bedrock.invoke_model(\n",
55 | " body=body_part,\n",
56 | " contentType=\"application/json\",\n",
57 | " accept=\"application/json\",\n",
58 | " modelId='amazon.titan-text-express-v1'\n",
59 | " )\n",
60 | " output_text=json.loads(response['body'].read())['results'][0]['outputText']\n",
61 | " return output_text"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": null,
67 | "metadata": {
68 | "id": "oJgKzeNvSiSj"
69 | },
70 | "outputs": [],
71 | "source": [
72 | "def read_pdf_and_split(filename):\n",
73 | " loader = UnstructuredPDFLoader(filename)\n",
74 | " data = loader.load()\n",
75 | " print(data)\n",
76 | " splitter = RecursiveCharacterTextSplitter(\n",
77 | " chunk_size=1000,\n",
78 | " chunk_overlap=100,\n",
79 | " length_function=len,\n",
80 | " add_start_index=True\n",
81 | " )\n",
82 | " splitted_text = splitter.split_documents(data)\n",
83 | "\n",
84 | " return splitted_text\n"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": null,
90 | "metadata": {
91 | "id": "Kbw4ars1UBFI"
92 | },
93 | "outputs": [],
94 | "source": [
95 | "pdf_document = read_pdf_and_split('/content/YOGI_2_0.pdf')"
96 | ]
97 | },
98 | {
99 | "cell_type": "code",
100 | "source": [
101 | "pdf_document"
102 | ],
103 | "metadata": {
104 | "id": "1Ygmv_JZQZh4"
105 | },
106 | "execution_count": null,
107 | "outputs": []
108 | },
109 | {
110 | "cell_type": "code",
111 | "execution_count": null,
112 | "metadata": {
113 | "id": "n10KYlgRU27h"
114 | },
115 | "outputs": [],
116 | "source": [
117 | "len(pdf_document)"
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": null,
123 | "metadata": {
124 | "id": "47e8ZSV4YEwe"
125 | },
126 | "outputs": [],
127 | "source": [
128 | "summary= \"\"\n",
129 | "for i in pdf_document:\n",
130 | " # gathering the text content of that specific chunk\n",
131 | " chunk_content = i.page_content\n",
132 | " # creating the prompt that will be passed into Bedrock with the text content of the chunk\n",
133 | " prompt = f\"\"\"Human: Provide a detailed summary for the chunk of text provided to you:\n",
134 | " Text: {chunk_content}\"\"\"\n",
135 | " # passing the prompt into the summarizer function to generate the summary of that chunk, and appending it to\n",
136 | " # the summary string\n",
137 | " summary += summarizer(prompt)\n",
138 | "\n",
139 | "final_summary_prompt = f\"\"\"Human: You will be given a set of summaries from a document. Create a cohesive\n",
140 | "summary from the provided individual summaries. The summary should very detailed.\n",
141 | "Summaries: {summary}\"\"\"\n",
142 | "# generating the final summary of all the summaries we have previously generated.\n",
143 | "print(summarizer(final_summary_prompt))"
144 | ]
145 | }
146 | ],
147 | "metadata": {
148 | "colab": {
149 | "provenance": [],
150 | "authorship_tag": "ABX9TyNlKIthr1rY+Vrj0gHzReFL"
151 | },
152 | "kernelspec": {
153 | "display_name": "Python 3",
154 | "name": "python3"
155 | },
156 | "language_info": {
157 | "name": "python"
158 | }
159 | },
160 | "nbformat": 4,
161 | "nbformat_minor": 0
162 | }
--------------------------------------------------------------------------------
/Analyze Apache Parquet optimized data using Amazon Kinesis Data Firehose, Amazon Athena.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/Analyze Apache Parquet optimized data using Amazon Kinesis Data Firehose, Amazon Athena.png
--------------------------------------------------------------------------------
/Architecture of the generic Job to ingest data from s3 to Snowflake.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/Architecture of the generic Job to ingest data from s3 to Snowflake.png
--------------------------------------------------------------------------------
/Artilary Demystified.txt:
--------------------------------------------------------------------------------
1 | VM Launch:
2 | ----
3 | ec2-ubuntu machine
4 |
5 | Installation:
6 | -------------
7 | sudo su
8 | curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.34.0/install.sh | bash
9 | . ~/.nvm/nvm.sh
10 | nvm install node
11 | nvm --version
12 | npm install -g artillery@latest
13 | artillery version
14 | artillery dino -m "Knowledge Amplifier" -r
15 |
16 |
17 |
18 | Case 1:
19 | --------
20 | {API (Base URL)}
21 |
22 | /firstdemo
23 |
24 | YAML File:
25 | -----------
26 | firstdemo.yml
27 |
28 | config:
29 | target: 'https://httpbin.org/'
30 | phases:
31 | - duration: 5
32 | arrivalRate: 2
33 | scenarios:
34 | - flow:
35 | - get:
36 | url: "/get"
37 |
38 | config:
39 | target: '{API (Base URL)}'
40 | phases:
41 | - duration: 5
42 | arrivalRate: 2
43 | scenarios:
44 | - flow:
45 | - get:
46 | url: "/firstdemo"
47 |
48 |
49 |
50 |
51 | Run the code:
52 | -------------
53 | DEBUG=http* artillery run /home/ubuntu/firstdemo.yml
54 |
55 | Understanding Summary:
56 | -----------------------
57 | All VUs finished. Total time: {}
58 | http.codes.--Number of codes received for each specific status code.
59 | http.request_rate--Rate of http requests done over the time period.
60 | http.requests -- Number of HTTP requests made.
61 | http.responses -- Number of HTTP responses received.
62 |
63 |
64 |
65 | Case 2:
66 | ---------
67 | seconddemo.yml
68 |
69 | config:
70 | target: '{API (Base URL)}'
71 | phases:
72 | - name: reduce_load
73 | duration: 3h
74 | arrivalRate: 1
75 | - name: nothing
76 | pause: 2m
77 | - name: stress
78 | duration: 2m
79 | arrivalRate: 3
80 | scenarios:
81 | - flow:
82 | - get:
83 | url: "/firstdemo"
84 |
85 |
86 | Shell Script:
87 | ---------------
88 | seconddemo.sh
89 |
90 | DEBUG=http* artillery run /home/ubuntu/seconddemo.yml
91 |
92 | Code run:
93 | ----------
94 | chmod 755 seconddemo.sh
95 | nohup "/home/ubuntu/seconddemo.sh" > /home/ubuntu/seconddemo.out 2>&1 &
96 |
97 | [1] 3042
98 |
99 | Case 3:
100 | ---------
101 | thirddemo.yml
102 |
103 | config:
104 | target: https://www.hccreference.com
105 | phases:
106 | - name: high_traffic
107 | duration: 2m
108 | arrivalRate: 2
109 | - name: nothing
110 | pause: 1m
111 | - name: stress
112 | duration: 10
113 | arrivalRate: 32
114 | payload:
115 | path: /home/ubuntu/extarcted_data_for_testing30k.csv
116 | order: sequence # default: random
117 | loadAll: true
118 | skipHeader: true # default: false
119 | delimiter: "," # default: ,
120 | skipEmptyLines: true # default: true
121 | fields:
122 | - "dos_year"
123 | - "drf"
124 | - "dx_hcc"
125 | - "search"
126 |
127 | scenarios:
128 | - name: testhcc
129 | flow:
130 | - post:
131 | url: /api/search
132 | headers:
133 | Content-Type: application/json
134 | json:
135 | dos_year: "{{ dos_year }}"
136 | drf: "{{ drf }}"
137 | dx_hcc: "{{ dx_hcc }}"
138 | search: "{{ search }}"
139 |
140 | Shell Script:
141 | ---------------
142 | thirddemo.sh
143 |
144 | DEBUG=http* artillery run /home/ubuntu/thirddemo.yml
145 |
146 | Code run:
147 | ----------
148 | chmod 755 thirddemo.sh
149 | nohup "/home/ubuntu/thirddemo.sh" > /home/ubuntu/thirddemo.out 2>&1 &
150 |
151 | 3154
--------------------------------------------------------------------------------
/Bloom Filter Theory.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/Bloom Filter Theory.pdf
--------------------------------------------------------------------------------
/Build & Push Docker Image to AWS ECR using GitHub Actions.txt:
--------------------------------------------------------------------------------
1 | Reference:
2 | ------------
3 | https://github.com/aws-actions/amazon-ecr-login
4 | https://docs.aws.amazon.com/lambda/latest/dg/python-image.html
5 | https://github.com/SatadruMukherjee/Data-Preprocessing-Models/blob/main/Github%20Action%20with%20AWS.txt
6 |
7 | Github Action Code:
8 | -----------------------
9 | name: ecr_docker_deployment
10 | on: [push]
11 | jobs:
12 | docker_cicd:
13 | runs-on: ubuntu-latest
14 | steps:
15 | - uses: actions/checkout@v4
16 | - name: Configure AWS credentials
17 | uses: aws-actions/configure-aws-credentials@v1
18 | with:
19 | aws-access-key-id:
20 | aws-secret-access-key:
21 | aws-region: us-east-1
22 | - name: Login to Amazon ECR
23 | id: login-ecr
24 | uses: aws-actions/amazon-ecr-login@v2
25 |
26 | - name: Build, tag, and push docker image to Amazon ECR
27 | env:
28 | REGISTRY: ${{ steps.login-ecr.outputs.registry }}
29 | REPOSITORY: demoytcicdgithubaction
30 | IMAGE_TAG: ${{ github.sha }}
31 | run: |
32 | docker build -t $REGISTRY/$REPOSITORY:$IMAGE_TAG .
33 | docker push $REGISTRY/$REPOSITORY:$IMAGE_TAG
34 | aws lambda update-function-code \
35 | --function-name demoytcicdecrtest \
36 | --image-uri $REGISTRY/$REPOSITORY:$IMAGE_TAG
--------------------------------------------------------------------------------
/Cake preparation-Detail Architecture (1).jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/Cake preparation-Detail Architecture (1).jpg
--------------------------------------------------------------------------------
/Create_,_Update_,_Delete_Athena_Prepared_Statement_using_boto3.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": [],
7 | "authorship_tag": "ABX9TyOldsBynHv6Pz97bdTKjPP2",
8 | "include_colab_link": true
9 | },
10 | "kernelspec": {
11 | "name": "python3",
12 | "display_name": "Python 3"
13 | },
14 | "language_info": {
15 | "name": "python"
16 | }
17 | },
18 | "cells": [
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {
22 | "id": "view-in-github",
23 | "colab_type": "text"
24 | },
25 | "source": [
26 | "
"
27 | ]
28 | },
29 | {
30 | "cell_type": "markdown",
31 | "source": [
32 | "Prerequisite:\n",
33 | "https://github.com/SatadruMukherjee/Data-Preprocessing-Models/blob/main/Athena_parameterized_queries_using_Python.ipynb"
34 | ],
35 | "metadata": {
36 | "id": "999exryIKhvN"
37 | }
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": null,
42 | "metadata": {
43 | "id": "OqDN6-_N1yNb"
44 | },
45 | "outputs": [],
46 | "source": [
47 | "!pip install boto3"
48 | ]
49 | },
50 | {
51 | "cell_type": "code",
52 | "source": [
53 | "import boto3\n",
54 | "access_key=''\n",
55 | "secret_key=''\n",
56 | "session = boto3.Session(\n",
57 | " aws_access_key_id=access_key,\n",
58 | " aws_secret_access_key=secret_key,region_name='us-east-1'\n",
59 | ")\n",
60 | "athena_client = session.client('athena')\n"
61 | ],
62 | "metadata": {
63 | "id": "v_Ao18Cc116f"
64 | },
65 | "execution_count": null,
66 | "outputs": []
67 | },
68 | {
69 | "cell_type": "markdown",
70 | "source": [
71 | "# Create prepared statements using boto3"
72 | ],
73 | "metadata": {
74 | "id": "6Bxn7tJl2kFb"
75 | }
76 | },
77 | {
78 | "cell_type": "code",
79 | "source": [
80 | "resp = athena_client.create_prepared_statement(\n",
81 | " StatementName = \"iris_fulla\",\n",
82 | " WorkGroup = \"primary\",\n",
83 | " QueryStatement = \"\"\"\n",
84 | " SELECT sum(sepal_length) FROM irisdemo WHERE variety = ? \n",
85 | " \"\"\"\n",
86 | " )"
87 | ],
88 | "metadata": {
89 | "id": "z-vW5iQs17HT"
90 | },
91 | "execution_count": null,
92 | "outputs": []
93 | },
94 | {
95 | "cell_type": "markdown",
96 | "source": [
97 | "# Update prepared statements using boto3"
98 | ],
99 | "metadata": {
100 | "id": "FfniHtX924WU"
101 | }
102 | },
103 | {
104 | "cell_type": "code",
105 | "source": [
106 | "response = athena_client.update_prepared_statement(\n",
107 | " StatementName = \"iris_fulla\",\n",
108 | " WorkGroup = \"primary\",\n",
109 | " QueryStatement = \"\"\"\n",
110 | " SELECT sum(petal_length) FROM irisdemo WHERE variety = ? ;\n",
111 | " \"\"\"\n",
112 | ")"
113 | ],
114 | "metadata": {
115 | "id": "2k0vZkzP2zZd"
116 | },
117 | "execution_count": null,
118 | "outputs": []
119 | },
120 | {
121 | "cell_type": "markdown",
122 | "source": [
123 | "# Delete prepared statements\n"
124 | ],
125 | "metadata": {
126 | "id": "VWq2ha3a3jt9"
127 | }
128 | },
129 | {
130 | "cell_type": "code",
131 | "source": [
132 | "--to remove the prepared statement\n",
133 | "DEALLOCATE PREPARE iris_fulla;"
134 | ],
135 | "metadata": {
136 | "id": "YtU9ZqcY3mUg"
137 | },
138 | "execution_count": null,
139 | "outputs": []
140 | },
141 | {
142 | "cell_type": "code",
143 | "source": [
144 | "response = athena_client.delete_prepared_statement(\n",
145 | " StatementName='iris_fulla',\n",
146 | " WorkGroup='primary'\n",
147 | ")"
148 | ],
149 | "metadata": {
150 | "id": "140SfLvO6Tp1"
151 | },
152 | "execution_count": null,
153 | "outputs": []
154 | },
155 | {
156 | "cell_type": "markdown",
157 | "source": [
158 | "# *security **using** Amazon Athena parameterized queries*"
159 | ],
160 | "metadata": {
161 | "id": "pCzKTd1B7PRt"
162 | }
163 | },
164 | {
165 | "cell_type": "markdown",
166 | "source": [
167 | "https://aws.amazon.com/blogs/big-data/improve-reusability-and-security-using-amazon-athena-parameterized-queries/"
168 | ],
169 | "metadata": {
170 | "id": "tQ2DAFoJKq1d"
171 | }
172 | }
173 | ]
174 | }
--------------------------------------------------------------------------------
/DLQ Redrive.drawio.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/DLQ Redrive.drawio.png
--------------------------------------------------------------------------------
/Data Warehouse in Snowflake.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/Data Warehouse in Snowflake.png
--------------------------------------------------------------------------------
/Data_Ingestion_from_GSheet_to_s3.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": [],
7 | "authorship_tag": "ABX9TyOVHQq+Baz2iKjbD87oOQB3",
8 | "include_colab_link": true
9 | },
10 | "kernelspec": {
11 | "name": "python3",
12 | "display_name": "Python 3"
13 | },
14 | "language_info": {
15 | "name": "python"
16 | }
17 | },
18 | "cells": [
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {
22 | "id": "view-in-github",
23 | "colab_type": "text"
24 | },
25 | "source": [
26 | "
"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": null,
32 | "metadata": {
33 | "id": "nzsqL_sbGhzp"
34 | },
35 | "outputs": [],
36 | "source": [
37 | "!pip install pandas gspread google-auth google-auth-oauthlib google-auth-httplib2 boto3"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "source": [
43 | "import os.path\n",
44 | "import pandas as pd\n",
45 | "from google.auth.transport.requests import Request\n",
46 | "from google_auth_oauthlib.flow import InstalledAppFlow\n",
47 | "from googleapiclient.discovery import build\n",
48 | "from googleapiclient.errors import HttpError\n",
49 | "from google.oauth2.service_account import Credentials\n",
50 | "from io import StringIO\n",
51 | "import boto3"
52 | ],
53 | "metadata": {
54 | "id": "31V0PWGyGkbl"
55 | },
56 | "execution_count": null,
57 | "outputs": []
58 | },
59 | {
60 | "cell_type": "code",
61 | "source": [
62 | "# The ID and range of a sample spreadsheet.\n",
63 | "SAMPLE_SPREADSHEET_ID = \"{}\"\n",
64 | "\n",
65 | "scopes = [\n",
66 | " 'https://www.googleapis.com/auth/spreadsheets',\n",
67 | " 'https://www.googleapis.com/auth/drive'\n",
68 | " ]\n",
69 | "credentials = Credentials.from_service_account_info({\n",
70 | "\n",
71 | "}\n",
72 | ", scopes=scopes)\n",
73 | "service = build(\"sheets\", \"v4\", credentials=credentials)\n",
74 | "sheet = service.spreadsheets()\n",
75 | "result = (\n",
76 | "sheet.values()\n",
77 | ".get(spreadsheetId=SAMPLE_SPREADSHEET_ID,range='{}')\n",
78 | ".execute()\n",
79 | ")\n",
80 | "values = result.get(\"values\", [])\n",
81 | "values"
82 | ],
83 | "metadata": {
84 | "id": "c1GJ_DWFG9RP"
85 | },
86 | "execution_count": null,
87 | "outputs": []
88 | },
89 | {
90 | "cell_type": "code",
91 | "source": [
92 | "df = pd.DataFrame(values[1:], columns=values[0])"
93 | ],
94 | "metadata": {
95 | "id": "19a3S9_gJp76"
96 | },
97 | "execution_count": null,
98 | "outputs": []
99 | },
100 | {
101 | "cell_type": "code",
102 | "source": [
103 | "df"
104 | ],
105 | "metadata": {
106 | "id": "_PgBK1DQKaHd"
107 | },
108 | "execution_count": null,
109 | "outputs": []
110 | },
111 | {
112 | "cell_type": "code",
113 | "source": [
114 | "# Initialize S3 client\n",
115 | "s3_client = boto3.client(\n",
116 | " \"s3\",\n",
117 | " aws_access_key_id='{}',\n",
118 | " aws_secret_access_key='{}',\n",
119 | " region_name='us-east-1'\n",
120 | ")\n",
121 | "\n",
122 | "# Convert DataFrame to CSV and upload to S3\n",
123 | "csv_buffer = StringIO()\n",
124 | "df.to_csv(csv_buffer, index=False)\n",
125 | "\n",
126 | "s3_client.put_object(\n",
127 | " Bucket='{}',\n",
128 | " Key='{}/write_drive_data.csv',\n",
129 | " Body=csv_buffer.getvalue()\n",
130 | ")\n",
131 | "\n",
132 | "print(f\"DataFrame successfully uploaded to s3\")"
133 | ],
134 | "metadata": {
135 | "id": "NSCAj0DP9Qah"
136 | },
137 | "execution_count": null,
138 | "outputs": []
139 | }
140 | ]
141 | }
--------------------------------------------------------------------------------
/DeltaLake Handwritten Notes.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/DeltaLake Handwritten Notes.pdf
--------------------------------------------------------------------------------
/Delta_Lake in AWS Lambda.txt:
--------------------------------------------------------------------------------
1 | lambda_function.py
2 | =================
3 | import os
4 | import boto3
5 | import pandas as pd
6 | from deltalake import DeltaTable
7 | from deltalake.writer import write_deltalake
8 | import json
9 |
10 | def get_s3_object(bucket, key):
11 | """Get object from S3"""
12 | s3_client = boto3.client('s3')
13 | try:
14 | response = s3_client.get_object(Bucket=bucket, Key=key)
15 | print(f"Successfully retrieved object from S3: {bucket}/{key}")
16 | return response['Body']
17 | except Exception as e:
18 | print(f"Error reading from S3: {str(e)}")
19 | raise
20 |
21 | def read_csv_from_s3(s3_object):
22 | """Read CSV data from S3 object into pandas DataFrame"""
23 | try:
24 | df = pd.read_csv(s3_object)
25 | print(f"Successfully read CSV with {len(df)} rows")
26 | return df
27 | except Exception as e:
28 | print(f"Error parsing CSV: {str(e)}")
29 | raise
30 |
31 | def write_to_delta(df, table_url):
32 | """Write DataFrame to Delta Lake"""
33 | try:
34 | print(f"Attempting to write to Delta Lake at: {table_url}")
35 | write_deltalake(table_url, df , mode='append')
36 | print("Successfully wrote to Delta Lake")
37 | return True
38 | except Exception as e:
39 | print(f"Error writing to Delta Lake: {str(e)}")
40 | raise
41 |
42 | def lambda_handler(event, context):
43 | """Main Lambda handler"""
44 | try:
45 | # Print the incoming event
46 | print(f"Received event: {json.dumps(event)}")
47 |
48 | # Get environment variables
49 | table_url = os.environ.get('TABLE_URL')
50 | if not table_url:
51 | raise ValueError("TABLE_URL environment variable is not set")
52 |
53 | print(f"Using TABLE_URL: {table_url}")
54 |
55 | # Get S3 event details
56 | records = event.get('Records', [])
57 | if not records:
58 | raise ValueError("No records found in event")
59 |
60 | # Process each record (usually there will be one)
61 | for record in records:
62 | # Extract S3 information
63 | bucket = record['s3']['bucket']['name']
64 | key = record['s3']['object']['key']
65 |
66 | print(f"Processing file {key} from bucket {bucket}")
67 |
68 | # Get and process the file
69 | s3_object = get_s3_object(bucket, key)
70 | df = read_csv_from_s3(s3_object)
71 |
72 | # Write to Delta Lake
73 | write_to_delta(df, table_url)
74 |
75 | print(f"Successfully processed {key}")
76 |
77 | return {
78 | 'statusCode': 200,
79 | 'headers': {'Content-Type': 'application/json'},
80 | 'body': json.dumps({'message': 'Successfully processed CSV to Delta Lake'})
81 | }
82 |
83 | except Exception as e:
84 | error_message = str(e)
85 | print(f"Error in lambda_handler: {error_message}")
86 | print(f"Full error details: {repr(e)}")
87 | return {
88 | 'statusCode': 500,
89 | 'headers': {'Content-Type': 'application/json'},
90 | 'body': json.dumps({'error': error_message})
91 | }
92 |
93 | Dockerfile:
94 | ============
95 | FROM public.ecr.aws/lambda/python:3.12
96 |
97 | # Copy requirements.txt
98 | COPY requirements.txt ${LAMBDA_TASK_ROOT}
99 |
100 | # Install the specified packages
101 | RUN pip install -r requirements.txt
102 |
103 | # Copy function code
104 | COPY lambda_function.py ${LAMBDA_TASK_ROOT}
105 |
106 | # Set the CMD to your handler (could also be done as a parameter override outside of the Dockerfile)
107 | CMD [ "lambda_function.lambda_handler" ]
108 |
109 |
110 | requirements.txt:
111 | =================
112 | pandas
113 | deltalake
114 |
115 | Github Action Code:
116 | -----------------------
117 | name: ecr_docker_deployment
118 | on: [push]
119 | jobs:
120 | docker_cicd:
121 | runs-on: ubuntu-latest
122 | steps:
123 | - uses: actions/checkout@v4
124 | - name: Configure AWS credentials
125 | uses: aws-actions/configure-aws-credentials@v1
126 | with:
127 | aws-access-key-id:
128 | aws-secret-access-key:
129 | aws-region: us-east-1
130 | - name: Login to Amazon ECR
131 | id: login-ecr
132 | uses: aws-actions/amazon-ecr-login@v2
133 |
134 | - name: Build, tag, and push docker image to Amazon ECR
135 | env:
136 | REGISTRY: ${{ steps.login-ecr.outputs.registry }}
137 | REPOSITORY: deltalakelambdayt
138 | IMAGE_TAG: ${{ github.sha }}
139 | run: |
140 | docker build -t $REGISTRY/$REPOSITORY:$IMAGE_TAG .
141 | docker push $REGISTRY/$REPOSITORY:$IMAGE_TAG
142 |
143 | Athena SQL:
144 | ============
145 | DROP TABLE IF EXISTS deltalake_db.lambda_delta;
146 |
147 | CREATE EXTERNAL TABLE deltalake_db.lambda_delta
148 | LOCATION 's3://{Bucket_Name}/delta_lake/'
149 | TBLPROPERTIES ('table_type' = 'DELTA');
150 |
--------------------------------------------------------------------------------
/Delta_Sharing_Python.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": [],
7 | "authorship_tag": "ABX9TyPR8DU7eGWuInrHAmO4VpyP",
8 | "include_colab_link": true
9 | },
10 | "kernelspec": {
11 | "name": "python3",
12 | "display_name": "Python 3"
13 | },
14 | "language_info": {
15 | "name": "python"
16 | }
17 | },
18 | "cells": [
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {
22 | "id": "view-in-github",
23 | "colab_type": "text"
24 | },
25 | "source": [
26 | "
"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": null,
32 | "metadata": {
33 | "id": "EVzXlXSESBex"
34 | },
35 | "outputs": [],
36 | "source": [
37 | "!pip install --upgrade delta-sharing\n"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "source": [
43 | "import delta_sharing\n",
44 | "\n",
45 | "# Point to the profile file. It can be a file on the local file system or a file on a remote storage.\n",
46 | "profile_file = \"/content/config.share\"\n",
47 | "\n",
48 | "# Create a SharingClient.\n",
49 | "client = delta_sharing.SharingClient(profile_file)\n",
50 | "\n",
51 | "# List all shared tables.\n",
52 | "client.list_all_tables()"
53 | ],
54 | "metadata": {
55 | "id": "lhl16bDZTALh"
56 | },
57 | "execution_count": null,
58 | "outputs": []
59 | },
60 | {
61 | "cell_type": "code",
62 | "source": [
63 | "table_url = profile_file + \"#..\"\n",
64 | "\n",
65 | "# Fetch 10 rows from a table and convert it to a Pandas DataFrame. This can be used to read sample data\n",
66 | "# from a table that cannot fit in the memory.\n",
67 | "df=delta_sharing.load_as_pandas(table_url, limit=10)"
68 | ],
69 | "metadata": {
70 | "id": "T3vv8RmSTH33"
71 | },
72 | "execution_count": null,
73 | "outputs": []
74 | },
75 | {
76 | "cell_type": "code",
77 | "source": [
78 | "df.head()"
79 | ],
80 | "metadata": {
81 | "id": "3STzY8-XTXgu"
82 | },
83 | "execution_count": null,
84 | "outputs": []
85 | }
86 | ]
87 | }
--------------------------------------------------------------------------------
/Demo Data used in Generic Framework for External Table.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/Demo Data used in Generic Framework for External Table.zip
--------------------------------------------------------------------------------
/Docker Build Args in-depth intuition.txt:
--------------------------------------------------------------------------------
1 |
2 |
3 | Step 1:
4 | --------
5 | Upload requirements.txt, app.py & test.py in s3 folder
6 |
7 | Dockerfile:
8 | --------------
9 | FROM python:3.7
10 | ARG AWS_ACCESS_KEY_ID
11 | ARG AWS_SECRET_ACCESS_KEY
12 | ARG AWS_DEFAULT_REGION
13 | COPY . /app
14 | WORKDIR /app
15 | RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
16 | RUN unzip awscliv2.zip
17 | RUN ./aws/install
18 | RUN aws s3 cp --recursive s3://gluezipp/demoyttestdockerset/ /app
19 | RUN pip install -r requirements.txt
20 | CMD ["python","app.py"]
21 |
22 | In Host:
23 | --------
24 | export AWS_ACCESS_KEY_ID=AKIA4ASLMUJL7WX73WBK
25 | export AWS_SECRET_ACCESS_KEY=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
26 | export AWS_DEFAULT_REGION=us-east-1
27 |
28 | General Code:
29 | ---------------
30 | docker build -t welcome-app .
31 |
32 | docker build --build-arg AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID --build-arg AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY --build-arg AWS_DEFAULT_REGION=$AWS_DEFAULT_REGION -t welcome-app .
33 |
34 |
35 |
36 | Run Container:
37 | -----------------
38 | docker run -p 5000:5000 welcome-app
39 |
40 | Open another console:
41 | ------------------------
42 | docker container ls
43 | docker exec -it bf7f40daab84 sh
44 |
45 | Observation: The file is not there
46 |
47 |
48 |
--------------------------------------------------------------------------------
/ETL Automation with Metadata Management and Job Tracking.txt:
--------------------------------------------------------------------------------
1 | DynamoDB Table:
2 | ------------------
3 | DD_JobExecLog
4 |
5 | PK --JobName
6 | SK--JobRunID
7 |
8 | Glue Job:
9 | ------------
10 | import sys
11 | from awsglue.transforms import *
12 | from awsglue.utils import getResolvedOptions
13 | from pyspark.context import SparkContext
14 | from awsglue.context import GlueContext
15 | from awsglue.job import Job
16 | from pyspark.sql.functions import *
17 | from pyspark.sql.types import *
18 | from pyspark.sql import SparkSession
19 | spark = SparkSession.builder.getOrCreate()
20 |
21 |
22 | def main():
23 | ## @params: [JOB_NAME]
24 | args = getResolvedOptions(sys.argv, ["VAL1","VAL2"])
25 | file_name=args['VAL1']
26 | bucket_name=args['VAL2']
27 | print("Bucket Name" , bucket_name)
28 | print("File Name" , file_name)
29 | input_file_path="s3a://{}/{}".format(bucket_name,file_name)
30 | print("Input File Path : ",input_file_path);
31 | df = spark.read.csv(input_file_path, header = True)
32 | df.repartition(1).write.mode('overwrite').parquet("s3a://{target_bucket}/{}".format(file_name.split('.')[0]))
33 |
34 | main()
35 |
36 | Trigger Lambda:
37 | -----------------
38 | from datetime import datetime, timedelta
39 | import json
40 | import boto3
41 | client = boto3.client('glue')
42 | dd = boto3.resource('dynamodb')
43 | table = dd.Table('DD_JobExecLog')
44 |
45 | start_timestamp = str(datetime.now())
46 | glue_job_name="yt_csv_to_parquet"
47 | def lambda_handler(event, context):
48 | for record in event['Records']:
49 | file_name = record['s3']['object']['key']
50 | bucketName=record['s3']['bucket']['name']
51 | print("File Name : ",file_name)
52 | print("Bucket Name : ",bucketName)
53 | fullS3Path = "s3://" + bucketName + "/" + file_name
54 | glue=boto3.client('glue');
55 | response = glue.start_job_run(JobName = glue_job_name, Arguments={"--VAL1":file_name,"--VAL2":bucketName})
56 | print(response)
57 | # Converting "response" from Type dict to string
58 | string_response = json.dumps(response)
59 | # Parsing JSON response from Glue API
60 | parsed_response = json.loads(string_response)
61 | ###########################
62 | #
63 | #
64 | table.put_item(
65 | Item={
66 | 'JobName': glue_job_name,
67 | 'JobRunID': parsed_response['JobRunId'],
68 | 'job_state': 'STARTED (Lambda)',
69 | 'start_timestamp': start_timestamp,
70 | 'update_timestamp': 'null',
71 | 'job_message': 'Job Triggered by Lambda',
72 | 'job_severity': 'null',
73 | 's3_file_key': fullS3Path,
74 | 'job_region': 'null',
75 | 'job_time': 'null',
76 | 'job_account': 'null',
77 | 'glue_metadata': parsed_response
78 | }
79 | )
80 |
81 |
82 |
83 | Update Lambda:
84 | ------------------
85 | from datetime import datetime, timedelta
86 | import json
87 | import boto3
88 | dd = boto3.resource('dynamodb')
89 | table = dd.Table('DD_JobExecLog')
90 |
91 | def lambda_handler(event, context):
92 | print("Event",event)
93 | jobName = event['detail']['jobName']
94 | jobRunId = event['detail']['jobRunId']
95 | job_time = event['time']
96 | j_account = event['account']
97 | j_region = event['region']
98 | j_severity = event['detail']['severity']
99 | j_state = event['detail']['state']
100 | j_message = event['detail']['message']
101 | update_timestamp = str(datetime.now())
102 | #
103 | if jobName == "yt_csv_to_parquet":
104 | table.update_item(
105 | Key={
106 | 'JobName': jobName,
107 | 'JobRunID': jobRunId
108 | },
109 | UpdateExpression='SET job_message= :msg, job_severity= :sev, update_timestamp = :upd_ts, job_time= :jb_tm, job_region= :j_region, job_state= :v_state, job_account= :acc ',
110 | ExpressionAttributeValues={
111 | ':upd_ts': update_timestamp,
112 | ':jb_tm': job_time,
113 | ':j_region': j_region,
114 | ':sev': j_severity,
115 | ':v_state': j_state,
116 | ':msg': j_message,
117 | ':acc': j_account
118 | }
119 | )
120 | #
121 |
122 | Cloudwatch Rule:
123 | ------------------
124 | {
125 | "source": ["aws.glue"],
126 | "detail-type":
127 | [
128 | "Glue Job State Change",
129 | "Glue Job Run Status"
130 | ]
131 | }
--------------------------------------------------------------------------------
/Error Notifications for Snowflake Tasks using SNS.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/Error Notifications for Snowflake Tasks using SNS.png
--------------------------------------------------------------------------------
/Geolocation_from_IP.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": [],
7 | "authorship_tag": "ABX9TyMeWIuDcO1FM9I+cdmOIWqx"
8 | },
9 | "kernelspec": {
10 | "name": "python3",
11 | "display_name": "Python 3"
12 | },
13 | "language_info": {
14 | "name": "python"
15 | }
16 | },
17 | "cells": [
18 | {
19 | "cell_type": "markdown",
20 | "source": [
21 | "# **Resources**"
22 | ],
23 | "metadata": {
24 | "id": "l1rfOxVLKdoX"
25 | }
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "source": [
30 | "* [ipinfo](https://ipinfo.io/)\n",
31 | "\n",
32 | "* [Country Code](https://country.io/names.json?ref=ipinfo.io)\n",
33 | "\n",
34 | "* [Json Formatter](https://jsonformatter.curiousconcept.com/#)\n",
35 | "\n",
36 | "\n"
37 | ],
38 | "metadata": {
39 | "id": "q0RwkuBsKL5h"
40 | }
41 | },
42 | {
43 | "cell_type": "code",
44 | "source": [
45 | "import json\n",
46 | "import requests"
47 | ],
48 | "metadata": {
49 | "id": "9XVg9L01HV44"
50 | },
51 | "execution_count": null,
52 | "outputs": []
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "metadata": {
58 | "id": "IrZEkSwQHKyH"
59 | },
60 | "outputs": [],
61 | "source": [
62 | "def geolocation(public_ip):\n",
63 | "\turl = \"http://ipinfo.io/\"+ public_ip\n",
64 | "\tresponse = requests.get(url)\n",
65 | "\tdata = json.loads(response.text)\n",
66 | "\tprint(data)"
67 | ]
68 | },
69 | {
70 | "cell_type": "code",
71 | "source": [
72 | "public_ip='{Put your IP here}'"
73 | ],
74 | "metadata": {
75 | "id": "6M7wmQqzHfM2"
76 | },
77 | "execution_count": null,
78 | "outputs": []
79 | },
80 | {
81 | "cell_type": "code",
82 | "source": [
83 | "geolocation(public_ip)"
84 | ],
85 | "metadata": {
86 | "id": "Yl7RkWMXHcen"
87 | },
88 | "execution_count": null,
89 | "outputs": []
90 | },
91 | {
92 | "cell_type": "markdown",
93 | "source": [
94 | "# **Lambda Function**"
95 | ],
96 | "metadata": {
97 | "id": "qyJuBky0J_LR"
98 | }
99 | },
100 | {
101 | "cell_type": "code",
102 | "source": [
103 | "import json\n",
104 | "import requests\n",
105 | "def geolocation(public_ip):\n",
106 | "\turl = \"http://ipinfo.io/\"+ public_ip\n",
107 | "\tresponse = requests.get(url)\n",
108 | "\tdata = json.loads(response.text)\n",
109 | "\treturn data\n",
110 | "\n",
111 | "def lambda_handler(event, context):\n",
112 | " # TODO implement\n",
113 | " print(event)\n",
114 | " source_ip=event['requestContext']['http']['sourceIp']\n",
115 | " response=geolocation(source_ip)\n",
116 | " print(\"Response: \",response)"
117 | ],
118 | "metadata": {
119 | "id": "AkMAy9M2KBnu"
120 | },
121 | "execution_count": null,
122 | "outputs": []
123 | }
124 | ]
125 | }
--------------------------------------------------------------------------------
/Get_Data_Lineage_using_Rest_API.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": []
7 | },
8 | "kernelspec": {
9 | "name": "python3",
10 | "display_name": "Python 3"
11 | },
12 | "language_info": {
13 | "name": "python"
14 | }
15 | },
16 | "cells": [
17 | {
18 | "cell_type": "code",
19 | "execution_count": 23,
20 | "metadata": {
21 | "id": "lgVFFTmOPedj"
22 | },
23 | "outputs": [],
24 | "source": [
25 | "import requests\n",
26 | "import json"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "source": [
32 | "access_token = '{}'"
33 | ],
34 | "metadata": {
35 | "id": "gjGYxVGAPt6m"
36 | },
37 | "execution_count": 25,
38 | "outputs": []
39 | },
40 | {
41 | "cell_type": "code",
42 | "source": [
43 | "endpoint =\"https://{}/api/2.0/lineage-tracking/table-lineage\""
44 | ],
45 | "metadata": {
46 | "id": "m4ACYxOkP999"
47 | },
48 | "execution_count": 26,
49 | "outputs": []
50 | },
51 | {
52 | "cell_type": "code",
53 | "source": [
54 | "payload = {\"table_name\": \"workspace.default.iris_date_transformed\", \"include_entity_lineage\": True}"
55 | ],
56 | "metadata": {
57 | "id": "BeSnKJTvmGke"
58 | },
59 | "execution_count": 30,
60 | "outputs": []
61 | },
62 | {
63 | "cell_type": "code",
64 | "source": [
65 | "response= requests.get(endpoint, headers={\"Authorization\": f\"Bearer {access_token}\"}, data=json.dumps(payload))"
66 | ],
67 | "metadata": {
68 | "id": "Mb6ceI0CmAzN"
69 | },
70 | "execution_count": 31,
71 | "outputs": []
72 | },
73 | {
74 | "cell_type": "code",
75 | "source": [
76 | "print(response.text)"
77 | ],
78 | "metadata": {
79 | "colab": {
80 | "base_uri": "https://localhost:8080/"
81 | },
82 | "id": "9KT2nK8kmSmc",
83 | "outputId": "cb0ad7e7-f452-4d35-e773-8eaf67c3cdd8"
84 | },
85 | "execution_count": 32,
86 | "outputs": [
87 | {
88 | "output_type": "stream",
89 | "name": "stdout",
90 | "text": [
91 | "{\"upstreams\":[{\"tableInfo\":{\"name\":\"iris_date\",\"catalog_name\":\"workspace\",\"schema_name\":\"default\",\"table_type\":\"TABLE\",\"lineage_timestamp\":\"2025-05-15 16:12:35.0\"},\"notebookInfos\":[{\"workspace_id\":3835792019408392,\"notebook_id\":3011685692879898,\"lineage_timestamp\":\"2025-05-15 16:12:35.0\"}]}],\"downstreams\":[{\"tableInfo\":{\"name\":\"iris_gold\",\"catalog_name\":\"workspace\",\"schema_name\":\"default\",\"table_type\":\"PERSISTED_VIEW\",\"lineage_timestamp\":\"2025-05-15 16:14:22.0\"},\"queryInfos\":[{\"workspace_id\":3835792019408392,\"query_id\":\"f9185d3a-5537-464c-b9b9-5fd6ec9c6d1c\",\"lineage_timestamp\":\"2025-05-15 16:14:22.0\"},{\"workspace_id\":3835792019408392,\"query_id\":\"514c9f90-5ed9-4aa2-b8be-fd036c917a5a\",\"lineage_timestamp\":\"2025-05-15 15:52:35.0\"}]},{\"notebookInfos\":[{\"workspace_id\":3835792019408392,\"notebook_id\":3011685692879898,\"lineage_timestamp\":\"2025-05-15 16:12:40.0\"}]}]}\n"
92 | ]
93 | }
94 | ]
95 | },
96 | {
97 | "cell_type": "code",
98 | "source": [],
99 | "metadata": {
100 | "id": "VqRB08rTmTm0"
101 | },
102 | "execution_count": null,
103 | "outputs": []
104 | }
105 | ]
106 | }
--------------------------------------------------------------------------------
/Glue Workflow Lab.txt:
--------------------------------------------------------------------------------
1 | Step 1:
2 | --------
3 | Create a s3 bucket with 2 folders
4 |
5 | Step 2:
6 | --------
7 | Create a Glue role
8 |
9 | Step 3:
10 | --------
11 | Create a Glue Database
12 |
13 | Step 4:
14 | -------
15 | Create 2 Glue Crawlers
16 |
17 | Step 5:
18 | --------
19 | Create a Glue job with Job bookmark enabled--
20 |
21 | import sys
22 | from awsglue.transforms import *
23 | from awsglue.utils import getResolvedOptions
24 | from pyspark.context import SparkContext
25 | from awsglue.context import GlueContext
26 | from awsglue.job import Job
27 |
28 | ## @params: [JOB_NAME]
29 | args = getResolvedOptions(sys.argv, ['JOB_NAME'])
30 |
31 | sc = SparkContext()
32 | glueContext = GlueContext(sc)
33 | spark = glueContext.spark_session
34 | job = Job(glueContext)
35 | job.init(args['JOB_NAME'], args)
36 |
37 | datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "workflowdemoyt", table_name = "csvstorer", transformation_ctx = "datasource0")
38 |
39 | datasink4 = glueContext.write_dynamic_frame.from_options(frame = datasource0, connection_type = "s3",
40 | connection_options = {"path": "s3://{}/{}/"}, format = "parquet", transformation_ctx = "datasink4")
41 | job.commit()
42 |
43 | Step 6:
44 | --------
45 | Create the Glue Workflow
46 |
47 | Step 7:
48 | --------
49 | Download the Snowflake data --
50 | select * from books where publishyear=2002 and PUBLISHMONTH=23;
51 |
52 | Step 8:
53 | -------
54 | Trigger the Glue workflow
55 |
56 | Step 9:
57 | --------
58 | Query using Athena--
59 |
60 |
61 | Step 10:
62 | ----------
63 | Download the Snowflake data --
64 | select * from books where publishyear=2001 and PUBLISHMONTH=1;
65 |
66 | Step 11:
67 | ---------
68 | Trigger the Glue workflow
69 |
70 | Step 12:
71 | ---------
72 | Query using Athena--
73 |
--------------------------------------------------------------------------------
/HTTP Sink Connector.txt:
--------------------------------------------------------------------------------
1 | Start Zookeeper:
2 | ---------------------
3 | F:/kafka_2.12-3.3.1/bin/windows/zookeeper-server-start.bat F:/kafka_2.12-3.3.1/config/zookeeper.properties
4 |
5 | Start Kafka Server:
6 | ---------------------
7 | F:/kafka_2.12-3.3.1/bin/windows/kafka-server-start.bat F:/kafka_2.12-3.3.1/config/server.properties
8 |
9 | Create Source Topic:
10 | ---------------------
11 | F:/kafka_2.12-3.3.1/bin/windows/kafka-topics.bat --create --topic http-messages --bootstrap-server localhost:9092 --replication-factor 1 --partitions 1
12 |
13 | http_config.properties:
14 | -------------------------
15 | name=HttpSinkBasicAuth
16 | topics=http-messages
17 | tasks.max=1
18 | connector.class=io.confluent.connect.http.HttpSinkConnector
19 | # key/val converters
20 | key.converter=org.apache.kafka.connect.storage.StringConverter
21 | value.converter=org.apache.kafka.connect.storage.StringConverter
22 | # licensing for local single-node Kafka cluster
23 | confluent.topic.bootstrap.servers=localhost:9092
24 | confluent.topic.replication.factor=1
25 | # connect reporter required bootstrap server
26 | reporter.bootstrap.servers=localhost:9092
27 | reporter.result.topic.name=success-responses
28 | reporter.result.topic.replication.factor=1
29 | behavior.on.error=log
30 | reporter.error.topic.name=error-responses
31 | reporter.error.topic.replication.factor=1
32 | # http sink connector configs
33 | http.api.url=https://api.mailjet.com/v3.1/send
34 | auth.type=BASIC
35 | connection.user={}
36 | connection.password={}
37 |
38 | Start HTTP Sink Connector:
39 | ------------------------------
40 | F:/kafka_2.12-3.3.1/bin/windows/connect-standalone.bat F:/kafka_2.12-3.3.1/config/connect-standalone.properties F:/kafka_2.12-3.3.1/config/http_config.properties
41 |
42 | Start Console Producer to Source Topic:
43 | --------------------------------------------
44 | F:/kafka_2.12-3.3.1/bin/windows/kafka-console-producer.bat --topic http-messages --bootstrap-server localhost:9092
45 |
46 | Start Console Consumer to Source Topic:
47 | --------------------------------------------
48 | F:/kafka_2.12-3.3.1/bin/windows/kafka-console-consumer.bat --topic http-messages --from-beginning --bootstrap-server localhost:9092
49 |
50 | Start Console Consumer to Response Topic:
51 | --------------------------------------------
52 | F:/kafka_2.12-3.3.1/bin/windows/kafka-console-consumer.bat --topic success-responses --from-beginning --bootstrap-server localhost:9092
53 |
54 | Start Console Consumer to Error Topic:
55 | --------------------------------------------
56 | F:/kafka_2.12-3.3.1/bin/windows/kafka-console-consumer.bat --topic error-responses --from-beginning --bootstrap-server localhost:9092
57 |
58 |
59 | Smaple Message:
60 | -----------------
61 | {"Messages":[{"From":{"Email":"{}","Name":"{}"},"To":[{"Email":"{}","Name":"{}"}],"Subject":"My first Marketing email","TextPart":"Greetings from Knowledge Amplifier. to viewer 1","HTMLPart":"
Subscribe our channel for more intresting videos!","CustomID":"AppGettingStartedTest"}]}
62 |
--------------------------------------------------------------------------------
/Incremental Crawling , Incremental ETL with Glue Workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/Incremental Crawling , Incremental ETL with Glue Workflow.png
--------------------------------------------------------------------------------
/Kafka Consumer Internals with Auto Offset Commits and At Least Once Processing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/Kafka Consumer Internals with Auto Offset Commits and At Least Once Processing.png
--------------------------------------------------------------------------------
/Kafka Consumer Lag Monitoring.txt:
--------------------------------------------------------------------------------
1 | Download Kafka:
2 | ---------------------
3 | wget https://downloads.apache.org/kafka/3.6.0/kafka_2.12-3.6.0.tgz
4 | tar -xvf kafka_2.12-3.6.0.tgz
5 |
6 | Download Java:
7 | ---------------------
8 | java -version
9 | sudo yum -y install java-1.8.0-openjdk
10 | java -version
11 |
12 | vi kafka_2.12-3.6.0/config/server.properties
13 |
14 | Start Zookepper:
15 | -----------------
16 | cd kafka_2.12-3.6.0
17 | bin/zookeeper-server-start.sh config/zookeeper.properties
18 |
19 | Start Kafka-server:
20 | ----------------------------------------
21 | export KAFKA_HEAP_OPTS="-Xmx256M -Xms128M"
22 |
23 | cd kafka_2.12-3.6.0
24 | bin/kafka-server-start.sh config/server.properties
25 |
26 | Create Topic:
27 | ----------------
28 | cd kafka_2.12-3.6.0
29 | bin/kafka-topics.sh --create --topic hello_world2 --bootstrap-server {}:9092 --replication-factor 1 --partitions 3
30 |
31 |
32 | Shell Script to monitor Consumer Lag:
33 | -----------------------------------------
34 | bin/kafka-consumer-groups.sh --bootstrap-server {}:9092 --group hello_world1 --describe
35 |
36 |
37 | Producer Code:
38 | --------------
39 | from time import sleep
40 | from json import dumps
41 | from kafka import KafkaProducer
42 |
43 | topic_name='{}'
44 | kafka_server='{}'
45 | def custom_partitioner(key, all_partitions, available):
46 | """
47 | Customer Kafka partitioner to get the partition corresponding to key
48 | :param key: partitioning key
49 | :param all_partitions: list of all partitions sorted by partition ID
50 | :param available: list of available partitions in no particular order
51 | :return: one of the values from all_partitions or available
52 | """
53 | print("The key is : {}".format(key))
54 | print("All partitions : {}".format(all_partitions))
55 | print("After decoding of the key : {}".format(key.decode('UTF-8')))
56 | return int(key.decode('UTF-8'))%len(all_partitions)
57 |
58 |
59 | producer = KafkaProducer(bootstrap_servers=[kafka_server],value_serializer=lambda x: dumps(x).encode('utf-8'),
60 | partitioner=custom_partitioner)
61 |
62 | for e in range(1000):
63 | data = {'number' : e}
64 | print(data)
65 | producer.send(topic_name, key=str(e).encode(),value=data)
66 | sleep(0.4)
67 |
68 | Consumer Code:
69 | --------------
70 | from kafka import KafkaConsumer
71 | from kafka import TopicPartition , OffsetAndMetadata
72 | from time import sleep
73 | import json
74 |
75 | topic='{}'
76 | group_id=topic
77 | kafka_server='{}'
78 | consumer = KafkaConsumer (topic,bootstrap_servers = [kafka_server],
79 | value_deserializer=lambda m: json.loads(m.decode('utf-8')),group_id=group_id,auto_offset_reset='earliest',
80 | enable_auto_commit =False)
81 |
82 |
83 | for message in consumer:
84 | print(message)
85 | tp = TopicPartition(message.topic, message.partition)
86 | om = OffsetAndMetadata(message.offset + 1, message.timestamp)
87 | consumer.commit({tp: om})
88 | sleep(0.4)
89 |
90 | Log Monitor:
91 | -------------
92 | from kafka import KafkaConsumer
93 | from kafka import TopicPartition , OffsetAndMetadata
94 | from time import sleep
95 | import json
96 |
97 | topic='{}'
98 | group_id=topic
99 | kafka_server='{}'
100 | consumer = KafkaConsumer (topic,bootstrap_servers = [kafka_server],group_id=group_id)
101 |
102 | partitions=consumer.partitions_for_topic(topic)
103 | print(partitions)
104 |
105 | "***********************************************************************************************"
106 | #
107 | tp = [TopicPartition(topic, partition) for partition in partitions]
108 |
109 | topic_partition_last_offset = consumer.end_offsets(tp)
110 | print(topic_partition_last_offset)
111 |
112 | # "***********************************************************************************************"
113 | #
114 | #
115 | for i in tp:
116 | consumer_committed_offset=0 if consumer.committed(i) is None else consumer.committed(i)
117 | last_offset_stored_by_broker_in_partition=topic_partition_last_offset[i]
118 | lag=last_offset_stored_by_broker_in_partition-consumer_committed_offset
119 | print(f"Topic: {topic} - Partition: {i.partition} - Current Consumer Offset: {consumer_committed_offset} - Last Offset: {last_offset_stored_by_broker_in_partition} - Lag : {lag}")
120 | print('*'*100)
121 |
--------------------------------------------------------------------------------
/Kafka Faust Snowflake Python.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/Kafka Faust Snowflake Python.PNG
--------------------------------------------------------------------------------
/Kafka Producer Internals.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/Kafka Producer Internals.png
--------------------------------------------------------------------------------
/Kafka Retry Architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/Kafka Retry Architecture.png
--------------------------------------------------------------------------------
/Long-Running Workflows with Human Interactions using Step Functions.txt:
--------------------------------------------------------------------------------
1 | Step Function Code:
2 | --------------------------
3 | {
4 | "Comment": "A description of my state machine",
5 | "StartAt": "SQS SendMessage",
6 | "States": {
7 | "SQS SendMessage": {
8 | "Type": "Task",
9 | "Resource": "arn:aws:states:::sqs:sendMessage.waitForTaskToken",
10 | "Parameters": {
11 | "MessageBody": {
12 | "input.$": "$",
13 | "MyTaskToken.$": "$$.Task.Token"
14 | },
15 | "QueueUrl": "{Put the SQS Queue URL here}"
16 | },
17 | "Next": "Choice"
18 | },
19 | "Choice": {
20 | "Type": "Choice",
21 | "Choices": [
22 | {
23 | "Variable": "$.body",
24 | "StringEquals": "Approved",
25 | "Next": "Success"
26 | }
27 | ],
28 | "Default": "Fail"
29 | },
30 | "Success": {
31 | "Type": "Succeed"
32 | },
33 | "Fail": {
34 | "Type": "Fail"
35 | }
36 | }
37 | }
38 |
39 | Step Function Sample Input:
40 | -----------------------------
41 | {
42 | "Manager Mail Address": "{}",
43 | "Employee Name":"{}"
44 | }
45 |
46 | Callback Lambda Code:
47 | -----------------------------
48 | import json
49 | import boto3
50 | import time
51 | import urllib
52 |
53 | client = boto3.client("ses")
54 |
55 | def lambda_handler(event, context):
56 | main_message=json.loads(event['Records'][0]['body'])
57 | print("Main Message Part : {}".format(main_message))
58 |
59 | step_fucntion_input=main_message['input']
60 |
61 | manager_main_address=step_fucntion_input['Manager Mail Address']
62 | employee_to_be_promoted=step_fucntion_input['Employee Name']
63 |
64 |
65 | task_token=main_message['MyTaskToken']
66 | print("The task token is : {}".format(task_token))
67 | task_token_encode=urllib.parse.quote(task_token)
68 | body = """
69 | Hi,
70 | {} has been nominated for promotion!
.
71 |
72 | Can you please approve:
73 |
74 | {Put the API Invoke URL here}/approve?TaskToken={}
75 |
76 | Or reject:
77 |
78 | {Put the API Invoke URL here}/reject?TaskToken={}
79 | """.format(employee_to_be_promoted, task_token_encode,task_token_encode)
80 |
81 | message = {"Subject": {"Data": 'Your Approval Needed for Promotion!'}, "Body": {"Html": {"Data": body}}}
82 |
83 | response = client.send_email(Source = manager_main_address, Destination = {"ToAddresses": [manager_main_address]}, Message = message)
84 |
85 | print("The mail is sent successfully")
86 |
87 |
88 |
89 | Approve Handler:
90 | ------------------------
91 | import json
92 | import boto3
93 | import time
94 |
95 | client = boto3.client('stepfunctions')
96 |
97 | def lambda_handler(event, context):
98 | task_token=event['queryStringParameters']['TaskToken']
99 | print(task_token)
100 | response = client.send_task_success(
101 | taskToken=task_token,
102 | output=json.dumps({'body':'Approved'})
103 | )
104 |
105 |
106 | Reject Handler:
107 | ------------------
108 | import json
109 | import boto3
110 | import time
111 |
112 | client = boto3.client('stepfunctions')
113 |
114 | def lambda_handler(event, context):
115 | task_token=event['queryStringParameters']['TaskToken']
116 | response = client.send_task_success(
117 | taskToken=task_token,
118 | output=json.dumps({'body':'Rejected'})
119 | )
--------------------------------------------------------------------------------
/MSK Lab 1.txt:
--------------------------------------------------------------------------------
1 | Step 1:
2 | ------------
3 | Cretae VPC -- Name -- virtual-private-cloud IPv4 CIDR -- 10.0.0.0/16
4 | Host address range -- 10.0.0.1 - 10.0.255.254
5 |
6 | Step 2:
7 | -----------
8 | Create 2 public subnets
9 | Public-Subnet-A--10.0.0.0/24
10 | Host address range -- 10.0.0.1 - 10.0.0.254
11 |
12 | Public-Subnet-B--10.0.1.0/24
13 | Host address range -- 10.0.1.1 - 10.0.1.254
14 |
15 | Step 3:
16 | ------------
17 | Check the default route table -- you will see the above 2 subnets have not been explicitly associated with any route tables and are therefore associated with the main route table.
18 |
19 | Step 4:
20 | ------------
21 | Create a IGW & connect with VPC
22 |
23 | Step 5:
24 | ------------
25 | Add the IGW in default route table
26 |
27 |
28 | Step 6:
29 | ---------
30 | Launch MSK Cluster with vpc you created , unauthorised access allowed , plaintext enxryption
31 | (keep security group as it is)
32 |
33 | Step 7:
34 | ------------
35 | Launch Linux EC2
36 | In the list Network choose the VPC previously created.
37 | In the list Auto-assign Public IP, choose Enable.
38 |
39 | Step 8:
40 | ---------
41 | Once the client for Amazon MSK has been created, the security group rules must be configured to allow the connection between the cluster and the client that we have just created.
42 |
43 | For that , Add the security group id of ec2 to msk cluster security group all traffic
44 |
45 | Repeat these steps to add an inbound rule in the security group that corresponds to your client computer to allow it to receive traffic from the security group from the VPC. Now your client computer can communicate bidirectionally with the MSK Cluster.
46 |
47 | Once this is done, the newly created and configured client can be accessed.
48 |
49 | Step 9:
50 | -----------
51 | sudo yum install java-1.8.0-openjdk
52 | wget https://archive.apache.org/dist/kafka/2.8.1/kafka_2.12-2.8.1.tgz
53 | tar -xvf kafka_2.12-2.8.1.tgz
54 | cd kafka_2.12-2.8.1
55 |
56 | bin/kafka-topics.sh --create --topic demo_testing2 --bootstrap-server {Put the MSK bootstrap server URLs here} --replication-factor 1 --partitions 1
57 |
58 |
59 | bin/kafka-topics.sh --create --topic helloworld --bootstrap-server {Put the MSK bootstrap server URLs here} --replication-factor 1 --partitions 1
60 |
61 | Step 10:
62 | -----------
63 | Start the kafka Producer
64 | ---------------------------
65 | bin/kafka-console-producer.sh --topic demo_testing2 --bootstrap-server {Put the MSK bootstrap server URLs here}
66 |
67 | In a new console start the kafka consumer--
68 | cd kafka_2.12-2.8.1
69 | bin/kafka-console-consumer.sh --topic helloworld --bootstrap-server {Put the MSK bootstrap server URLs here}
70 |
71 | Step 11:
72 | -----------
73 | Install confluent kafka within kafka_2.12-2.8.1)
74 | wget http://packages.confluent.io/archive/5.1/confluent-5.1.2-2.11.zip
75 | unzip confluent-5.1.2-2.11.zip
76 |
77 | export CONFLUENT_HOME=/home/ec2-user/kafka_2.12-2.8.1/confluent-5.1.2
78 | export PATH=$PATH:$CONFLUENT_HOME/bin
79 | (Note , if installing confluent kafka , where kafka is installed (i.e. in /home/ec2-user) , then CONFLUENT_HOME should be -- /home/ec2-user/confluent-5.1.2)
80 |
81 | Step 12:
82 | -----------
83 | Change the bootstrap.servers in confluent-5.1.2/etc/kafka-rest/kafka-rest.properties
84 |
85 |
86 |
87 | Step 13:
88 | -----------
89 | Start Kafka Rest
90 | /home/ec2-user/kafka_2.12-2.8.1/confluent-5.1.2/bin/kafka-rest-start /home/ec2-user/kafka_2.12-2.8.1/confluent-5.1.2/etc/kafka-rest/kafka-rest.properties
91 |
92 | (Don't forget to allow all traffic to the security group of EC2 client machine)
93 |
94 | Url to post messages using Kafka rest API--
95 | http://{Put your cleint machine's Public IP here}:8082/topics/demo_testing2
96 |
97 | Content-Type: application/vnd.kafka.json.v2+json
98 |
99 | Sample Message:
100 | ------------------
101 | {"records":[{"value":{"name": "testUser"}}]}
102 |
103 | Start consumer to see the messages:
104 | ----------------------------------------
105 | cd kafka_2.12-2.8.1
106 | bin/kafka-console-consumer.sh --topic demo_testing2 --bootstrap-server {Put the MSK bootstrap server URLs here}
107 |
108 |
109 |
110 |
111 |
112 |
113 |
--------------------------------------------------------------------------------
/MSK To AWS Lambda.txt:
--------------------------------------------------------------------------------
1 | Step 1:
2 | ------------
3 | Cretae VPC -- Name -- virtual-private-cloud-lambda IPv4 CIDR -- 11.0.0.0/16
4 | Host address range -- 11.0.0.1 - 11.0.255.254
5 |
6 | Step 2:
7 | -----------
8 | Create 2 public subnets
9 | Public-Subnet-A-lambda--11.0.0.0/24--us-east-1a
10 | Host address range -- 11.0.0.1 - 11.0.0.254
11 |
12 | Public-Subnet-B-lambda--11.0.1.0/24--us-east-1b
13 | Host address range -- 11.0.1.1 - 11.0.1.254
14 |
15 | Private-Subnet-A-lambda--11.0.2.0/24--us-east-1a
16 | Host address range -- 11.0.2.1 - 11.0.2.254
17 |
18 | Private-Subnet-B-lambda--11.0.3.0/24--us-east-1b
19 | Host address range -- 11.0.3.1 - 11.0.3.254
20 |
21 | Step 3:
22 | ------------
23 | Create an IGW and attach with VPC
24 |
25 | Step 4:
26 | ---------
27 | Create 2 route tables 1 for Public subnets and 1 for Private subnets
28 | (Attach IGW with Public route tables)
29 |
30 | Step 5:
31 | ------------
32 | Launch MSK Cluster in Private subnets(keep unauthorised access and plaintext authentication)
33 |
34 | Step 6:
35 | -----------
36 | Create NAT Gateway in public subnet and attach with Private Subnet route table
37 |
38 | Step 7:
39 | -------------
40 | Launch an EC2 in a public subnet in same VPC as of MSK Cluster in a public subnet.
41 | Launch an EC2 in private subnet in same VPC as of MSK Cluster in a private subnet.
42 |
43 | Step 8:
44 | -----------
45 | Add private ec2 security group and msk security group both way all traffic.
46 |
47 | Step 9:
48 | -------------
49 | Enter in public subnet , from there enter in private subnet.
50 |
51 | Step 7:
52 | -----------
53 | sudo yum install java-1.8.0-openjdk
54 | wget https://archive.apache.org/dist/kafka/2.8.1/kafka_2.12-2.8.1.tgz
55 | tar -xvf kafka_2.12-2.8.1.tgz
56 | cd kafka_2.12-2.8.1
57 |
58 | bin/kafka-topics.sh --create --topic demo_testing2 --bootstrap-server --replication-factor 1 --partitions 2
59 |
60 | Step 8:
61 | ----------
62 | Perform local testing:
63 | -----------------------------
64 | bin/kafka-console-producer.sh --topic demo_testing2 --bootstrap-server
65 |
66 | In a new console start the kafka consumer--
67 | cd kafka_2.12-2.8.1
68 | bin/kafka-console-consumer.sh --topic demo_testing2 --bootstrap-server
69 |
70 |
71 |
72 | Step 8:
73 | -----------
74 | You set up a target Lambda function with MSK and VPC access.
75 |
76 | Step 9:
77 | -----------
78 | Create Lambda Function with MSK Trigger
79 |
80 | Sample Event:
81 | ------------------
82 | {
83 | "eventSource":"aws:kafka",
84 | "eventSourceArn":"",
85 | "bootstrapServers":"",
86 | "records":{
87 | "demo_testing2-0":[
88 | {
89 | "topic":"demo_testing2",
90 | "partition":0,
91 | "offset":34,
92 | "timestamp":1674023898925,
93 | "timestampType":"CREATE_TIME",
94 | "value":"eyJIZWxsbyI6IldvcmxkIn0=",
95 | "headers":[
96 |
97 | ]
98 | }
99 | ]
100 | }
101 | }
102 |
103 |
104 |
105 | import base64
106 | import boto3
107 | import json
108 |
109 | def lambda_handler(event, context):
110 | # TODO implement
111 | print(event)
112 | for partition_key in event['records']:
113 | partition_value=event['records'][partition_key]
114 | for record_value in partition_value:
115 | print((base64.b64decode(record_value['value'])).decode())
116 |
117 |
118 |
119 |
120 |
--------------------------------------------------------------------------------
/MSK project with serverless Producer & Consumer.txt:
--------------------------------------------------------------------------------
1 | Step 1:
2 | --------
3 | Create a NAT Gateway & attach with Private subnet route table
4 |
5 | Step 2:
6 | ---------
7 | Launch one MSK Cluster in private subnet
8 |
9 | Step 3:
10 | ----------
11 | Create a Lambda code (Python 3.8)
12 |
13 | from time import sleep
14 | from json import dumps
15 | from kafka import KafkaProducer
16 | import json
17 |
18 | topic_name='{Provide the topic name here}'
19 | producer = KafkaProducer(bootstrap_servers=['{Put the broker URLs here}'
20 | ,'{Put the broker URLs here}'],value_serializer=lambda x: dumps(x).encode('utf-8'))
21 |
22 | def lambda_handler(event, context):
23 | print(event)
24 | for i in event['Records']:
25 | sqs_message =json.loads((i['body']))
26 | print(sqs_message)
27 | producer.send(topic_name, value=sqs_message)
28 |
29 | producer.flush()
30 |
31 | Step 4:
32 | ----------
33 | Increase the timeout for Lambda to 2 mins , provide SQS,MSK and VPC access & put in Private VPC (where MSK Brokers are running)
34 |
35 | Configure Lambda Layer--
36 | Reference:
37 | ------------
38 | https://youtube.com/watch?v=uleTVY7LkMM&feature=shares
39 |
40 |
41 |
42 | Step 5:
43 | ---------
44 | Launch one SQS Queue with visibility timeout to 240 sec
45 |
46 | Step 6:
47 | ----------
48 | Create an API Gateway and setup integration with SQS Queue
49 |
50 | Step 7:
51 | ---------
52 | Test the integration , if works , then setup integration with AWS Lambda Producer
53 |
54 |
55 | Step 8:
56 | ---------
57 | Create an s3 bucket for data archival
58 |
59 | Step 9:
60 | ---------
61 | Configure kinesis Firehose
62 |
63 |
64 | Step 10:
65 | -----------
66 | Configure the Consumer Lambda Code:
67 |
68 | import base64
69 | import boto3
70 | import json
71 |
72 | client = boto3.client('firehose')
73 |
74 | def lambda_handler(event, context):
75 | print(event)
76 | for partition_key in event['records']:
77 | partition_value=event['records'][partition_key]
78 | for record_value in partition_value:
79 | actual_message=json.loads((base64.b64decode(record_value['value'])).decode('utf-8'))
80 | print(actual_message)
81 | newImage = (json.dumps(actual_message)+'\n').encode('utf-8')
82 | print(newImage)
83 | response = client.put_record(
84 | DeliveryStreamName='{Kinesis Delivery Stream Name}',
85 | Record={
86 | 'Data': newImage
87 | })
88 |
89 | Step 11:
90 | -----------
91 | Provide KinesisFirehose write access , VPC access , MSK access to this Lambda
92 |
93 |
94 | Step 12:
95 | ----------
96 | Launch an EC2 in a public subnet in same VPC as of MSK Cluster in a public subnet.
97 | Launch an EC2 in private subnet in same VPC as of MSK Cluster in a private subnet.
98 |
99 |
100 | Step 13:
101 | -----------
102 | Add private ec2 security group and msk security group both way all traffic.
103 |
104 | Step 14:
105 | -------------
106 | Enter in public subnet , from there enter in private subnet.
107 |
108 | sudo yum install java-1.8.0-openjdk
109 | wget https://archive.apache.org/dist/kafka/2.8.1/kafka_2.12-2.8.1.tgz
110 | tar -xvf kafka_2.12-2.8.1.tgz
111 | cd kafka_2.12-2.8.1
112 |
113 | bin/kafka-topics.sh --create --topic demo_testing2 --bootstrap-server {} --replication-factor 1 --partitions 2
114 |
115 |
116 | Step 15:
117 | ------------
118 | Start kafka console consumer and check whether from Lambda messages are getting published in kafka topic or not
119 | bin/kafka-console-consumer.sh --topic demo_testing2 --bootstrap-server {}
120 |
121 |
122 | Step 16:
123 | ------------
124 | Add MSK Trigger from Consumer Lambda
125 |
126 |
127 | Step 17:
128 | ---------
129 | Peform end to end testing
130 |
131 | {"station":"OH","temp":"26.39f"}
132 | {"station":"WA","temp":"40.00F"}
133 | {"station":"TX","temp":"15.01F"}
134 | {"station":"NC","temp":"32.36f"}
135 | {"station":"WA","temp":"62.86F"}
136 | {"station":"NC","temp":"49.43f"}
137 | {"station":"MD","temp":"2.30f"}
138 |
--------------------------------------------------------------------------------
/MSK with s3 sink connector and Snowflake.txt:
--------------------------------------------------------------------------------
1 | Documentation:
2 | ------------------
3 | https://aws.amazon.com/blogs/big-data/back-up-and-restore-kafka-topic-data-using-amazon-msk-connect/
4 |
5 |
6 | Step 1:Launch MSK Cluster:
7 | -------------------------------------
8 | Configure NAT Gateway & launch MSK Cluster in Private Subnet
9 |
10 | Step 2:
11 | ----------
12 | Create IAM role for MSK Connect--s3connectordemoyt
13 |
14 | IAM Role:s3--give s3 full access,kms access , msk access
15 |
16 | Trust Relationship--
17 |
18 | {
19 | "Version": "2012-10-17",
20 | "Statement": [
21 | {
22 | "Effect": "Allow",
23 | "Principal": {
24 | "Service": "kafkaconnect.amazonaws.com"
25 | },
26 | "Action": "sts:AssumeRole",
27 | "Condition": {
28 | "StringEquals": {
29 | "aws:SourceAccount": ""
30 | }
31 | }
32 | }
33 | ]
34 | }
35 |
36 | Step 3:
37 | -----------
38 | Download the Jar from the below link --
39 | https://github.com/lensesio/stream-reactor/releases
40 |
41 | Uplaod the jar file in s3
42 |
43 |
44 | Step 4:
45 | ---------
46 | Create the custom plugin using the jar uplaoded in s3 in Step 3
47 | s3sinkconnectortest123
48 |
49 |
50 | Step 5:
51 | ---------
52 | Create public and private ec2 which will act as client machine for MSK Cluster
53 |
54 |
55 | Step 6:In EC2 Client Machine:
56 | -----------------------------
57 | sudo yum install java-1.8.0-openjdk
58 | wget https://archive.apache.org/dist/kafka/2.8.1/kafka_2.12-2.8.1.tgz
59 | tar -xvf kafka_2.12-2.8.1.tgz
60 | cd kafka_2.12-2.8.1
61 |
62 |
63 | bin/kafka-topics.sh --create --topic demotesting3 --bootstrap-server {} --replication-factor 1 --partitions 2
64 |
65 | Step 7:Create the connector
66 | -----------------------------
67 | Documentation Link:
68 | ----------------------
69 | https://docs.lenses.io/5.0/integrations/connectors/stream-reactor/sinks/s3sinkconnector/
70 |
71 |
72 |
73 | connector.class=io.lenses.streamreactor.connect.aws.s3.sink.S3SinkConnector
74 | tasks.max=2
75 | topics=demotesting3
76 | connect.s3.vhost.bucket=true
77 | schema.enable=false
78 | key.converter.schemas.enable=false
79 | connect.s3.kcql=INSERT INTO irisseta:MSKBuildLabClusterdate SELECT * FROM demotesting3 PARTITIONBY _date.uuuu,_date.LL,_date.dd STOREAS `JSON` WITHPARTITIONER=Values WITH_FLUSH_SIZE = 10000 WITH_FLUSH_INTERVAL = 300 WITH_FLUSH_COUNT = 20
80 | aws.region=us-east-1
81 | aws.custom.endpoint=https://s3.us-east-1.amazonaws.com/
82 | value.converter.schemas.enable=false
83 | connect.s3.aws.region=us-east-1
84 | value.converter=org.apache.kafka.connect.json.JsonConverter
85 | errors.log.enable=true
86 | key.converter=org.apache.kafka.connect.json.JsonConverter
87 |
88 |
89 | Step 8:Setup Snowflake Table ,Snowpipe and s3 event notifications
90 | ------------------------------------------------------------------
91 | --drop database if exists
92 | drop database if exists s3_to_snowflake;
93 |
94 | --Database Creation
95 | create database if not exists s3_to_snowflake;
96 |
97 | --Use the database
98 | use s3_to_snowflake;
99 |
100 | --create the external stage
101 | create or replace stage s3_to_snowflake.PUBLIC.Snow_stage url="s3://{}"
102 | credentials=(aws_key_id=''
103 | aws_secret_key='');
104 |
105 | list @Snow_stage;
106 |
107 | create or replace table s3_to_snowflake.PUBLIC.real_time_demo(data variant);
108 |
109 |
110 | --Create the Pipe
111 | create or replace pipe s3_to_snowflake.PUBLIC.for_kafka_ingestion
112 | auto_ingest=true as copy into s3_to_snowflake.PUBLIC.real_time_demo from
113 | @s3_to_snowflake.PUBLIC.Snow_stage FILE_FORMAT=(type = 'JSON');
114 |
115 | show pipes;
116 |
117 |
118 |
119 |
120 | Test:Start consumer in a new window
121 | ------------------------------------------
122 |
123 | cd kafka_2.12-2.8.1
124 | bin/kafka-console-consumer.sh --topic demotesting3 --bootstrap-server b-1.kafkas3connectordemoyt.pvvnij.c3.kafka.us-east-1.amazonaws.com:9092,b-2.kafkas3connectordemoyt.pvvnij.c3.kafka.us-east-1.amazonaws.com:9092
125 |
126 | Produce messages:
127 | ---------------------
128 | pip install kafka-python
129 |
130 |
131 | from time import sleep
132 | from json import dumps
133 | from kafka import KafkaProducer
134 | topic_name='demotesting3'
135 | producer = KafkaProducer(bootstrap_servers=['{}'],value_serializer=lambda x: dumps(x).encode('utf-8'))
136 |
137 | for e in range(1000):
138 | data = {'number' : e}
139 | print(data)
140 | producer.send(topic_name, value=data)
141 | sleep(0.2)
142 |
143 |
144 |
145 |
146 | Observer partitioning :
147 | ---------------------------
148 | bucket/prefix/customValue/topic(partition_offset)
149 |
150 | Download the data and observe the json
151 |
152 | Check in Snowflake
153 | ------------------------
154 |
155 |
156 | select * from s3_to_snowflake.PUBLIC.real_time_demo;
157 |
158 | select count(*) from s3_to_snowflake.PUBLIC.real_time_demo;
159 |
160 |
161 | select parse_json(Data):number as value_part from s3_to_snowflake.PUBLIC.real_time_demo order by value_part;
162 |
163 |
164 |
165 |
--------------------------------------------------------------------------------
/Manual Offset Commits and At-most Once Processing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/Manual Offset Commits and At-most Once Processing.png
--------------------------------------------------------------------------------
/Manual Offset Commits and Exactly Once Processing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/Manual Offset Commits and Exactly Once Processing.png
--------------------------------------------------------------------------------
/Multimodal Embedding using BedRock.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/Multimodal Embedding using BedRock.zip
--------------------------------------------------------------------------------
/Publish Message in MSK Cluster from AWS Lambda.txt:
--------------------------------------------------------------------------------
1 | Code to install Java & Kafka in EC2 Client Machine:
2 | ---------------------------------------------------------------------------
3 | sudo yum install java-1.8.0-openjdk
4 | wget https://archive.apache.org/dist/kafka/2.8.1/kafka_2.12-2.8.1.tgz
5 | tar -xvf kafka_2.12-2.8.1.tgz
6 |
7 | Creating Kafka Topic:
8 | ----------------------------------
9 | cd kafka_2.12-2.8.1
10 | bin/kafka-topics.sh --create --topic {topic_name} --bootstrap-server {Put the MSK bootstrap server URLs here} --replication-factor 1 --partitions 1
11 |
12 | Start Console Consumer:
13 | ----------------------------------
14 | cd kafka_2.12-2.8.1
15 | bin/kafka-console-consumer.sh --topic {Topic Name} --bootstrap-server {Put the MSK bootstrap server URLs here}
16 |
17 | Start Python Producer:
18 | ----------------------------------
19 | Install the Python Module:
20 | ------------------------------------
21 | pip install kafka-python
22 |
23 | Producer Code:
24 | ------------------------------------
25 | from time import sleep
26 | from json import dumps
27 | from kafka import KafkaProducer
28 |
29 | topic_name='{Topic Name}'
30 | producer = KafkaProducer(bootstrap_servers=['{Put 1st MSK bootstrap server URL here}','{Put 2nd MSK bootstrap server URL here}',...],value_serializer=lambda x: dumps(x).encode('utf-8'))
31 |
32 | for e in range(1000):
33 | data = {'number' : e}
34 | print(data)
35 | producer.send(topic_name, value=data)
36 | sleep(1)
37 |
38 |
39 |
40 |
41 |
42 | Code to create the Lambda Layer:
43 | ---------------------------------------------------------
44 | sudo apt-get update
45 | sudo apt install python3-virtualenv
46 | virtualenv kafka_yt
47 | source kafka_yt/bin/activate
48 | python3 --version
49 | sudo apt install python3-pip
50 | python3 -m pip install --upgrade pip
51 | mkdir -p lambda_layers/python/lib/python3.8/site-packages
52 | cd lambda_layers/python/lib/python3.8/site-packages
53 | pip install kafka-python -t .
54 | cd /mnt/c/Users/USER/lambda_layers
55 | sudo apt install zip
56 | zip -r kafka_yt_demo.zip *
57 |
58 |
59 | Lambda Security Confuguration:
60 | --------------------------------------
61 | Provide AmazonVPCFullAccess to the Lambda execution role
62 | Configure VPC for Lambda (Choose both subnets & provide the security group of MSK Cluster)
63 |
64 |
65 | Lambda Code to publish messages in MSK Topic:
66 | ----------------------------------------------------------------
67 | from time import sleep
68 | from json import dumps
69 | from kafka import KafkaProducer
70 |
71 | topic_name='{Topic Name}'
72 | producer = KafkaProducer(bootstrap_servers=['{Put 1st MSK bootstrap server URL here}','{Put 2nd MSK bootstrap server URL here}',...],value_serializer=lambda x: dumps(x).encode('utf-8'))
73 |
74 | def lambda_handler(event, context):
75 | for e in range(10):
76 | data = e
77 | producer.send(topic_name, value=data)
78 | sleep(0.5)
79 |
80 |
81 |
82 |
--------------------------------------------------------------------------------
/RAG using Kendra & Langchain AWS.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": [],
7 | "authorship_tag": "ABX9TyMC8x+t5ojmOvhyzoCu1KA6",
8 | "include_colab_link": true
9 | },
10 | "kernelspec": {
11 | "name": "python3",
12 | "display_name": "Python 3"
13 | },
14 | "language_info": {
15 | "name": "python"
16 | }
17 | },
18 | "cells": [
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {
22 | "id": "view-in-github",
23 | "colab_type": "text"
24 | },
25 | "source": [
26 | "
"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": null,
32 | "metadata": {
33 | "id": "tAPKj2yh_frr"
34 | },
35 | "outputs": [],
36 | "source": [
37 | "!pip install boto3 langchain langchain-pinecone\n"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "source": [
43 | "import boto3\n",
44 | "import os\n",
45 | "\n",
46 | "\n",
47 | "boto3_bedrock = boto3.client('bedrock-runtime',region_name='us-east-1',aws_access_key_id='',aws_secret_access_key='')\n",
48 | "\n",
49 | "kendra_client=boto3.client('kendra',region_name='us-east-1',aws_access_key_id='',aws_secret_access_key='')"
50 | ],
51 | "metadata": {
52 | "id": "d61bXvX_GDDe"
53 | },
54 | "execution_count": null,
55 | "outputs": []
56 | },
57 | {
58 | "cell_type": "code",
59 | "source": [
60 | "from langchain_community.retrievers import AmazonKendraRetriever\n",
61 | "\n",
62 | "retriever = AmazonKendraRetriever(index_id=\"d0215a1d-87f2-41de-906f-edd13da9fb62\",client =kendra_client)\n",
63 | "\n",
64 | "\n",
65 | "\n",
66 | "response=retriever.get_relevant_documents(\"How many major terror attacks happened in any city in India since 2014?\")\n",
67 | "\n",
68 | "response"
69 | ],
70 | "metadata": {
71 | "id": "QHiLvSKSGF3r"
72 | },
73 | "execution_count": null,
74 | "outputs": []
75 | },
76 | {
77 | "cell_type": "code",
78 | "source": [
79 | "from langchain import PromptTemplate\n",
80 | "\n",
81 | "RAG_PROMPT_TEMPLATE = '''Here is some important context which can help inform the questions the Human asks.\n",
82 | "Make sure to not make anything up to answer the question if it is not provided in the context.\n",
83 | "\n",
84 | "\n",
85 | "{context}\n",
86 | "\n",
87 | "\n",
88 | "Human: {human_input}\n",
89 | "\n",
90 | "Assistant:\n",
91 | "'''\n",
92 | "PROMPT = PromptTemplate.from_template(RAG_PROMPT_TEMPLATE)\n",
93 | "\n",
94 | "import json\n",
95 | "human_input=input(\"Enter the question : \")\n",
96 | "search_results =retriever.get_relevant_documents(human_input)\n",
97 | "context_string = '\\n\\n'.join([f'Document {ind+1}: ' + i.page_content for ind, i in enumerate(search_results)])\n",
98 | "prompt_data = PROMPT.format(human_input=human_input, context=context_string)\n",
99 | "inputText=prompt_data\n",
100 | "body_part=json.dumps({'inputText': inputText,\n",
101 | " 'textGenerationConfig': {'maxTokenCount': 8192,\n",
102 | " 'stopSequences': [],\n",
103 | " 'temperature': 0,\n",
104 | " 'topP': 1}})\n",
105 | "response = boto3_bedrock.invoke_model(\n",
106 | " body=body_part,\n",
107 | " contentType=\"application/json\",\n",
108 | " accept=\"application/json\",\n",
109 | " modelId='amazon.titan-text-express-v1'\n",
110 | ")\n",
111 | "output_text=json.loads(response['body'].read())['results'][0]['outputText']\n",
112 | "output_text"
113 | ],
114 | "metadata": {
115 | "id": "X5m4SQ9GH9Gv"
116 | },
117 | "execution_count": null,
118 | "outputs": []
119 | }
120 | ]
121 | }
--------------------------------------------------------------------------------
/Real-Time Streaming Project with Smartphone Data.txt:
--------------------------------------------------------------------------------
1 | Lauch Kafka:
2 | -------------------
3 |
4 | https://github.com/SatadruMukherjee/Data-Preprocessing-Models/blob/main/kafka_yt_demo.zip
5 |
6 | Lambda Code:
7 | ----------------
8 | import json
9 | from time import sleep
10 | from json import dumps
11 | from kafka import KafkaProducer
12 | import json
13 |
14 | topic_name='sensor_data_consumer'
15 | producer = KafkaProducer(bootstrap_servers=['52.87.254.233:9092']
16 | ,value_serializer=lambda x: dumps(x).encode('utf-8'))
17 |
18 | def lambda_handler(event, context):
19 | # TODO implement
20 | print(event)
21 | payload_part=json.loads(event['body'])['payload']
22 | for i in payload_part:
23 | light_illumination=i['values']['lux']
24 | capture_time=i['time']
25 | data={"light_illumination":light_illumination,"capture_time":capture_time}
26 | print(data)
27 | producer.send(topic_name, value=data)
28 | producer.flush()
29 | return {
30 | 'statusCode': 200,
31 | 'body': json.dumps('Hello from Lambda!')
32 | }
33 |
34 |
35 |
36 |
37 |
38 | wget https://dlcdn.apache.org/kafka/3.4.0/kafka_2.13-3.4.0.tgz
39 | tar -xvf kafka_2.13-3.4.0.tgz
40 |
41 |
42 | To install Java --
43 | ----------------------------------------
44 | java -version
45 | sudo yum install java-1.8.0-openjdk
46 | java -version
47 | cd kafka_2.13-3.4.0
48 |
49 | Start Zoo-keeper:
50 | -------------------------------
51 | bin/zookeeper-server-start.sh config/zookeeper.properties
52 |
53 |
54 | Start Kafka-server:
55 | ----------------------------------------
56 | Duplicate the session & enter in a new console --
57 | export KAFKA_HEAP_OPTS="-Xmx256M -Xms128M"
58 | cd kafka_2.13-3.4.0
59 | bin/kafka-server-start.sh config/server.properties
60 |
61 | It is pointing to private server , change server.properties so that it can run in public IP
62 |
63 | To do this , you can follow any of the 2 approaches shared belwo --
64 | 1)Do a vi config/server.properties in insert mode -- change ADVERTISED_LISTENERS to public ip of the EC2 instance
65 | 2)You can modify the file using Winscp also
66 |
67 | To create topic:
68 | ------------------
69 | cd kafka_2.13-3.4.0
70 | bin/kafka-topics.sh --create --topic sensor_data_consumer --bootstrap-server 100.26.220.99:9092 --replication-factor 1 --partitions 1
71 |
72 | Start Kafka Console Consumer:
73 | --------------------------------------
74 | bin/kafka-console-consumer.sh --topic sensor_data_consumer --bootstrap-server 100.26.220.99:9092
75 |
76 | Sample Json event received from API:
77 | ---------------------------------------
78 | {
79 | "version":"2.0",
80 | "routeKey":"POST /publishtokafka",
81 | "rawPath":"/publishtokafka",
82 | "rawQueryString":"",
83 | "headers":{
84 | "accept-encoding":"gzip",
85 | "content-length":"1141",
86 | "content-type":"application/json",
87 | "host":"szl0e9g8og.execute-api.us-east-1.amazonaws.com",
88 | "user-agent":"okhttp/4.9.2",
89 | "x-amzn-trace-id":"Root=1-641b2ce4-718735061957bb75192829c1",
90 | "x-forwarded-for":"43.226.31.179",
91 | "x-forwarded-port":"443",
92 | "x-forwarded-proto":"https"
93 | },
94 | "requestContext":{
95 | "accountId":"825865577047",
96 | "apiId":"szl0e9g8og",
97 | "domainName":"szl0e9g8og.execute-api.us-east-1.amazonaws.com",
98 | "domainPrefix":"szl0e9g8og",
99 | "http":{
100 | "method":"POST",
101 | "path":"/publishtokafka",
102 | "protocol":"HTTP/1.1",
103 | "sourceIp":"43.226.31.179",
104 | "userAgent":"okhttp/4.9.2"
105 | },
106 | "requestId":"CMPzriYgoAMEJ1Q=",
107 | "routeKey":"POST /publishtokafka",
108 | "stage":"$default",
109 | "time":"22/Mar/2023:16:29:24 +0000",
110 | "timeEpoch":1679502564025
111 | },
112 | "body":"{\"messageId\":22,\"sessionId\":\"9dc9bf11-6301-477c-97a8-50cfc08c77d6\",\"deviceId\":\"a390a36d-eee4-466c-8287-3360165e351c\",\"payload\":[{\"name\":\"light\",\"values\":{\"lux\":6.329113960266113},\"accuracy\":3,\"time\":1679502563452012300},{\"name\":\"light\",\"values\":{\"lux\":6.329113960266113},\"accuracy\":3,\"time\":1679502563552018200},{\"name\":\"light\",\"values\":{\"lux\":6.329113960266113},\"accuracy\":3,\"time\":1679502563652024000},{\"name\":\"light\",\"values\":{\"lux\":6.329113960266113},\"accuracy\":3,\"time\":1679502563752029700},{\"name\":\"light\",\"values\":{\"lux\":6.329113960266113},\"accuracy\":3,\"time\":1679502563852004600},{\"name\":\"light\",\"values\":{\"lux\":6.329113960266113},\"accuracy\":3,\"time\":1679502563952010800},{\"name\":\"light\",\"values\":{\"lux\":6.329113960266113},\"accuracy\":3,\"time\":1679502564052016400},{\"name\":\"light\",\"values\":{\"lux\":6.329113960266113},\"accuracy\":3,\"time\":1679502564152022300},{\"name\":\"light\",\"values\":{\"lux\":6.329113960266113},\"accuracy\":3,\"time\":1679502564252028000},{\"name\":\"light\",\"values\":{\"lux\":6.329113960266113},\"accuracy\":3,\"time\":1679502564352003000},{\"name\":\"light\",\"values\":{\"lux\":6.329113960266113},\"accuracy\":3,\"time\":1679502564452009000}]}",
113 | "isBase64Encoded":false
114 | }
115 |
116 |
--------------------------------------------------------------------------------
/Recursion Pattern with AWS Step Funciton & Lambda.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/Recursion Pattern with AWS Step Funciton & Lambda.png
--------------------------------------------------------------------------------
/Run Batch job Using AWS Lambda.txt:
--------------------------------------------------------------------------------
1 | s3 bucket:
2 | DynamoDB:
3 |
4 | Docker Install:
5 | ---------------
6 | AWS Linux
7 | sudo yum update -y
8 | sudo amazon-linux-extras install docker
9 | sudo service docker start
10 | sudo usermod -a -G docker ec2-user
11 |
12 | mkdir lambdafargate
13 | cd lambdafargate
14 |
15 | requirements.txt:
16 | ---------------------
17 | boto3
18 |
19 | Dockerfile:
20 | -----------
21 | # Use an official Python runtime as a parent image
22 | FROM python:3.8-slim
23 |
24 | # Set the working directory to /app
25 | WORKDIR /app
26 |
27 | # Copy the current directory contents into the container at /app
28 | COPY . /app
29 |
30 | # Install any needed packages specified in requirements.txt
31 | RUN pip install --no-cache-dir -r requirements.txt
32 |
33 | # Run script.py when the container launches
34 | CMD ["echo" "hello"]
35 |
36 | ["python","script.py","demoytlambdabatchtest","Setosa1.csv"]
37 |
38 | script.py:
39 | ----------
40 | import boto3
41 | import csv
42 | import sys
43 | from io import StringIO
44 |
45 | def main(bucket, key):
46 | s3_client = boto3.client('s3',region_name='us-east-1')
47 | dynamodb_client = boto3.resource('dynamodb',region_name='us-east-1')
48 |
49 | # Read the CSV file from S3
50 | file_obj = s3_client.get_object(Bucket=bucket, Key=key)
51 | csv_content = file_obj['Body'].read().decode('utf-8')
52 |
53 | # Define the DynamoDB table
54 | table = dynamodb_client.Table('iris_dataset')
55 |
56 | # Read the CSV content
57 | csv_reader = csv.DictReader(StringIO(csv_content))
58 |
59 | # Iterate through the CSV and write to DynamoDB
60 | for row in csv_reader:
61 | Id = int(row['Id'])
62 | SEPAL_LENGTH = row['SEPAL_LENGTH']
63 | SEPAL_WIDTH = (row['SEPAL_WIDTH'])
64 | PETAL_LENGTH = row['PETAL_LENGTH']
65 | PETAL_WIDTH = row['PETAL_WIDTH']
66 | CLASS_NAME = row['CLASS_NAME']
67 |
68 | # Write to DynamoDB
69 | table.put_item(
70 | Item={
71 | 'Id':Id,
72 | 'SEPAL_LENGTH': SEPAL_LENGTH,
73 | 'SEPAL_WIDTH': SEPAL_WIDTH,
74 | 'PETAL_LENGTH': PETAL_LENGTH,
75 | 'PETAL_WIDTH': PETAL_WIDTH,
76 | 'CLASS_NAME':CLASS_NAME
77 | }
78 | )
79 |
80 | print('CSV processed successfully!')
81 |
82 | if __name__ == "__main__":
83 | # Extract command-line arguments
84 | if len(sys.argv) != 3:
85 | print("Usage: python script.py ")
86 | sys.exit(1)
87 |
88 | s3_bucket = sys.argv[1]
89 | s3_key = sys.argv[2]
90 |
91 | # Execute the main function with provided arguments
92 | main(s3_bucket, s3_key)
93 |
94 | Docker Image Build & Test:
95 | ---------------------------
96 | ECR Push
97 |
98 | AWS Batch Components Creation
99 |
100 | Batch IAM Role creation --dynmaodb , ecs task execution role , s3 access
101 |
102 | Lambda Function:
103 | ------------------
104 | import boto3
105 | import json
106 |
107 | def lambda_handler(event, context):
108 | print(event)
109 | # Extract necessary information from the S3 event
110 | s3_bucket = event['Records'][0]['s3']['bucket']['name']
111 | s3_key = event['Records'][0]['s3']['object']['key']
112 |
113 | # Specify your AWS Batch job definition name
114 | job_definition_name = '{}'
115 |
116 | # Specify your AWS Batch job queue name
117 | job_queue_name = '{}'
118 |
119 | # Specify the command to pass to the AWS Batch job
120 | command = f"python script.py {s3_bucket} {s3_key}"
121 | print("Executing the command : ", command)
122 | # Create an AWS Batch client
123 | batch_client = boto3.client('batch')
124 |
125 | # Submit a job to AWS Batch
126 | response = batch_client.submit_job(
127 | jobName='{}',
128 | jobQueue=job_queue_name,
129 | jobDefinition=job_definition_name,
130 | containerOverrides={
131 | 'command': command.split(' ')
132 | },
133 | retryStrategy={
134 | 'attempts': 1
135 | },
136 | )
137 |
138 | # Print the AWS Batch job response
139 | print(json.dumps(response, indent=2))
140 |
141 | return {
142 | 'statusCode': 200,
143 | 'body': json.dumps('AWS Batch job submitted successfully!')
144 | }
--------------------------------------------------------------------------------
/SNS_Message_Publish.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "name": "SNS Message Publish.ipynb",
7 | "provenance": [],
8 | "authorship_tag": "ABX9TyMQXNzei1cXERatoSYN2Tth",
9 | "include_colab_link": true
10 | },
11 | "kernelspec": {
12 | "name": "python3",
13 | "display_name": "Python 3"
14 | },
15 | "language_info": {
16 | "name": "python"
17 | }
18 | },
19 | "cells": [
20 | {
21 | "cell_type": "markdown",
22 | "metadata": {
23 | "id": "view-in-github",
24 | "colab_type": "text"
25 | },
26 | "source": [
27 | "
"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "source": [
33 | "!pip install boto3"
34 | ],
35 | "metadata": {
36 | "id": "6cKOYN27ysNV"
37 | },
38 | "execution_count": null,
39 | "outputs": []
40 | },
41 | {
42 | "cell_type": "code",
43 | "source": [
44 | "import json\n",
45 | "import boto3\n",
46 | "access_key=\"\"\n",
47 | "secret_key=\"\"\n",
48 | "client = boto3.client('sns',aws_access_key_id=access_key,\n",
49 | "aws_secret_access_key=secret_key, region_name='')"
50 | ],
51 | "metadata": {
52 | "id": "uoqibyJQyxCM"
53 | },
54 | "execution_count": 11,
55 | "outputs": []
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": 23,
60 | "metadata": {
61 | "id": "4J0MyeFAyo1b"
62 | },
63 | "outputs": [],
64 | "source": [
65 | "def message_publish(arn,message,destination):\n",
66 | " response = client.publish(\n",
67 | " TargetArn=arn,\n",
68 | " Message=json.dumps({'default': json.dumps(message)}),\n",
69 | " MessageStructure='json',\n",
70 | " MessageAttributes={\n",
71 | " 'Destination': {\n",
72 | " 'DataType': 'String',\n",
73 | " 'StringValue': destination\n",
74 | " }\n",
75 | " }\n",
76 | " )\n",
77 | " return response"
78 | ]
79 | },
80 | {
81 | "cell_type": "code",
82 | "source": [
83 | "arn=\"\"\n",
84 | "message=\"\"\n",
85 | "destination=\"\"\n",
86 | "response=message_publish(arn,message,destination)"
87 | ],
88 | "metadata": {
89 | "id": "b4lyRnEozTAA"
90 | },
91 | "execution_count": 24,
92 | "outputs": []
93 | },
94 | {
95 | "cell_type": "code",
96 | "source": [
97 | "response"
98 | ],
99 | "metadata": {
100 | "id": "rORBL0jk0_gK"
101 | },
102 | "execution_count": null,
103 | "outputs": []
104 | },
105 | {
106 | "cell_type": "code",
107 | "source": [
108 | ""
109 | ],
110 | "metadata": {
111 | "id": "-0iW4CGZ_ogN"
112 | },
113 | "execution_count": null,
114 | "outputs": []
115 | }
116 | ]
117 | }
--------------------------------------------------------------------------------
/Serverless Manual Approval Steps in AWS Step Functions and Amazon API Gateway.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/Serverless Manual Approval Steps in AWS Step Functions and Amazon API Gateway.png
--------------------------------------------------------------------------------
/Setup PySpark in ec2 using conda.txt:
--------------------------------------------------------------------------------
1 | Think of Conda as a more powerful virtualenv that not only handles virtual environments but also manages packages and dependencies across multiple languages (not just Python)
2 |
3 |
4 | Install Miniconda:
5 | ===================
6 | wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
7 | bash ~/Miniconda3-latest-Linux-x86_64.sh
8 |
9 | Check whether conda installed successfully or not:
10 | ==================================================
11 | conda list
12 |
13 | PATH is an environment variable on Unix-like operating systems, DOS, OS/2, and Microsoft Windows, specifying a set of directories where executable programs are located.
14 |
15 | cat ~/.bashrc
16 | vi ~/.bashrc
17 | export PATH=~/miniconda3/bin:$PATH
18 | source ~/.bashrc
19 | conda list
20 | which python
21 |
22 |
23 | Configuring yml file for conda env setup(https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html#creating-an-environment-file-manually)
24 |
25 | vi environment.yml
26 |
27 | name: pyspark_demo
28 | channels:
29 | - conda-forge
30 | dependencies:
31 | - findspark=2.0.1
32 | - jupyter=1.0.0
33 | - pyspark=3.5.0
34 | - openjdk=11.0.13
35 | - python=3.12
36 | - python-dotenv
37 |
38 | Note: A channel is a location (a URL) where conda can search for packages to install on your machine e.g. https://anaconda.org/conda-forge/repo
39 |
40 | conda env create -f environment.yml
41 |
42 | conda activate pyspark_demo
43 |
44 | jupyter notebook --generate-config
45 |
46 | vi /home/ec2-user/.jupyter/jupyter_notebook_config.py
47 |
48 | c.NotebookApp.ip = '*'
49 | c.NotebookApp.open_browser = False
50 |
51 | (Reference: https://testnb.readthedocs.io/en/stable/examples/Notebook/Configuring%20the%20Notebook%20and%20Server.html#running-a-public-notebook-server)
52 |
53 | jupyter notebook
54 |
55 |
56 | Spark Code:
57 | ============
58 |
59 | import os
60 | from dotenv import load_dotenv
61 | import findspark
62 | findspark.init()
63 | findspark.find()
64 |
65 | import pyspark
66 | from pyspark.sql import SparkSession
67 | from pyspark.sql import functions as F
68 | from pyspark.sql import SparkSession
69 | from pyspark.sql.types import StructType, StructField, IntegerType, StringType
70 |
71 | spark = SparkSession.builder \
72 | .appName("WriteToS3") \
73 | .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.4,com.amazonaws:aws-java-sdk-bundle:1.12.262") \
74 | .config("spark.hadoop.fs.s3a.access.key", "") \
75 | .config("spark.hadoop.fs.s3a.secret.key", "") \
76 | .config("spark.hadoop.fs.s3a.endpoint", "s3.amazonaws.com") \
77 | .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
78 | .getOrCreate()
79 |
80 | # Define schema
81 | schema = StructType([
82 | StructField("id", IntegerType(), True),
83 | StructField("name", StringType(), True),
84 | StructField("age", IntegerType(), True)
85 | ])
86 |
87 | # Create dummy data
88 | data = [
89 | (1, "Alice", 25),
90 | (2, "Bob", 30),
91 | (3, "Charlie", 28)
92 | ]
93 |
94 | # Create DataFrame
95 | df = spark.createDataFrame(data, schema=schema)
96 |
97 | # Show DataFrame
98 | df.show()
99 |
100 | # Define S3 path
101 | s3_path = "s3a://{Bucket Name}/test_conda/"
102 |
103 | # Write DataFrame to S3 in Parquet format
104 | df.write.mode("overwrite").parquet(s3_path)
105 |
106 |
--------------------------------------------------------------------------------
/Shake detection using Accelerometer , Kafka & Python.txt:
--------------------------------------------------------------------------------
1 |
2 |
3 | EC2 Code:
4 | ------------
5 | wget https://dlcdn.apache.org/kafka/3.4.0/kafka_2.13-3.4.0.tgz
6 | tar -xvf kafka_2.13-3.4.0.tgz
7 | sudo yum install java-1.8.0-openjdk
8 | cd kafka_2.13-3.4.0
9 | bin/zookeeper-server-start.sh config/zookeeper.properties
10 | export KAFKA_HEAP_OPTS="-Xmx256M -Xms128M"
11 | cd kafka_2.13-3.4.0
12 | bin/kafka-server-start.sh config/server.properties
13 | To create topic:
14 | ------------------
15 | cd kafka_2.13-3.4.0
16 | bin/kafka-topics.sh --create --topic sensor_data_consumer --bootstrap-server {}:9092 --replication-factor 1 --partitions 1
17 |
18 | Start Kafka Console Consumer:
19 | --------------------------------------
20 | bin/kafka-console-consumer.sh --topic sensor_data_consumer --bootstrap-server {}:9092
21 |
22 |
23 | Lambda Code:
24 | ----------------
25 | import json
26 | from time import sleep
27 | from json import dumps
28 | from kafka import KafkaProducer
29 | import json
30 |
31 | topic_name='sensor_data_consumer'
32 | producer = KafkaProducer(bootstrap_servers=['{}:9092']
33 | ,value_serializer=lambda x: dumps(x).encode('utf-8'))
34 |
35 | def lambda_handler(event, context):
36 | # TODO implement
37 | print(event)
38 | payload_part=json.loads(event['body'])['payload']
39 | for i in payload_part:
40 | acc_x=i['values']['x']
41 | acc_y=i['values']['y']
42 | acc_z=i['values']['z']
43 | capture_time=i['time']
44 | data={"acc_x":acc_x,"acc_y":acc_y,"acc_z":acc_z,"capture_time":capture_time}
45 | print(data)
46 | producer.send(topic_name, value=data)
47 | producer.flush()
48 | return {
49 | 'statusCode': 200,
50 | 'body': json.dumps('Hello from Lambda!')
51 | }
52 |
53 |
54 |
55 |
56 | Consumer Code:
57 | ----------------
58 | from kafka import KafkaConsumer
59 | from kafka import TopicPartition , OffsetAndMetadata
60 | import json
61 |
62 |
63 | consumer = KafkaConsumer ('sensor_data_consumer1',bootstrap_servers = ['{}:9092'],
64 | value_deserializer=lambda m: json.loads(m.decode('utf-8')),group_id='acceleration_test',auto_offset_reset='earliest',
65 | enable_auto_commit =False)
66 |
67 |
68 | for message in consumer:
69 | data=message.value
70 | x_accl=data['acc_x']
71 | y_accl = data['acc_y']
72 | z_accl = data['acc_z']
73 | mag=(abs(x_accl)+abs(y_accl)+abs(z_accl))
74 | if(mag>=15):
75 | print('*' * 100)
76 | print("Shaking")
77 | print('*' * 100)
78 | else:
79 | print("Idle")
80 | tp=TopicPartition(message.topic,message.partition)
81 | om = OffsetAndMetadata(message.offset+1, message.timestamp)
82 | consumer.commit({tp:om})
83 |
84 |
--------------------------------------------------------------------------------
/Simple OTP System using AWS Serverless.txt:
--------------------------------------------------------------------------------
1 | DynamoDB Table:
2 | -----------------
3 | Table Name: otp_holder
4 | Primary Key: email_id
5 | Sort Key: EXPIRATION_TIME
6 |
7 |
8 |
9 | OTP Generator:
10 | ------------------
11 | import json
12 | import boto3
13 | import time
14 | from random import randint
15 |
16 | client_dynamo=boto3.resource('dynamodb')
17 |
18 | table=client_dynamo.Table('otp_holder')
19 |
20 | default_ttl = 120
21 |
22 | def lambda_handler(event, context):
23 |
24 | email_id=event['queryStringParameters']['email_address']
25 |
26 | otp_value=randint(100000, 999999)
27 |
28 | entry={
29 | 'email_id': email_id,
30 | 'OTP': otp_value,
31 | 'EXPIRATION_TIME': int(time.time()) + default_ttl
32 | }
33 |
34 | response=table.put_item(Item=entry)
35 |
36 | return "A verification code is sent to the email address you provided."
37 |
38 |
39 | Send Email:
40 | --------------
41 | import json
42 | import boto3
43 | client = boto3.client("ses")
44 |
45 |
46 | def lambda_handler(event, context):
47 | print(event)
48 | if(event['Records'][0]['eventName']=='INSERT'):
49 | mail_id=event['Records'][0]['dynamodb']['Keys']['email_id']['S']
50 | print("The mail id is : {}".format(mail_id))
51 |
52 | otp=event['Records'][0]['dynamodb']['NewImage']['OTP']['N']
53 | print("The mail id is : {}".format(otp))
54 |
55 | body = """
56 | Use this code to verify your login at Simple Website
57 |
58 | {}
59 | """.format(otp)
60 |
61 | message = {"Subject": {"Data": 'Your OTP (valid for only 2 mins)!'}, "Body": {"Html": {"Data": body}}}
62 |
63 | response = client.send_email(Source = '{FromAddress}', Destination = {"ToAddresses": [mail_id]}, Message = message)
64 |
65 | print("The mail is sent successfully")
66 |
67 | Verify OTP:
68 | --------------
69 | import json
70 | import boto3
71 | import time
72 | client = boto3.client('dynamodb')
73 |
74 | def lambda_handler(event, context):
75 | # TODO implement
76 |
77 | email_id=event['queryStringParameters']['email_address']
78 | print("The received email id : {}".format(email_id))
79 |
80 | otp_from_user=event['queryStringParameters']['otp']
81 | print("The received otp : {}".format(otp_from_user))
82 |
83 | response = client.query(
84 | TableName='otp_holder',
85 | KeyConditionExpression='email_id = :email_id',
86 | ExpressionAttributeValues={
87 | ':email_id': {'S': email_id}
88 | },ScanIndexForward = False, Limit = 1)
89 |
90 | if(response['Count']==0):
91 | return "No such OTP was shared"
92 | else:
93 | latest_stored_otp_value=response['Items'][0]['OTP']['N']
94 | print("Latest Stored OTP Value : {}".format(latest_stored_otp_value))
95 |
96 | if(int(response['Items'][0]['EXPIRATION_TIME']['N'])=1998
80 | group by
81 | l_returnflag,
82 | l_linestatus
83 | order by
84 | l_returnflag,
85 | l_linestatus;
86 |
87 | select METADATA$FILENAME as File_Name from @Snow_stage/unloadlineitem limit 10;
88 |
89 | select METADATA$FILENAME as File_Name,split_part(METADATA$FILENAME,'/',2) as Partition_Name from @Snow_stage/unloadlineitem limit 10;
90 |
91 | --Create partitioned Table
92 | create or replace external table s3_to_snowflake.PUBLIC.table_with_partition_for_yt_demo (
93 | L_ORDERKEY number(38,0) as (Value:c1::int),
94 | L_PARTKEY NUMBER(38,0) as (Value:c2::int),
95 | L_SUPPKEY NUMBER(38,0) as (Value:c3::int),
96 | L_LINENUMBER NUMBER(38,0) as (Value:c4::int),
97 | L_QUANTITY NUMBER(12,2) as (Value:c5::number(12,2)),
98 | L_EXTENDEDPRICE NUMBER(12,2) as (Value:c6::number(12,2)),
99 | L_DISCOUNT NUMBER(12,2) as (Value:c7::number(12,2)),
100 | L_TAX NUMBER(12,2) as (Value:c8::number(12,2)),
101 | L_RETURNFLAG VARCHAR(1) as (Value:c9::varchar),
102 | L_LINESTATUS VARCHAR(1) as (Value:c10::varchar),
103 | L_SHIPDATE DATE as (Value:c11::DATE),
104 | L_COMMITDATE DATE as (Value:c12::DATE),
105 | L_RECEIPTDATE DATE as (Value:c13::DATE),
106 | L_SHIPINSTRUCT VARCHAR(25) as (Value:c14::varchar),
107 | L_SHIPMODE VARCHAR(10) as (Value:c15::varchar),
108 | L_COMMENT VARCHAR(44) as (Value:c16::varchar),File_Partition NUMBER(38,0) as (split_part(METADATA$FILENAME,'/',2)::int)) PARTITION BY(File_Partition) with
109 | location =@s3_to_snowflake.PUBLIC.Snow_stage/unloadlineitem file_format ='my_csv_format' ;
110 |
111 |
112 | select
113 | l_returnflag,
114 | l_linestatus,
115 | sum(l_quantity) as sum_qty,
116 | sum(l_extendedprice) as sum_base_price,
117 | sum(l_extendedprice * (1-l_discount)) as sum_disc_price,
118 | sum(l_extendedprice * (1-l_discount) * (1+l_tax)) as sum_charge,
119 | avg(l_quantity) as avg_qty,
120 | avg(l_extendedprice) as avg_price,
121 | avg(l_discount) as avg_disc,
122 | count(*) as count_order
123 | from
124 | s3_to_snowflake.PUBLIC.table_with_partition_for_yt_demo
125 | where
126 | File_Partition >=1998
127 | group by
128 | l_returnflag,
129 | l_linestatus
130 | order by
131 | l_returnflag,
132 | l_linestatus;
133 |
--------------------------------------------------------------------------------
/Snowflake Parallel Processing using Python Lab.txt:
--------------------------------------------------------------------------------
1 |
2 | Snowflake Code:
3 | ---------------------------------
4 | --drop database if required
5 | drop database if exists ramu;
6 | --Create Database
7 | create database if not exists ramu;
8 | --use the database
9 | use ramu;
10 |
11 |
12 | create or replace table employee_history(employee_id number,
13 | empl_join_date date,
14 | dept varchar(10),
15 | salary number,
16 | manager_id number,t timestamp default current_timestamp());
17 |
18 |
19 |
20 | create or replace procedure demostoredprocedure(run_this_sql_query Varchar)
21 | returns String not null
22 | language javascript
23 | as
24 | $$
25 | var my_sql_command_3 ="call system$wait(10)"
26 | var statement3 = snowflake.createStatement( {sqlText: my_sql_command_3} );
27 | var result_set3 = statement3.execute();
28 | var my_sql_command_2 = RUN_THIS_SQL_QUERY;
29 | var statement2 = snowflake.createStatement( {sqlText: my_sql_command_2} );
30 | var result_set2 = statement2.execute();
31 | return "Done"; // Replace with something more useful.
32 | $$
33 | ;
34 |
35 |
36 |
37 | select * from employee_history;
38 |
39 |
40 | create or replace table employee_history_parallel(employee_id number,
41 | empl_join_date date,
42 | dept varchar(10),
43 | salary number,
44 | manager_id number,t timestamp default current_timestamp());
45 |
46 | select * from employee_history_parallel;
47 |
48 |
49 |
50 | Sequential Execution:
51 | -------------------------
52 | import pandas as pd
53 | import time
54 | import snowflake.connector as sf
55 |
56 | user=""
57 | password=""
58 | account=""
59 | conn=sf.connect(user=user,password=password,account=account)
60 |
61 |
62 | cursor = conn.cursor()
63 |
64 | statement_1 = 'use warehouse COMPUTE_WH'
65 | cursor.execute(statement_1)
66 | statement2 = "alter warehouse COMPUTE_WH resume IF SUSPENDED"
67 | cursor.execute(statement2)
68 | statement3 = "use database RAMU"
69 | cursor.execute(statement3)
70 | statement4 = "use role ACCOUNTADMIN"
71 | cursor.execute(statement4)
72 | statement5 = "use schema PUBLIC"
73 | cursor.execute(statement5)
74 |
75 |
76 |
77 |
78 | df=pd.read_csv('{}')
79 |
80 | for index,row in df.iterrows():
81 | query_to_be_executed=row['Query'].replace("'","''")
82 | print(query_to_be_executed)
83 | exeucte_snowflake_query="""call demostoredprocedure('{}');""".format(query_to_be_executed)
84 | print("Executing the query :{} ".format(exeucte_snowflake_query))
85 | cursor.execute(exeucte_snowflake_query)
86 |
87 | Parallel Execution:
88 | -------------------------
89 |
90 | import pandas as pd
91 | import time
92 | import snowflake.connector as sf
93 | import warnings
94 | warnings.filterwarnings('ignore')
95 |
96 | user=""
97 | password=""
98 | account=""
99 | conn=sf.connect(user=user,password=password,account=account)
100 |
101 |
102 | cursor = conn.cursor()
103 |
104 | statement_1 = 'use warehouse COMPUTE_WH'
105 | cursor.execute(statement_1)
106 | statement2 = "alter warehouse COMPUTE_WH resume IF SUSPENDED"
107 | cursor.execute(statement2)
108 | statement3 = "use database RAMU"
109 | cursor.execute(statement3)
110 | statement4 = "use role ACCOUNTADMIN"
111 | cursor.execute(statement4)
112 | statement5 = "use schema PUBLIC"
113 | cursor.execute(statement5)
114 |
115 |
116 | def get_status(cur_list):
117 | print("Check the status of the query list : {}".format(cur_list))
118 | status=[]
119 | df=pd.DataFrame(columns=['Query_id','Status'])
120 | arr=cur_list
121 | for query_id in cur_list:
122 | status_for_the_query=conn.get_query_status(query_id).name
123 | status.append(status_for_the_query)
124 | df=df.append({'Query_id':query_id,'Status':status_for_the_query},ignore_index=True)
125 | if status.count('RUNNING')>1:
126 | del status[:]
127 | print(df)
128 | print("One or more commands still running")
129 | time.sleep(5)
130 | get_status(arr)
131 | else:
132 | print("All commands execution done!")
133 | print(df)
134 | return
135 |
136 |
137 | query_ids=[]
138 |
139 | df=pd.read_csv('{}')
140 |
141 | for index,row in df.iterrows():
142 | query_to_be_executed=row['Query'].replace("'","''")
143 | print(query_to_be_executed)
144 | exeucte_snowflake_query="""call demostoredprocedure('{}');""".format(query_to_be_executed)
145 | print("Executing the query :{} ".format(exeucte_snowflake_query))
146 | cursor.execute_async(exeucte_snowflake_query)
147 | query_id=cursor.sfqid
148 | print("Query id for the above query execution : {}".format(query_id))
149 | query_ids.append(query_id)
150 |
151 | get_status(query_ids)
152 |
153 |
--------------------------------------------------------------------------------
/Snowflake Row Level Security.sql:
--------------------------------------------------------------------------------
1 | drop database if exists ramu;
2 |
3 | create database ramu;
4 |
5 | create or replace table ramu.public.employees(employee_id number,
6 | empl_join_date date,
7 | dept varchar(10),
8 | salary number,
9 | manager_id number);
10 |
11 | insert into ramu.public.employees values(1,'2014-10-01','HR',40000,4),
12 | (2,'2014-09-01','Tech',50000,9),
13 | (3,'2018-09-01','Marketing',30000,5),
14 | (4,'2017-09-01','HR',10000,5),
15 | (5,'2019-09-01','HR',35000,9),
16 | (6,'2015-09-01','Tech',90000,4),
17 | (7,'2016-09-01','Marketing',20000,1);
18 |
19 | select * from ramu.public.employees;
20 |
21 | CREATE OR REPLACE TABLE access_management_lookup (role string, dept_name string);
22 |
23 | INSERT INTO access_management_lookup VALUES ('HR_ADMIN', 'HR'), ('TECH_ADMIN', 'Tech'),('MARKETING_ADMIN', 'Marketing');
24 |
25 | select * from access_management_lookup;
26 |
27 |
28 |
29 | create or replace row access policy dept_level_access as (dept varchar) returns boolean ->
30 | current_role()='ACCOUNTADMIN'
31 | or exists (
32 | select 1 from access_management_lookup
33 | where role = current_role()
34 | and dept_name = depta
35 | );
36 |
37 |
38 | ALTER TABLE ramu.public.employees ADD ROW ACCESS POLICY dept_level_access ON (dept);
39 |
40 | select * from ramu.public.employees;
41 |
42 |
43 |
44 |
45 | create or replace role HR_Admin;
46 | create or replace role Tech_Admin;
47 | create or replace role Marketing_Admin;
48 |
49 | grant usage on warehouse compute_Wh to role HR_Admin;
50 | grant usage on warehouse compute_Wh to role Tech_Admin;
51 | grant usage on warehouse compute_Wh to role Marketing_Admin;
52 |
53 | grant usage on database ramu to role HR_Admin;
54 | grant usage on database ramu to role Tech_Admin;
55 | grant usage on database ramu to role Marketing_Admin;
56 |
57 | grant usage on schema public to role HR_Admin;
58 | grant usage on schema public to role Tech_Admin;
59 | grant usage on schema public to role Marketing_Admin;
60 |
61 | grant select on table ramu.public.employees to role HR_Admin;
62 | grant select on table ramu.public.employees to role Tech_Admin;
63 | grant select on table ramu.public.employees to role Marketing_Admin;
64 |
65 |
66 | create or replace user jadu_hr password = '123456';
67 | grant role HR_Admin to user jadu_hr;
68 |
69 |
70 |
71 | create or replace user mimo_marketing password = '456789';
72 | grant role Marketing_Admin to user mimo_marketing;
73 |
74 |
75 | create or replace user jimo_tech password = '147258';
76 | grant role Tech_Admin to user jimo_tech;
77 |
78 |
79 | drop user jimo_tech;
80 | drop user mimo_marketing;
81 | drop user jadu_hr;
82 |
83 | drop role HR_Admin;
84 | drop role Tech_Admin;
85 | drop role Marketing_Admin;
86 |
--------------------------------------------------------------------------------
/Snowflake Schema Detection.txt:
--------------------------------------------------------------------------------
1 | Create 3 buckets:
2 | -----------------
3 | 1)For storing csv file
4 | 2)For storing parquet file
5 | 3)For storing athena logs
6 |
7 |
8 |
9 | CSV to parquet conversion using AWS Athena:
10 | --------------------------------------------
11 | CREATE EXTERNAL TABLE helloworld.hellocsv (
12 | `Id` int,
13 | `SEPAL_LENGTH` double,
14 | `SEPAL_WIDTH` double,
15 | `PETAL_LENGTH` double,
16 | `PETAL_WIDTH` double,
17 | `CLASS_NAME` string
18 | )
19 | ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
20 | WITH SERDEPROPERTIES (
21 | 'serialization.format' = ',',
22 | 'field.delim' = ','
23 | ) LOCATION 's3://{}/'
24 | TBLPROPERTIES ('has_encrypted_data'='false','skip.header.line.count'='1');
25 |
26 |
27 |
28 |
29 | CREATE TABLE helloworld.helloparquet
30 | WITH (
31 | format = 'PARQUET',
32 | parquet_compression = 'SNAPPY',
33 | external_location = 's3://{}'
34 | ) AS SELECT * FROM helloworld.hellocsv ;
35 |
36 |
37 | Inferschema:
38 | -------------
39 | --drop database if required
40 | drop database ramu;
41 | --Create Database
42 | create database if not exists ramu;
43 | --use the database
44 | use ramu;
45 | --create file format
46 | create file format parquet_format TYPE=parquet;
47 | --create external stage
48 | create or replace stage ramu.PUBLIC.snow_simple url="s3://{}/"
49 | credentials=(aws_key_id='{}'
50 | aws_secret_key='{}')
51 | file_format = parquet_format;
52 | --list stage elements
53 | list @ramu.PUBLIC.snow_simple;
54 |
55 | select * from TABLE(INFER_SCHEMA (LOCATION=>'@ramu.PUBLIC.snow_simple',FILE_FORMAT=>'parquet_format'));
56 |
57 |
58 | CREATE TABLE … USING TEMPLATE:
59 | -------------------------------
60 | Recap:
61 | ------
62 | create table demo_table_1 (province varchar, created_date date);
63 | insert into demo_table_1 (province, created_date) values
64 | ('Manitoba', '2020-01-18'::date),
65 | ('Alberta', '2020-01-19'::date);
66 |
67 | select * from demo_table_1;
68 | +----------+--------------+
69 | | PROVINCE | CREATED_DATE |
70 | |----------+--------------|
71 | | Manitoba | 2020-01-18 |
72 | | Alberta | 2020-01-19 |
73 | +----------+--------------+
74 |
75 |
76 | select object_construct(*) from demo_table_1;
77 | +---------------------------------+
78 | | OBJECT_CONSTRUCT(*) |
79 | |---------------------------------|
80 | | { |
81 | | "CREATED_DATE": "2020-01-18", |
82 | | "PROVINCE": "Manitoba" |
83 | | } |
84 | | { |
85 | | "CREATED_DATE": "2020-01-19", |
86 | | "PROVINCE": "Alberta" |
87 | | } |
88 | +---------------------------------+
89 |
90 |
91 | select array_agg(object_construct(*)) from demo_table_1;
92 |
93 | [
94 | {
95 | "CREATED_DATE": "2020-01-18",
96 | "PROVINCE": "Manitoba"
97 | },
98 | {
99 | "CREATED_DATE": "2020-01-19",
100 | "PROVINCE": "Alberta"
101 | }
102 | ]
103 |
104 |
105 | select ARRAY_AGG(OBJECT_CONSTRUCT(*)) from TABLE(INFER_SCHEMA (LOCATION=>'@ramu.PUBLIC.snow_simple',FILE_FORMAT=>'parquet_format'));
106 |
107 | create or replace table helloparquet using template(select ARRAY_AGG(OBJECT_CONSTRUCT(*)) from TABLE(INFER_SCHEMA (LOCATION=>'@ramu.PUBLIC.snow_simple',FILE_FORMAT=>'parquet_format')));
108 |
109 | --load data
110 | copy into ramu.PUBLIC.helloparquet from @ramu.PUBLIC.snow_simple MATCH_BY_COLUMN_NAME=CASE_INSENSITIVE FILE_FORMAT=parquet_format;
111 |
112 | select * from helloparquet;
113 |
114 | GENERATE_COLUMN_DESCRIPTION:
115 | ----------------------------
116 | select generate_column_description(ARRAY_AGG(OBJECT_CONSTRUCT(*)),'table') from TABLE(INFER_SCHEMA (LOCATION=>'@ramu.PUBLIC.snow_simple',FILE_FORMAT=>'parquet_format')) ;
117 |
118 |
119 | select generate_column_description(ARRAY_AGG(OBJECT_CONSTRUCT(*)),'view') from TABLE(INFER_SCHEMA (LOCATION=>'@ramu.PUBLIC.snow_simple',FILE_FORMAT=>'parquet_format')) ;
120 |
121 | select generate_column_description(ARRAY_AGG(OBJECT_CONSTRUCT(*)),'external_table') from TABLE(INFER_SCHEMA (LOCATION=>'@ramu.PUBLIC.snow_simple',FILE_FORMAT=>'parquet_format')) ;
122 |
123 | --generate complete ddl script using string concatenation
124 | select 'create or replace external table ramu.PUBLIC.Iris_dataset( ' ||
125 | generate_column_description(ARRAY_AGG(OBJECT_CONSTRUCT(*)),'external_table') ||
126 | ') with location = @ramu.PUBLIC.snow_simple file_format =''parquet_format''' from TABLE(INFER_SCHEMA (LOCATION=>'@ramu.PUBLIC.snow_simple',FILE_FORMAT=>'parquet_format')) ;
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
--------------------------------------------------------------------------------
/Snowflake Stored Porcedure Parallel execution (Part 1).txt:
--------------------------------------------------------------------------------
1 | --drop database if required
2 | drop database if exists ramu;
3 | --Create Database
4 | create database if not exists ramu;
5 | --use the database
6 | use ramu;
7 |
8 |
9 |
10 | // Prepare table
11 | CREATE OR REPLACE TABLE video_demo (
12 | ID INT AUTOINCREMENT START = 1 INCREMENT =1,
13 | NAME VARCHAR(40) DEFAULT 'DemoYoutube' ,
14 | CREATE_DATE timestamp);
15 |
16 |
17 | create or replace procedure dummy_executor()
18 | returns string not null
19 | language javascript
20 | as
21 | $$
22 | var my_sql_command = "INSERT INTO video_demo(CREATE_DATE) VALUES(CURRENT_TIMESTAMP)";
23 | var statement1 = snowflake.createStatement( {sqlText: my_sql_command} );
24 | var result_set1 = statement1.execute();
25 | return "Successfully Executed"
26 | $$
27 | ;
28 |
29 |
30 | set my_variable=(select current_timestamp());
31 |
32 | select $my_variable;
33 |
34 |
35 |
36 |
37 | // Create task
38 | CREATE OR REPLACE TASK INSERT_DATA_SET_123456789
39 | WAREHOUSE = COMPUTE_WH
40 | SCHEDULE = '1 MINUTE'
41 | AS call dummy_executor()
42 | ;
43 |
44 |
45 | SHOW TASKS;
46 |
47 | select * from video_demo;
48 |
49 | // Task starting and suspending
50 | ALTER TASK INSERT_DATA_SET_123456789 RESUME;
51 |
52 |
53 | SELECT * FROM TABLE(INFORMATION_SCHEMA.TASK_HISTORY()) WHERE SCHEMA_NAME = 'PUBLIC' AND NAME = 'INSERT_DATA_SET_123456789' ;
54 |
55 |
56 |
57 | SELECT * FROM TABLE(INFORMATION_SCHEMA.TASK_HISTORY())
58 | WHERE SCHEMA_NAME = 'PUBLIC' AND NAME = 'INSERT_DATA_SET_123456789' AND QUERY_START_TIME IS NOT NULL AND SCHEDULED_TIME > $my_variable
59 | ORDER BY SCHEDULED_TIME DESC LIMIT 1;
60 |
61 | SELECT COALESCE(QUERY_ID, '') AS QUERY_ID FROM TABLE(INFORMATION_SCHEMA.TASK_HISTORY())
62 | WHERE SCHEMA_NAME = 'PUBLIC' AND NAME = 'INSERT_DATA_SET_123456789' AND QUERY_START_TIME IS NOT NULL AND SCHEDULED_TIME > $my_variable
63 | ORDER BY SCHEDULED_TIME DESC LIMIT 1;
64 |
65 |
66 |
67 |
68 | ALTER TASK INSERT_DATA_SET_123456789 SUSPEND;
69 |
70 |
71 |
72 |
73 | create or replace procedure dummy_executoryt123()
74 | returns string not null
75 | language javascript
76 | EXECUTE AS CALLER
77 | as
78 | $$
79 |
80 | function sleep(milliseconds) {
81 | const date = Date.now();
82 | let currentDate = null;
83 | do {
84 | currentDate = Date.now();
85 | } while (currentDate - date < milliseconds);
86 | };
87 |
88 |
89 |
90 | var v_Now = new Date().toISOString();
91 |
92 |
93 | var query = 'CREATE OR REPLACE TASK RAMU.PUBLIC.MULTITHREAD_CHILD_12345678 WAREHOUSE=COMPUTE_WH SCHEDULE = \'1 MINUTE\' AS call RAMU.PUBLIC.dummy_executor();';
94 | var statement = snowflake.createStatement( {sqlText: query} );
95 | var result_set = statement.execute();
96 |
97 | query = 'ALTER TASK RAMU.PUBLIC.MULTITHREAD_CHILD_12345678 RESUME;';
98 | var statement = snowflake.createStatement( {sqlText: query} );
99 | var result_set = statement.execute();
100 |
101 | var v_QueryID = '';
102 |
103 | sleep(90000);
104 |
105 | while (v_QueryID == '')
106 | {
107 |
108 | v_Query = 'SELECT COALESCE(QUERY_ID, \'\') AS QUERY_ID FROM TABLE(INFORMATION_SCHEMA.TASK_HISTORY()) \
109 | WHERE SCHEMA_NAME = \'PUBLIC\' AND NAME = \'MULTITHREAD_CHILD_12345678\' AND QUERY_START_TIME IS NOT NULL AND SCHEDULED_TIME > \'' + v_Now + '\' \
110 | ORDER BY SCHEDULED_TIME DESC LIMIT 1;';
111 |
112 |
113 | v_Statement = snowflake.createStatement( {sqlText: v_Query} );
114 | rs = v_Statement.execute();
115 |
116 | while (rs.next())
117 | {
118 | v_QueryID = rs.getColumnValue('QUERY_ID');
119 | };
120 |
121 | sleep(10000);
122 | };
123 |
124 | // Suspend the main task & clean up all tasks
125 | v_Query = 'ALTER TASK RAMU.PUBLIC.MULTITHREAD_CHILD_12345678 SUSPEND;';
126 | snowflake.execute( {sqlText: v_Query} );
127 | v_Statement = snowflake.createStatement( {sqlText: v_Query} );
128 | v_Statement.execute();
129 | return "Success"
130 | $$
131 | ;
132 |
133 |
134 | call dummy_executoryt123();
135 |
136 |
137 | select * from video_demo;
138 |
--------------------------------------------------------------------------------
/Snowflake logging (1).txt:
--------------------------------------------------------------------------------
1 | DROP DATABASE IF EXISTS RAMU;
2 |
3 | CREATE DATABASE RAMU;
4 |
5 | USE RAMU;
6 |
7 | CREATE or replace TABLE log_storer(log_key number ,Stored_procedure_name text,Success_status VARCHAR(1),log_message Text,Start_time String,End_time String);
8 |
9 | CREATE OR REPLACE PROCEDURE open_sp_process(Stored_procedure_name STRING)
10 | RETURNS STRING
11 | LANGUAGE JAVASCRIPT
12 | AS
13 | $$
14 |
15 | var my_sql_command_date = "select to_char(current_timestamp(2)) as curr_time";
16 | var statement1 = snowflake.createStatement( {sqlText: my_sql_command_date} );
17 | var result_set1 = statement1.execute();
18 | result_set1.next();
19 | var time_info= result_set1.getColumnValue(1);
20 |
21 | var my_sql_command_2 = "select coalesce( max(log_key),0 ) from log_storer";
22 | var statement2 = snowflake.createStatement( {sqlText: my_sql_command_2} );
23 | var result_set2 = statement2.execute();
24 | result_set2.next();
25 | var log_key_for_the_entry= result_set2.getColumnValue(1)+1;
26 |
27 | var my_sql_command_3 = "INSERT INTO log_storer (log_key,Stored_procedure_name,Success_status,Start_time) values ("+log_key_for_the_entry+",'"+STORED_PROCEDURE_NAME+"','"+'I'+"','"+time_info+"')";
28 | var statement3 = snowflake.createStatement( {sqlText: my_sql_command_3} );
29 | var result_set3 = statement3.execute();
30 |
31 |
32 | return log_key_for_the_entry;
33 | $$;
34 |
35 |
36 | CREATE OR REPLACE PROCEDURE close_sp_process(log_key String,success_status String,log_message STRING)
37 | RETURNS String
38 | LANGUAGE JAVASCRIPT
39 | AS
40 | $$
41 |
42 | var my_sql_command_date = "select to_char(current_timestamp(2)) as curr_time";
43 | var statement1 = snowflake.createStatement( {sqlText: my_sql_command_date} );
44 | var result_set1 = statement1.execute();
45 | result_set1.next();
46 | var time_info= result_set1.getColumnValue(1);
47 |
48 |
49 | var my_sql_command_3 = "UPDATE log_storer set Success_status='"+SUCCESS_STATUS+"',log_message='"+LOG_MESSAGE+"',End_time='"+time_info+"' where log_key="+LOG_KEY;
50 | var statement3 = snowflake.createStatement( {sqlText: my_sql_command_3} );
51 | var result_set3 = statement3.execute();
52 | return 'Done'
53 | $$;
54 |
55 |
56 |
57 |
58 | --Case 1: Testing correct Procedure
59 |
60 | CREATE OR REPLACE PROCEDURE my_test1()
61 | RETURNS STRING
62 | LANGUAGE JAVASCRIPT
63 | AS
64 | $$
65 | function sql_runner_with_return(sql_to_be_executed)
66 | {
67 | var my_sql_command = sql_to_be_executed;
68 | var statement1 = snowflake.createStatement( {sqlText: my_sql_command} );
69 | var result_set1 = statement1.execute();
70 | result_set1.next();
71 | var log_key= result_set1.getColumnValue(1);
72 | return log_key
73 | }
74 |
75 | function sql_runner_without_return(sql_to_be_executed)
76 | {
77 | var my_sql_command = sql_to_be_executed;
78 | var statement1 = snowflake.createStatement( {sqlText: my_sql_command} );
79 | var result_set1 = statement1.execute();
80 | }
81 |
82 | try
83 | {
84 | var log_key=sql_runner_with_return("call open_sp_process('RAMU.PUBLIC.my_test')")
85 | var x=10/10;
86 | sql_runner_with_return("call system$wait(10)");
87 | var closing_command ="call close_sp_process("+log_key+",'C','Stored Procedure Successfully completed')"
88 | sql_runner_without_return(closing_command)
89 | return log_key;
90 | }
91 | catch(ERROR)
92 | {
93 | var closing_command ="call close_sp_process("+log_key+",'E','"+ERROR+"')"
94 | sql_runner_without_return(closing_command)
95 | return ERROR
96 | }
97 | $$;
98 |
99 | select * from log_storer;
100 |
101 | call my_test1();
102 |
103 | select * from log_storer;
104 |
105 | --Case 2: Testing wrong Procedure
106 |
107 | CREATE OR REPLACE PROCEDURE my_test2()
108 | RETURNS STRING
109 | LANGUAGE JAVASCRIPT
110 | AS
111 | $$
112 | function sql_runner_with_return(sql_to_be_executed)
113 | {
114 | var my_sql_command = sql_to_be_executed;
115 | var statement1 = snowflake.createStatement( {sqlText: my_sql_command} );
116 | var result_set1 = statement1.execute();
117 | result_set1.next();
118 | var log_key= result_set1.getColumnValue(1);
119 | return log_key
120 | }
121 | function sql_runner_without_return(sql_to_be_executed)
122 | {
123 | var my_sql_command = sql_to_be_executed;
124 | var statement1 = snowflake.createStatement( {sqlText: my_sql_command} );
125 | var result_set1 = statement1.execute();
126 | }
127 | try
128 | {
129 | var log_key=sql_runner_with_return("call open_sp_process('RAMU.PUBLIC.my_test')")
130 | var x=10/10;
131 | sql_runner_with_return("call system$wait(10)");
132 | SUM(2,3)
133 | var closing_command ="call close_sp_process("+log_key+",'C','Stored Procedure Successfully completed')"
134 | sql_runner_without_return(closing_command)
135 | return log_key;
136 | }
137 | catch(ERROR)
138 | {
139 | var closing_command ="call close_sp_process("+log_key+",'E','"+ERROR+"')"
140 | sql_runner_without_return(closing_command)
141 | return ERROR
142 | }
143 | $$;
144 |
145 |
146 | call my_test2();
147 |
--------------------------------------------------------------------------------
/Snowflake_SP_Util.py:
--------------------------------------------------------------------------------
1 |
2 | --Create Database
3 | Create or replace database snowpark_helper;
4 | use snowpark_helper;
5 |
6 | -- Create a sample table
7 | CREATE OR REPLACE TABLE sample_table (
8 | id INTEGER,
9 | name STRING,
10 | score INTEGER
11 | );
12 |
13 | -- Insert sample data into the table
14 | INSERT INTO sample_table (id, name, score)
15 | VALUES
16 | (1, 'ALICE', 85),
17 | (2, 'BOB', 90),
18 | (3, 'CHARLIE', 75),
19 | (4, 'DAVID', 60),
20 | (5, 'EVE', 95);
21 |
22 | select * from sample_table;
23 |
24 |
25 | CREATE OR REPLACE PROCEDURE process_data()
26 | RETURNS STRING
27 | LANGUAGE PYTHON
28 | RUNTIME_VERSION = '3.8'
29 | PACKAGES = ('snowflake-snowpark-python')
30 | Handler='run'
31 | EXECUTE AS CALLER
32 | AS
33 | $$
34 | from snowflake.snowpark import Session
35 | from snowflake.snowpark.functions import *
36 |
37 | def convert_string_to_lowercase(df):
38 | return df.with_column("name", lower(df["name"]))
39 |
40 | def filter_based_on_score(df, threshold):
41 | return df.filter(df["score"] > threshold)
42 |
43 | def write_data_to_table(df, target_table):
44 | df.write.save_as_table(target_table, mode="overwrite")
45 |
46 | def run(session: Session) -> str:
47 | # Load the data from the source table
48 | df = session.table("sample_table")
49 |
50 | # Apply the transformations
51 | df_lowercase = convert_string_to_lowercase(df)
52 | df_filtered = filter_based_on_score(df_lowercase, 80)
53 |
54 | # Write the transformed data to the target table
55 | target_table = "transformed_table"
56 | write_data_to_table(df_filtered, target_table)
57 |
58 | return "Data transformation and write successful!"
59 |
60 | $$;
61 |
62 | call process_data();
63 |
64 | select * from transformed_table;
65 |
66 |
67 | --concept of imports
68 | Create or replace stage snowpark_helper.PUBLIC.snowpark_reusable_code url="s3://{}/"
69 | credentials=(aws_key_id=''
70 | aws_secret_key='');
71 |
72 | list @snowpark_helper.PUBLIC.snowpark_reusable_code;
73 |
74 |
75 |
76 | CREATE OR REPLACE PROCEDURE process_data_with_util()
77 | RETURNS STRING
78 | LANGUAGE PYTHON
79 | RUNTIME_VERSION = '3.8'
80 | PACKAGES = ('snowflake-snowpark-python')
81 | IMPORTS = ('@snowpark_helper.PUBLIC.snowpark_reusable_code/snowpark_modules.zip')
82 | Handler='run'
83 | EXECUTE AS CALLER
84 | AS
85 | $$
86 | from snowflake.snowpark import Session
87 | from snowflake.snowpark.functions import *
88 | from snowpark_modules.transformation import *
89 | from snowpark_modules.write_data import *
90 |
91 | def run(session: Session) -> str:
92 | # Load the data from the source table
93 | df = session.table("sample_table")
94 |
95 | # Apply the transformations
96 | df_lowercase = convert_string_to_lowercase(df)
97 | df_filtered = filter_based_on_score(df_lowercase, 80)
98 |
99 | # Write the transformed data to the target table
100 | target_table = "transformed_table_with_util"
101 | write_data_to_table(df_filtered, target_table)
102 |
103 | return "Data transformation and write successful!"
104 |
105 | $$;
106 |
107 | call process_data_with_util();
108 |
109 | select * from SNOWPARK_HELPER.PUBLIC.TRANSFORMED_TABLE_WITH_UTIL;
110 |
111 |
112 |
113 |
114 |
115 | CREATE OR REPLACE PROCEDURE process_data_with_util_and_param(threshold_value Integer)
116 | RETURNS STRING
117 | LANGUAGE PYTHON
118 | RUNTIME_VERSION = '3.8'
119 | PACKAGES = ('snowflake-snowpark-python')
120 | IMPORTS = ('@snowpark_helper.PUBLIC.snowpark_reusable_code/tf.zip')
121 | Handler='run'
122 | EXECUTE AS CALLER
123 | AS
124 | $$
125 | from snowflake.snowpark import Session
126 | from snowflake.snowpark.functions import *
127 | from tf.transformation import *
128 | from tf.write_data import *
129 |
130 | def run(session: Session,threshold_value) -> str:
131 | # Load the data from the source table
132 | df = session.table("sample_table")
133 |
134 | # Apply the transformations
135 | df_lowercase = convert_string_to_lowercase(df)
136 | df_filtered = filter_based_on_score(df_lowercase, threshold_value)
137 |
138 | # Write the transformed data to the target table
139 | target_table = "transformed_table_with_util_and_param"
140 | write_data_to_table(df_filtered, target_table)
141 |
142 | return "Data transformation and write successful!"
143 |
144 | $$;
145 |
146 | call process_data_with_util_and_param(70);
147 |
148 | select * from SNOWPARK_HELPER.PUBLIC.TRANSFORMED_TABLE_WITH_UTIL_AND_PARAM;
--------------------------------------------------------------------------------
/Snyk Code for Github Action.yml:
--------------------------------------------------------------------------------
1 | name: Example workflow for Python using Snyk
2 | on: push
3 |
4 | jobs:
5 | security:
6 | runs-on: ubuntu-latest
7 | steps:
8 | - uses: actions/checkout@master
9 | - name: Run Snyk to check for vulnerabilities
10 | uses: snyk/actions/python@master
11 | continue-on-error: true # To make sure that SARIF upload gets called
12 | env:
13 | SNYK_TOKEN: ${{ secrets.SNYK_TOKEN }}
14 | with:
15 | command: code test
16 | args: --sarif-file-output=snyk.sarif
17 | - name: Count total number of vulnerabilities
18 | run: |
19 | RESULTS_LENGTH=$(jq '.runs[0].results | length' snyk.sarif)
20 | echo "RESULTS_LENGTH=$RESULTS_LENGTH" >> $GITHUB_ENV
21 | echo $RESULTS_LENGTH
22 | - name: Pass_or_Fail_the_job
23 | run: |
24 | if [ "$RESULTS_LENGTH" != 0 ]; then
25 | echo "Job Failed"
26 | exit 1
27 | else
28 | echo "Pass"
29 | fi
30 |
31 | - name: Send notification on Slack using Webhooks
32 | uses: slackapi/slack-github-action@v1.24.0
33 | if: always()
34 | with:
35 | payload: |
36 | {
37 | "text": "*The Snyk scan result for repo is : ${{ job.status }}* \n*Number of vulnerabilities : ${{ env.RESULTS_LENGTH }}* \n*Detail*: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}"
38 | }
39 | env:
40 | SLACK_WEBHOOK_URL: ${{ secrets.slack_webhook_url }}
--------------------------------------------------------------------------------
/Talend with EMR & Snowflake.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/Talend with EMR & Snowflake.png
--------------------------------------------------------------------------------
/Time Traven in Snowflake.txt:
--------------------------------------------------------------------------------
1 | use database Ramu;
2 |
3 | create or replace TABLE BUSINESSES (
4 | BUSINESS_ID VARCHAR(100),
5 | NAME VARCHAR(100),
6 | CITY VARCHAR(50),
7 | STATE VARCHAR(2),
8 | REVIEW_COUNT NUMBER(38,4),
9 | STARS NUMBER(38,4)
10 | );
11 |
12 | INSERT INTO BUSINESSES VALUES
13 | ('QNcv3mwnHJ5w4YB4giqkWw','Preferred Veterinary Care','Pittsburgh','PA',4,3.5),
14 | ('oZG8sxDL54ki9pmDfyL7rA','Not My Dog','Toronto','ON',9,3.5),
15 | ('S06JfRM3ICESOHc1pr3LOA','Chase Bank','Las Vegas','NV',3,5.0),
16 | ('NL_BfZ4BkQXJSYAFouJqsQ','24 hr lockouts','Las Vegas','NV',3,1.0),
17 | ('AnUyv2zHq_35gCeHr8555w','Soma Restaurant','Las Vegas','NV',12,3.0),
18 | ('jjBTBObnHrY87qQIMybjzQ','Blue Jade','Cleveland','OH',24,3.5),
19 | ('PhL85G9Y6OstQzThDIllMQ','Animalerie Little Bear','Westmount','QC',9,4.0),
20 | ('SkRqx-hxVPLgV4K5hxNa9g','Parkview Dental Associates','Sun Prairie','WI',4,3.0),
21 | ('tWX7j4Qg4cXofQqmoNKH3A','Sir Hobbs','Sun Prairie','WI',35,3.0),
22 | ('4a9Rypytzdz9NZuGMS2ZYw','Rogue Bar','Scottsdale','AZ',80,3.5),
23 | ('oYWy-hOTCOF7h8DCAZ_Mxw','Cool Girl','Toronto','ON',48,3.5),
24 | ('AMxxi7jyxhcdNF7FIRbUVA','Remington''s Restaurant','Scottsdale','AZ',103,3.0),
25 | ('d01d-w7pxHrMCX5mDwaaHQ','D Liche','Montréal','QC',89,4.5),
26 | ('66DKb6APF96InEKrUVIbZw','Allo Inde','Montréal','QC',3,3.5);
27 |
28 |
29 | SELECT * from BUSINESSES
30 | WHERE City='Las Vegas';
31 |
32 | ALTER SESSION SET TIMEZONE = 'UTC';
33 | select getdate();
34 | --2021-02-12 10:55:49.190 +0000
35 |
36 | SELECT * from BUSINESSES
37 | WHERE City='Las Vegas';
38 |
39 | SELECT * from BUSINESSES at(timestamp => '2021-02-12 10:55:49.190 +0000'::timestamp)
40 | WHERE City='Las Vegas';
41 |
42 | INSERT INTO BUSINESSES
43 | SELECT * from BUSINESSES at(timestamp => '2021-02-12 10:55:49.190 +0000'::timestamp)
44 | WHERE City='Las Vegas';
45 |
46 | SELECT * from BUSINESSES
47 | WHERE City='Las Vegas';
48 |
--------------------------------------------------------------------------------
/Unstructured Data processing with Snowflake.txt:
--------------------------------------------------------------------------------
1 | --drop database if exists
2 | drop database if exists s3_to_snowflake;
3 |
4 | --Database Creation
5 | create database if not exists s3_to_snowflake;
6 |
7 | --Use the database
8 | use s3_to_snowflake;
9 |
10 |
11 | --create the external stage
12 | create or replace stage s3_to_snowflake.PUBLIC.Snow_stage url="{s3 location}"
13 | credentials=(aws_key_id='{AWS Access Key}'
14 | aws_secret_key='{AWS Secret Key}');
15 |
16 |
17 | list @s3_to_snowflake.PUBLIC.Snow_stage;
18 |
19 |
20 | CREATE OR REPLACE PROCEDURE count_no_of_pages_sp(file_name string)
21 | RETURNS integer
22 | LANGUAGE PYTHON
23 | RUNTIME_VERSION = '3.8'
24 | PACKAGES = ('snowflake-snowpark-python','PyPDF2')
25 | HANDLER = 'main_fn'
26 | AS
27 | $$
28 | from snowflake.snowpark.files import SnowflakeFile
29 | import PyPDF2
30 | def main_fn(session,file_name):
31 | f=SnowflakeFile.open(file_name,'rb')
32 | pdf_object=PyPDF2.PdfReader(f);
33 | return len(pdf_object.pages)
34 | $$;
35 |
36 | call count_no_of_pages_sp(BUILD_SCOPED_FILE_URL( @s3_to_snowflake.PUBLIC.Snow_stage , 'crm_system.pdf' ));
37 |
38 | --select BUILD_SCOPED_FILE_URL( @s3_to_snowflake.PUBLIC.Snow_stage , 'upgrades-white-paper-final.pdf' );
39 |
40 |
41 | CREATE OR REPLACE function count_no_of_pages_udf(file_name string)
42 | RETURNS integer
43 | LANGUAGE PYTHON
44 | RUNTIME_VERSION = '3.8'
45 | PACKAGES = ('snowflake-snowpark-python','PyPDF2')
46 | HANDLER = 'main_fn'
47 | AS
48 | $$
49 | from snowflake.snowpark.files import SnowflakeFile
50 | import PyPDF2
51 | def main_fn(file_name):
52 | f=SnowflakeFile.open(file_name,'rb')
53 | pdf_object=PyPDF2.PdfReader(f);
54 | return len(pdf_object.pages)
55 | $$;
56 |
57 | select count_no_of_pages_udf(BUILD_SCOPED_FILE_URL( @s3_to_snowflake.PUBLIC.Snow_stage , 'sample_pdf_1.pdf' ));
58 |
59 | list @s3_to_snowflake.PUBLIC.Snow_stage;
60 | SELECT "name",count_no_of_pages_udf(BUILD_SCOPED_FILE_URL( @s3_to_snowflake.PUBLIC.Snow_stage , file_name )) as pdf_page_count FROM
61 | (select "name",split_part("name",'/',-1) as file_name from TABLE(RESULT_SCAN(LAST_QUERY_ID())));
62 |
63 |
64 |
65 |
66 | --create the external stage with directory table enabled
67 | create or replace stage s3_to_snowflake.PUBLIC.Snow_stage_directory_table_yt url="{s3 location}"
68 | credentials=(aws_key_id='{AWS Access Key}'
69 | aws_secret_key='{AWS Secret Key}')
70 | Directory=(ENABLE=TRUE);
71 |
72 | list @s3_to_snowflake.PUBLIC.Snow_stage_directory_table_yt;
73 |
74 | select * from directory(@s3_to_snowflake.PUBLIC.Snow_stage_directory_table_yt);
75 |
76 | SELECT RELATIVE_PATH,count_no_of_pages_udf(BUILD_SCOPED_FILE_URL( @s3_to_snowflake.PUBLIC.Snow_stage_directory_table_yt , RELATIVE_PATH )) as pdf_page_count FROM directory(@s3_to_snowflake.PUBLIC.Snow_stage_directory_table_yt);
77 |
78 | alter stage s3_to_snowflake.PUBLIC.Snow_stage_directory_table_yt refresh;
79 |
80 | --create the external stage with directory table enabled & automatic refresh
81 | create or replace stage s3_to_snowflake.PUBLIC.Snow_stage_directory_table_yt_auto_refresh url="{s3 location}"
82 | credentials=(aws_key_id='{AWS Access Key}'
83 | aws_secret_key='{AWS Secret Key}')
84 | Directory=(ENABLE=TRUE AUTO_REFRESH=TRUE);
85 |
86 | desc stage s3_to_snowflake.PUBLIC.Snow_stage_directory_table_yt_auto_refresh;
87 |
88 |
89 | SELECT RELATIVE_PATH,count_no_of_pages_udf(BUILD_SCOPED_FILE_URL( @s3_to_snowflake.PUBLIC.Snow_stage_directory_table_yt_auto_refresh , RELATIVE_PATH )) as pdf_page_count FROM directory(@s3_to_snowflake.PUBLIC.Snow_stage_directory_table_yt_auto_refresh);
--------------------------------------------------------------------------------
/Updated GenAI Notes.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/Updated GenAI Notes.pdf
--------------------------------------------------------------------------------
/_Manual Offset Commits & At Least Once Processing in Kafka Consumer.drawio.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/_Manual Offset Commits & At Least Once Processing in Kafka Consumer.drawio.png
--------------------------------------------------------------------------------
/airflow_emr_s3_snowflake_setup.txt:
--------------------------------------------------------------------------------
1 |
2 |
3 | sudo apt update
4 | sudo apt install -y python3-pip
5 | sudo apt install -y sqlite3
6 | sudo apt-get install -y libpq-dev
7 | pip3 install --upgrade awscli
8 | pip3 install boto3
9 | sudo pip3 install virtualenv
10 | virtualenv venv
11 | source venv/bin/activate
12 | pip install "apache-airflow[postgres]==2.5.0" --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-2.5.0/constraints-3.7.txt"
13 | pip install pandas apache-airflow-providers-snowflake==2.1.0 snowflake-connector-python==2.5.1 snowflake-sqlalchemy==1.2.5
14 | airflow db init
15 | sudo apt-get install postgresql postgresql-contrib
16 | sudo -i -u postgres
17 | psql
18 | CREATE DATABASE airflow;
19 | CREATE USER airflow WITH PASSWORD 'airflow';
20 | GRANT ALL PRIVILEGES ON DATABASE airflow TO airflow;
21 | exit
22 | exit
23 | ls
24 | cd airflow
25 | sed -i 's#sqlite:////home/ubuntu/airflow/airflow.db#postgresql+psycopg2://airflow:airflow@localhost/airflow#g' airflow.cfg
26 | sed -i 's#SequentialExecutor#LocalExecutor#g' airflow.cfg
27 | airflow db init
28 | airflow users create -u airflow -f airflow -l airflow -r Admin -e airflow1@gmail.com
29 | User id --airflow
30 | password--admin@123!
31 | mkdir /home/ubuntu/dags
32 | cd airflow
33 | vi airflow.cfg
34 | change the below properties --
35 | dags_folder = /home/ubuntu/dags
36 | load_examples = False
37 |
38 |
39 |
40 |
41 | airflow db init
42 | airflow webserver
43 |
44 | source venv/bin/activate
45 | airflow scheduler
46 |
47 | Create external stage & tables in Snowflake
48 |
49 | Snowflake Queries:
50 | ------------------------
51 | drop database if exists s3_to_snowflake;
52 |
53 | use role accountadmin;
54 | --Database Creation
55 | create database if not exists s3_to_snowflake;
56 |
57 | --Specify the active/current database for the session.
58 | use s3_to_snowflake;
59 |
60 |
61 |
62 |
63 | create or replace stage s3_to_snowflake.PUBLIC.snow_simple url="s3://irisseta/output_folder/"
64 | credentials=(aws_key_id=''
65 | aws_secret_key='');
66 |
67 |
68 |
69 | list @s3_to_snowflake.PUBLIC.snow_simple;
70 |
71 | --File Format Creation
72 | create or replace file format my_parquet_format
73 | type = parquet;
74 |
75 |
76 |
77 | --Table Creation
78 | create or replace external table s3_to_snowflake.PUBLIC.Iris_dataset (CLASS_NAME varchar(20) as (Value:CLASS_NAME::varchar),
79 | Count_Value Number as (Value:count::Number)) with location = @s3_to_snowflake.PUBLIC.snow_simple
80 | file_format ='my_parquet_format';
81 |
82 |
83 | select * from s3_to_snowflake.PUBLIC.Iris_dataset;
84 |
85 |
86 |
87 | Create Snowflake Connection
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
--------------------------------------------------------------------------------
/airflow_install.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash -xe
2 | upload_log() {
3 | aws s3 cp /tmp/userdata_execution.log s3://demoytuserdataairflow/logs/
4 | }
5 |
6 | trap 'upload_log' EXIT
7 |
8 | sudo -u ubuntu -i <<'EOF'
9 |
10 | exec &>> /tmp/userdata_execution.log
11 |
12 |
13 | sudo apt update
14 | sudo apt -y install awscli
15 | sudo apt --yes install python3-pip
16 | sudo apt --yes install sqlite3
17 | sudo apt-get --yes install libpq-dev
18 | pip3 install --upgrade awscli
19 | sudo pip3 install virtualenv
20 | python3 -m virtualenv /home/ubuntu/venv
21 | source /home/ubuntu/venv/bin/activate
22 | pip install "apache-airflow[postgres]==2.5.0" --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-2.5.0/constraints-3.7.txt" pandas boto3
23 | airflow db init
24 | sudo apt-get --yes install postgresql postgresql-contrib
25 | sudo -i -u postgres <<'EOpostgres'
26 | psql -U postgres -c "CREATE DATABASE airflow;"
27 | psql -U postgres -c "CREATE USER airflow WITH PASSWORD 'airflow';"
28 | psql -U postgres -c "GRANT ALL PRIVILEGES ON DATABASE airflow TO airflow;"
29 | EOpostgres
30 | sed -i 's#sqlite:////home/ubuntu/airflow/airflow.db#postgresql+psycopg2://airflow:airflow@localhost/airflow#g' /home/ubuntu/airflow/airflow.cfg
31 | sed -i 's#SequentialExecutor#LocalExecutor#g' /home/ubuntu/airflow/airflow.cfg
32 | airflow db init
33 | airflow users create -u airflow -f airflow -l airflow -r Admin -e airflow@gmail.com -p admin@123!
34 | mkdir /home/ubuntu/dags
35 | aws s3 cp s3://demoytuserdataairflow/dags/hello_world.py /home/ubuntu/dags
36 | sed -i 's/^dags_folder = .*/dags_folder = \/home\/ubuntu\/dags/' /home/ubuntu/airflow/airflow.cfg
37 | sed -i 's/^load_examples = .*/load_examples = False/' /home/ubuntu/airflow/airflow.cfg
38 | airflow db init
39 | EOF
--------------------------------------------------------------------------------
/airflow_news_data_pipeline.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import airflow
3 | from airflow import DAG
4 | from airflow.operators.python_operator import PythonOperator
5 | from airflow.contrib.operators.snowflake_operator import SnowflakeOperator
6 | from airflow.contrib.hooks.snowflake_hook import SnowflakeHook
7 | from airflow.operators.bash_operator import BashOperator
8 | from datetime import datetime, timedelta
9 | from news_fetcher_etl import runner
10 | logging.basicConfig(level=logging.INFO)
11 | logger = logging.getLogger(__name__)
12 |
13 |
14 | args = {"owner": "Airflow", "start_date": airflow.utils.dates.days_ago(2)}
15 |
16 | dag = DAG(
17 | dag_id="snowflake_automation_dag", default_args=args, schedule_interval=None
18 | )
19 |
20 |
21 |
22 |
23 | with dag:
24 |
25 | extract_news_info = PythonOperator(
26 | task_id='extract_news_info',
27 | python_callable=runner,
28 | dag=dag,
29 | )
30 |
31 | move_file_to_s3 = BashOperator(
32 | task_id="move_file_to_s3",
33 | bash_command='aws s3 mv {{ ti.xcom_pull("extract_news_info")}} s3://irisseta',
34 | )
35 |
36 | snowflake_create_table=SnowflakeOperator(
37 | task_id="snowflake_create_table",
38 | sql="""create table if not exists helloparquet using template(select ARRAY_AGG(OBJECT_CONSTRUCT(*)) from TABLE(INFER_SCHEMA (LOCATION=>'@ramu.PUBLIC.snow_simple',FILE_FORMAT=>'parquet_format')))
39 | """ ,
40 | snowflake_conn_id="snowflake_conn"
41 | )
42 |
43 |
44 | snowflake_copy=SnowflakeOperator(
45 | task_id="snowflake_copy",
46 | sql="""copy into ramu.PUBLIC.helloparquet from @ramu.PUBLIC.snow_simple MATCH_BY_COLUMN_NAME=CASE_INSENSITIVE FILE_FORMAT=parquet_format
47 | """ ,
48 | snowflake_conn_id="snowflake_conn"
49 | )
50 |
51 |
52 | extract_news_info >> move_file_to_s3 >> snowflake_create_table >> snowflake_copy
--------------------------------------------------------------------------------
/airflow_talend_runner.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import airflow
3 | from airflow import DAG
4 | from airflow.operators.python_operator import PythonOperator
5 | from airflow.operators.bash_operator import BashOperator
6 | from datetime import datetime, timedelta
7 | from datetime import datetime, timedelta
8 | from airflow.models import DAG
9 | from airflow.providers.amazon.aws.sensors.s3 import S3KeySensor
10 | import boto3
11 | logging.basicConfig(level=logging.INFO)
12 | logger = logging.getLogger(__name__)
13 |
14 |
15 | args = {"owner": "Airflow", "start_date": airflow.utils.dates.days_ago(2)}
16 |
17 | dag = DAG(
18 | dag_id="Hello_World", default_args=args, schedule_interval=None
19 | )
20 |
21 |
22 | s3_bucketname = 's3datacapturetestconnector'
23 | s3_loc = 'landing_directory/success.txt'
24 |
25 |
26 |
27 | with dag:
28 |
29 | s3_sensor = S3KeySensor(
30 | task_id='success_flg_check',
31 | bucket_name=s3_bucketname,
32 | bucket_key=s3_loc,
33 | aws_conn_id='aws_default',
34 | mode='poke',
35 | poke_interval=5,
36 | timeout=15,
37 | soft_fail=False
38 | )
39 |
40 | load_data_sf_table=BashOperator(
41 | task_id="move_file_from_s3_to_snowflake",
42 | bash_command='sh /home/ubuntu/dags/elt_runner/elt_runner_run.sh ',
43 | )
44 |
45 | s3_sensor >> load_data_sf_table
--------------------------------------------------------------------------------
/airflow_talend_success_file_snesor.bash:
--------------------------------------------------------------------------------
1 | #!/bin/bash -xe
2 | upload_log() {
3 | aws s3 cp /tmp/userdata_execution.log s3://s3datacapturetestconnector/logs/
4 | }
5 |
6 | trap 'upload_log' EXIT
7 |
8 | sudo -u ubuntu -i <<'EOF'
9 |
10 | exec &>> /tmp/userdata_execution.log
11 |
12 |
13 | sudo apt update
14 | sudo apt-get install -y openjdk-8-jdk
15 | sudo apt-get install -y unzip
16 | sudo apt -y install awscli
17 | sudo apt --yes install python3-pip
18 | sudo apt --yes install sqlite3
19 | sudo apt-get --yes install libpq-dev
20 | pip3 install --upgrade awscli
21 | sudo pip3 install virtualenv
22 | python3 -m virtualenv /home/ubuntu/venv
23 | source /home/ubuntu/venv/bin/activate
24 | pip install "apache-airflow[postgres]==2.5.0" --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-2.5.0/constraints-3.7.txt" pandas boto3
25 | airflow db init
26 | sudo apt-get --yes install postgresql postgresql-contrib
27 | sudo -i -u postgres <<'EOpostgres'
28 | psql -U postgres -c "CREATE DATABASE airflow;"
29 | psql -U postgres -c "CREATE USER airflow WITH PASSWORD 'airflow';"
30 | psql -U postgres -c "GRANT ALL PRIVILEGES ON DATABASE airflow TO airflow;"
31 | EOpostgres
32 | sed -i 's#sqlite:////home/ubuntu/airflow/airflow.db#postgresql+psycopg2://airflow:airflow@localhost/airflow#g' /home/ubuntu/airflow/airflow.cfg
33 | sed -i 's#SequentialExecutor#LocalExecutor#g' /home/ubuntu/airflow/airflow.cfg
34 | airflow db init
35 | airflow users create -u airflow -f airflow -l airflow -r Admin -e airflow@gmail.com -p admin@123!
36 | mkdir /home/ubuntu/dags
37 | pip install apache-airflow-providers-amazon[apache.hive]
38 | aws s3 cp s3://s3datacapturetestconnector/codebase /home/ubuntu/dags --recursive
39 | unzip -d /home/ubuntu/dags /home/ubuntu/dags/elt_runner_0.1.zip
40 | sed -i 's/^dags_folder = .*/dags_folder = \/home\/ubuntu\/dags/' /home/ubuntu/airflow/airflow.cfg
41 | sed -i 's/^load_examples = .*/load_examples = False/' /home/ubuntu/airflow/airflow.cfg
42 | airflow db init
43 | EOF
--------------------------------------------------------------------------------
/algolia_layer1.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/algolia_layer1.zip
--------------------------------------------------------------------------------
/aws-eventbridge-kinesisfirehose-s3.drawio.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/aws-eventbridge-kinesisfirehose-s3.drawio.png
--------------------------------------------------------------------------------
/dbscan_visualization.txt:
--------------------------------------------------------------------------------
1 | clc
2 | clear all
3 | close all
4 | warning off
5 | load matlab.mat
6 | x=[x';x2';x3';t1'];
7 | y=[y';y2';y3';y1'];
8 | for i=1:length(x)
9 | plot(x(i),y(i),'.','MarkerSize',10);
10 | axis([-3 3 -1.5 3.5]);
11 | drawnow limitrate;
12 | hold on;
13 | end
14 | hold on;
15 | data=[x y];
16 | idx=dbscan(data,0.2,15);
17 | for i=1:length(x)
18 | if(idx(i)==1)
19 | plot(x(i),y(i),'r.','MarkerSize',20);
20 | elseif(idx(i)==2)
21 | plot(x(i),y(i),'g.','MarkerSize',20);
22 | elseif(idx(i)==3)
23 | plot(x(i),y(i),'b.','MarkerSize',20);
24 | elseif(idx(i)==4)
25 | plot(x(i),y(i),'m.','MarkerSize',20);
26 | else
27 | plot(x(i),y(i),'c.','MarkerSize',20);
28 | end
29 | drawnow limitrate;
30 | hold on;
31 | end
32 |
--------------------------------------------------------------------------------
/generate_smiling_face_cluster.txt:
--------------------------------------------------------------------------------
1 | clc
2 | clear all
3 | close all
4 | warning off
5 | t=0:0.01:2*pi;
6 | x=2*cos(t)+0.08*randn(length(t),1)';
7 | y=1+2*sin(t)+0.08*randn(length(t),1)';
8 | scatter(x,y);
9 | axis square
10 | hold on
11 | t=0:0.08:2*pi;
12 | x2=-1+0.1*cos(t)+0.08*randn(length(t),1)';
13 | y2=2+0.1*sin(t)+0.08*randn(length(t),1)';
14 | scatter(x2,y2,'k');
15 | hold on;
16 | x3=1+0.1*cos(t)+0.08*randn(length(t),1)';
17 | y3=2+0.1*sin(t)+0.08*randn(length(t),1)';
18 | scatter(x3,y3,'k')
19 | t1=-0.6:0.01:0.6;
20 | y1=t1.^2+0.08*rand(length(t1),1)';
21 | scatter(t1,y1)
--------------------------------------------------------------------------------
/generic_job_s3_to_snowflake_using_copy_command.py:
--------------------------------------------------------------------------------
1 | """
2 | pip install pandas
3 | pip install -r https://raw.githubusercontent.com/snowflakedb/snowflake-connector-python/v2.3.10/tested_requirements/requirements_38.reqs -t .
4 | pip install snowflake-connector-python==2.3.10 -t .
5 | """
6 | import pandas as pd
7 | import snowflake.connector as sf
8 |
9 | user=""
10 | password="@"
11 | account=""
12 | conn=sf.connect(user=user,password=password,account=account)
13 |
14 |
15 | def run_query(query):
16 | print("Executing the query : {}".format(query))
17 | cursor = conn.cursor()
18 | cursor.execute(query)
19 | cursor.close()
20 |
21 |
22 | def run_query_single_value_return(query):
23 | print("Executing the query : {}".format(query))
24 | cursor = conn.cursor()
25 | cursor.execute(query)
26 | records = cursor.fetchone()[0]
27 | cursor.close()
28 | return records
29 |
30 |
31 | def run_copy_command(query):
32 | print("Executing the query : {}".format(query))
33 | cursor = conn.cursor()
34 | cursor.execute(query)
35 | query_id=cursor.sfqid
36 | cursor.close()
37 | return query_id
38 |
39 |
40 | def execute_copy_cmd():
41 | print("Reading the metadata file...")
42 |
43 | df=pd.read_csv('{}')
44 |
45 | for index,row in df.iterrows():
46 | database=row['DATABASE']
47 | schema=row['SCHEMA']
48 | table=row['TABLE_NAME']
49 | external_stage_object=row['STAGE_OBJECT']
50 | s3_file_path=row['S3_FILE_PATH_TO_BE_APPENDED_WITH_STAGE_OBJECT']
51 | warehouse=row['WAREHOUSE']
52 | snowflake_role=row['SNOWFLAKE_ROLE']
53 | file_format=row['FILE_FORMAT']
54 | pattern=row['PATTERN']
55 |
56 |
57 | #set up the env of execution
58 | statement_1 = 'use warehouse ' + warehouse
59 | statement2 = 'alter warehouse ' + warehouse + " resume IF SUSPENDED"
60 | statement3 = "use database " + database
61 | statement4 = "use role " + snowflake_role
62 | statement5= "use schema " + schema
63 | run_query(statement_1)
64 | run_query(statement2)
65 | run_query(statement3)
66 | run_query(statement4)
67 | run_query(statement5)
68 |
69 | #executing the copy command
70 | copy_command="""copy into {}.{}.{} from @{}/{}/ FILE_FORMAT={} PATTERN='{}' ON_ERROR=CONTINUE""".format(database,schema,table,external_stage_object,s3_file_path,file_format,pattern)
71 | query_id_of_the_copy_command=run_copy_command(copy_command)
72 |
73 | #check whether copy command picked up any file or not
74 | detecting_copy_command_picked_up_file_or_not="""SELECT "status" FROM TABLE(RESULT_SCAN('{}')) limit 1;""".format(query_id_of_the_copy_command)
75 | first_value_of_status_in_copy_command_output= run_query_single_value_return(detecting_copy_command_picked_up_file_or_not)
76 | print("First value of result-set of the above copy command execution : {}".format(first_value_of_status_in_copy_command_output))
77 | count_no_of_rows_inserted_due_to_copy_command=0
78 |
79 | if(first_value_of_status_in_copy_command_output!='Copy executed with 0 files processed.'):
80 | #rows inserted by copy command
81 | command_to_get_no_of_rows_inserted_due_to_copy_command="""SELECT sum("rows_loaded") FROM TABLE(RESULT_SCAN('{}'));""".format(query_id_of_the_copy_command)
82 | count_no_of_rows_inserted_due_to_copy_command=run_query_single_value_return(command_to_get_no_of_rows_inserted_due_to_copy_command)
83 | print("No. of rows inserted due to this copy command execution : {}".format(count_no_of_rows_inserted_due_to_copy_command))
84 |
85 | #Capture the rejected records
86 | rejects_collector = """insert into {}.{}.copy_cmd_rejects select '{}' QUERY_ID,'{}' TABLE_NAME, CURRENT_TIMESTAMP(),A.* from table(validate({}.{}.{},job_id=>'{}')) A""".format(database,schema,query_id_of_the_copy_command,table,database,schema,table,query_id_of_the_copy_command)
87 | run_query(rejects_collector)
88 |
89 |
90 | #get total number of rejected records
91 | rejected_records="select count(distinct ROW_NUMBER) from {}.{}.copy_cmd_rejects where QUERY_ID='{}'".format(database,schema,query_id_of_the_copy_command)
92 | count_of_rejected_records=run_query_single_value_return(rejected_records)
93 |
94 | #audit the records
95 | audit_copy="""insert into {}.{}.COPY_AUDIT select QUERY_ID,QUERY_TEXT,DATABASE_NAME,'{}' ROWS_INSERTED,'{}'
96 | ROWS_REJECTED,SCHEMA_NAME,ROLE_NAME,WAREHOUSE_NAME,WAREHOUSE_SIZE,EXECUTION_STATUS,ERROR_MESSAGE,EXECUTION_TIME ,current_timestamp() ETL_TS
97 | FROM table(information_schema.query_history()) where query_type='COPY' AND QUERY_ID='{}' """.format(database,schema,count_no_of_rows_inserted_due_to_copy_command,
98 | count_of_rejected_records,query_id_of_the_copy_command)
99 | run_query(audit_copy)
100 |
101 |
102 | execute_copy_cmd()
--------------------------------------------------------------------------------
/incremental_etl.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/incremental_etl.zip
--------------------------------------------------------------------------------
/ingest.sh:
--------------------------------------------------------------------------------
1 | wget -O - https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data | aws s3 cp - s3://irisseta/input_folder/hello_world.csv
--------------------------------------------------------------------------------
/iris_partitioned_Data.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/iris_partitioned_Data.zip
--------------------------------------------------------------------------------
/kafka snowflake integration.txt:
--------------------------------------------------------------------------------
1 | Kafka Snowflake Integration:
2 | --------------------------------------------------------
3 | Download the required jar file -- https://mvnrepository.com/artifact/com.snowflake/snowflake-kafka-connector/1.5.0
4 |
5 | Put this jar in libs folders
6 |
7 | Update the plugin.path in kafka connect-standalone properties.
8 |
9 | Create Private & Public key-pair:
10 | --------------------------------------------------------------
11 | openssl genrsa -out rsa_key.pem 2048
12 | openssl rsa -in rsa_key.pem -pubout -out rsa_key.pub
13 |
14 |
15 | Configure the public key in Snowflake:
16 | ----------------------------------------------------------------
17 |
18 | alter user {User_name} set rsa_public_key='{Put the Public key content here}';
19 |
20 | Verify the public key is configured properly or not --
21 | desc user {User_name};
22 |
23 |
24 |
25 | Create a SF_connect.properties file with below properties in config folder --
26 |
27 | connector.class=com.snowflake.kafka.connector.SnowflakeSinkConnector
28 | tasks.max=8
29 | topics={topic_name}
30 | snowflake.topic2table.map={topic_name}:{snowflake_table_name}
31 | buffer.count.records=10000
32 | buffer.flush.time=60
33 | buffer.size.bytes=5000000
34 | snowflake.url.name={Snowflake URL}
35 | snowflake.user.name={Snowflake User Name}
36 | snowflake.private.key={Put the Private key content here}
37 | snowflake.database.name={Snowflake Database Name}
38 | snowflake.schema.name={Snowflake Schema Name}
39 | key.converter=com.snowflake.kafka.connector.records.SnowflakeJsonConverter
40 | value.converter=com.snowflake.kafka.connector.records.SnowflakeJsonConverter
41 | name={}
42 |
43 | Create the topic if not already exists & run the python code to ingest the data in the topic.
44 |
45 |
46 | Start the Kafka Connector:
47 | ---------------------------------------------------------
48 | F:/kafka_2.12-3.2.0/bin/windows/connect-standalone.bat F:/kafka_2.12-3.2.0/config/connect-standalone.properties F:/kafka_2.12-3.2.0/config/SF_connect.properties
49 |
50 | To unset the Public Key in Snowflake:
51 | ----------------------------------------------------------------------
52 | alter user {User_name} unset rsa_public_key;
53 |
--------------------------------------------------------------------------------
/kafka_producer_with_topic_partitioning.py:
--------------------------------------------------------------------------------
1 | from time import sleep
2 | from json import dumps
3 | from kafka import KafkaProducer
4 |
5 |
6 | #Lab 1: Write message to a partition (mentioning the partition number while publishing the message)
7 |
8 | topic_name='hello_world1'
9 | producer = KafkaProducer(bootstrap_servers=['localhost:9092'],value_serializer=lambda x: dumps(x).encode('utf-8'))
10 | data1 = {'number' : 1}
11 | data2 = {'number' : 2}
12 | data3 = {'number' : 3}
13 | data4 = {'number' : 4}
14 | data5 = {'number' : 5}
15 | data6 = {'number' : 6}
16 | producer.send(topic_name, value=data1,partition=1)
17 | producer.send(topic_name, value=data2,partition=1)
18 | producer.send(topic_name, value=data3,partition=1)
19 | producer.send(topic_name, value=data4,partition=2)
20 | producer.send(topic_name, value=data5,partition=2)
21 | producer.send(topic_name, value=data6,partition=0)
22 | producer.close()
23 |
24 | #Lab 2: Pass key value pair
25 | from json import dumps
26 | from kafka import KafkaProducer
27 | producer = KafkaProducer(bootstrap_servers=['localhost:9092'])
28 | topic_name='hello_world2'
29 | producer.send(topic_name, key=b'foo', value=b'bar') #Note :key & value serialization we are doing while publishing the message
30 | #itself , so explicitly not mentioning the key or value serializer
31 | producer.send(topic_name, key=b'foo', value=b'bar')
32 | producer.close()
33 |
34 | #Lab 3: Pass key value pair with key & value serialization with key or value serializer explicitly mentioned
35 | from json import dumps
36 | from kafka import KafkaProducer
37 | producer = KafkaProducer(bootstrap_servers=['localhost:9092'],key_serializer=str.encode,value_serializer=lambda x: dumps(x).encode('utf-8'))
38 | topic_name='hello_world3'
39 | data1 = {'number' : 1}
40 | data2 = {'number' : 2}
41 | data3 = {'number' : 3}
42 | data4 = {'number' : 4}
43 | data5 = {'number' : 5}
44 | data6 = {'number' : 6}
45 | producer.send(topic_name, key='ping',value=data1)
46 | producer.send(topic_name, key='ping',value=data2)
47 | producer.send(topic_name, key='ping',value=data3)
48 | producer.send(topic_name, key='pong',value=data4)
49 | producer.send(topic_name, key='pong',value=data5)
50 | producer.send(topic_name, key='pong',value=data6)
51 | producer.close()
52 |
53 |
54 |
55 | #Lab 4: Customize a partitioner
56 | from time import sleep
57 | from json import dumps
58 | from kafka import KafkaProducer
59 |
60 |
61 | def custom_partitioner(key, all_partitions, available):
62 | """
63 | Customer Kafka partitioner to get the partition corresponding to key
64 | :param key: partitioning key
65 | :param all_partitions: list of all partitions sorted by partition ID
66 | :param available: list of available partitions in no particular order
67 | :return: one of the values from all_partitions or available
68 | """
69 | print("The key is : {}".format(key))
70 | print("All partitions : {}".format(all_partitions))
71 | print("After decoding of the key : {}".format(key.decode('UTF-8')))
72 | return int(key.decode('UTF-8'))%len(all_partitions)
73 |
74 |
75 | producer = KafkaProducer(bootstrap_servers=['localhost:9092'],partitioner=custom_partitioner)
76 | topic_name='hello_world4'
77 | producer.send(topic_name, key=b'3', value=b'Hello Partitioner')
78 | producer.send(topic_name, key=b'2', value=b'Hello Partitioner123')
79 | producer.send(topic_name, key=b'369', value=b'Hello Partitioner')
80 | producer.send(topic_name, key=b'301', value=b'Hello Partitioner')
81 |
82 |
83 |
--------------------------------------------------------------------------------
/kafka_yt_demo.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/kafka_yt_demo.zip
--------------------------------------------------------------------------------
/key monitor.txt:
--------------------------------------------------------------------------------
1 | import json
2 | import boto3
3 | import datetime
4 | from datetime import datetime,timedelta
5 | from dateutil import tz
6 |
7 | def lambda_handler(event, context):
8 | client=boto3.client('iam')
9 | time_utc=datetime.now(tz.tzutc())
10 | response=client.list_users();
11 | print("List of users : ",response)
12 | for user in response['Users']:
13 | print("Access keys for user : {}".format(user['UserName']))
14 | access_key_info=client.list_access_keys(UserName=user['UserName'])
15 | print(access_key_info)
16 | access_key_metadata=access_key_info['AccessKeyMetadata']
17 | for key in access_key_metadata:
18 | if(key['CreateDate']<(time_utc-timedelta(days=90))):
19 | print("Alert!!")
20 |
--------------------------------------------------------------------------------
/lambda_powertools.py:
--------------------------------------------------------------------------------
1 | import json
2 | from aws_lambda_powertools.event_handler.exceptions import NotFoundError
3 | from aws_lambda_powertools.event_handler import (
4 | APIGatewayRestResolver,
5 | Response,
6 | content_types,
7 | )
8 | app = APIGatewayRestResolver()
9 |
10 | @app.not_found
11 | def handle_not_found_errors(exc: NotFoundError) -> Response:
12 | return Response(status_code=418, content_type=content_types.TEXT_PLAIN, body="No such resource path found")
13 |
14 | @app.get("/v1")
15 | def v1_call():
16 | print("Inside v1")
17 | return {"path":1}
18 |
19 | @app.get("/v2")
20 | def v2_call():
21 | print("Inside v2")
22 | return {"path":2}
23 |
24 | @app.get("/v4/")
25 | def v4_call(animal_name: str):
26 | print("Inside v4")
27 | return {"value":animal_name}
28 |
29 |
30 | @app.post("/v3")
31 | def v3_call():
32 | print("Inside v3 post endpoint")
33 | todo_data: dict = app.current_event.json_body
34 | return f"I love my {todo_data["country"]}"
35 |
36 | def lambda_handler(event, context):
37 | # TODO implement
38 | print("Input Event: ",event)
39 | return app.resolve(event, context)
40 |
--------------------------------------------------------------------------------
/mysql_cdc_fetcher_runner.py:
--------------------------------------------------------------------------------
1 | import json
2 | import boto3
3 |
4 | from pymysqlreplication import BinLogStreamReader
5 | from pymysqlreplication.row_event import (
6 | DeleteRowsEvent,
7 | UpdateRowsEvent,
8 | WriteRowsEvent,
9 | )
10 |
11 | def main():
12 | kinesis = boto3.client("kinesis",region_name='{}')
13 | stream = BinLogStreamReader(
14 | connection_settings= {
15 | "host": "{}",
16 | "port":{} ,
17 | "user": "{}",
18 | "passwd": "{}"},
19 | server_id=100,
20 | blocking=True,
21 | resume_stream=True,
22 | only_events=[DeleteRowsEvent, WriteRowsEvent, UpdateRowsEvent])
23 | for binlogevent in stream:
24 | for row in binlogevent.rows:
25 | event = {"schema": binlogevent.schema,
26 | "table": binlogevent.table,
27 | "type": type(binlogevent).__name__,
28 | "row": row
29 | }
30 | kinesis.put_record(StreamName="{}", Data=str(event), PartitionKey="1")
31 | print(json.dumps(event))
32 |
33 | main()
--------------------------------------------------------------------------------
/news_fetcher.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import json
3 | import requests
4 | import os
5 | from base64 import b64decode
6 | import datetime
7 | from datetime import date
8 | import uuid
9 | import os
10 |
11 |
12 | def runner():
13 | today = (date.today())
14 | api_key = ''
15 |
16 | base_url = "https://newsapi.org/v2/everything?q={}&from={}&to={}&sortBy=popularity&apiKey={}&language=en"
17 | print(base_url)
18 | start_date_value = str(today - datetime.timedelta(days=1))
19 | end_date_value = str(today)
20 |
21 | df = pd.DataFrame(columns=['newsTitle', 'timestamp', 'url_source', 'content', 'source', 'author', 'urlToImage'])
22 |
23 | url_extractor = base_url.format('India', start_date_value, end_date_value, api_key)
24 | print(url_extractor)
25 | response = requests.get(url_extractor)
26 | d = response.json()
27 |
28 | for i in d['articles']:
29 | newsTitle = i['title']
30 | timestamp = i['publishedAt']
31 | trimmed_part = "None"
32 | url_source = i['url']
33 | source = i['source']
34 | author = i['author']
35 | urlToImage = i['urlToImage']
36 | partial_content = ""
37 | if (str(i['content']) != 'None'):
38 | partial_content = i['content']
39 | if (len(partial_content) >= 200):
40 | partial_content = partial_content[0:199]
41 | if ('.' in partial_content):
42 | trimmed_part = partial_content[:partial_content.rindex('.')]
43 | else:
44 | trimmed_part = partial_content
45 | df = pd.concat([df, pd.DataFrame(
46 | {'newsTitle': newsTitle, 'timestamp': timestamp, 'url_source': url_source, 'content': trimmed_part,
47 | 'source': source, 'author': author, 'urlToImage': urlToImage})], ignore_index=True)
48 |
49 |
50 | output_file = "/home/ubuntu/news_data.parquet"
51 | df1 = df.drop_duplicates()
52 | df1.to_parquet(output_file)
53 |
54 | runner()
--------------------------------------------------------------------------------
/news_fetcher_etl.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import json
3 | import requests
4 | import os
5 | from base64 import b64decode
6 | import datetime
7 | from datetime import date
8 | import uuid
9 | import os
10 |
11 |
12 | def runner():
13 | today = (date.today())
14 | api_key = '{}'
15 |
16 | base_url = "https://newsapi.org/v2/everything?q={}&from={}&to={}&sortBy=popularity&apiKey={}&language=en"
17 | print(base_url)
18 | start_date_value = str(today - datetime.timedelta(days=1))
19 | end_date_value = str(today)
20 |
21 | df = pd.DataFrame(columns=['newsTitle', 'timestamp', 'url_source', 'content', 'source', 'author', 'urlToImage'])
22 |
23 | url_extractor = base_url.format('Covid', start_date_value, end_date_value, api_key)
24 | print(url_extractor)
25 | response = requests.get(url_extractor)
26 | d = response.json()
27 |
28 | for i in d['articles']:
29 | newsTitle = i['title']
30 | timestamp = i['publishedAt']
31 | trimmed_part = "None"
32 | url_source = i['url']
33 | source = i['source']
34 | author = i['author']
35 | urlToImage = i['urlToImage']
36 | partial_content = ""
37 | if (str(i['content']) != 'None'):
38 | partial_content = i['content']
39 | if (len(partial_content) >= 200):
40 | partial_content = partial_content[0:199]
41 | if ('.' in partial_content):
42 | trimmed_part = partial_content[:partial_content.rindex('.')]
43 | else:
44 | trimmed_part = partial_content
45 | df = pd.concat([df, pd.DataFrame(
46 | {'newsTitle': newsTitle, 'timestamp': timestamp, 'url_source': url_source, 'content': trimmed_part,
47 | 'source': source, 'author': author, 'urlToImage': urlToImage})], ignore_index=True)
48 |
49 | filename = str(uuid.uuid4())
50 | output_file = "/home/ubuntu/{}.parquet".format(filename)
51 | df1 = df.drop_duplicates()
52 | df1.to_parquet(output_file)
53 | return output_file
--------------------------------------------------------------------------------
/otp system.drawio.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/otp system.drawio.png
--------------------------------------------------------------------------------
/scd type 1 Snowflake.txt:
--------------------------------------------------------------------------------
1 | create or replace database ramu;
2 | use ramu;
3 |
4 | create or replace sequence seq_01 start = 1 increment = 1;
5 |
6 | create or replace table source_table( emp_no int,emp_name text,salary int, hra int );
7 |
8 |
9 |
10 | INSERT INTO source_table VALUES (100, 'A' ,2000, 100),
11 | (101, 'B' ,5000, 300),
12 | (102, 'C' ,6000, 400),
13 | (103, 'D' ,500, 50),
14 | (104, 'E' ,15000, 3000),
15 | (105, 'F' ,150000, 20050);
16 |
17 | select * from source_table;
18 |
19 |
20 | create or replace table target_table(surrogate_key int default seq_01.nextval,emp_no int,emp_name text,salary int,
21 | hra int);
22 |
23 |
24 | select * from target_table;
25 |
26 |
27 | INSERT INTO PUBLIC.target_table(emp_no, emp_name, salary, hra)
28 | SELECT t.emp_no, t.emp_name, t.salary, t.hra FROM PUBLIC.source_table t
29 | LEFT JOIN PUBLIC.target_table d ON d.emp_no = t.emp_no WHERE ( d.emp_no IS NULL);
30 |
31 |
32 | select * from target_table;
33 |
34 |
35 |
36 | update source_table set salary=5690 where emp_name='A';
37 |
38 | select * from source_table;
39 | select * from target_table;
40 |
41 |
42 | UPDATE PUBLIC.target_table d SET emp_name = t.emp_name, salary = t.salary, hra = t.hra
43 | FROM PUBLIC.source_table t WHERE d.emp_no = t.emp_no AND (( d.emp_name <> t.emp_name) OR ( d.salary <> t.salary) OR ( d.hra <> t.hra));
44 |
45 |
46 | select * from target_table;
47 |
48 | update source_table set salary=6000 where emp_name='B';
49 | update source_table set HRA=3000 where emp_name='B';
50 | INSERT INTO source_table VALUES (1001, 'MG' ,2000, 100);
51 |
52 | select * from source_table;
53 | select * from target_table;
54 |
55 |
56 | INSERT INTO PUBLIC.target_table(emp_no, emp_name, salary, hra)
57 | SELECT t.emp_no, t.emp_name, t.salary, t.hra FROM PUBLIC.source_table t
58 | LEFT JOIN PUBLIC.target_table d ON d.emp_no = t.emp_no WHERE ( d.emp_no IS NULL);
59 | UPDATE PUBLIC.target_table d SET emp_name = t.emp_name, salary = t.salary, hra = t.hra
60 | FROM PUBLIC.source_table t WHERE d.emp_no = t.emp_no AND (( d.emp_name <> t.emp_name) OR ( d.salary <> t.salary) OR ( d.hra <> t.hra));
61 |
62 | select * from source_table;
63 | select * from target_table;
--------------------------------------------------------------------------------
/scd_type_2_snowflake_queries.sql:
--------------------------------------------------------------------------------
1 | create or replace table source_table( emp_no int,emp_name text,salary int, hra int );
2 |
3 |
4 |
5 | INSERT INTO source_table VALUES (100, 'A' ,2000, 100),
6 | (101, 'B' ,5000, 300),
7 | (102, 'C' ,6000, 400),
8 | (103, 'D' ,500, 50),
9 | (104, 'E' ,15000, 3000),
10 | (105, 'F' ,150000, 20050);
11 |
12 |
13 | create or replace sequence seq_01 start = 1 increment = 1;
14 |
15 |
16 | create or replace table target_table( surrogate_key int default seq_01.nextval,emp_no int,emp_name text,salary int,
17 | hra int,start_date string default current_timestamp()::string ,end_date string,activeflag text default 'Y' );
18 |
19 | SELECT * FROM source_table;
20 |
21 | select * from target_table;
22 |
23 |
24 | select * from updated_emp_id;
25 |
26 |
27 |
28 |
29 |
30 | INSERT INTO source_table VALUES (110, 'AB' ,5600, 180);
31 | INSERT INTO source_table VALUES (115, 'CD' ,5670, 185);
32 |
33 |
34 | update source_table set salary=5690 where emp_name='A';
35 | update source_table set HRA=645 where emp_name='CD';
36 |
37 | delete from source_table where emp_name='B';
38 |
39 |
40 | INSERT INTO source_table VALUES (1010, 'B' ,5600, 180);
41 | update source_table set salary=7000 where emp_name='A';
42 | delete from source_table where emp_name='C';
43 |
44 | select * from target_table;
45 |
46 |
47 |
--------------------------------------------------------------------------------
/snowflake cortex fine tuning.txt:
--------------------------------------------------------------------------------
1 | --Lab section:
2 | DROP DATABASE IF EXISTS snowflake_llm_poc;
3 | CREATE Database snowflake_llm_poc;
4 | use snowflake_llm_poc;
5 |
6 | CREATE or REPLACE file format csvformat
7 | SKIP_HEADER = 1
8 | FIELD_OPTIONALLY_ENCLOSED_BY = '"'
9 | type = 'CSV';
10 |
11 | CREATE or REPLACE stage support_tickets_data_stage
12 | file_format = csvformat
13 | url = 's3://sfquickstarts/finetuning_llm_using_snowflake_cortex_ai/';
14 |
15 | CREATE or REPLACE TABLE SUPPORT_TICKETS (
16 | ticket_id VARCHAR(60),
17 | customer_name VARCHAR(60),
18 | customer_email VARCHAR(60),
19 | service_type VARCHAR(60),
20 | request VARCHAR,
21 | contact_preference VARCHAR(60)
22 | );
23 |
24 | COPY into SUPPORT_TICKETS
25 | from @support_tickets_data_stage;
26 |
27 | select * from SUPPORT_TICKETS;
28 |
29 | --with mistral-large
30 | select *,snowflake.cortex.complete('mistral-large',concat('You are an agent that helps organize requests that come to our support team.
31 |
32 | The request category is the reason why the customer reached out. These are the possible types of request categories:
33 |
34 | Roaming fees
35 | Slow data speed
36 | Lost phone
37 | Add new line
38 | Closing account
39 |
40 | Try doing it for this request and return only the request category only.
41 | ',REQUEST,'')) as classification_result from SUPPORT_TICKETS;
42 |
43 |
44 |
45 | --with mistral-7b
46 | select *,snowflake.cortex.complete('mistral-7b',concat('You are an agent that helps organize requests that come to our support team.
47 |
48 | The request category is the reason why the customer reached out. These are the possible types of request categories:
49 |
50 | Roaming fees
51 | Slow data speed
52 | Lost phone
53 | Add new line
54 | Closing account
55 |
56 | Try doing it for this request and return only the request category only.
57 | ',REQUEST,'')) as classification_result from SUPPORT_TICKETS;
58 |
59 |
60 | --step 1 : create the data format
61 | Create or replace table snowflake_llm_poc.public.annotated_data_for_finetuning as
62 | (select *,concat('You are an agent that helps organize requests that come to our support team.
63 |
64 | The request category is the reason why the customer reached out. These are the possible types of request categories:
65 |
66 | Roaming fees
67 | Slow data speed
68 | Lost phone
69 | Add new line
70 | Closing account
71 |
72 | Try doing it for this request and return only the request category only.
73 | ',REQUEST,'') as prompt,snowflake.cortex.complete('mistral-large',concat('You are an agent that helps organize requests that come to our support team.
74 |
75 | The request category is the reason why the customer reached out. These are the possible types of request categories:
76 |
77 | Roaming fees
78 | Slow data speed
79 | Lost phone
80 | Add new line
81 | Closing account
82 |
83 | Try doing it for this request and return only the request category only.
84 | ',REQUEST,'')) as classification_result from SUPPORT_TICKETS);
85 |
86 | select * from snowflake_llm_poc.public.annotated_data_for_finetuning;
87 |
88 | --splitting into training & test dataset
89 | create or replace table snowflake_llm_poc.public.trainig_data as select * from snowflake_llm_poc.public.annotated_data_for_finetuning sample(80);
90 |
91 | select * from snowflake_llm_poc.public.trainig_data;
92 |
93 | create or replace table snowflake_llm_poc.public.validation_data as select * from snowflake_llm_poc.public.annotated_data_for_finetuning minus
94 | select * from snowflake_llm_poc.public.trainig_data;
95 |
96 | select * from snowflake_llm_poc.public.validation_data;
97 |
98 | select * from snowflake_llm_poc.public.trainig_data
99 | intersect
100 | select * from snowflake_llm_poc.public.validation_data;
101 |
102 | --fine-tuning
103 | select snowflake.cortex.finetune(
104 | 'CREATE',
105 | 'snowflake_llm_poc.PUBLIC.SUPPORT_TICKETS_FINETUNED_MISTRAL_7B', 'mistral-7b',
106 | 'SELECT prompt, CLASSIFICATION_RESULT as completion from snowflake_llm_poc.PUBLIC.trainig_data',
107 | 'SELECT prompt, CLASSIFICATION_RESULT as completion from snowflake_llm_poc.PUBLIC.validation_data'
108 | );
109 |
110 | --check the fine-tune job completed or not
111 | select SNOWFLAKE.CORTEX.FINETUNE(
112 | 'DESCRIBE',
113 | 'CortexFineTuningWorkflow_398c6ef0-afcf-4934-913c-546285e53ec7'
114 | );
115 |
116 | --Inferencing the fine-tuned model
117 | select *,snowflake.cortex.complete('snowflake_llm_poc.PUBLIC.SUPPORT_TICKETS_FINETUNED_MISTRAL_7B',concat('You are an agent that helps organize requests that come to our support team.
118 |
119 | The request category is the reason why the customer reached out. These are the possible types of request categories:
120 |
121 | Roaming fees
122 | Slow data speed
123 | Lost phone
124 | Add new line
125 | Closing account
126 |
127 | Try doing it for this request and return only the request category only.
128 | ',REQUEST,'')) as classification_result from SUPPORT_TICKETS;
129 |
--------------------------------------------------------------------------------
/snowflake_connector_python-2.3.8-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/snowflake_connector_python-2.3.8-py3-none-any.whl
--------------------------------------------------------------------------------
/snowflake_dq_framework.py:
--------------------------------------------------------------------------------
1 | #code explanation: https://youtu.be/Rp0RHsT0jIA?si=zFrYgPChEJn4aWv4
2 | #pip install snowflake-connector-python
3 | #pip install "snowflake-connector-python[pandas]" -t .
4 | #pip install pandas -t .
5 |
6 |
7 | from snowflake.connector import connect
8 | import pandas as pd
9 | import os
10 |
11 |
12 | def run_query(conn, query):
13 | cursor = conn.cursor()
14 | cursor.execute(query)
15 | cursor.close()
16 |
17 | def run_query1(conn, query):
18 | cursor = conn.cursor()
19 | cursor.execute(query)
20 | records=cursor.fetchone()[0]
21 | cursor.close()
22 | return records
23 |
24 | def execute_test(
25 | db_conn,
26 | script_1,
27 | script_2,
28 | comp_operator):
29 | print("1st SQL Query : ",script_1)
30 | result_1=run_query1(db_conn,script_1)
31 | print("2nd SQL Query : ", script_2)
32 | result_2 = run_query1(db_conn, script_2)
33 | print("result 1 = " + str(result_1))
34 | print("result 2 = " + str(result_2))
35 | # compare values based on the comp_operator
36 | if comp_operator == "equals":
37 | return result_1 == result_2
38 | elif comp_operator == "greater_equals":
39 | return result_1 >= result_2
40 | elif comp_operator == "greater":
41 | return result_1 > result_2
42 | elif comp_operator == "less_equals":
43 | return result_1 <= result_2
44 | elif comp_operator == "less":
45 | return result_1 < result_2
46 | elif comp_operator == "not_equal":
47 | return result_1 != result_2
48 | # if we made it here, something went wrong
49 | return False
50 |
51 |
52 |
53 | user=''
54 | password=''
55 | account=""
56 | database=""
57 | warehouse=""
58 | schema=""
59 | role=""
60 | conn = connect(
61 | user=user,
62 | password=password,
63 | account=account,
64 | database=database,
65 | schema=schema,
66 | warehouse=warehouse,
67 | role=role
68 | )
69 |
70 |
71 |
72 | sql_query ="""select * from dq_check where table_name='dummy_table'"""
73 | cursor = conn.cursor()
74 | cursor.execute(sql_query)
75 |
76 | df=cursor.fetch_pandas_all()
77 | cursor.close()
78 |
79 | test_case_output_df=pd.DataFrame(columns=['Check_Description','Status'])
80 |
81 | for index,row in df.iterrows():
82 | table_name=row["TABLE_NAME"]
83 | description=row['DESCRIPTION']
84 | print('*'*100)
85 | print("Performing check : ",description)
86 | sql_query_1=row['SQL_QUERY_1']
87 | sql_query_2=row['SQL_QUERY_2']
88 | comparison_type=row['COMPARISON_TYPE']
89 | outcome=execute_test(
90 | conn,
91 | sql_query_1,
92 | sql_query_2,
93 | comparison_type)
94 | testcase_pass_fail= "Pass" if outcome else "Failed"
95 | print("Testcase Results : ",testcase_pass_fail)
96 | new_row=({'Check_Description': description, 'Status': testcase_pass_fail})
97 | test_case_output_df = pd.concat([test_case_output_df, pd.DataFrame([new_row])], ignore_index=True)
98 | print('*' * 100)
99 |
100 | print(test_case_output_df)
--------------------------------------------------------------------------------
/study_data.csv:
--------------------------------------------------------------------------------
1 | repetition_time,study_time,knowledge_level
2 | 0,0,Low
3 | 0.24,0.9,High
4 | 0.25,0.33,Low
5 | 0.65,0.3,High
6 | 0.98,0.24,Low
7 | 0.1,0.66,High
8 | 0.29,0.56,High
9 | 0.4,0.01,Low
10 | 0.72,0.25,Low
11 | 0.2,0.85,High
12 | 0.3,0.81,High
13 | 0.41,0.3,Low
14 | 0.78,0.34,High
15 | 0.15,0.9,High
16 | 0.3,0.6,High
17 | 0.35,0.8,High
18 | 0.01,0.05,Low
19 | 0.08,0.33,Low
20 | 0.27,0.29,Low
21 | 0.49,0.56,High
22 | 0.78,0.2,Low
23 | 0.12,0.66,High
24 | 0.29,0.65,High
25 | 0.42,0.28,Low
26 | 0.76,0.25,Low
27 | 0.18,0.85,High
28 | 0.25,0.1,Low
29 | 0.45,0.25,Low
30 | 0.94,0.56,High
31 | 0.21,0.81,High
32 | 0.31,0.59,High
33 | 0.65,0.24,Low
34 | 0.76,0.16,Low
35 | 0.19,0.82,High
36 | 0.31,0.78,High
37 | 0.43,0.29,Low
38 | 0.72,0.26,Low
39 | 0.08,0.33,Low
40 | 0.26,0,Low
41 | 0.49,0.45,High
42 | 0.76,0.1,Low
43 | 0.2,0.78,High
44 | 0.29,0.6,High
45 | 0.64,0.25,High
46 | 0.27,0.04,Low
47 | 0.14,0.66,High
48 | 0.31,0.62,High
49 | 0.38,0.77,High
50 | 0.71,0.9,High
51 | 0.18,0.67,High
52 | 0.28,0.25,Low
53 | 0.51,0.45,High
54 | 0.78,0.05,Low
55 | 0.18,0.86,High
56 | 0.29,0.55,High
57 | 0.42,0.26,Low
58 | 0.84,0.25,High
59 | 0.19,0.59,High
60 | 0.33,0.82,High
61 | 0.64,0.1,Low
62 | 0.75,0.01,Low
63 | 0.19,0.56,High
64 | 0.3,0.51,High
65 | 0.48,0.28,Low
66 | 0.8,0.28,High
67 | 0.09,0.15,Low
68 | 0.26,0.24,Low
69 | 0.55,0.51,High
70 | 0.28,0.32,Low
71 | 0.02,0.67,High
72 | 0.29,0.58,High
73 | 0.42,0.75,High
74 | 0.66,0.08,Low
75 | 0.3,0.53,High
76 | 0.33,0.78,High
77 | 0.4,0.1,Low
78 | 0.75,0.1,Low
79 | 0.1,0.3,Low
80 | 0.26,0.2,Low
81 | 0.44,0.28,Low
82 | 0.76,0.24,Low
83 | 0.12,0.67,High
84 | 0.29,0.62,High
85 | 0.48,0.26,Low
86 | 0.7,0.25,Low
87 | 0.2,0.03,Low
88 | 0.31,0.79,High
89 | 0.41,0.28,Low
90 | 0.78,0.18,Low
91 | 0.09,0.67,High
92 | 0.29,0.56,High
93 | 0.78,0.34,High
94 | 0.6,0.09,Low
95 | 0.23,0.24,Low
96 | 0.32,0.8,High
97 | 0.62,0.15,Low
98 | 0.77,0.19,Low
99 | 0.33,0.77,High
100 | 0.29,0.57,High
101 | 0.42,0.29,Low
102 | 0.48,0.26,Low
103 | 0.33,0.87,High
104 | 0.31,0.54,High
105 | 0.49,0.27,Low
106 | 0.76,0.1,Low
107 | 0.25,0.67,High
108 | 0.29,0.59,High
109 | 0.4,0.54,High
110 | 0.81,0.3,High
111 | 0.37,0.84,High
112 | 0.27,0.33,Low
113 | 0.4,0.3,Low
114 | 0.89,0.58,High
115 | 0.4,0.79,High
116 | 0.31,0.55,High
117 | 0.61,0.45,High
118 | 0.66,0.07,Low
119 | 0.8,0.7,High
120 | 0.17,0.66,High
121 | 0.32,0.81,High
122 | 0.65,0.13,Low
123 | 0.72,0.25,Low
124 | 0.11,0.333,Low
125 | 0.25,0.83,High
126 | 0.49,0.76,High
127 | 0.92,0.5,High
128 | 0.22,0.66,High
129 | 0.28,0.28,Low
130 | 0.63,0.14,Low
131 | 0.88,0.28,High
132 | 0.06,0.34,Low
133 | 0.26,0.67,High
134 | 0.55,0.07,Low
135 | 0.7,0.71,High
136 | 0.1,0.65,High
137 | 0.31,0.5,High
138 | 0.48,0.26,Low
139 | 0.78,0.1,Low
140 | 0.18,0.58,High
141 | 0.27,0.3,Low
142 | 0.55,0.1,Low
143 | 0.78,0.4,High
144 | 0.22,0.56,High
145 | 0.22,0.29,Low
146 | 0.56,0.48,High
147 | 0.95,0.65,High
148 | 0.24,0.35,Low
149 | 0.33,0.1,Low
150 | 0.64,0.13,Low
151 | 0.65,0.77,High
152 | 0.14,0.86,High
153 | 0.32,0.3,Low
154 | 0.48,0.13,Low
155 | 0.77,0.14,Low
156 | 0.09,0.64,High
157 | 0.33,0.52,High
158 | 0.36,0.51,High
159 | 0.77,0.83,High
160 | 0.18,0.59,High
161 | 0.31,0.54,High
162 | 0.61,0.18,Low
163 | 0.84,0.3,High
164 | 0.24,0.88,High
165 | 0.27,0.89,High
166 | 0.49,0.12,Low
167 | 0.3,0.9,High
168 | 0.2,0.61,High
169 | 0.49,0.78,High
170 | 0.6,0.16,Low
171 | 0.21,0.92,High
172 | 0.04,0.25,Low
173 | 0.33,0.49,High
174 | 0.53,0.85,High
175 | 0.75,0.16,Low
176 | 0.12,0.66,High
177 | 0.33,0.3,Low
178 | 0.65,0.19,Low
179 | 0.75,0.71,High
180 | 0.22,0.6,High
181 | 0.26,0.83,High
182 | 0.63,0.18,Low
183 | 0.99,0.55,High
184 | 0.24,0.89,High
185 | 0.29,0.3,Low
186 | 0.62,0.2,Low
187 | 0.78,0.21,Low
188 | 0.01,0.93,High
189 | 0.29,0.57,High
190 | 0.55,0.25,Low
191 | 0.9,0.47,High
192 | 0.16,0.64,High
193 | 0.3,0.8,High
194 | 0.4,0.5,High
195 | 0.88,0.67,High
196 | 0.11,0.66,High
197 | 0.25,0.29,Low
198 | 0.48,0.1,Low
199 | 0.72,0.26,Low
200 | 0.18,0.63,High
201 | 0.3,0.1,Low
202 | 0.55,0.09,Low
203 | 0.65,0.5,High
204 | 0.08,0.1,Low
205 | 0.3,0.29,Low
206 | 0.65,0.75,High
207 | 0.81,0.15,Low
208 | 0.09,0.66,High
209 | 0.31,0.53,High
210 | 0.48,0.11,Low
211 | 0.8,0.68,High
212 | 0.14,0.62,High
213 | 0.31,0.51,High
214 | 0.58,0.79,High
215 | 0.83,0.34,High
216 | 0.2,0.6,High
217 | 0.29,0.3,Low
218 | 0.5,0.3,Low
219 | 0.87,0.58,High
220 | 0.17,0.64,High
221 | 0.28,0.3,Low
222 | 0.62,0.24,Low
223 | 0.78,0.28,High
224 | 0.2,0.66,High
225 | 0.31,0.57,High
226 | 0.63,0.21,Low
227 | 0.82,0.68,High
228 | 0.23,0.59,High
229 | 0.29,0.31,Low
230 | 0.55,0.78,High
231 | 0.7,0.69,High
232 | 0.12,0.65,High
233 | 0.28,0.28,Low
234 | 0.59,0.23,Low
235 | 0.91,0.66,High
236 | 0.15,0.62,High
237 | 0.27,0.3,Low
238 | 0.6,0.22,Low
239 | 0.4,0.83,High
240 | 0.13,0.64,High
241 | 0.3,0.52,High
242 | 0.62,0.2,Low
243 | 0.78,0.86,High
244 | 0.18,0.63,High
245 | 0.27,0.25,Low
246 | 0.65,0.25,High
247 | 0.89,0.88,High
248 | 0.1,0.66,High
249 | 0.29,0.29,Low
250 | 0.65,0.9,High
251 | 0.79,0.45,High
252 | 0.09,0.66,High
253 | 0.31,0.5,High
254 | 0.64,0.19,Low
255 | 0.92,0.58,High
256 | 0.19,0.6,High
257 | 0.29,0.77,High
258 | 0.61,0.26,High
259 | 0.87,0.74,High
260 |
--------------------------------------------------------------------------------
/test123.txt:
--------------------------------------------------------------------------------
1 | Getting Started with AWS Managed Streaming for Kafka with in-depth service setup
2 | https://youtube.com/watch?v=BFKmQAafE_c&feature=shares
3 | Capturing client events using Amazon API Gateway and Amazon EventBridge
4 | https://youtube.com/watch?v=mcpnhZThZ7s&feature=shares
5 | End to End Streaming Data Pipeline Using AWS MSK & AWS Serverless Services
6 | https://youtube.com/watch?v=l5ypWBHMsNY&feature=shares
--------------------------------------------------------------------------------
/testa:
--------------------------------------------------------------------------------
1 | service_name = 's3'
2 | region_name = 'us-east-2'
3 | aws_access_key_id = ''
4 | aws_secret_access_key = ''
5 |
6 | s3_resource = boto3.resource(
7 | service_name=service_name,
8 | region_name=region_name,
9 | aws_access_key_id=aws_access_key_id,
10 | aws_secret_access_key=aws_secret_access_key
11 | )
12 | bucket='destinationbucketdemoshow';
13 | df = initial_df[(initial_df.species == "setosa")];
14 | csv_buffer = StringIO()
15 | df.to_csv(csv_buffer,index=False);
16 | s3_resource.Object(bucket, s3_file_key).put(Body=csv_buffer.getvalue())
17 |
--------------------------------------------------------------------------------
/transform.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 |
4 | from pyspark.sql import SparkSession
5 | from pyspark import SparkContext
6 | from pyspark.sql.types import *
7 |
8 |
9 | spark = SparkSession \
10 | .builder \
11 | .appName("airflow_with_emr") \
12 | .getOrCreate()
13 |
14 |
15 |
16 | def main():
17 | s3_location="s3://irisseta/input_folder/";
18 | iris = spark.read.format("csv").option("inferSchema","true").load(s3_location).toDF('SEPAL_LENGTH','SEPAL_WIDTH','PETAL_LENGTH','PETAL_WIDTH','CLASS_NAME');
19 | ms=iris.groupBy("CLASS_NAME").count()
20 | ms.coalesce(1).write.format("parquet").mode('overwrite').save("s3://irisseta/output_folder/")
21 |
22 | main()
23 |
24 |
25 |
--------------------------------------------------------------------------------
/translator_with_polly.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 | import boto3
3 | from botocore.exceptions import BotoCoreError, ClientError
4 | import os
5 | import sys
6 | from tempfile import gettempdir
7 | from contextlib import closing
8 |
9 |
10 | aws_access_key="{}"
11 | aws_secret_key="{}"
12 |
13 | session=boto3.Session(aws_access_key_id =aws_access_key,aws_secret_access_key = aws_secret_key,region_name ='{}')
14 |
15 | translate = session.client('translate')
16 |
17 | polly = session.client("polly")
18 |
19 |
20 |
21 | def translate_text(text, source_language,destination_language):
22 | """
23 |
24 | :param text: Input text which has to be translated
25 | :param source_language: The Input Language
26 | :param destination_language: The desired output Language
27 | :return:
28 | """
29 | result = translate.translate_text(
30 | Text=text,
31 | SourceLanguageCode=source_language,
32 | TargetLanguageCode=destination_language
33 | )
34 | return result['TranslatedText']
35 |
36 |
37 | def text_to_speech(text_part):
38 | """
39 | :param text_part: The text which has to be converted to Hindi audio
40 | :return: temporary path where the audio is stored
41 | """
42 |
43 | print("Input text part for text to speech conversion : ",text_part)
44 | try:
45 | # Request speech synthesis
46 | response = polly.synthesize_speech(Text=text_part, LanguageCode="hi-IN",OutputFormat="mp3",
47 | VoiceId="Joanna")
48 | except (BotoCoreError, ClientError) as error:
49 | # The service returned an error, exit gracefully
50 | print(error)
51 | sys.exit(-1)
52 | # Access the audio stream from the response
53 | if "AudioStream" in response:
54 | # Note: Closing the stream is important because the service throttles on the
55 | # number of parallel connections. Here we are using contextlib.closing to
56 | # ensure the close method of the stream object will be called automatically
57 | # at the end of the with statement's scope.
58 | with closing(response["AudioStream"]) as stream:
59 | output = os.path.join(gettempdir(), "speech.mp3")
60 |
61 | try:
62 | # Open a file for writing the output as a binary stream
63 | with open(output, "wb") as file:
64 | file.write(stream.read())
65 | except IOError as error:
66 | # Could not write to file, exit gracefully
67 | print(error)
68 | sys.exit(-1)
69 | print("Output Path where audio is stored :",output)
70 | return output
71 |
72 |
73 |
74 |
75 |
76 | def runner():
77 | col11, col22 = st.columns(2)
78 | with col11:
79 | st.title("Language Translation")
80 | st.markdown('Feel the power of Neural Machine Translation')
81 | with col22:
82 | st.image('Capture.PNG', use_column_width=True);
83 | col1, col2 = st.columns(2)
84 | conversion_list={"English":"en","Bengali":"bn","Hindi":"hi","French":"fr"}
85 | with col1:
86 | source_language = st.selectbox('Select the Source Language', ['Default', 'English', 'Bengali','Hindi','French'])
87 | input_text = st.text_input('Enter the Input text', 'Enter text here')
88 | with col2:
89 | destination_language = st.selectbox('Select the Destination', ['Default', 'English', 'Bengali','Hindi','French'])
90 |
91 | with col1:
92 | button_value = st.checkbox(label='Translate')
93 | translated_text=""
94 | if button_value:
95 | if source_language!=destination_language:
96 | print("The Source Language is {}".format(conversion_list[source_language]))
97 | print("The Destination Language is {}".format(conversion_list[destination_language]))
98 | translated_text=translate_text(input_text, conversion_list[source_language],conversion_list[destination_language])
99 | else:
100 | translated_text=input_text
101 | with col2:
102 | st.text_input('Translated Text', translated_text)
103 | print("Translated Text : ",translated_text)
104 | if (destination_language == 'Hindi'):
105 | button_value_text_to_speech = st.checkbox(label='Audio Form')
106 | if(button_value_text_to_speech):
107 | audio_path = text_to_speech(translated_text)
108 | audio_file = open(audio_path, 'rb')
109 | audio_bytes = audio_file.read()
110 | st.audio(audio_bytes, format='audio / ogg')
111 |
112 |
113 |
114 | runner()
115 |
116 |
117 |
118 |
119 |
--------------------------------------------------------------------------------
/user_data_yt.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash -xe
2 | exec &>> /tmp/userdata_execution.log
3 |
4 | upload_log() {
5 | aws s3 cp /tmp/userdata_execution.log s3://demoytuserdata/logs/
6 | sudo shutdown now -h
7 | }
8 |
9 | trap 'upload_log' EXIT
10 |
11 | sudo apt update
12 | sudo apt -y install awscli
13 | sudo apt -y install python3-pip
14 | pip3 install --upgrade awscli
15 | pip3 install boto3 pandas pyarrow fastparquet
16 | aws s3 cp s3://demoytuserdata/script/news_fetcher.py .
17 | python3 news_fetcher.py
18 | aws s3 mv /home/ubuntu/news_data.parquet s3://demoytuserdata/outputdirectory/
--------------------------------------------------------------------------------