├── API_Load_Testing_using_Cloudwatch_Custom_Metric.ipynb
├── AWS Bedrock Gen AI Notes.pdf
├── AWS ElastiCache with Python.txt
├── AWS Glue Data Quality Check.txt
├── AWS Glue Job trigger from Lambda.JPG
├── AWS Lambda Trigger from PostgreSQL.txt
├── AWS MSK Connector for Snowflake.txt
├── AWS Pinpoint using Boto3.ipynb
├── AWS Serverless Data Analytics Pipeline for File Processing .txt
├── AWSS3ToSnowflake.zip
├── AWS_Bedrock_Text_Summarization.ipynb
├── Access_s3_data_in_Spark_outside_AWS_Account.ipynb
├── Analyze Apache Parquet optimized data using Amazon Kinesis Data Firehose, Amazon Athena.png
├── Approximate Duplicate Detection using Weighted Fuzzy.ipynb
├── Architecture  of the generic Job to ingest data from s3 to Snowflake.png
├── Artilary Demystified.txt
├── Athena_Table_as_source_for_Apache_Spark.ipynb
├── Athena_parameterized_queries_using_Python.ipynb
├── Automatic Mean Imputation.ipynb
├── Automatic Mode Imputation.ipynb
├── Bloom Filter Demo.ipynb
├── Bloom Filter Theory.pdf
├── Build & Push Docker Image to AWS ECR using GitHub Actions.txt
├── Building_RAG_Application_using_Textract_&_Bedrock.ipynb
├── CSV_Sentiment.ipynb
├── Cake preparation-Detail Architecture (1).jpg
├── Classification_with_GenAI.ipynb
├── Cloudy-Snow.ipynb
├── Consumer_File SQS.ipynb
├── Create_,_Update_,_Delete_Athena_Prepared_Statement_using_boto3.ipynb
├── DLQ Redrive.drawio.png
├── Data Analysis Practice.ipynb
├── Data Warehouse in Snowflake.png
├── DataBricks Quickstart Guide with AWS.ipynb
├── Data_Analysis_Play.tsv
├── Data_Analysis_Practice.ipynb
├── Data_Engineering_Essentials(news sentiment analysis).ipynb
├── Data_Ingestion_from_GSheet_to_s3.ipynb
├── Data_Quality_Check_using_pydeequ.ipynb
├── Databricks column description addition using Gen AI.ipynb
├── Databricks workflow demo.txt
├── Databricks_SQL_Connector_for_Python.ipynb
├── Delta Sharing Databricks Notebook.ipynb
├── DeltaLake AWS Foundation from scratch.ipynb
├── DeltaLake Handwritten Notes.pdf
├── Delta_Lake in AWS Lambda.txt
├── Delta_Lake_using_Python.ipynb
├── Delta_Sharing_Python.ipynb
├── Delta_Sharing_Spark.ipynb
├── Demo Data used in Generic Framework for External Table.zip
├── Docker Build Args in-depth intuition.txt
├── ETL Automation with Metadata Management and Job Tracking.txt
├── End-to-End Project on Location Based Service.txt
├── Error Notifications for Snowflake Tasks using SNS.png
├── Error_Handler_Python_File SQS.ipynb
├── FAISS_Similarity_Search.ipynb
├── Feature Scaling.ipynb
├── Feature Selection based on correlation.ipynb
├── Flat Any JSON File PySpark.ipynb
├── Flat any Json Using_Recursion,Filter & Map.ipynb
├── Flat_DataFrame.ipynb
├── Flatten JSON & Harmonize Schema.ipynb
├── Fundamentals of Data Preprocessing using Python.ipynb
├── Generative AI on AWS with AWS Bedrock.ipynb
├── Geolocation_from_IP.ipynb
├── Get_Data_Lineage_using_Rest_API.ipynb
├── Github Action with AWS.txt
├── Glue Workflow Lab.txt
├── Go Deeper in Partitioning.ipynb
├── HTTP Sink Connector.txt
├── Incremental Crawling , Incremental ETL with Glue Workflow.png
├── Intelligent Text Classification using Databricks Gen AI (1).ipynb
├── Kafka , Faust & Snowflake.txt
├── Kafka Consumer Internals with Auto Offset Commits and At Least Once Processing.png
├── Kafka Consumer Lag Monitoring.txt
├── Kafka Faust Snowflake Python.PNG
├── Kafka Producer Internals.png
├── Kafka Retry Architecture.png
├── Kmeans.ipynb
├── Leverage_pgvector_and_Amazon_PostgreSQL_for_Natural_Language_Processing.ipynb
├── Limit_and_offset_concept_in_API.ipynb
├── Long-Running Workflows with Human Interactions using Step Functions.txt
├── MSK Lab 1.txt
├── MSK To AWS Lambda.txt
├── MSK project with serverless Producer & Consumer.txt
├── MSK with s3 sink connector and Snowflake.txt
├── Manual Offset Commits and At-most Once Processing.png
├── Manual Offset Commits and Exactly Once Processing.png
├── Master Containerization with AWS.txt
├── Monitor_Data_Pipeline_using_CloudWatch_Custom_Metrics.ipynb
├── Multicollinearity  Effect in Regression.ipynb
├── Multimodal Embedding using BedRock.zip
├── Multiple Linear Regression Code.ipynb
├── Null_Value_Handling_Pyspark.ipynb
├── One Hot Encoding.ipynb
├── Outliers_&_Skewness_Handling.ipynb
├── Parallel_processing_in_Pandas_using_pandarallel.ipynb
├── Partitioning in Athena using AWS Lambda.txt
├── Percentage_of_Null_values_in_each_column_of_Dataframe.ipynb
├── Pivoting.ipynb
├── Practice Session Data Analysis.ipynb
├── Publish Message in MSK Cluster from AWS Lambda.txt
├── Publisher SQS.ipynb
├── PySpark5.ipynb
├── PySpark6.ipynb
├── Pyspark1.ipynb
├── Pyspark2.ipynb
├── Pyspark3.ipynb
├── Pyspark4.ipynb
├── Python_&_Gsheet.ipynb
├── RAG using Kendra & Langchain AWS.ipynb
├── RAG using Snowflake.sql
├── Real-Time Streaming Project with Smartphone Data.txt
├── Receive_message_from_SQS_Queue.ipynb
├── Recursion Pattern with AWS Step Funciton & Lambda.png
├── Run Batch job Using AWS Lambda.txt
├── SNS_Message_Publish.ipynb
├── Semantic_clustering.ipynb
├── Send_message_SQS_Queue.ipynb
├── Serverless Manual Approval Steps in AWS Step Functions and Amazon API Gateway.png
├── Setup PySpark in ec2 using conda.txt
├── Shake detection using Accelerometer , Kafka & Python.txt
├── Simple Linear Regression (ML).ipynb
├── Simple OTP System using AWS Serverless.txt
├── Snowflake Codes for generic s3 to snowflake loader.txt
├── Snowflake External Table Partitioning.txt
├── Snowflake Parallel Processing using Python Lab.txt
├── Snowflake Row Level Security.sql
├── Snowflake Schema Detection.txt
├── Snowflake Stored Porcedure Parallel execution (Part 1).txt
├── Snowflake Stored Porcedure Parallel execution (Part 2).txt
├── Snowflake code for External Table refresh framework.txt
├── Snowflake logging (1).txt
├── Snowflake_SP_Util.py
├── Snyk Code for Github Action.yml
├── Sorting.ipynb
├── Spark Caching In-Depth.ipynb
├── String Functions.ipynb
├── String_similarity_using_Fuzzy.ipynb
├── Success File in PySpark.txt
├── Talend with EMR & Snowflake.png
├── Time Traven in Snowflake.txt
├── Trigger Airflow code via rest api.txt
├── Unstructured Data processing with Snowflake.txt
├── Untitled7.ipynb
├── Unusual Usecases of Time Travel.ipynb
├── Updated GenAI Notes.pdf
├── Using Ephemeral Storage for AWS Lambda.ipynb
├── Using KMS for Client Side Encryption.ipynb
├── _Manual Offset Commits & At Least Once Processing in Kafka Consumer.drawio.png
├── airflow_emr_s3_snowflake_setup.txt
├── airflow_emr_spark_s3_snowflake.py
├── airflow_install.sh
├── airflow_news_data_pipeline.py
├── airflow_talend_runner.py
├── airflow_talend_success_file_snesor.bash
├── algolia_layer1.zip
├── aws-eventbridge-kinesisfirehose-s3.drawio.png
├── bronze_to_silver_data_lineage.ipynb
├── context_aware_rag.sql
├── convert_json_to_csv_in_kinesis_firehose_transformation.ipynb
├── dbscan_visualization.txt
├── discoverx Lab 1.ipynb
├── dynamic_compaction.ipynb
├── entity_extraction_templating_using_aws_bedrock.py
├── generate_smiling_face_cluster.txt
├── generic lambda_layer_creation_framework.txt
├── generic_job_s3_to_snowflake_using_copy_command.py
├── incremental_etl.zip
├── ingest.sh
├── iris_partitioned_Data.zip
├── isNull,isNotNull,case.ipynb
├── isin,sample,limit.ipynb
├── kafka snowflake integration.txt
├── kafka source rest project.txt
├── kafka_producer_with_topic_partitioning.py
├── kafka_yt_demo.zip
├── key monitor.txt
├── lambda_powertools.py
├── mysql_cdc_fetcher_runner.py
├── news_fetcher.py
├── news_fetcher_etl.py
├── otp system.drawio.png
├── scd type 1 Snowflake.txt
├── scd_type_2_snowflake.py
├── scd_type_2_snowflake_queries.sql
├── scd_type_2_snowflake_version_2.py
├── snowflake cortex fine tuning.txt
├── snowflake_connector_python-2.3.8-py3-none-any.whl
├── snowflake_dq_framework.py
├── snowflake_elt_talend_lab_24_08_2022.txt
├── source_to_bronze_data_lineage.ipynb
├── study_data.csv
├── test123.txt
├── testa
├── transform.py
├── transientcluster.ipynb
├── translator_with_polly.py
└── user_data_yt.sh


/API_Load_Testing_using_Cloudwatch_Custom_Metric.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "provenance": [],
  7 |       "authorship_tag": "ABX9TyPx14exc0tt0xFeYnpWFa2c",
  8 |       "include_colab_link": true
  9 |     },
 10 |     "kernelspec": {
 11 |       "name": "python3",
 12 |       "display_name": "Python 3"
 13 |     },
 14 |     "language_info": {
 15 |       "name": "python"
 16 |     }
 17 |   },
 18 |   "cells": [
 19 |     {
 20 |       "cell_type": "markdown",
 21 |       "metadata": {
 22 |         "id": "view-in-github",
 23 |         "colab_type": "text"
 24 |       },
 25 |       "source": [
 26 |         "<a href=\"https://colab.research.google.com/github/SatadruMukherjee/Data-Preprocessing-Models/blob/main/API_Load_Testing_using_Cloudwatch_Custom_Metric.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
 27 |       ]
 28 |     },
 29 |     {
 30 |       "cell_type": "code",
 31 |       "execution_count": null,
 32 |       "metadata": {
 33 |         "id": "DKGsa9g-0rSj"
 34 |       },
 35 |       "outputs": [],
 36 |       "source": [
 37 |         "!pip3 install boto3 requests pandas"
 38 |       ]
 39 |     },
 40 |     {
 41 |       "cell_type": "code",
 42 |       "source": [
 43 |         "import time\n",
 44 |         "import json\n",
 45 |         "import pandas as pd\n",
 46 |         "import requests\n",
 47 |         "import boto3"
 48 |       ],
 49 |       "metadata": {
 50 |         "id": "z3IlVwZXAYES"
 51 |       },
 52 |       "execution_count": null,
 53 |       "outputs": []
 54 |     },
 55 |     {
 56 |       "cell_type": "code",
 57 |       "source": [
 58 |         "\n",
 59 |         "# Initialize CloudWatch client\n",
 60 |         "cloudwatch = boto3.client('cloudwatch', aws_access_key_id='', aws_secret_access_key='', region_name='us-east-1')\n",
 61 |         "\n",
 62 |         "# Read the CSV file\n",
 63 |         "csv_file = \"/content/extarcted_data_for_testing30k.csv\"  # Replace with the path to your CSV file\n",
 64 |         "data = pd.read_csv(csv_file)\n",
 65 |         "data = data.head(300)\n",
 66 |         "\n",
 67 |         "# API URL and headers\n",
 68 |         "api_url = \"https://hccreference.com/api/search\"\n",
 69 |         "headers = {\n",
 70 |         "    'accept': 'application/json, text/plain, */*',\n",
 71 |         "    'accept-language': 'en-US,en;q=0.9,bn;q=0.8,hi;q=0.7',\n",
 72 |         "    'cache-control': 'no-cache',\n",
 73 |         "    'content-type': 'application/json',\n",
 74 |         "    'origin': 'https://hccreference.com',\n",
 75 |         "    'pragma': 'no-cache',\n",
 76 |         "    'referer': 'https://hccreference.com/',\n",
 77 |         "    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',\n",
 78 |         "}\n",
 79 |         "\n",
 80 |         "def send_metrics_to_cloudwatch(metric_name, value, unit='Milliseconds', namespace='API_RESPONSE_DATA_CUSTOM_METRIC'):\n",
 81 |         "    try:\n",
 82 |         "        cloudwatch.put_metric_data(\n",
 83 |         "            MetricData=[\n",
 84 |         "                {\n",
 85 |         "                    'MetricName': metric_name,\n",
 86 |         "                    'Unit': unit,\n",
 87 |         "                    'Value': value\n",
 88 |         "                },\n",
 89 |         "            ],\n",
 90 |         "            Namespace=namespace\n",
 91 |         "        )\n",
 92 |         "        print(f\"Metric {metric_name} sent successfully.\")\n",
 93 |         "    except Exception as e:\n",
 94 |         "        print(f\"Error sending metric {metric_name}: {e}\")\n",
 95 |         "\n",
 96 |         "def make_api_requests(data):\n",
 97 |         "    for index, row in data.iterrows():\n",
 98 |         "        payload = {\n",
 99 |         "            \"dx_hcc\": row['dx_hcc'],\n",
100 |         "            \"dos_year\": row['dos_year'],\n",
101 |         "            \"drf\": row['drf'],\n",
102 |         "            \"search\": row['search']\n",
103 |         "        }\n",
104 |         "        try:\n",
105 |         "            start_time = time.perf_counter()\n",
106 |         "            response = requests.post(api_url, headers=headers, json=payload)\n",
107 |         "            end_time = time.perf_counter()\n",
108 |         "\n",
109 |         "            response_time = (end_time - start_time) * 1000  # Convert to milliseconds\n",
110 |         "            print(f\"Row {index + 1}: Response time {response_time:.2f} ms\")\n",
111 |         "\n",
112 |         "            send_metrics_to_cloudwatch('API_Response_Time', response_time)\n",
113 |         "\n",
114 |         "            if response.status_code != 200:\n",
115 |         "                print(f\"Row {index + 1}: API error - {response.status_code}\")\n",
116 |         "                send_metrics_to_cloudwatch('API_Errors', 1, unit='Count')\n",
117 |         "        except Exception as e:\n",
118 |         "            print(f\"Error processing row {index + 1}: {e}\")\n",
119 |         "            send_metrics_to_cloudwatch('API_Errors', 1, unit='Count')\n",
120 |         "\n",
121 |         "# Run the API requests\n",
122 |         "make_api_requests(data)\n"
123 |       ],
124 |       "metadata": {
125 |         "id": "I25hiqiz02Aj"
126 |       },
127 |       "execution_count": null,
128 |       "outputs": []
129 |     }
130 |   ]
131 | }


--------------------------------------------------------------------------------
/AWS Bedrock Gen AI Notes.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/AWS Bedrock Gen AI Notes.pdf


--------------------------------------------------------------------------------
/AWS Glue Job trigger from Lambda.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/AWS Glue Job trigger from Lambda.JPG


--------------------------------------------------------------------------------
/AWS Lambda Trigger from PostgreSQL.txt:
--------------------------------------------------------------------------------
 1 | Step 1: Create Postgres Instance
 2 | 
 3 | Instance identifier: 
 4 | Master User Name:
 5 | Master Password: 
 6 | Security Group : 
 7 | Database Port: 
 8 | DB Name: 
 9 | 
10 | Step 2: Create the Lambda Function
11 | 
12 | import json
13 | from time import sleep
14 | 
15 | def lambda_handler(event, context):
16 |     print(event)
17 |     sleep(2)
18 |     # TODO implement
19 |     return {
20 |         'statusCode': 200,
21 |         'body': json.dumps('Hello from Lambda!')
22 |     }
23 | 
24 | 
25 | Step 3: Create IAM Role with Lambda Access & Assign to AWS RDS
26 | 
27 | Step 4: Connect to DB 
28 | 
29 | Driver: 
30 | 
31 | URL: jdbc:postgresql://host:port/name_of_database  
32 | Username:
33 | Password:
34 | 
35 | 
36 | Step 5: Run SQL Query:
37 | 
38 | SELECT current_database();
39 | 
40 | CREATE EXTENSION IF NOT EXISTS aws_lambda CASCADE;
41 | 
42 | --sync
43 | SELECT * from aws_lambda.invoke(aws_commons.create_lambda_function_arn('arn:aws:lambda:us-east-1:825865577047:function:triggerfromdbrds', 'us-east-1'), 
44 | '{"body": "Hello from Postgres second time!"}'::json );
45 | 
46 | --async
47 | SELECT * FROM aws_lambda.invoke(aws_commons.create_lambda_function_arn('arn:aws:lambda:us-east-1:825865577047:function:triggerfromdbrds', 'us-east-1'), 
48 | '{"body": "Hello from Postgres async!"}'::json, 'Event');
49 | 
50 | --create table
51 | -- Create the inventory table
52 | CREATE TABLE inventory (
53 |     id SERIAL PRIMARY KEY,
54 |     product_name VARCHAR(100) NOT NULL,
55 |     quantity INTEGER NOT NULL,
56 |     price DECIMAL(10, 2) NOT NULL
57 | );
58 | 
59 | -- Insert some sample data into the inventory table
60 | INSERT INTO inventory (product_name, quantity, price) VALUES ('Product A', 10, 50.00);
61 | INSERT INTO inventory (product_name, quantity, price) VALUES ('Product B', 5, 40.00);
62 | INSERT INTO inventory (product_name, quantity, price) VALUES ('Product C', 0, 30.00);
63 | 
64 | --create trigger
65 | -- Create or replace the function to invoke Lambda with inserted row as input
66 | CREATE OR REPLACE FUNCTION invoke_lambda_on_insert()
67 | RETURNS TRIGGER AS $$
68 | BEGIN
69 |     PERFORM aws_lambda.invoke(
70 |         aws_commons.create_lambda_function_arn('arn:aws:lambda:us-east-1:825865577047:function:triggerfromdbrds'),
71 |         row_to_json(NEW),
72 |         'Event'
73 |     );
74 |     RETURN NEW;
75 | END;
76 | $$ LANGUAGE plpgsql;
77 | 
78 | -- Create or replace the trigger to call the function on insert
79 | CREATE TRIGGER call_lambda_on_insert
80 | AFTER INSERT ON inventory
81 | FOR EACH ROW
82 | EXECUTE FUNCTION invoke_lambda_on_insert();
83 | 
84 | INSERT INTO inventory (product_name, quantity, price) VALUES ('Product E', 0, 30.00);
85 | INSERT INTO inventory (product_name, quantity, price) VALUES ('Product F', 0, 30.00);
86 | INSERT INTO inventory (product_name, quantity, price) VALUES ('Product G', 0, 30.00);
87 | ;
88 | 
89 | INSERT INTO inventory (product_name, quantity, price) VALUES ('Product H', 0, 30.00),('Product I', 0, 30.00);
90 | 
91 | SELECT * FROM inventory;


--------------------------------------------------------------------------------
/AWS MSK Connector for Snowflake.txt:
--------------------------------------------------------------------------------
  1 | Launch MSK Cluster:
  2 | ----------------------
  3 | Configure NAT Gateway & launch MSK Cluster in Private Subnet
  4 | 
  5 | 
  6 | openssl genrsa -out rsa_key.pem 2048
  7 | openssl rsa -in rsa_key.pem -pubout -out rsa_key.pub
  8 | 
  9 | These files contain keys that may contain spaces and new line characters which need to be removed--
 10 | export SNOWFLAKE_PVT_KEY=$(echo `sed -e '2,$!d' -e '$d' -e 's/\n/ /g' rsa_key.pem`|tr -d ' ')
 11 | echo $SNOWFLAKE_PVT_KEY > rsa_key_p8.out
 12 | 
 13 | Configure Snowflake:
 14 | --------------------------
 15 | cat rsa_key.pub
 16 | 
 17 | DROP DATABASE IF EXISTS RAMU;
 18 | Create database ramu;
 19 | alter user Satadru set rsa_public_key='';
 20 | 
 21 | desc user satadru;
 22 | use ramu;
 23 | show tables;
 24 | 
 25 | 
 26 | In EC2 Client Machine:
 27 | -----------------------------
 28 | sudo yum install java-1.8.0-openjdk
 29 | wget https://archive.apache.org/dist/kafka/2.8.1/kafka_2.12-2.8.1.tgz
 30 | tar -xvf kafka_2.12-2.8.1.tgz
 31 | cd kafka_2.12-2.8.1
 32 | 
 33 | bin/kafka-topics.sh --create --topic demo_testing2 --bootstrap-server {} --replication-factor 1 --partitions 2
 34 | 
 35 | 
 36 | Create Custom plugins:
 37 | -------------------------
 38 | https://mvnrepository.com/artifact/com.snowflake/snowflake-kafka-connector/1.5.0
 39 | 
 40 | 
 41 | For Kafka Connect Config:
 42 | ---------------------------
 43 | IAM Role:s3--give s3 full access
 44 | 
 45 | Trust Relationship--
 46 | 
 47 | {
 48 |     "Version": "2012-10-17",
 49 |     "Statement": [
 50 |         {
 51 |             "Effect": "Allow",
 52 |             "Principal": {
 53 |                 "Service": "kafkaconnect.amazonaws.com"
 54 |             },
 55 |             "Action": "sts:AssumeRole",
 56 |             "Condition": {
 57 |                 "StringEquals": {
 58 |                     "aws:SourceAccount": "Account ID"
 59 |                 }
 60 |             }
 61 |         }
 62 |     ]
 63 | }
 64 | 
 65 | Create a cloudwatch log group
 66 | 
 67 | 
 68 | Connector Config:
 69 | -------------------------
 70 | 
 71 | connector.class=com.snowflake.kafka.connector.SnowflakeSinkConnector
 72 | tasks.max=8
 73 | topics=demo_testing2
 74 | snowflake.topic2table.map=demo_testing2:fake_data_real_time_demo
 75 | buffer.count.records=10000
 76 | buffer.flush.time=60
 77 | buffer.size.bytes=5000000
 78 | snowflake.url.name=
 79 | snowflake.user.name=
 80 | snowflake.private.key=
 81 | snowflake.database.name=RAMU
 82 | snowflake.schema.name=PUBLIC
 83 | key.converter=com.snowflake.kafka.connector.records.SnowflakeJsonConverter
 84 | value.converter=com.snowflake.kafka.connector.records.SnowflakeJsonConverter
 85 | 
 86 | 
 87 | Test:
 88 | -------
 89 | Produce messages:
 90 | ---------------------
 91 | bin/kafka-console-producer.sh --topic demo_testing2 --bootstrap-server {}
 92 | 
 93 | Consume messages:
 94 | ---------------------
 95 | bin/kafka-console-consumer.sh --topic demo_testing2 --bootstrap-server {}
 96 | 
 97 | Destination:
 98 | -----------------
 99 | select * from ramu.public.fake_data_real_time_demo;
100 | 
101 | Sample Data to Publish:
102 | ------------------------------
103 | {"email":"wzanettinirp@stanford.edu","timestamp":1663420415,"event":"spamreport","gender":"Female","ip_address":"8.166.173.156"}
104 | {"email":"pstegersrq@reddit.com","timestamp":1664321942,"event":"spamreport","gender":"Female","ip_address":"128.214.160.228"}
105 | {"email":"avlahosrr@posterous.com","timestamp":1646024825,"event":"bounce","gender":"Female","ip_address":"147.51.176.231"}


--------------------------------------------------------------------------------
/AWSS3ToSnowflake.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/AWSS3ToSnowflake.zip


--------------------------------------------------------------------------------
/AWS_Bedrock_Text_Summarization.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "cell_type": "code",
  5 |       "execution_count": null,
  6 |       "metadata": {
  7 |         "id": "x5HdV-yvIty2"
  8 |       },
  9 |       "outputs": [],
 10 |       "source": [
 11 |         "!pip install boto3 langchain pypdf unstructured[pdf] langchain-community"
 12 |       ]
 13 |     },
 14 |     {
 15 |       "cell_type": "code",
 16 |       "execution_count": null,
 17 |       "metadata": {
 18 |         "id": "5sqr0kkWTAaa"
 19 |       },
 20 |       "outputs": [],
 21 |       "source": [
 22 |         "from langchain_community.document_loaders import UnstructuredPDFLoader\n",
 23 |         "from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter\n",
 24 |         "import json\n",
 25 |         "import boto3"
 26 |       ]
 27 |     },
 28 |     {
 29 |       "cell_type": "code",
 30 |       "execution_count": null,
 31 |       "metadata": {
 32 |         "id": "ehFtAxHGUy6j"
 33 |       },
 34 |       "outputs": [],
 35 |       "source": [
 36 |         "boto3_bedrock = boto3.client('bedrock-runtime',region_name='us-east-1',aws_access_key_id='{}',aws_secret_access_key='{}')"
 37 |       ]
 38 |     },
 39 |     {
 40 |       "cell_type": "code",
 41 |       "execution_count": null,
 42 |       "metadata": {
 43 |         "id": "CXLmd31WUfvZ"
 44 |       },
 45 |       "outputs": [],
 46 |       "source": [
 47 |         "def summarizer(prompt_data):\n",
 48 |         "  inputText=prompt_data\n",
 49 |         "  body_part=json.dumps({'inputText': inputText,\n",
 50 |         "  'textGenerationConfig': {'maxTokenCount': 8192,\n",
 51 |         "  'stopSequences': [],\n",
 52 |         "  'temperature': 0,\n",
 53 |         "  'topP': 1}})\n",
 54 |         "  response = boto3_bedrock.invoke_model(\n",
 55 |         "  body=body_part,\n",
 56 |         "  contentType=\"application/json\",\n",
 57 |         "  accept=\"application/json\",\n",
 58 |         "  modelId='amazon.titan-text-express-v1'\n",
 59 |         "  )\n",
 60 |         "  output_text=json.loads(response['body'].read())['results'][0]['outputText']\n",
 61 |         "  return output_text"
 62 |       ]
 63 |     },
 64 |     {
 65 |       "cell_type": "code",
 66 |       "execution_count": null,
 67 |       "metadata": {
 68 |         "id": "oJgKzeNvSiSj"
 69 |       },
 70 |       "outputs": [],
 71 |       "source": [
 72 |         "def read_pdf_and_split(filename):\n",
 73 |         "  loader = UnstructuredPDFLoader(filename)\n",
 74 |         "  data = loader.load()\n",
 75 |         "  print(data)\n",
 76 |         "  splitter = RecursiveCharacterTextSplitter(\n",
 77 |         "  chunk_size=1000,\n",
 78 |         "  chunk_overlap=100,\n",
 79 |         "  length_function=len,\n",
 80 |         "  add_start_index=True\n",
 81 |         "  )\n",
 82 |         "  splitted_text = splitter.split_documents(data)\n",
 83 |         "\n",
 84 |         "  return splitted_text\n"
 85 |       ]
 86 |     },
 87 |     {
 88 |       "cell_type": "code",
 89 |       "execution_count": null,
 90 |       "metadata": {
 91 |         "id": "Kbw4ars1UBFI"
 92 |       },
 93 |       "outputs": [],
 94 |       "source": [
 95 |         "pdf_document = read_pdf_and_split('/content/YOGI_2_0.pdf')"
 96 |       ]
 97 |     },
 98 |     {
 99 |       "cell_type": "code",
100 |       "source": [
101 |         "pdf_document"
102 |       ],
103 |       "metadata": {
104 |         "id": "1Ygmv_JZQZh4"
105 |       },
106 |       "execution_count": null,
107 |       "outputs": []
108 |     },
109 |     {
110 |       "cell_type": "code",
111 |       "execution_count": null,
112 |       "metadata": {
113 |         "id": "n10KYlgRU27h"
114 |       },
115 |       "outputs": [],
116 |       "source": [
117 |         "len(pdf_document)"
118 |       ]
119 |     },
120 |     {
121 |       "cell_type": "code",
122 |       "execution_count": null,
123 |       "metadata": {
124 |         "id": "47e8ZSV4YEwe"
125 |       },
126 |       "outputs": [],
127 |       "source": [
128 |         "summary= \"\"\n",
129 |         "for i in pdf_document:\n",
130 |         "  # gathering the text content of that specific chunk\n",
131 |         "  chunk_content = i.page_content\n",
132 |         "  # creating the prompt that will be passed into Bedrock with the text content of the chunk\n",
133 |         "  prompt = f\"\"\"Human: Provide a detailed summary for the chunk of text provided to you:\n",
134 |         "  Text: {chunk_content}\"\"\"\n",
135 |         "  # passing the prompt into the summarizer function to generate the summary of that chunk, and appending it to\n",
136 |         "  # the summary string\n",
137 |         "  summary += summarizer(prompt)\n",
138 |         "\n",
139 |         "final_summary_prompt = f\"\"\"Human: You will be given a set of summaries from a document. Create a cohesive\n",
140 |         "summary from the provided individual summaries. The summary should very detailed.\n",
141 |         "Summaries: {summary}\"\"\"\n",
142 |         "# generating the final summary of all the summaries we have previously generated.\n",
143 |         "print(summarizer(final_summary_prompt))"
144 |       ]
145 |     }
146 |   ],
147 |   "metadata": {
148 |     "colab": {
149 |       "provenance": [],
150 |       "authorship_tag": "ABX9TyNlKIthr1rY+Vrj0gHzReFL"
151 |     },
152 |     "kernelspec": {
153 |       "display_name": "Python 3",
154 |       "name": "python3"
155 |     },
156 |     "language_info": {
157 |       "name": "python"
158 |     }
159 |   },
160 |   "nbformat": 4,
161 |   "nbformat_minor": 0
162 | }


--------------------------------------------------------------------------------
/Analyze Apache Parquet optimized data using Amazon Kinesis Data Firehose, Amazon Athena.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/Analyze Apache Parquet optimized data using Amazon Kinesis Data Firehose, Amazon Athena.png


--------------------------------------------------------------------------------
/Architecture  of the generic Job to ingest data from s3 to Snowflake.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/Architecture  of the generic Job to ingest data from s3 to Snowflake.png


--------------------------------------------------------------------------------
/Artilary Demystified.txt:
--------------------------------------------------------------------------------
  1 | VM Launch:
  2 | ----
  3 | ec2-ubuntu machine
  4 | 
  5 | Installation:
  6 | -------------
  7 | sudo su
  8 | curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.34.0/install.sh | bash
  9 | . ~/.nvm/nvm.sh
 10 | nvm install node
 11 | nvm --version
 12 | npm install -g artillery@latest
 13 | artillery version
 14 | artillery dino -m "Knowledge Amplifier" -r
 15 | 
 16 | 
 17 | 
 18 | Case 1:
 19 | --------
 20 | {API (Base URL)}
 21 | 
 22 | /firstdemo
 23 | 
 24 | YAML File:
 25 | -----------
 26 | firstdemo.yml
 27 | 
 28 | config:
 29 |   target: 'https://httpbin.org/'
 30 |   phases:
 31 |     - duration: 5
 32 |       arrivalRate: 2
 33 | scenarios:
 34 |   - flow:
 35 |     - get:
 36 |         url: "/get"
 37 | 
 38 | config:
 39 |   target: '{API (Base URL)}'
 40 |   phases:
 41 |     - duration: 5
 42 |       arrivalRate: 2
 43 | scenarios:
 44 |   - flow:
 45 |     - get:
 46 |         url: "/firstdemo"		
 47 | 
 48 | 
 49 | 
 50 | 		
 51 | Run the code:
 52 | -------------
 53 | DEBUG=http* artillery run /home/ubuntu/firstdemo.yml
 54 | 
 55 | Understanding Summary:
 56 | -----------------------
 57 | All VUs finished. Total time: {}
 58 | http.codes.<status_code>--Number of codes received for each specific status code.
 59 | http.request_rate--Rate of http requests done over the time period.
 60 | http.requests -- Number of HTTP requests made.
 61 | http.responses -- Number of HTTP responses received.
 62 | 
 63 | 
 64 | 
 65 | Case 2:
 66 | ---------
 67 | seconddemo.yml
 68 | 		
 69 | config:
 70 |   target: '{API (Base URL)}'
 71 |   phases:
 72 |     - name: reduce_load
 73 |       duration: 3h
 74 |       arrivalRate: 1
 75 |     - name: nothing
 76 |       pause: 2m
 77 |     - name: stress
 78 |       duration: 2m
 79 |       arrivalRate: 3
 80 | scenarios:
 81 |   - flow:
 82 |     - get:
 83 |         url: "/firstdemo"
 84 | 		
 85 | 		
 86 | Shell Script:
 87 | ---------------
 88 | seconddemo.sh
 89 | 
 90 | DEBUG=http* artillery run /home/ubuntu/seconddemo.yml
 91 | 
 92 | Code run:
 93 | ----------
 94 | chmod 755 seconddemo.sh
 95 | nohup "/home/ubuntu/seconddemo.sh" > /home/ubuntu/seconddemo.out 2>&1 &		
 96 | 		
 97 | [1] 3042		
 98 | 		
 99 | Case 3:
100 | ---------
101 | thirddemo.yml	
102 | 
103 | config:
104 |   target: https://www.hccreference.com
105 |   phases:
106 |     - name: high_traffic
107 |       duration: 2m
108 |       arrivalRate: 2
109 |     - name: nothing
110 |       pause: 1m
111 |     - name: stress
112 |       duration: 10
113 |       arrivalRate: 32
114 |   payload:
115 |     path: /home/ubuntu/extarcted_data_for_testing30k.csv
116 |     order: sequence           # default: random
117 |     loadAll: true             
118 |     skipHeader: true          # default: false  
119 |     delimiter: ","            # default: ,
120 |     skipEmptyLines: true      # default: true
121 |     fields:
122 |       - "dos_year"
123 |       - "drf"
124 |       - "dx_hcc"
125 |       - "search"
126 | 
127 | scenarios:
128 | - name: testhcc
129 |   flow:
130 |   - post:
131 |       url: /api/search
132 |       headers:
133 |         Content-Type: application/json
134 |       json:
135 |         dos_year: "{{ dos_year }}"
136 |         drf: "{{ drf }}"
137 |         dx_hcc: "{{ dx_hcc }}"
138 |         search: "{{ search }}"	
139 | 		
140 | Shell Script:
141 | ---------------
142 | thirddemo.sh
143 | 
144 | DEBUG=http* artillery run /home/ubuntu/thirddemo.yml
145 | 
146 | Code run:
147 | ----------
148 | chmod 755 thirddemo.sh
149 | nohup "/home/ubuntu/thirddemo.sh" > /home/ubuntu/thirddemo.out 2>&1 &		
150 | 
151 | 3154


--------------------------------------------------------------------------------
/Bloom Filter Theory.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/Bloom Filter Theory.pdf


--------------------------------------------------------------------------------
/Build & Push Docker Image to AWS ECR using GitHub Actions.txt:
--------------------------------------------------------------------------------
 1 | Reference:
 2 | ------------
 3 | https://github.com/aws-actions/amazon-ecr-login
 4 | https://docs.aws.amazon.com/lambda/latest/dg/python-image.html
 5 | https://github.com/SatadruMukherjee/Data-Preprocessing-Models/blob/main/Github%20Action%20with%20AWS.txt
 6 | 
 7 | Github Action Code:
 8 | -----------------------
 9 | name: ecr_docker_deployment
10 | on: [push]
11 | jobs:
12 |   docker_cicd:
13 |     runs-on: ubuntu-latest
14 |     steps:
15 |       - uses: actions/checkout@v4
16 |       - name: Configure AWS credentials
17 |         uses: aws-actions/configure-aws-credentials@v1
18 |         with:
19 |           aws-access-key-id: 
20 |           aws-secret-access-key: 
21 |           aws-region: us-east-1
22 |       - name: Login to Amazon ECR
23 |         id: login-ecr
24 |         uses: aws-actions/amazon-ecr-login@v2
25 | 
26 |       - name: Build, tag, and push docker image to Amazon ECR
27 |         env:
28 |           REGISTRY: ${{ steps.login-ecr.outputs.registry }}
29 |           REPOSITORY: demoytcicdgithubaction
30 |           IMAGE_TAG: ${{ github.sha }}
31 |         run: |
32 |           docker build -t $REGISTRY/$REPOSITORY:$IMAGE_TAG .
33 |           docker push $REGISTRY/$REPOSITORY:$IMAGE_TAG
34 |           aws lambda update-function-code \
35 |             --function-name demoytcicdecrtest \
36 |             --image-uri $REGISTRY/$REPOSITORY:$IMAGE_TAG


--------------------------------------------------------------------------------
/Cake preparation-Detail Architecture (1).jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/Cake preparation-Detail Architecture (1).jpg


--------------------------------------------------------------------------------
/Create_,_Update_,_Delete_Athena_Prepared_Statement_using_boto3.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "provenance": [],
  7 |       "authorship_tag": "ABX9TyOldsBynHv6Pz97bdTKjPP2",
  8 |       "include_colab_link": true
  9 |     },
 10 |     "kernelspec": {
 11 |       "name": "python3",
 12 |       "display_name": "Python 3"
 13 |     },
 14 |     "language_info": {
 15 |       "name": "python"
 16 |     }
 17 |   },
 18 |   "cells": [
 19 |     {
 20 |       "cell_type": "markdown",
 21 |       "metadata": {
 22 |         "id": "view-in-github",
 23 |         "colab_type": "text"
 24 |       },
 25 |       "source": [
 26 |         "<a href=\"https://colab.research.google.com/github/SatadruMukherjee/Data-Preprocessing-Models/blob/main/Create_%2C_Update_%2C_Delete_Athena_Prepared_Statement_using_boto3.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
 27 |       ]
 28 |     },
 29 |     {
 30 |       "cell_type": "markdown",
 31 |       "source": [
 32 |         "Prerequisite:\n",
 33 |         "https://github.com/SatadruMukherjee/Data-Preprocessing-Models/blob/main/Athena_parameterized_queries_using_Python.ipynb"
 34 |       ],
 35 |       "metadata": {
 36 |         "id": "999exryIKhvN"
 37 |       }
 38 |     },
 39 |     {
 40 |       "cell_type": "code",
 41 |       "execution_count": null,
 42 |       "metadata": {
 43 |         "id": "OqDN6-_N1yNb"
 44 |       },
 45 |       "outputs": [],
 46 |       "source": [
 47 |         "!pip install boto3"
 48 |       ]
 49 |     },
 50 |     {
 51 |       "cell_type": "code",
 52 |       "source": [
 53 |         "import boto3\n",
 54 |         "access_key=''\n",
 55 |         "secret_key=''\n",
 56 |         "session = boto3.Session(\n",
 57 |         "    aws_access_key_id=access_key,\n",
 58 |         "    aws_secret_access_key=secret_key,region_name='us-east-1'\n",
 59 |         ")\n",
 60 |         "athena_client = session.client('athena')\n"
 61 |       ],
 62 |       "metadata": {
 63 |         "id": "v_Ao18Cc116f"
 64 |       },
 65 |       "execution_count": null,
 66 |       "outputs": []
 67 |     },
 68 |     {
 69 |       "cell_type": "markdown",
 70 |       "source": [
 71 |         "# Create prepared statements using boto3"
 72 |       ],
 73 |       "metadata": {
 74 |         "id": "6Bxn7tJl2kFb"
 75 |       }
 76 |     },
 77 |     {
 78 |       "cell_type": "code",
 79 |       "source": [
 80 |         "resp = athena_client.create_prepared_statement(\n",
 81 |         "        StatementName = \"iris_fulla\",\n",
 82 |         "        WorkGroup = \"primary\",\n",
 83 |         "        QueryStatement = \"\"\"\n",
 84 |         "        SELECT sum(sepal_length) FROM irisdemo WHERE variety = ? \n",
 85 |         "        \"\"\"\n",
 86 |         "    )"
 87 |       ],
 88 |       "metadata": {
 89 |         "id": "z-vW5iQs17HT"
 90 |       },
 91 |       "execution_count": null,
 92 |       "outputs": []
 93 |     },
 94 |     {
 95 |       "cell_type": "markdown",
 96 |       "source": [
 97 |         "# Update prepared statements using boto3"
 98 |       ],
 99 |       "metadata": {
100 |         "id": "FfniHtX924WU"
101 |       }
102 |     },
103 |     {
104 |       "cell_type": "code",
105 |       "source": [
106 |         "response = athena_client.update_prepared_statement(\n",
107 |         "    StatementName = \"iris_fulla\",\n",
108 |         "        WorkGroup = \"primary\",\n",
109 |         "        QueryStatement = \"\"\"\n",
110 |         "        SELECT sum(petal_length) FROM irisdemo WHERE variety = ? ;\n",
111 |         "        \"\"\"\n",
112 |         ")"
113 |       ],
114 |       "metadata": {
115 |         "id": "2k0vZkzP2zZd"
116 |       },
117 |       "execution_count": null,
118 |       "outputs": []
119 |     },
120 |     {
121 |       "cell_type": "markdown",
122 |       "source": [
123 |         "# Delete prepared statements\n"
124 |       ],
125 |       "metadata": {
126 |         "id": "VWq2ha3a3jt9"
127 |       }
128 |     },
129 |     {
130 |       "cell_type": "code",
131 |       "source": [
132 |         "--to remove the prepared statement\n",
133 |         "DEALLOCATE PREPARE iris_fulla;"
134 |       ],
135 |       "metadata": {
136 |         "id": "YtU9ZqcY3mUg"
137 |       },
138 |       "execution_count": null,
139 |       "outputs": []
140 |     },
141 |     {
142 |       "cell_type": "code",
143 |       "source": [
144 |         "response = athena_client.delete_prepared_statement(\n",
145 |         "    StatementName='iris_fulla',\n",
146 |         "    WorkGroup='primary'\n",
147 |         ")"
148 |       ],
149 |       "metadata": {
150 |         "id": "140SfLvO6Tp1"
151 |       },
152 |       "execution_count": null,
153 |       "outputs": []
154 |     },
155 |     {
156 |       "cell_type": "markdown",
157 |       "source": [
158 |         "# *security **using** Amazon Athena parameterized queries*"
159 |       ],
160 |       "metadata": {
161 |         "id": "pCzKTd1B7PRt"
162 |       }
163 |     },
164 |     {
165 |       "cell_type": "markdown",
166 |       "source": [
167 |         "https://aws.amazon.com/blogs/big-data/improve-reusability-and-security-using-amazon-athena-parameterized-queries/"
168 |       ],
169 |       "metadata": {
170 |         "id": "tQ2DAFoJKq1d"
171 |       }
172 |     }
173 |   ]
174 | }


--------------------------------------------------------------------------------
/DLQ Redrive.drawio.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/DLQ Redrive.drawio.png


--------------------------------------------------------------------------------
/Data Warehouse in Snowflake.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/Data Warehouse in Snowflake.png


--------------------------------------------------------------------------------
/Data_Ingestion_from_GSheet_to_s3.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "provenance": [],
  7 |       "authorship_tag": "ABX9TyOVHQq+Baz2iKjbD87oOQB3",
  8 |       "include_colab_link": true
  9 |     },
 10 |     "kernelspec": {
 11 |       "name": "python3",
 12 |       "display_name": "Python 3"
 13 |     },
 14 |     "language_info": {
 15 |       "name": "python"
 16 |     }
 17 |   },
 18 |   "cells": [
 19 |     {
 20 |       "cell_type": "markdown",
 21 |       "metadata": {
 22 |         "id": "view-in-github",
 23 |         "colab_type": "text"
 24 |       },
 25 |       "source": [
 26 |         "<a href=\"https://colab.research.google.com/github/SatadruMukherjee/Data-Preprocessing-Models/blob/main/Data_Ingestion_from_GSheet_to_s3.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
 27 |       ]
 28 |     },
 29 |     {
 30 |       "cell_type": "code",
 31 |       "execution_count": null,
 32 |       "metadata": {
 33 |         "id": "nzsqL_sbGhzp"
 34 |       },
 35 |       "outputs": [],
 36 |       "source": [
 37 |         "!pip install pandas gspread google-auth google-auth-oauthlib google-auth-httplib2 boto3"
 38 |       ]
 39 |     },
 40 |     {
 41 |       "cell_type": "code",
 42 |       "source": [
 43 |         "import os.path\n",
 44 |         "import pandas as pd\n",
 45 |         "from google.auth.transport.requests import Request\n",
 46 |         "from google_auth_oauthlib.flow import InstalledAppFlow\n",
 47 |         "from googleapiclient.discovery import build\n",
 48 |         "from googleapiclient.errors import HttpError\n",
 49 |         "from google.oauth2.service_account import Credentials\n",
 50 |         "from io import StringIO\n",
 51 |         "import boto3"
 52 |       ],
 53 |       "metadata": {
 54 |         "id": "31V0PWGyGkbl"
 55 |       },
 56 |       "execution_count": null,
 57 |       "outputs": []
 58 |     },
 59 |     {
 60 |       "cell_type": "code",
 61 |       "source": [
 62 |         "# The ID and range of a sample spreadsheet.\n",
 63 |         "SAMPLE_SPREADSHEET_ID = \"{}\"\n",
 64 |         "\n",
 65 |         "scopes = [\n",
 66 |         "        'https://www.googleapis.com/auth/spreadsheets',\n",
 67 |         "        'https://www.googleapis.com/auth/drive'\n",
 68 |         "    ]\n",
 69 |         "credentials = Credentials.from_service_account_info({\n",
 70 |         "\n",
 71 |         "}\n",
 72 |         ", scopes=scopes)\n",
 73 |         "service = build(\"sheets\", \"v4\", credentials=credentials)\n",
 74 |         "sheet = service.spreadsheets()\n",
 75 |         "result = (\n",
 76 |         "sheet.values()\n",
 77 |         ".get(spreadsheetId=SAMPLE_SPREADSHEET_ID,range='{}')\n",
 78 |         ".execute()\n",
 79 |         ")\n",
 80 |         "values = result.get(\"values\", [])\n",
 81 |         "values"
 82 |       ],
 83 |       "metadata": {
 84 |         "id": "c1GJ_DWFG9RP"
 85 |       },
 86 |       "execution_count": null,
 87 |       "outputs": []
 88 |     },
 89 |     {
 90 |       "cell_type": "code",
 91 |       "source": [
 92 |         "df = pd.DataFrame(values[1:], columns=values[0])"
 93 |       ],
 94 |       "metadata": {
 95 |         "id": "19a3S9_gJp76"
 96 |       },
 97 |       "execution_count": null,
 98 |       "outputs": []
 99 |     },
100 |     {
101 |       "cell_type": "code",
102 |       "source": [
103 |         "df"
104 |       ],
105 |       "metadata": {
106 |         "id": "_PgBK1DQKaHd"
107 |       },
108 |       "execution_count": null,
109 |       "outputs": []
110 |     },
111 |     {
112 |       "cell_type": "code",
113 |       "source": [
114 |         "# Initialize S3 client\n",
115 |         "s3_client = boto3.client(\n",
116 |         "    \"s3\",\n",
117 |         "    aws_access_key_id='{}',\n",
118 |         "    aws_secret_access_key='{}',\n",
119 |         "    region_name='us-east-1'\n",
120 |         ")\n",
121 |         "\n",
122 |         "# Convert DataFrame to CSV and upload to S3\n",
123 |         "csv_buffer = StringIO()\n",
124 |         "df.to_csv(csv_buffer, index=False)\n",
125 |         "\n",
126 |         "s3_client.put_object(\n",
127 |         "    Bucket='{}',\n",
128 |         "    Key='{}/write_drive_data.csv',\n",
129 |         "    Body=csv_buffer.getvalue()\n",
130 |         ")\n",
131 |         "\n",
132 |         "print(f\"DataFrame successfully uploaded to s3\")"
133 |       ],
134 |       "metadata": {
135 |         "id": "NSCAj0DP9Qah"
136 |       },
137 |       "execution_count": null,
138 |       "outputs": []
139 |     }
140 |   ]
141 | }


--------------------------------------------------------------------------------
/DeltaLake Handwritten Notes.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/DeltaLake Handwritten Notes.pdf


--------------------------------------------------------------------------------
/Delta_Lake in AWS Lambda.txt:
--------------------------------------------------------------------------------
  1 | lambda_function.py
  2 | =================
  3 | import os
  4 | import boto3
  5 | import pandas as pd
  6 | from deltalake import DeltaTable
  7 | from deltalake.writer import write_deltalake
  8 | import json
  9 | 
 10 | def get_s3_object(bucket, key):
 11 |     """Get object from S3"""
 12 |     s3_client = boto3.client('s3')
 13 |     try:
 14 |         response = s3_client.get_object(Bucket=bucket, Key=key)
 15 |         print(f"Successfully retrieved object from S3: {bucket}/{key}")
 16 |         return response['Body']
 17 |     except Exception as e:
 18 |         print(f"Error reading from S3: {str(e)}")
 19 |         raise
 20 | 
 21 | def read_csv_from_s3(s3_object):
 22 |     """Read CSV data from S3 object into pandas DataFrame"""
 23 |     try:
 24 |         df = pd.read_csv(s3_object)
 25 |         print(f"Successfully read CSV with {len(df)} rows")
 26 |         return df
 27 |     except Exception as e:
 28 |         print(f"Error parsing CSV: {str(e)}")
 29 |         raise
 30 | 
 31 | def write_to_delta(df, table_url):
 32 |     """Write DataFrame to Delta Lake"""
 33 |     try:
 34 |         print(f"Attempting to write to Delta Lake at: {table_url}")
 35 |         write_deltalake(table_url, df , mode='append')
 36 |         print("Successfully wrote to Delta Lake")
 37 |         return True
 38 |     except Exception as e:
 39 |         print(f"Error writing to Delta Lake: {str(e)}")
 40 |         raise
 41 | 
 42 | def lambda_handler(event, context):
 43 |     """Main Lambda handler"""
 44 |     try:
 45 |         # Print the incoming event
 46 |         print(f"Received event: {json.dumps(event)}")
 47 |         
 48 |         # Get environment variables
 49 |         table_url = os.environ.get('TABLE_URL')
 50 |         if not table_url:
 51 |             raise ValueError("TABLE_URL environment variable is not set")
 52 |         
 53 |         print(f"Using TABLE_URL: {table_url}")
 54 |         
 55 |         # Get S3 event details
 56 |         records = event.get('Records', [])
 57 |         if not records:
 58 |             raise ValueError("No records found in event")
 59 |             
 60 |         # Process each record (usually there will be one)
 61 |         for record in records:
 62 |             # Extract S3 information
 63 |             bucket = record['s3']['bucket']['name']
 64 |             key = record['s3']['object']['key']
 65 |             
 66 |             print(f"Processing file {key} from bucket {bucket}")
 67 |             
 68 |             # Get and process the file
 69 |             s3_object = get_s3_object(bucket, key)
 70 |             df = read_csv_from_s3(s3_object)
 71 |             
 72 |             # Write to Delta Lake
 73 |             write_to_delta(df, table_url)
 74 |             
 75 |             print(f"Successfully processed {key}")
 76 |         
 77 |         return {
 78 |             'statusCode': 200,
 79 |             'headers': {'Content-Type': 'application/json'},
 80 |             'body': json.dumps({'message': 'Successfully processed CSV to Delta Lake'})
 81 |         }
 82 |         
 83 |     except Exception as e:
 84 |         error_message = str(e)
 85 |         print(f"Error in lambda_handler: {error_message}")
 86 |         print(f"Full error details: {repr(e)}")
 87 |         return {
 88 |             'statusCode': 500,
 89 |             'headers': {'Content-Type': 'application/json'},
 90 |             'body': json.dumps({'error': error_message})
 91 |         }
 92 | 
 93 | Dockerfile:
 94 | ============
 95 | FROM public.ecr.aws/lambda/python:3.12
 96 | 
 97 | # Copy requirements.txt
 98 | COPY requirements.txt ${LAMBDA_TASK_ROOT}
 99 | 
100 | # Install the specified packages
101 | RUN pip install -r requirements.txt
102 | 
103 | # Copy function code
104 | COPY lambda_function.py ${LAMBDA_TASK_ROOT}
105 | 
106 | # Set the CMD to your handler (could also be done as a parameter override outside of the Dockerfile)
107 | CMD [ "lambda_function.lambda_handler" ]	
108 | 
109 | 
110 | requirements.txt:
111 | =================
112 | pandas
113 | deltalake
114 | 
115 | Github Action Code:
116 | -----------------------
117 | name: ecr_docker_deployment
118 | on: [push]
119 | jobs:
120 |   docker_cicd:
121 |     runs-on: ubuntu-latest
122 |     steps:
123 |       - uses: actions/checkout@v4
124 |       - name: Configure AWS credentials
125 |         uses: aws-actions/configure-aws-credentials@v1
126 |         with:
127 |           aws-access-key-id: 
128 |           aws-secret-access-key: 
129 |           aws-region: us-east-1
130 |       - name: Login to Amazon ECR
131 |         id: login-ecr
132 |         uses: aws-actions/amazon-ecr-login@v2
133 | 
134 |       - name: Build, tag, and push docker image to Amazon ECR
135 |         env:
136 |           REGISTRY: ${{ steps.login-ecr.outputs.registry }}
137 |           REPOSITORY: deltalakelambdayt
138 |           IMAGE_TAG: ${{ github.sha }}
139 |         run: |
140 |           docker build -t $REGISTRY/$REPOSITORY:$IMAGE_TAG .
141 |           docker push $REGISTRY/$REPOSITORY:$IMAGE_TAG
142 | 
143 | Athena SQL:
144 | ============
145 | DROP TABLE IF EXISTS deltalake_db.lambda_delta;
146 | 
147 | CREATE EXTERNAL TABLE deltalake_db.lambda_delta
148 | LOCATION 's3://{Bucket_Name}/delta_lake/'
149 | TBLPROPERTIES ('table_type' = 'DELTA');	
150 | 


--------------------------------------------------------------------------------
/Delta_Sharing_Python.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |   "nbformat": 4,
 3 |   "nbformat_minor": 0,
 4 |   "metadata": {
 5 |     "colab": {
 6 |       "provenance": [],
 7 |       "authorship_tag": "ABX9TyPR8DU7eGWuInrHAmO4VpyP",
 8 |       "include_colab_link": true
 9 |     },
10 |     "kernelspec": {
11 |       "name": "python3",
12 |       "display_name": "Python 3"
13 |     },
14 |     "language_info": {
15 |       "name": "python"
16 |     }
17 |   },
18 |   "cells": [
19 |     {
20 |       "cell_type": "markdown",
21 |       "metadata": {
22 |         "id": "view-in-github",
23 |         "colab_type": "text"
24 |       },
25 |       "source": [
26 |         "<a href=\"https://colab.research.google.com/github/SatadruMukherjee/Data-Preprocessing-Models/blob/main/Delta_Sharing_Python.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
27 |       ]
28 |     },
29 |     {
30 |       "cell_type": "code",
31 |       "execution_count": null,
32 |       "metadata": {
33 |         "id": "EVzXlXSESBex"
34 |       },
35 |       "outputs": [],
36 |       "source": [
37 |         "!pip install --upgrade delta-sharing\n"
38 |       ]
39 |     },
40 |     {
41 |       "cell_type": "code",
42 |       "source": [
43 |         "import delta_sharing\n",
44 |         "\n",
45 |         "# Point to the profile file. It can be a file on the local file system or a file on a remote storage.\n",
46 |         "profile_file = \"/content/config.share\"\n",
47 |         "\n",
48 |         "# Create a SharingClient.\n",
49 |         "client = delta_sharing.SharingClient(profile_file)\n",
50 |         "\n",
51 |         "# List all shared tables.\n",
52 |         "client.list_all_tables()"
53 |       ],
54 |       "metadata": {
55 |         "id": "lhl16bDZTALh"
56 |       },
57 |       "execution_count": null,
58 |       "outputs": []
59 |     },
60 |     {
61 |       "cell_type": "code",
62 |       "source": [
63 |         "table_url = profile_file + \"#<share-name>.<schema-name>.<table-name>\"\n",
64 |         "\n",
65 |         "# Fetch 10 rows from a table and convert it to a Pandas DataFrame. This can be used to read sample data\n",
66 |         "# from a table that cannot fit in the memory.\n",
67 |         "df=delta_sharing.load_as_pandas(table_url, limit=10)"
68 |       ],
69 |       "metadata": {
70 |         "id": "T3vv8RmSTH33"
71 |       },
72 |       "execution_count": null,
73 |       "outputs": []
74 |     },
75 |     {
76 |       "cell_type": "code",
77 |       "source": [
78 |         "df.head()"
79 |       ],
80 |       "metadata": {
81 |         "id": "3STzY8-XTXgu"
82 |       },
83 |       "execution_count": null,
84 |       "outputs": []
85 |     }
86 |   ]
87 | }


--------------------------------------------------------------------------------
/Demo Data used in Generic Framework for External Table.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/Demo Data used in Generic Framework for External Table.zip


--------------------------------------------------------------------------------
/Docker Build Args in-depth intuition.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | Step 1:
 4 | --------
 5 | Upload requirements.txt, app.py & test.py in s3 folder
 6 | 
 7 | Dockerfile:
 8 | --------------
 9 | FROM python:3.7
10 | ARG AWS_ACCESS_KEY_ID
11 | ARG AWS_SECRET_ACCESS_KEY
12 | ARG AWS_DEFAULT_REGION
13 | COPY . /app
14 | WORKDIR /app
15 | RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
16 | RUN unzip awscliv2.zip
17 | RUN ./aws/install
18 | RUN aws s3 cp --recursive s3://gluezipp/demoyttestdockerset/ /app
19 | RUN pip install -r requirements.txt
20 | CMD ["python","app.py"]
21 | 
22 | In Host:
23 | --------
24 | export AWS_ACCESS_KEY_ID=AKIA4ASLMUJL7WX73WBK
25 | export AWS_SECRET_ACCESS_KEY=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
26 | export AWS_DEFAULT_REGION=us-east-1
27 | 
28 | General Code:
29 | ---------------
30 | docker build -t welcome-app .
31 | 
32 | docker build --build-arg AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID --build-arg AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY --build-arg AWS_DEFAULT_REGION=$AWS_DEFAULT_REGION -t welcome-app .
33 | 
34 | 
35 | 
36 | Run Container:
37 | -----------------
38 | docker run -p 5000:5000 welcome-app
39 | 
40 | Open another console:
41 | ------------------------
42 | docker container ls
43 | docker exec -it bf7f40daab84 sh
44 | 
45 | Observation: The file is not there
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/ETL Automation with Metadata Management and Job Tracking.txt:
--------------------------------------------------------------------------------
  1 | DynamoDB Table:
  2 | ------------------
  3 | DD_JobExecLog
  4 | 
  5 | PK --JobName 
  6 | SK--JobRunID 
  7 | 
  8 | Glue Job:
  9 | ------------
 10 | import sys
 11 | from awsglue.transforms import *
 12 | from awsglue.utils import getResolvedOptions
 13 | from pyspark.context import SparkContext
 14 | from awsglue.context import GlueContext
 15 | from awsglue.job import Job
 16 | from pyspark.sql.functions import *
 17 | from pyspark.sql.types import *
 18 | from pyspark.sql import SparkSession
 19 | spark = SparkSession.builder.getOrCreate()
 20 | 
 21 | 
 22 | def main():
 23 |     ## @params: [JOB_NAME]
 24 |     args = getResolvedOptions(sys.argv, ["VAL1","VAL2"])
 25 |     file_name=args['VAL1']
 26 |     bucket_name=args['VAL2']
 27 |     print("Bucket Name" , bucket_name)
 28 |     print("File Name" , file_name)
 29 |     input_file_path="s3a://{}/{}".format(bucket_name,file_name)
 30 |     print("Input File Path : ",input_file_path);
 31 |     df = spark.read.csv(input_file_path, header = True)
 32 |     df.repartition(1).write.mode('overwrite').parquet("s3a://{target_bucket}/{}".format(file_name.split('.')[0]))
 33 | 
 34 | main()
 35 | 
 36 | Trigger Lambda:
 37 | -----------------
 38 | from datetime import datetime, timedelta
 39 | import json
 40 | import boto3
 41 | client = boto3.client('glue')
 42 | dd = boto3.resource('dynamodb')
 43 | table = dd.Table('DD_JobExecLog')
 44 | 
 45 | start_timestamp = str(datetime.now())
 46 | glue_job_name="yt_csv_to_parquet"
 47 | def lambda_handler(event, context):
 48 |     for record in event['Records']:
 49 |         file_name = record['s3']['object']['key']
 50 |         bucketName=record['s3']['bucket']['name']
 51 |         print("File Name : ",file_name)
 52 |         print("Bucket Name : ",bucketName)
 53 |         fullS3Path = "s3://" + bucketName + "/" + file_name
 54 |         glue=boto3.client('glue');
 55 |         response = glue.start_job_run(JobName = glue_job_name, Arguments={"--VAL1":file_name,"--VAL2":bucketName})
 56 |         print(response)
 57 |         # Converting "response" from Type dict to string
 58 |         string_response = json.dumps(response)
 59 |         # Parsing JSON response from Glue API
 60 |         parsed_response = json.loads(string_response)
 61 |         ###########################
 62 |         #
 63 |         #  
 64 |         table.put_item(
 65 |            Item={
 66 |                 'JobName': glue_job_name,
 67 |                 'JobRunID': parsed_response['JobRunId'],
 68 |                 'job_state': 'STARTED (Lambda)',
 69 |                 'start_timestamp': start_timestamp,
 70 |                 'update_timestamp': 'null',
 71 |                 'job_message': 'Job Triggered by Lambda',
 72 |                 'job_severity': 'null',
 73 |                 's3_file_key': fullS3Path,
 74 |                 'job_region': 'null',
 75 |                 'job_time': 'null',
 76 |                 'job_account': 'null',
 77 |                 'glue_metadata': parsed_response
 78 |             }
 79 |         )
 80 | 		
 81 | 		
 82 | 
 83 | Update Lambda:
 84 | ------------------
 85 | from datetime import datetime, timedelta
 86 | import json
 87 | import boto3
 88 | dd = boto3.resource('dynamodb')
 89 | table = dd.Table('DD_JobExecLog')
 90 | 
 91 | def lambda_handler(event, context):
 92 |     print("Event",event)
 93 |     jobName = event['detail']['jobName']
 94 |     jobRunId = event['detail']['jobRunId']
 95 |     job_time = event['time']
 96 |     j_account = event['account']
 97 |     j_region = event['region']
 98 |     j_severity = event['detail']['severity']
 99 |     j_state = event['detail']['state']
100 |     j_message = event['detail']['message']
101 |     update_timestamp = str(datetime.now())
102 | #
103 |     if jobName == "yt_csv_to_parquet": 
104 |         table.update_item(
105 |             Key={
106 |                 'JobName': jobName,
107 |                 'JobRunID': jobRunId
108 |             },
109 |             UpdateExpression='SET job_message= :msg, job_severity= :sev, update_timestamp = :upd_ts, job_time= :jb_tm, job_region= :j_region, job_state= :v_state, job_account= :acc ',
110 |             ExpressionAttributeValues={
111 |                 ':upd_ts': update_timestamp,
112 |                 ':jb_tm': job_time,
113 |                 ':j_region': j_region,
114 |                 ':sev':  j_severity,
115 |                 ':v_state': j_state,
116 |                 ':msg': j_message,
117 |                 ':acc': j_account
118 |             }
119 |         )
120 | #
121 | 
122 | Cloudwatch Rule:
123 | ------------------
124 | {
125 |    "source": ["aws.glue"],
126 |    "detail-type": 
127 |     [
128 |         "Glue Job State Change",
129 |         "Glue Job Run Status"
130 |     ]
131 | }


--------------------------------------------------------------------------------
/Error Notifications for Snowflake Tasks using SNS.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/Error Notifications for Snowflake Tasks using SNS.png


--------------------------------------------------------------------------------
/Geolocation_from_IP.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "provenance": [],
  7 |       "authorship_tag": "ABX9TyMeWIuDcO1FM9I+cdmOIWqx"
  8 |     },
  9 |     "kernelspec": {
 10 |       "name": "python3",
 11 |       "display_name": "Python 3"
 12 |     },
 13 |     "language_info": {
 14 |       "name": "python"
 15 |     }
 16 |   },
 17 |   "cells": [
 18 |     {
 19 |       "cell_type": "markdown",
 20 |       "source": [
 21 |         "# **Resources**"
 22 |       ],
 23 |       "metadata": {
 24 |         "id": "l1rfOxVLKdoX"
 25 |       }
 26 |     },
 27 |     {
 28 |       "cell_type": "markdown",
 29 |       "source": [
 30 |         "* [ipinfo](https://ipinfo.io/)\n",
 31 |         "\n",
 32 |         "* [Country Code](https://country.io/names.json?ref=ipinfo.io)\n",
 33 |         "\n",
 34 |         "* [Json Formatter](https://jsonformatter.curiousconcept.com/#)\n",
 35 |         "\n",
 36 |         "\n"
 37 |       ],
 38 |       "metadata": {
 39 |         "id": "q0RwkuBsKL5h"
 40 |       }
 41 |     },
 42 |     {
 43 |       "cell_type": "code",
 44 |       "source": [
 45 |         "import json\n",
 46 |         "import requests"
 47 |       ],
 48 |       "metadata": {
 49 |         "id": "9XVg9L01HV44"
 50 |       },
 51 |       "execution_count": null,
 52 |       "outputs": []
 53 |     },
 54 |     {
 55 |       "cell_type": "code",
 56 |       "execution_count": null,
 57 |       "metadata": {
 58 |         "id": "IrZEkSwQHKyH"
 59 |       },
 60 |       "outputs": [],
 61 |       "source": [
 62 |         "def geolocation(public_ip):\n",
 63 |         "\turl = \"http://ipinfo.io/\"+ public_ip\n",
 64 |         "\tresponse = requests.get(url)\n",
 65 |         "\tdata = json.loads(response.text)\n",
 66 |         "\tprint(data)"
 67 |       ]
 68 |     },
 69 |     {
 70 |       "cell_type": "code",
 71 |       "source": [
 72 |         "public_ip='{Put your IP here}'"
 73 |       ],
 74 |       "metadata": {
 75 |         "id": "6M7wmQqzHfM2"
 76 |       },
 77 |       "execution_count": null,
 78 |       "outputs": []
 79 |     },
 80 |     {
 81 |       "cell_type": "code",
 82 |       "source": [
 83 |         "geolocation(public_ip)"
 84 |       ],
 85 |       "metadata": {
 86 |         "id": "Yl7RkWMXHcen"
 87 |       },
 88 |       "execution_count": null,
 89 |       "outputs": []
 90 |     },
 91 |     {
 92 |       "cell_type": "markdown",
 93 |       "source": [
 94 |         "# **Lambda Function**"
 95 |       ],
 96 |       "metadata": {
 97 |         "id": "qyJuBky0J_LR"
 98 |       }
 99 |     },
100 |     {
101 |       "cell_type": "code",
102 |       "source": [
103 |         "import json\n",
104 |         "import requests\n",
105 |         "def geolocation(public_ip):\n",
106 |         "\turl = \"http://ipinfo.io/\"+ public_ip\n",
107 |         "\tresponse = requests.get(url)\n",
108 |         "\tdata = json.loads(response.text)\n",
109 |         "\treturn data\n",
110 |         "\n",
111 |         "def lambda_handler(event, context):\n",
112 |         "    # TODO implement\n",
113 |         "    print(event)\n",
114 |         "    source_ip=event['requestContext']['http']['sourceIp']\n",
115 |         "    response=geolocation(source_ip)\n",
116 |         "    print(\"Response: \",response)"
117 |       ],
118 |       "metadata": {
119 |         "id": "AkMAy9M2KBnu"
120 |       },
121 |       "execution_count": null,
122 |       "outputs": []
123 |     }
124 |   ]
125 | }


--------------------------------------------------------------------------------
/Get_Data_Lineage_using_Rest_API.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "provenance": []
  7 |     },
  8 |     "kernelspec": {
  9 |       "name": "python3",
 10 |       "display_name": "Python 3"
 11 |     },
 12 |     "language_info": {
 13 |       "name": "python"
 14 |     }
 15 |   },
 16 |   "cells": [
 17 |     {
 18 |       "cell_type": "code",
 19 |       "execution_count": 23,
 20 |       "metadata": {
 21 |         "id": "lgVFFTmOPedj"
 22 |       },
 23 |       "outputs": [],
 24 |       "source": [
 25 |         "import requests\n",
 26 |         "import json"
 27 |       ]
 28 |     },
 29 |     {
 30 |       "cell_type": "code",
 31 |       "source": [
 32 |         "access_token = '{}'"
 33 |       ],
 34 |       "metadata": {
 35 |         "id": "gjGYxVGAPt6m"
 36 |       },
 37 |       "execution_count": 25,
 38 |       "outputs": []
 39 |     },
 40 |     {
 41 |       "cell_type": "code",
 42 |       "source": [
 43 |         "endpoint =\"https://{}/api/2.0/lineage-tracking/table-lineage\""
 44 |       ],
 45 |       "metadata": {
 46 |         "id": "m4ACYxOkP999"
 47 |       },
 48 |       "execution_count": 26,
 49 |       "outputs": []
 50 |     },
 51 |     {
 52 |       "cell_type": "code",
 53 |       "source": [
 54 |         "payload = {\"table_name\": \"workspace.default.iris_date_transformed\", \"include_entity_lineage\": True}"
 55 |       ],
 56 |       "metadata": {
 57 |         "id": "BeSnKJTvmGke"
 58 |       },
 59 |       "execution_count": 30,
 60 |       "outputs": []
 61 |     },
 62 |     {
 63 |       "cell_type": "code",
 64 |       "source": [
 65 |         "response= requests.get(endpoint, headers={\"Authorization\": f\"Bearer {access_token}\"},  data=json.dumps(payload))"
 66 |       ],
 67 |       "metadata": {
 68 |         "id": "Mb6ceI0CmAzN"
 69 |       },
 70 |       "execution_count": 31,
 71 |       "outputs": []
 72 |     },
 73 |     {
 74 |       "cell_type": "code",
 75 |       "source": [
 76 |         "print(response.text)"
 77 |       ],
 78 |       "metadata": {
 79 |         "colab": {
 80 |           "base_uri": "https://localhost:8080/"
 81 |         },
 82 |         "id": "9KT2nK8kmSmc",
 83 |         "outputId": "cb0ad7e7-f452-4d35-e773-8eaf67c3cdd8"
 84 |       },
 85 |       "execution_count": 32,
 86 |       "outputs": [
 87 |         {
 88 |           "output_type": "stream",
 89 |           "name": "stdout",
 90 |           "text": [
 91 |             "{\"upstreams\":[{\"tableInfo\":{\"name\":\"iris_date\",\"catalog_name\":\"workspace\",\"schema_name\":\"default\",\"table_type\":\"TABLE\",\"lineage_timestamp\":\"2025-05-15 16:12:35.0\"},\"notebookInfos\":[{\"workspace_id\":3835792019408392,\"notebook_id\":3011685692879898,\"lineage_timestamp\":\"2025-05-15 16:12:35.0\"}]}],\"downstreams\":[{\"tableInfo\":{\"name\":\"iris_gold\",\"catalog_name\":\"workspace\",\"schema_name\":\"default\",\"table_type\":\"PERSISTED_VIEW\",\"lineage_timestamp\":\"2025-05-15 16:14:22.0\"},\"queryInfos\":[{\"workspace_id\":3835792019408392,\"query_id\":\"f9185d3a-5537-464c-b9b9-5fd6ec9c6d1c\",\"lineage_timestamp\":\"2025-05-15 16:14:22.0\"},{\"workspace_id\":3835792019408392,\"query_id\":\"514c9f90-5ed9-4aa2-b8be-fd036c917a5a\",\"lineage_timestamp\":\"2025-05-15 15:52:35.0\"}]},{\"notebookInfos\":[{\"workspace_id\":3835792019408392,\"notebook_id\":3011685692879898,\"lineage_timestamp\":\"2025-05-15 16:12:40.0\"}]}]}\n"
 92 |           ]
 93 |         }
 94 |       ]
 95 |     },
 96 |     {
 97 |       "cell_type": "code",
 98 |       "source": [],
 99 |       "metadata": {
100 |         "id": "VqRB08rTmTm0"
101 |       },
102 |       "execution_count": null,
103 |       "outputs": []
104 |     }
105 |   ]
106 | }


--------------------------------------------------------------------------------
/Glue Workflow Lab.txt:
--------------------------------------------------------------------------------
 1 | Step 1:
 2 | --------
 3 | Create a s3 bucket with 2 folders 
 4 | 
 5 | Step 2:
 6 | --------
 7 | Create a Glue role
 8 | 
 9 | Step 3:
10 | --------
11 | Create a Glue Database
12 | 
13 | Step 4:
14 | -------
15 | Create 2 Glue Crawlers
16 | 
17 | Step 5:
18 | --------
19 | Create a Glue job with Job bookmark enabled--
20 | 
21 | import sys
22 | from awsglue.transforms import *
23 | from awsglue.utils import getResolvedOptions
24 | from pyspark.context import SparkContext
25 | from awsglue.context import GlueContext
26 | from awsglue.job import Job
27 | 
28 | ## @params: [JOB_NAME]
29 | args = getResolvedOptions(sys.argv, ['JOB_NAME'])
30 | 
31 | sc = SparkContext()
32 | glueContext = GlueContext(sc)
33 | spark = glueContext.spark_session
34 | job = Job(glueContext)
35 | job.init(args['JOB_NAME'], args)
36 | 
37 | datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "workflowdemoyt", table_name = "csvstorer", transformation_ctx = "datasource0")
38 | 
39 | datasink4 = glueContext.write_dynamic_frame.from_options(frame = datasource0, connection_type = "s3", 
40 | connection_options = {"path": "s3://{}/{}/"}, format = "parquet", transformation_ctx = "datasink4")
41 | job.commit()
42 | 
43 | Step 6:
44 | --------
45 | Create the Glue Workflow
46 | 
47 | Step 7:
48 | --------
49 | Download the Snowflake data --
50 | select * from books where publishyear=2002 and PUBLISHMONTH=23;
51 | 
52 | Step 8:
53 | -------
54 | Trigger the Glue workflow
55 | 
56 | Step 9:
57 | --------
58 | Query using Athena--
59 | 
60 | 
61 | Step 10:
62 | ----------
63 | Download the Snowflake data --
64 | select * from books where publishyear=2001 and PUBLISHMONTH=1;
65 | 
66 | Step 11:
67 | ---------
68 | Trigger the Glue workflow
69 | 
70 | Step 12:
71 | ---------
72 | Query using Athena--
73 | 


--------------------------------------------------------------------------------
/HTTP Sink Connector.txt:
--------------------------------------------------------------------------------
 1 | Start Zookeeper:
 2 | ---------------------
 3 | F:/kafka_2.12-3.3.1/bin/windows/zookeeper-server-start.bat F:/kafka_2.12-3.3.1/config/zookeeper.properties
 4 | 
 5 | Start Kafka Server:
 6 | ---------------------
 7 | F:/kafka_2.12-3.3.1/bin/windows/kafka-server-start.bat F:/kafka_2.12-3.3.1/config/server.properties
 8 | 
 9 | Create Source Topic:
10 | ---------------------
11 | F:/kafka_2.12-3.3.1/bin/windows/kafka-topics.bat --create --topic http-messages --bootstrap-server localhost:9092 --replication-factor 1 --partitions 1
12 | 
13 | http_config.properties:
14 | -------------------------
15 | name=HttpSinkBasicAuth
16 | topics=http-messages
17 | tasks.max=1
18 | connector.class=io.confluent.connect.http.HttpSinkConnector
19 | # key/val converters
20 | key.converter=org.apache.kafka.connect.storage.StringConverter
21 | value.converter=org.apache.kafka.connect.storage.StringConverter
22 | # licensing for local single-node Kafka cluster
23 | confluent.topic.bootstrap.servers=localhost:9092
24 | confluent.topic.replication.factor=1
25 | # connect reporter required bootstrap server
26 | reporter.bootstrap.servers=localhost:9092
27 | reporter.result.topic.name=success-responses
28 | reporter.result.topic.replication.factor=1
29 | behavior.on.error=log
30 | reporter.error.topic.name=error-responses
31 | reporter.error.topic.replication.factor=1
32 | # http sink connector configs
33 | http.api.url=https://api.mailjet.com/v3.1/send
34 | auth.type=BASIC
35 | connection.user={}
36 | connection.password={}
37 | 
38 | Start HTTP Sink Connector:
39 | ------------------------------
40 | F:/kafka_2.12-3.3.1/bin/windows/connect-standalone.bat F:/kafka_2.12-3.3.1/config/connect-standalone.properties F:/kafka_2.12-3.3.1/config/http_config.properties
41 | 
42 | Start Console Producer to Source Topic:
43 | --------------------------------------------
44 | F:/kafka_2.12-3.3.1/bin/windows/kafka-console-producer.bat --topic http-messages  --bootstrap-server localhost:9092
45 | 
46 | Start Console Consumer to Source Topic:
47 | --------------------------------------------
48 | F:/kafka_2.12-3.3.1/bin/windows/kafka-console-consumer.bat --topic http-messages --from-beginning --bootstrap-server localhost:9092
49 | 
50 | Start Console Consumer to Response Topic:
51 | --------------------------------------------
52 | F:/kafka_2.12-3.3.1/bin/windows/kafka-console-consumer.bat --topic success-responses --from-beginning --bootstrap-server localhost:9092
53 | 
54 | Start Console Consumer to Error Topic:
55 | --------------------------------------------
56 | F:/kafka_2.12-3.3.1/bin/windows/kafka-console-consumer.bat --topic error-responses --from-beginning --bootstrap-server localhost:9092
57 | 
58 | 
59 | Smaple Message:
60 | -----------------
61 | {"Messages":[{"From":{"Email":"{}","Name":"{}"},"To":[{"Email":"{}","Name":"{}"}],"Subject":"My first Marketing email","TextPart":"Greetings from Knowledge Amplifier. to viewer 1","HTMLPart":"<h3>Dear Viewer 1, welcome to <a href=https://www.youtube.com/@KnowledgeAmplifier1/videos>Knowledge Amplifier</a>!</h3><br />Subscribe our channel for more intresting videos!","CustomID":"AppGettingStartedTest"}]}
62 | 


--------------------------------------------------------------------------------
/Incremental Crawling , Incremental ETL with Glue Workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/Incremental Crawling , Incremental ETL with Glue Workflow.png


--------------------------------------------------------------------------------
/Kafka Consumer Internals with Auto Offset Commits and At Least Once Processing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/Kafka Consumer Internals with Auto Offset Commits and At Least Once Processing.png


--------------------------------------------------------------------------------
/Kafka Consumer Lag Monitoring.txt:
--------------------------------------------------------------------------------
  1 | Download Kafka:
  2 | ---------------------
  3 | wget https://downloads.apache.org/kafka/3.6.0/kafka_2.12-3.6.0.tgz
  4 | tar -xvf kafka_2.12-3.6.0.tgz
  5 | 
  6 | Download Java:
  7 | ---------------------
  8 | java -version
  9 | sudo yum -y install java-1.8.0-openjdk
 10 | java -version
 11 | 
 12 | vi kafka_2.12-3.6.0/config/server.properties
 13 | 
 14 | Start Zookepper:
 15 | -----------------
 16 | cd kafka_2.12-3.6.0
 17 | bin/zookeeper-server-start.sh config/zookeeper.properties
 18 | 
 19 | Start Kafka-server:
 20 | ----------------------------------------
 21 | export KAFKA_HEAP_OPTS="-Xmx256M -Xms128M"
 22 | 
 23 | cd kafka_2.12-3.6.0
 24 | bin/kafka-server-start.sh config/server.properties
 25 | 
 26 | Create Topic:
 27 | ----------------
 28 | cd kafka_2.12-3.6.0
 29 | bin/kafka-topics.sh --create --topic hello_world2 --bootstrap-server {}:9092 --replication-factor 1 --partitions 3
 30 | 
 31 | 
 32 | Shell Script to monitor Consumer Lag:
 33 | -----------------------------------------
 34 | bin/kafka-consumer-groups.sh --bootstrap-server {}:9092 --group hello_world1 --describe
 35 | 
 36 | 
 37 | Producer Code:
 38 | --------------
 39 | from time import sleep
 40 | from json import dumps
 41 | from kafka import KafkaProducer
 42 | 
 43 | topic_name='{}'
 44 | kafka_server='{}'
 45 | def custom_partitioner(key, all_partitions, available):
 46 |     """
 47 |     Customer Kafka partitioner to get the partition corresponding to key
 48 |     :param key: partitioning key
 49 |     :param all_partitions: list of all partitions sorted by partition ID
 50 |     :param available: list of available partitions in no particular order
 51 |     :return: one of the values from all_partitions or available
 52 |     """
 53 |     print("The key is  : {}".format(key))
 54 |     print("All partitions : {}".format(all_partitions))
 55 |     print("After decoding of the key : {}".format(key.decode('UTF-8')))
 56 |     return int(key.decode('UTF-8'))%len(all_partitions)
 57 | 
 58 | 
 59 | producer = KafkaProducer(bootstrap_servers=[kafka_server],value_serializer=lambda x: dumps(x).encode('utf-8'),
 60 |                          partitioner=custom_partitioner)
 61 | 
 62 | for e in range(1000):
 63 |     data = {'number' : e}
 64 |     print(data)
 65 |     producer.send(topic_name, key=str(e).encode(),value=data)
 66 |     sleep(0.4)
 67 | 
 68 | Consumer Code:
 69 | --------------
 70 | from kafka import KafkaConsumer
 71 | from kafka import TopicPartition , OffsetAndMetadata
 72 | from time import sleep
 73 | import json
 74 | 
 75 | topic='{}'
 76 | group_id=topic
 77 | kafka_server='{}'
 78 | consumer = KafkaConsumer (topic,bootstrap_servers = [kafka_server],
 79 | value_deserializer=lambda m: json.loads(m.decode('utf-8')),group_id=group_id,auto_offset_reset='earliest',
 80 |                           enable_auto_commit =False)
 81 | 
 82 | 
 83 | for message in consumer:
 84 |     print(message)
 85 |     tp = TopicPartition(message.topic, message.partition)
 86 |     om = OffsetAndMetadata(message.offset + 1, message.timestamp)
 87 |     consumer.commit({tp: om})
 88 |     sleep(0.4)
 89 | 
 90 | Log Monitor:
 91 | -------------
 92 | from kafka import KafkaConsumer
 93 | from kafka import TopicPartition , OffsetAndMetadata
 94 | from time import sleep
 95 | import json
 96 | 
 97 | topic='{}'
 98 | group_id=topic
 99 | kafka_server='{}'
100 | consumer = KafkaConsumer (topic,bootstrap_servers = [kafka_server],group_id=group_id)
101 | 
102 | partitions=consumer.partitions_for_topic(topic)
103 | print(partitions)
104 | 
105 | "***********************************************************************************************"
106 | #
107 | tp = [TopicPartition(topic, partition) for partition in partitions]
108 | 
109 | topic_partition_last_offset = consumer.end_offsets(tp)
110 | print(topic_partition_last_offset)
111 | 
112 | # "***********************************************************************************************"
113 | #
114 | #
115 | for i in tp:
116 |     consumer_committed_offset=0 if consumer.committed(i) is None else consumer.committed(i)
117 |     last_offset_stored_by_broker_in_partition=topic_partition_last_offset[i]
118 |     lag=last_offset_stored_by_broker_in_partition-consumer_committed_offset
119 |     print(f"Topic: {topic} - Partition: {i.partition} - Current Consumer Offset: {consumer_committed_offset} -  Last Offset: {last_offset_stored_by_broker_in_partition} - Lag : {lag}")
120 | print('*'*100)
121 | 


--------------------------------------------------------------------------------
/Kafka Faust Snowflake Python.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/Kafka Faust Snowflake Python.PNG


--------------------------------------------------------------------------------
/Kafka Producer Internals.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/Kafka Producer Internals.png


--------------------------------------------------------------------------------
/Kafka Retry Architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/Kafka Retry Architecture.png


--------------------------------------------------------------------------------
/Long-Running Workflows with Human Interactions using Step Functions.txt:
--------------------------------------------------------------------------------
  1 | Step Function Code:
  2 | --------------------------
  3 | {
  4 |   "Comment": "A description of my state machine",
  5 |   "StartAt": "SQS SendMessage",
  6 |   "States": {
  7 |     "SQS SendMessage": {
  8 |       "Type": "Task",
  9 |       "Resource": "arn:aws:states:::sqs:sendMessage.waitForTaskToken",
 10 |       "Parameters": {
 11 |         "MessageBody": {
 12 |           "input.$": "$",
 13 |           "MyTaskToken.$": "$$.Task.Token"
 14 |         },
 15 |         "QueueUrl": "{Put the SQS Queue URL here}"
 16 |       },
 17 |       "Next": "Choice"
 18 |     },
 19 |     "Choice": {
 20 |       "Type": "Choice",
 21 |       "Choices": [
 22 |         {
 23 |           "Variable": "$.body",
 24 |           "StringEquals": "Approved",
 25 |           "Next": "Success"
 26 |         }
 27 |       ],
 28 |       "Default": "Fail"
 29 |     },
 30 |     "Success": {
 31 |       "Type": "Succeed"
 32 |     },
 33 |     "Fail": {
 34 |       "Type": "Fail"
 35 |     }
 36 |   }
 37 | }
 38 | 
 39 | Step Function Sample Input:
 40 | -----------------------------
 41 | {
 42 |   "Manager Mail Address": "{}",
 43 |   "Employee Name":"{}"
 44 | }
 45 | 
 46 | Callback Lambda Code:
 47 | -----------------------------
 48 | import json
 49 | import boto3
 50 | import time
 51 | import urllib
 52 | 
 53 | client = boto3.client("ses")
 54 | 
 55 | def lambda_handler(event, context):
 56 |     main_message=json.loads(event['Records'][0]['body'])
 57 |     print("Main Message Part : {}".format(main_message))
 58 |     
 59 |     step_fucntion_input=main_message['input']
 60 |     
 61 |     manager_main_address=step_fucntion_input['Manager Mail Address']
 62 |     employee_to_be_promoted=step_fucntion_input['Employee Name']
 63 | 
 64 |     
 65 |     task_token=main_message['MyTaskToken']
 66 |     print("The task token is : {}".format(task_token))
 67 |     task_token_encode=urllib.parse.quote(task_token)
 68 |     body = """
 69 |                  Hi,<br>
 70 |                      {} has been nominated for promotion!<br />.
 71 |                      
 72 |                  Can you please approve:<br />
 73 |                  
 74 |                  {Put the API Invoke URL here}/approve?TaskToken={}<br />
 75 |                  
 76 |                  Or reject:<br />
 77 |                  
 78 |                  {Put the API Invoke URL here}/reject?TaskToken={}
 79 |          """.format(employee_to_be_promoted, task_token_encode,task_token_encode)
 80 |          
 81 |     message = {"Subject": {"Data": 'Your Approval Needed for Promotion!'}, "Body": {"Html": {"Data": body}}}
 82 |     
 83 |     response = client.send_email(Source = manager_main_address, Destination = {"ToAddresses": [manager_main_address]}, Message = message) 
 84 |     
 85 |     print("The mail is sent successfully")
 86 |     
 87 | 
 88 | 
 89 | Approve Handler:
 90 | ------------------------
 91 | import json
 92 | import boto3
 93 | import time
 94 | 
 95 | client = boto3.client('stepfunctions')
 96 | 
 97 | def lambda_handler(event, context):
 98 |     task_token=event['queryStringParameters']['TaskToken']
 99 |     print(task_token)
100 |     response = client.send_task_success(
101 |     taskToken=task_token,
102 |     output=json.dumps({'body':'Approved'})
103 |     )
104 | 	
105 | 	
106 | Reject Handler:
107 | ------------------
108 | import json
109 | import boto3
110 | import time
111 | 
112 | client = boto3.client('stepfunctions')
113 | 
114 | def lambda_handler(event, context):
115 |     task_token=event['queryStringParameters']['TaskToken']
116 |     response = client.send_task_success(
117 |     taskToken=task_token,
118 |     output=json.dumps({'body':'Rejected'})
119 |     )


--------------------------------------------------------------------------------
/MSK Lab 1.txt:
--------------------------------------------------------------------------------
  1 | Step 1:
  2 | ------------
  3 | Cretae VPC -- Name -- virtual-private-cloud  IPv4 CIDR -- 10.0.0.0/16
  4 | Host address range -- 10.0.0.1 - 10.0.255.254
  5 | 
  6 | Step 2:
  7 | -----------
  8 | Create 2 public subnets 
  9 | Public-Subnet-A--10.0.0.0/24
 10 | Host address range -- 10.0.0.1 - 10.0.0.254
 11 | 
 12 | Public-Subnet-B--10.0.1.0/24
 13 | Host address range -- 10.0.1.1 - 10.0.1.254
 14 | 
 15 | Step 3:
 16 | ------------
 17 | Check the default route table -- you will see the above 2 subnets have not been explicitly associated with any route tables and are therefore associated with the main route table.
 18 | 
 19 | Step 4:
 20 | ------------
 21 | Create a IGW & connect with VPC
 22 | 
 23 | Step 5:
 24 | ------------
 25 | Add the IGW in default route table
 26 | 
 27 | 
 28 | Step 6:
 29 | ---------
 30 | Launch MSK Cluster with vpc you created , unauthorised access allowed , plaintext enxryption
 31 | (keep security group as it is)
 32 | 
 33 | Step 7:
 34 | ------------
 35 | Launch Linux EC2
 36 | In the list Network choose the VPC previously created.
 37 | In the list Auto-assign Public IP, choose Enable.
 38 | 
 39 | Step 8:
 40 | ---------
 41 | Once the client for Amazon MSK has been created, the security group rules must be configured to allow the connection between the cluster and the client that we have just created.
 42 | 
 43 | For that , Add the security group id of ec2 to msk cluster security group all traffic
 44 | 
 45 | Repeat these steps to add an inbound rule in the security group that corresponds to your client computer to allow it to receive traffic from the security group from the VPC. Now your client computer can communicate bidirectionally with the MSK Cluster.
 46 | 
 47 | Once this is done, the newly created and configured client can be accessed.
 48 | 
 49 | Step 9:
 50 | -----------
 51 | sudo yum install java-1.8.0-openjdk
 52 | wget https://archive.apache.org/dist/kafka/2.8.1/kafka_2.12-2.8.1.tgz
 53 | tar -xvf kafka_2.12-2.8.1.tgz
 54 | cd kafka_2.12-2.8.1
 55 | 
 56 | bin/kafka-topics.sh --create --topic demo_testing2 --bootstrap-server {Put the MSK bootstrap server URLs here} --replication-factor 1 --partitions 1
 57 | 
 58 | 
 59 | bin/kafka-topics.sh --create --topic helloworld --bootstrap-server {Put the MSK bootstrap server URLs here}  --replication-factor 1 --partitions 1
 60 | 
 61 | Step 10:
 62 | -----------
 63 | Start the kafka Producer
 64 | ---------------------------
 65 | bin/kafka-console-producer.sh --topic demo_testing2 --bootstrap-server {Put the MSK bootstrap server URLs here} 
 66 | 
 67 | In a new console start the kafka consumer--
 68 | cd kafka_2.12-2.8.1
 69 | bin/kafka-console-consumer.sh --topic helloworld --bootstrap-server {Put the MSK bootstrap server URLs here} 
 70 | 
 71 | Step 11:
 72 | -----------
 73 | Install confluent kafka within kafka_2.12-2.8.1)
 74 | wget  http://packages.confluent.io/archive/5.1/confluent-5.1.2-2.11.zip
 75 | unzip confluent-5.1.2-2.11.zip
 76 | 
 77 | export CONFLUENT_HOME=/home/ec2-user/kafka_2.12-2.8.1/confluent-5.1.2
 78 | export PATH=$PATH:$CONFLUENT_HOME/bin
 79 | (Note , if installing confluent kafka , where kafka is installed (i.e. in /home/ec2-user) , then CONFLUENT_HOME should be -- /home/ec2-user/confluent-5.1.2)
 80 | 
 81 | Step 12:
 82 | -----------
 83 | Change the bootstrap.servers in  confluent-5.1.2/etc/kafka-rest/kafka-rest.properties 
 84 | 
 85 | 
 86 | 
 87 | Step 13:
 88 | -----------
 89 | Start Kafka Rest 
 90 | /home/ec2-user/kafka_2.12-2.8.1/confluent-5.1.2/bin/kafka-rest-start /home/ec2-user/kafka_2.12-2.8.1/confluent-5.1.2/etc/kafka-rest/kafka-rest.properties 
 91 | 
 92 | (Don't forget to allow all traffic to the security group of EC2 client machine)
 93 | 
 94 | Url to post messages using Kafka rest API--
 95 | http://{Put your cleint machine's Public IP here}:8082/topics/demo_testing2
 96 | 
 97 | Content-Type: application/vnd.kafka.json.v2+json
 98 | 
 99 | Sample Message:
100 | ------------------
101 | {"records":[{"value":{"name": "testUser"}}]}
102 | 
103 | Start consumer to see the messages:
104 | ----------------------------------------
105 | cd kafka_2.12-2.8.1
106 | bin/kafka-console-consumer.sh --topic demo_testing2 --bootstrap-server {Put the MSK bootstrap server URLs here} 
107 | 
108 | 
109 | 
110 | 
111 | 
112 | 
113 | 


--------------------------------------------------------------------------------
/MSK To AWS Lambda.txt:
--------------------------------------------------------------------------------
  1 | Step 1:
  2 | ------------
  3 | Cretae VPC -- Name -- virtual-private-cloud-lambda  IPv4 CIDR -- 11.0.0.0/16
  4 | Host address range -- 11.0.0.1 - 11.0.255.254
  5 | 
  6 | Step 2:
  7 | -----------
  8 | Create 2 public subnets 
  9 | Public-Subnet-A-lambda--11.0.0.0/24--us-east-1a
 10 | Host address range -- 11.0.0.1 - 11.0.0.254
 11 | 
 12 | Public-Subnet-B-lambda--11.0.1.0/24--us-east-1b
 13 | Host address range -- 11.0.1.1 - 11.0.1.254
 14 | 
 15 | Private-Subnet-A-lambda--11.0.2.0/24--us-east-1a
 16 | Host address range -- 11.0.2.1 - 11.0.2.254
 17 | 
 18 | Private-Subnet-B-lambda--11.0.3.0/24--us-east-1b
 19 | Host address range -- 11.0.3.1 - 11.0.3.254
 20 | 
 21 | Step 3:
 22 | ------------
 23 | Create an IGW and attach with VPC
 24 | 
 25 | Step 4:
 26 | ---------
 27 | Create 2 route tables 1 for Public subnets and 1 for Private subnets
 28 | (Attach IGW with Public route tables)
 29 | 
 30 | Step 5:
 31 | ------------
 32 | Launch MSK Cluster in Private subnets(keep unauthorised access and plaintext authentication)
 33 | 
 34 | Step 6:
 35 | -----------
 36 | Create NAT Gateway in public subnet and attach with Private Subnet route table  
 37 | 
 38 | Step 7:
 39 | -------------
 40 | Launch an EC2 in a public subnet in same VPC as of MSK Cluster in a public subnet.
 41 | Launch an EC2 in private subnet in same VPC as of MSK Cluster in a private subnet.
 42 | 
 43 | Step 8:
 44 | -----------
 45 | Add private ec2 security group and msk security group both way all traffic.
 46 | 
 47 | Step 9:
 48 | -------------
 49 | Enter in public subnet , from there enter in private subnet.
 50 | 
 51 | Step 7:
 52 | -----------
 53 | sudo yum install java-1.8.0-openjdk
 54 | wget https://archive.apache.org/dist/kafka/2.8.1/kafka_2.12-2.8.1.tgz
 55 | tar -xvf kafka_2.12-2.8.1.tgz
 56 | cd kafka_2.12-2.8.1
 57 | 
 58 | bin/kafka-topics.sh --create --topic demo_testing2 --bootstrap-server  --replication-factor 1 --partitions 2
 59 | 
 60 | Step 8:
 61 | ----------
 62 | Perform local testing:
 63 | -----------------------------
 64 | bin/kafka-console-producer.sh --topic demo_testing2 --bootstrap-server 
 65 | 
 66 | In a new console start the kafka consumer--
 67 | cd kafka_2.12-2.8.1
 68 | bin/kafka-console-consumer.sh --topic demo_testing2 --bootstrap-server 
 69 | 
 70 | 
 71 | 
 72 | Step 8:
 73 | -----------
 74 | You set up a target Lambda function with MSK and VPC access.
 75 | 
 76 | Step 9:
 77 | -----------
 78 | Create Lambda Function with MSK Trigger 
 79 | 
 80 | Sample Event:
 81 | ------------------
 82 | {
 83 |    "eventSource":"aws:kafka",
 84 |    "eventSourceArn":"",
 85 |    "bootstrapServers":"",
 86 |    "records":{
 87 |       "demo_testing2-0":[
 88 |          {
 89 |             "topic":"demo_testing2",
 90 |             "partition":0,
 91 |             "offset":34,
 92 |             "timestamp":1674023898925,
 93 |             "timestampType":"CREATE_TIME",
 94 |             "value":"eyJIZWxsbyI6IldvcmxkIn0=",
 95 |             "headers":[
 96 |                
 97 |             ]
 98 |          }
 99 |       ]
100 |    }
101 | }
102 | 
103 | 
104 | 
105 | import base64
106 | import boto3
107 | import json
108 | 
109 | def lambda_handler(event, context):
110 |     # TODO implement
111 |     print(event)
112 |     for partition_key in event['records']:
113 |         partition_value=event['records'][partition_key]
114 |         for record_value in partition_value:
115 |              print((base64.b64decode(record_value['value'])).decode())
116 | 
117 | 
118 | 
119 | 
120 | 


--------------------------------------------------------------------------------
/MSK project with serverless Producer & Consumer.txt:
--------------------------------------------------------------------------------
  1 | Step 1:
  2 | --------
  3 | Create a NAT Gateway & attach with Private subnet route table
  4 | 
  5 | Step 2:
  6 | ---------
  7 | Launch one MSK Cluster in private subnet
  8 | 
  9 | Step 3:
 10 | ----------
 11 | Create a Lambda code (Python 3.8)
 12 | 
 13 | from time import sleep
 14 | from json import dumps
 15 | from kafka import KafkaProducer
 16 | import json
 17 | 
 18 | topic_name='{Provide the topic name here}'
 19 | producer = KafkaProducer(bootstrap_servers=['{Put the broker URLs here}'
 20 | ,'{Put the broker URLs here}'],value_serializer=lambda x: dumps(x).encode('utf-8'))
 21 | 
 22 | def lambda_handler(event, context):
 23 |     print(event)
 24 |     for i in event['Records']:
 25 |         sqs_message =json.loads((i['body']))
 26 |         print(sqs_message)
 27 |         producer.send(topic_name, value=sqs_message)
 28 |     
 29 |     producer.flush()
 30 | 	
 31 | Step 4:
 32 | ----------
 33 | Increase the timeout for Lambda to 2 mins , provide SQS,MSK and VPC access & put in Private VPC (where MSK Brokers are running)
 34 | 
 35 | Configure Lambda Layer--
 36 | Reference:
 37 | ------------
 38 | https://youtube.com/watch?v=uleTVY7LkMM&feature=shares
 39 | 
 40 | 
 41 | 
 42 | Step 5:
 43 | ---------
 44 | Launch one SQS Queue with visibility timeout to 240 sec
 45 | 
 46 | Step 6:
 47 | ----------
 48 | Create an API Gateway and setup integration with SQS Queue
 49 | 
 50 | Step 7:
 51 | ---------
 52 | Test the integration , if works , then setup integration with AWS Lambda Producer
 53 | 
 54 | 
 55 | Step 8:
 56 | ---------
 57 | Create an s3 bucket for data archival 
 58 | 
 59 | Step 9:
 60 | ---------
 61 | Configure kinesis Firehose
 62 | 
 63 | 
 64 | Step 10:
 65 | -----------
 66 | Configure the Consumer Lambda Code:
 67 | 
 68 | import base64
 69 | import boto3
 70 | import json
 71 | 
 72 | client = boto3.client('firehose')
 73 | 
 74 | def lambda_handler(event, context):
 75 | 	print(event)
 76 | 	for partition_key in event['records']:
 77 | 		partition_value=event['records'][partition_key]
 78 | 		for record_value in partition_value:
 79 | 			actual_message=json.loads((base64.b64decode(record_value['value'])).decode('utf-8'))
 80 | 			print(actual_message)
 81 | 			newImage = (json.dumps(actual_message)+'\n').encode('utf-8')
 82 | 			print(newImage)
 83 | 			response = client.put_record(
 84 | 			DeliveryStreamName='{Kinesis Delivery Stream Name}',
 85 | 			Record={
 86 | 			'Data': newImage
 87 | 			})
 88 | 
 89 | Step 11:
 90 | -----------
 91 | Provide KinesisFirehose write access , VPC access , MSK access to this Lambda
 92 | 
 93 | 
 94 | Step 12:
 95 | ----------
 96 | Launch an EC2 in a public subnet in same VPC as of MSK Cluster in a public subnet.
 97 | Launch an EC2 in private subnet in same VPC as of MSK Cluster in a private subnet.
 98 | 
 99 | 
100 | Step 13:
101 | -----------
102 | Add private ec2 security group and msk security group both way all traffic.
103 | 
104 | Step 14:
105 | -------------
106 | Enter in public subnet , from there enter in private subnet.
107 | 
108 | sudo yum install java-1.8.0-openjdk
109 | wget https://archive.apache.org/dist/kafka/2.8.1/kafka_2.12-2.8.1.tgz
110 | tar -xvf kafka_2.12-2.8.1.tgz
111 | cd kafka_2.12-2.8.1
112 | 
113 | bin/kafka-topics.sh --create --topic demo_testing2 --bootstrap-server {} --replication-factor 1 --partitions 2
114 | 
115 | 
116 | Step 15:
117 | ------------
118 | Start kafka console consumer and check whether from Lambda messages are getting published in kafka topic or not
119 | bin/kafka-console-consumer.sh --topic demo_testing2 --bootstrap-server {}
120 | 
121 | 
122 | Step 16:
123 | ------------
124 | Add  MSK Trigger from Consumer Lambda
125 | 
126 | 
127 | Step 17:
128 | ---------
129 | Peform end to end testing
130 | 
131 | {"station":"OH","temp":"26.39f"}
132 | {"station":"WA","temp":"40.00F"}
133 | {"station":"TX","temp":"15.01F"}
134 | {"station":"NC","temp":"32.36f"}
135 | {"station":"WA","temp":"62.86F"}
136 | {"station":"NC","temp":"49.43f"}
137 | {"station":"MD","temp":"2.30f"}
138 | 


--------------------------------------------------------------------------------
/MSK with s3 sink connector and Snowflake.txt:
--------------------------------------------------------------------------------
  1 | Documentation:
  2 | ------------------
  3 | https://aws.amazon.com/blogs/big-data/back-up-and-restore-kafka-topic-data-using-amazon-msk-connect/
  4 | 
  5 | 
  6 | Step 1:Launch MSK Cluster:
  7 | -------------------------------------
  8 | Configure NAT Gateway & launch MSK Cluster in Private Subnet
  9 | 
 10 | Step 2:
 11 | ----------
 12 | Create IAM role for MSK Connect--s3connectordemoyt
 13 | 
 14 | IAM Role:s3--give s3 full access,kms access , msk access
 15 | 
 16 | Trust Relationship--
 17 | 
 18 | {
 19 |     "Version": "2012-10-17",
 20 |     "Statement": [
 21 |         {
 22 |             "Effect": "Allow",
 23 |             "Principal": {
 24 |                 "Service": "kafkaconnect.amazonaws.com"
 25 |             },
 26 |             "Action": "sts:AssumeRole",
 27 |             "Condition": {
 28 |                 "StringEquals": {
 29 |                     "aws:SourceAccount": ""
 30 |                 }
 31 |             }
 32 |         }
 33 |     ]
 34 | }
 35 | 
 36 | Step 3:
 37 | -----------
 38 | Download the Jar from the below link --
 39 | https://github.com/lensesio/stream-reactor/releases
 40 | 
 41 | Uplaod the jar file in s3
 42 | 
 43 | 
 44 | Step 4:
 45 | ---------
 46 | Create the custom plugin using the jar uplaoded in s3 in Step 3
 47 | s3sinkconnectortest123
 48 | 
 49 | 
 50 | Step 5:
 51 | ---------
 52 | Create public and private ec2 which will act as client machine for MSK Cluster
 53 | 
 54 | 
 55 | Step 6:In EC2 Client Machine:
 56 | -----------------------------
 57 | sudo yum install java-1.8.0-openjdk
 58 | wget https://archive.apache.org/dist/kafka/2.8.1/kafka_2.12-2.8.1.tgz
 59 | tar -xvf kafka_2.12-2.8.1.tgz
 60 | cd kafka_2.12-2.8.1
 61 | 
 62 | 
 63 | bin/kafka-topics.sh --create --topic demotesting3 --bootstrap-server {} --replication-factor 1 --partitions 2
 64 | 
 65 | Step 7:Create the connector
 66 | -----------------------------
 67 | Documentation Link:
 68 | ----------------------
 69 | https://docs.lenses.io/5.0/integrations/connectors/stream-reactor/sinks/s3sinkconnector/
 70 | 
 71 | 
 72 | 
 73 | connector.class=io.lenses.streamreactor.connect.aws.s3.sink.S3SinkConnector
 74 | tasks.max=2
 75 | topics=demotesting3
 76 | connect.s3.vhost.bucket=true
 77 | schema.enable=false
 78 | key.converter.schemas.enable=false
 79 | connect.s3.kcql=INSERT INTO irisseta:MSKBuildLabClusterdate SELECT * FROM demotesting3 PARTITIONBY _date.uuuu,_date.LL,_date.dd STOREAS `JSON` WITHPARTITIONER=Values    WITH_FLUSH_SIZE = 10000 WITH_FLUSH_INTERVAL = 300 WITH_FLUSH_COUNT = 20
 80 | aws.region=us-east-1
 81 | aws.custom.endpoint=https://s3.us-east-1.amazonaws.com/
 82 | value.converter.schemas.enable=false
 83 | connect.s3.aws.region=us-east-1
 84 | value.converter=org.apache.kafka.connect.json.JsonConverter
 85 | errors.log.enable=true
 86 | key.converter=org.apache.kafka.connect.json.JsonConverter
 87 | 
 88 | 
 89 | Step 8:Setup Snowflake Table ,Snowpipe and s3 event notifications
 90 | ------------------------------------------------------------------
 91 | --drop database if exists
 92 | drop database if exists s3_to_snowflake;
 93 | 
 94 | --Database Creation 
 95 | create database if not exists s3_to_snowflake;
 96 | 
 97 | --Use the database
 98 | use s3_to_snowflake;
 99 | 
100 | --create the external stage
101 | create or replace stage s3_to_snowflake.PUBLIC.Snow_stage url="s3://{}" 
102 | credentials=(aws_key_id=''
103 | aws_secret_key='');
104 | 
105 | list @Snow_stage;
106 | 
107 | create or replace table s3_to_snowflake.PUBLIC.real_time_demo(data variant);
108 | 
109 | 
110 | --Create the Pipe
111 | create or replace pipe s3_to_snowflake.PUBLIC.for_kafka_ingestion
112 | auto_ingest=true as copy into s3_to_snowflake.PUBLIC.real_time_demo from 
113 | @s3_to_snowflake.PUBLIC.Snow_stage FILE_FORMAT=(type = 'JSON');
114 | 
115 | show pipes;
116 | 
117 | 
118 | 
119 | 
120 | Test:Start consumer in a new window 
121 | ------------------------------------------
122 | 
123 | cd kafka_2.12-2.8.1
124 | bin/kafka-console-consumer.sh --topic demotesting3 --bootstrap-server  b-1.kafkas3connectordemoyt.pvvnij.c3.kafka.us-east-1.amazonaws.com:9092,b-2.kafkas3connectordemoyt.pvvnij.c3.kafka.us-east-1.amazonaws.com:9092
125 | 
126 | Produce messages:
127 | ---------------------
128 | pip install kafka-python
129 | 
130 | 
131 | from time import sleep
132 | from json import dumps
133 | from kafka import KafkaProducer
134 | topic_name='demotesting3'
135 | producer = KafkaProducer(bootstrap_servers=['{}'],value_serializer=lambda x: dumps(x).encode('utf-8'))
136 | 
137 | for e in range(1000):
138 |     data = {'number' : e}
139 |     print(data)
140 |     producer.send(topic_name, value=data)
141 |     sleep(0.2)
142 | 	
143 | 
144 | 
145 | 
146 | Observer partitioning :
147 | ---------------------------
148 | bucket/prefix/customValue/topic(partition_offset)
149 | 
150 | Download the data and observe the json
151 | 
152 | Check in Snowflake
153 | ------------------------
154 | 
155 | 
156 | select * from s3_to_snowflake.PUBLIC.real_time_demo;
157 | 
158 | select count(*) from s3_to_snowflake.PUBLIC.real_time_demo;
159 | 
160 | 
161 | select parse_json(Data):number as value_part from  s3_to_snowflake.PUBLIC.real_time_demo order by value_part;
162 | 
163 | 
164 | 
165 | 


--------------------------------------------------------------------------------
/Manual Offset Commits and At-most Once Processing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/Manual Offset Commits and At-most Once Processing.png


--------------------------------------------------------------------------------
/Manual Offset Commits and Exactly Once Processing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/Manual Offset Commits and Exactly Once Processing.png


--------------------------------------------------------------------------------
/Multimodal Embedding using BedRock.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/Multimodal Embedding using BedRock.zip


--------------------------------------------------------------------------------
/Publish Message in MSK Cluster from AWS Lambda.txt:
--------------------------------------------------------------------------------
 1 | Code to install Java & Kafka in EC2 Client Machine:
 2 | ---------------------------------------------------------------------------
 3 | sudo yum install java-1.8.0-openjdk
 4 | wget https://archive.apache.org/dist/kafka/2.8.1/kafka_2.12-2.8.1.tgz
 5 | tar -xvf kafka_2.12-2.8.1.tgz
 6 | 
 7 | Creating Kafka Topic:
 8 | ----------------------------------
 9 | cd kafka_2.12-2.8.1
10 | bin/kafka-topics.sh --create --topic {topic_name} --bootstrap-server {Put the MSK bootstrap server URLs here} --replication-factor 1 --partitions 1
11 | 
12 | Start Console Consumer:
13 | ----------------------------------
14 | cd kafka_2.12-2.8.1
15 | bin/kafka-console-consumer.sh --topic {Topic Name} --bootstrap-server {Put the MSK bootstrap server URLs here}
16 |  
17 | Start Python Producer:
18 | ----------------------------------
19 | Install the Python Module:
20 | ------------------------------------
21 | pip install kafka-python
22 | 
23 | Producer Code:
24 | ------------------------------------
25 | from time import sleep
26 | from json import dumps
27 | from kafka import KafkaProducer
28 | 
29 | topic_name='{Topic Name}'
30 | producer = KafkaProducer(bootstrap_servers=['{Put 1st MSK bootstrap server URL here}','{Put 2nd MSK bootstrap server URL here}',...],value_serializer=lambda x: dumps(x).encode('utf-8'))
31 | 
32 | for e in range(1000):
33 |     data = {'number' : e}
34 |     print(data)
35 |     producer.send(topic_name, value=data)
36 |     sleep(1)
37 | 
38 | 
39 | 
40 | 
41 | 
42 | Code to create the Lambda Layer:
43 | ---------------------------------------------------------
44 | sudo apt-get update
45 | sudo apt install python3-virtualenv
46 | virtualenv kafka_yt
47 | source kafka_yt/bin/activate
48 | python3 --version  
49 | sudo apt install python3-pip
50 | python3 -m pip install --upgrade pip
51 | mkdir -p lambda_layers/python/lib/python3.8/site-packages
52 | cd lambda_layers/python/lib/python3.8/site-packages
53 | pip install  kafka-python -t .
54 | cd /mnt/c/Users/USER/lambda_layers
55 | sudo apt install zip
56 | zip -r kafka_yt_demo.zip *
57 | 
58 | 
59 | Lambda Security Confuguration:
60 | --------------------------------------
61 | Provide AmazonVPCFullAccess to the Lambda execution role
62 | Configure VPC for Lambda (Choose both subnets & provide the security group of MSK Cluster)
63 | 
64 | 
65 | Lambda Code to publish messages in MSK Topic:
66 | ----------------------------------------------------------------
67 | from time import sleep
68 | from json import dumps
69 | from kafka import KafkaProducer
70 | 
71 | topic_name='{Topic Name}'
72 | producer = KafkaProducer(bootstrap_servers=['{Put 1st MSK bootstrap server URL here}','{Put 2nd MSK bootstrap server URL here}',...],value_serializer=lambda x: dumps(x).encode('utf-8'))
73 | 
74 | def lambda_handler(event, context):
75 |     for e in range(10):
76 |         data = e
77 |         producer.send(topic_name, value=data)
78 |         sleep(0.5)
79 | 
80 | 
81 | 
82 | 


--------------------------------------------------------------------------------
/RAG using Kendra & Langchain AWS.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "provenance": [],
  7 |       "authorship_tag": "ABX9TyMC8x+t5ojmOvhyzoCu1KA6",
  8 |       "include_colab_link": true
  9 |     },
 10 |     "kernelspec": {
 11 |       "name": "python3",
 12 |       "display_name": "Python 3"
 13 |     },
 14 |     "language_info": {
 15 |       "name": "python"
 16 |     }
 17 |   },
 18 |   "cells": [
 19 |     {
 20 |       "cell_type": "markdown",
 21 |       "metadata": {
 22 |         "id": "view-in-github",
 23 |         "colab_type": "text"
 24 |       },
 25 |       "source": [
 26 |         "<a href=\"https://colab.research.google.com/github/SatadruMukherjee/Data-Preprocessing-Models/blob/main/RAG%20using%20Kendra%20%26%20Langchain%20AWS.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
 27 |       ]
 28 |     },
 29 |     {
 30 |       "cell_type": "code",
 31 |       "execution_count": null,
 32 |       "metadata": {
 33 |         "id": "tAPKj2yh_frr"
 34 |       },
 35 |       "outputs": [],
 36 |       "source": [
 37 |         "!pip install boto3 langchain langchain-pinecone\n"
 38 |       ]
 39 |     },
 40 |     {
 41 |       "cell_type": "code",
 42 |       "source": [
 43 |         "import boto3\n",
 44 |         "import os\n",
 45 |         "\n",
 46 |         "\n",
 47 |         "boto3_bedrock = boto3.client('bedrock-runtime',region_name='us-east-1',aws_access_key_id='',aws_secret_access_key='')\n",
 48 |         "\n",
 49 |         "kendra_client=boto3.client('kendra',region_name='us-east-1',aws_access_key_id='',aws_secret_access_key='')"
 50 |       ],
 51 |       "metadata": {
 52 |         "id": "d61bXvX_GDDe"
 53 |       },
 54 |       "execution_count": null,
 55 |       "outputs": []
 56 |     },
 57 |     {
 58 |       "cell_type": "code",
 59 |       "source": [
 60 |         "from langchain_community.retrievers import AmazonKendraRetriever\n",
 61 |         "\n",
 62 |         "retriever = AmazonKendraRetriever(index_id=\"d0215a1d-87f2-41de-906f-edd13da9fb62\",client =kendra_client)\n",
 63 |         "\n",
 64 |         "\n",
 65 |         "\n",
 66 |         "response=retriever.get_relevant_documents(\"How many major terror attacks happened in any city in India since 2014?\")\n",
 67 |         "\n",
 68 |         "response"
 69 |       ],
 70 |       "metadata": {
 71 |         "id": "QHiLvSKSGF3r"
 72 |       },
 73 |       "execution_count": null,
 74 |       "outputs": []
 75 |     },
 76 |     {
 77 |       "cell_type": "code",
 78 |       "source": [
 79 |         "from langchain import PromptTemplate\n",
 80 |         "\n",
 81 |         "RAG_PROMPT_TEMPLATE = '''Here is some important context which can help inform the questions the Human asks.\n",
 82 |         "Make sure to not make anything up to answer the question if it is not provided in the context.\n",
 83 |         "\n",
 84 |         "\n",
 85 |         "{context}\n",
 86 |         "\n",
 87 |         "\n",
 88 |         "Human: {human_input}\n",
 89 |         "\n",
 90 |         "Assistant:\n",
 91 |         "'''\n",
 92 |         "PROMPT = PromptTemplate.from_template(RAG_PROMPT_TEMPLATE)\n",
 93 |         "\n",
 94 |         "import json\n",
 95 |         "human_input=input(\"Enter the question : \")\n",
 96 |         "search_results =retriever.get_relevant_documents(human_input)\n",
 97 |         "context_string = '\\n\\n'.join([f'Document {ind+1}: ' + i.page_content for ind, i in enumerate(search_results)])\n",
 98 |         "prompt_data = PROMPT.format(human_input=human_input, context=context_string)\n",
 99 |         "inputText=prompt_data\n",
100 |         "body_part=json.dumps({'inputText': inputText,\n",
101 |         " 'textGenerationConfig': {'maxTokenCount': 8192,\n",
102 |         "  'stopSequences': [],\n",
103 |         "  'temperature': 0,\n",
104 |         "  'topP': 1}})\n",
105 |         "response = boto3_bedrock.invoke_model(\n",
106 |         "    body=body_part,\n",
107 |         "    contentType=\"application/json\",\n",
108 |         "    accept=\"application/json\",\n",
109 |         "    modelId='amazon.titan-text-express-v1'\n",
110 |         ")\n",
111 |         "output_text=json.loads(response['body'].read())['results'][0]['outputText']\n",
112 |         "output_text"
113 |       ],
114 |       "metadata": {
115 |         "id": "X5m4SQ9GH9Gv"
116 |       },
117 |       "execution_count": null,
118 |       "outputs": []
119 |     }
120 |   ]
121 | }


--------------------------------------------------------------------------------
/Real-Time Streaming Project with Smartphone Data.txt:
--------------------------------------------------------------------------------
  1 | Lauch Kafka:
  2 | -------------------
  3 | 
  4 | https://github.com/SatadruMukherjee/Data-Preprocessing-Models/blob/main/kafka_yt_demo.zip
  5 | 
  6 | Lambda Code:
  7 | ----------------
  8 | import json
  9 | from time import sleep
 10 | from json import dumps
 11 | from kafka import KafkaProducer
 12 | import json
 13 | 
 14 | topic_name='sensor_data_consumer'
 15 | producer = KafkaProducer(bootstrap_servers=['52.87.254.233:9092']
 16 | ,value_serializer=lambda x: dumps(x).encode('utf-8'))
 17 | 
 18 | def lambda_handler(event, context):
 19 |     # TODO implement
 20 |     print(event)
 21 |     payload_part=json.loads(event['body'])['payload']
 22 |     for i in payload_part:
 23 |         light_illumination=i['values']['lux']
 24 |         capture_time=i['time']
 25 |         data={"light_illumination":light_illumination,"capture_time":capture_time}
 26 |         print(data)
 27 |         producer.send(topic_name, value=data)
 28 |     producer.flush()
 29 |     return {
 30 |         'statusCode': 200,
 31 |         'body': json.dumps('Hello from Lambda!')
 32 |     }
 33 | 
 34 | 
 35 | 
 36 | 
 37 | 
 38 | wget https://dlcdn.apache.org/kafka/3.4.0/kafka_2.13-3.4.0.tgz
 39 | tar -xvf kafka_2.13-3.4.0.tgz
 40 | 
 41 | 
 42 | To install Java --
 43 | ----------------------------------------
 44 | java -version
 45 | sudo yum install java-1.8.0-openjdk
 46 | java -version
 47 | cd kafka_2.13-3.4.0
 48 | 
 49 | Start Zoo-keeper:
 50 | -------------------------------
 51 | bin/zookeeper-server-start.sh config/zookeeper.properties
 52 | 
 53 | 
 54 | Start Kafka-server:
 55 | ----------------------------------------
 56 | Duplicate the session & enter in a new console --
 57 | export KAFKA_HEAP_OPTS="-Xmx256M -Xms128M"
 58 | cd kafka_2.13-3.4.0
 59 | bin/kafka-server-start.sh config/server.properties
 60 | 
 61 | It is pointing to private server , change server.properties so that it can run in public IP 
 62 | 
 63 | To do this , you can follow any of the 2 approaches shared belwo --
 64 | 1)Do a vi config/server.properties in insert mode -- change ADVERTISED_LISTENERS to public ip of the EC2 instance
 65 | 2)You can modify the file using Winscp also
 66 | 
 67 | To create topic:
 68 | ------------------
 69 | cd kafka_2.13-3.4.0
 70 | bin/kafka-topics.sh --create --topic sensor_data_consumer --bootstrap-server 100.26.220.99:9092 --replication-factor 1 --partitions 1 
 71 | 
 72 | Start Kafka Console Consumer:
 73 | --------------------------------------
 74 | bin/kafka-console-consumer.sh --topic sensor_data_consumer --bootstrap-server 100.26.220.99:9092
 75 | 
 76 | Sample Json event received from API:
 77 | ---------------------------------------
 78 | {
 79 |    "version":"2.0",
 80 |    "routeKey":"POST /publishtokafka",
 81 |    "rawPath":"/publishtokafka",
 82 |    "rawQueryString":"",
 83 |    "headers":{
 84 |       "accept-encoding":"gzip",
 85 |       "content-length":"1141",
 86 |       "content-type":"application/json",
 87 |       "host":"szl0e9g8og.execute-api.us-east-1.amazonaws.com",
 88 |       "user-agent":"okhttp/4.9.2",
 89 |       "x-amzn-trace-id":"Root=1-641b2ce4-718735061957bb75192829c1",
 90 |       "x-forwarded-for":"43.226.31.179",
 91 |       "x-forwarded-port":"443",
 92 |       "x-forwarded-proto":"https"
 93 |    },
 94 |    "requestContext":{
 95 |       "accountId":"825865577047",
 96 |       "apiId":"szl0e9g8og",
 97 |       "domainName":"szl0e9g8og.execute-api.us-east-1.amazonaws.com",
 98 |       "domainPrefix":"szl0e9g8og",
 99 |       "http":{
100 |          "method":"POST",
101 |          "path":"/publishtokafka",
102 |          "protocol":"HTTP/1.1",
103 |          "sourceIp":"43.226.31.179",
104 |          "userAgent":"okhttp/4.9.2"
105 |       },
106 |       "requestId":"CMPzriYgoAMEJ1Q=",
107 |       "routeKey":"POST /publishtokafka",
108 |       "stage":"$default",
109 |       "time":"22/Mar/2023:16:29:24 +0000",
110 |       "timeEpoch":1679502564025
111 |    },
112 |    "body":"{\"messageId\":22,\"sessionId\":\"9dc9bf11-6301-477c-97a8-50cfc08c77d6\",\"deviceId\":\"a390a36d-eee4-466c-8287-3360165e351c\",\"payload\":[{\"name\":\"light\",\"values\":{\"lux\":6.329113960266113},\"accuracy\":3,\"time\":1679502563452012300},{\"name\":\"light\",\"values\":{\"lux\":6.329113960266113},\"accuracy\":3,\"time\":1679502563552018200},{\"name\":\"light\",\"values\":{\"lux\":6.329113960266113},\"accuracy\":3,\"time\":1679502563652024000},{\"name\":\"light\",\"values\":{\"lux\":6.329113960266113},\"accuracy\":3,\"time\":1679502563752029700},{\"name\":\"light\",\"values\":{\"lux\":6.329113960266113},\"accuracy\":3,\"time\":1679502563852004600},{\"name\":\"light\",\"values\":{\"lux\":6.329113960266113},\"accuracy\":3,\"time\":1679502563952010800},{\"name\":\"light\",\"values\":{\"lux\":6.329113960266113},\"accuracy\":3,\"time\":1679502564052016400},{\"name\":\"light\",\"values\":{\"lux\":6.329113960266113},\"accuracy\":3,\"time\":1679502564152022300},{\"name\":\"light\",\"values\":{\"lux\":6.329113960266113},\"accuracy\":3,\"time\":1679502564252028000},{\"name\":\"light\",\"values\":{\"lux\":6.329113960266113},\"accuracy\":3,\"time\":1679502564352003000},{\"name\":\"light\",\"values\":{\"lux\":6.329113960266113},\"accuracy\":3,\"time\":1679502564452009000}]}",
113 |    "isBase64Encoded":false
114 | }
115 | 
116 | 


--------------------------------------------------------------------------------
/Recursion Pattern with AWS Step Funciton & Lambda.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/Recursion Pattern with AWS Step Funciton & Lambda.png


--------------------------------------------------------------------------------
/Run Batch job Using AWS Lambda.txt:
--------------------------------------------------------------------------------
  1 | s3 bucket:
  2 | DynamoDB:
  3 | 
  4 | Docker Install:
  5 | ---------------
  6 | AWS Linux
  7 | sudo yum update -y
  8 | sudo amazon-linux-extras install docker
  9 | sudo service docker start
 10 | sudo usermod -a -G docker ec2-user
 11 | 
 12 | mkdir lambdafargate
 13 | cd lambdafargate
 14 | 
 15 | requirements.txt:
 16 | ---------------------
 17 | boto3
 18 | 
 19 | Dockerfile:
 20 | -----------
 21 | # Use an official Python runtime as a parent image
 22 | FROM python:3.8-slim
 23 | 
 24 | # Set the working directory to /app
 25 | WORKDIR /app
 26 | 
 27 | # Copy the current directory contents into the container at /app
 28 | COPY . /app
 29 | 
 30 | # Install any needed packages specified in requirements.txt
 31 | RUN pip install --no-cache-dir -r requirements.txt
 32 | 
 33 | # Run script.py when the container launches
 34 | CMD ["echo" "hello"]
 35 | 
 36 | ["python","script.py","demoytlambdabatchtest","Setosa1.csv"]
 37 | 
 38 | script.py:
 39 | ----------
 40 | import boto3
 41 | import csv
 42 | import sys
 43 | from io import StringIO
 44 | 
 45 | def main(bucket, key):
 46 |   s3_client = boto3.client('s3',region_name='us-east-1')
 47 |   dynamodb_client = boto3.resource('dynamodb',region_name='us-east-1')
 48 | 
 49 |   # Read the CSV file from S3
 50 |   file_obj = s3_client.get_object(Bucket=bucket, Key=key)
 51 |   csv_content = file_obj['Body'].read().decode('utf-8')
 52 | 
 53 |   # Define the DynamoDB table
 54 |   table = dynamodb_client.Table('iris_dataset')
 55 | 
 56 |   # Read the CSV content
 57 |   csv_reader = csv.DictReader(StringIO(csv_content))
 58 | 
 59 |   # Iterate through the CSV and write to DynamoDB
 60 |   for row in csv_reader:
 61 |     Id = int(row['Id'])
 62 |     SEPAL_LENGTH = row['SEPAL_LENGTH']
 63 |     SEPAL_WIDTH = (row['SEPAL_WIDTH'])
 64 |     PETAL_LENGTH = row['PETAL_LENGTH']
 65 |     PETAL_WIDTH = row['PETAL_WIDTH']
 66 |     CLASS_NAME = row['CLASS_NAME']
 67 | 
 68 |     # Write to DynamoDB
 69 |     table.put_item(
 70 |     Item={
 71 |     'Id':Id,
 72 |     'SEPAL_LENGTH': SEPAL_LENGTH,
 73 |     'SEPAL_WIDTH': SEPAL_WIDTH,
 74 |     'PETAL_LENGTH': PETAL_LENGTH,
 75 |     'PETAL_WIDTH': PETAL_WIDTH,
 76 |     'CLASS_NAME':CLASS_NAME
 77 |     }
 78 |     )
 79 | 
 80 |   print('CSV processed successfully!')
 81 | 
 82 | if __name__ == "__main__":
 83 |     # Extract command-line arguments
 84 |     if len(sys.argv) != 3:
 85 |         print("Usage: python script.py <S3_BUCKET_NAME> <S3_KEY>")
 86 |         sys.exit(1)
 87 | 
 88 |     s3_bucket = sys.argv[1]
 89 |     s3_key = sys.argv[2]
 90 | 
 91 |     # Execute the main function with provided arguments
 92 |     main(s3_bucket, s3_key)
 93 | 	
 94 | Docker Image Build & Test:
 95 | ---------------------------
 96 | ECR Push
 97 | 
 98 | AWS Batch Components Creation
 99 | 
100 | Batch IAM Role creation --dynmaodb , ecs task execution role , s3 access
101 | 
102 | Lambda Function:
103 | ------------------
104 | import boto3
105 | import json
106 | 
107 | def lambda_handler(event, context):
108 |     print(event)
109 |     # Extract necessary information from the S3 event
110 |     s3_bucket = event['Records'][0]['s3']['bucket']['name']
111 |     s3_key = event['Records'][0]['s3']['object']['key']
112 | 
113 |     # Specify your AWS Batch job definition name
114 |     job_definition_name = '{}'
115 | 
116 |     # Specify your AWS Batch job queue name
117 |     job_queue_name = '{}'
118 | 
119 |     # Specify the command to pass to the AWS Batch job
120 |     command = f"python script.py {s3_bucket} {s3_key}"
121 |     print("Executing the command : ", command)
122 |     # Create an AWS Batch client
123 |     batch_client = boto3.client('batch')
124 | 
125 |     # Submit a job to AWS Batch
126 |     response = batch_client.submit_job(
127 |         jobName='{}',
128 |         jobQueue=job_queue_name,
129 |         jobDefinition=job_definition_name,
130 |         containerOverrides={
131 |             'command': command.split(' ')
132 |         },
133 |         retryStrategy={
134 |             'attempts': 1
135 |         },
136 |     )
137 | 
138 |     # Print the AWS Batch job response
139 |     print(json.dumps(response, indent=2))
140 | 
141 |     return {
142 |         'statusCode': 200,
143 |         'body': json.dumps('AWS Batch job submitted successfully!')
144 |     }


--------------------------------------------------------------------------------
/SNS_Message_Publish.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "SNS Message Publish.ipynb",
  7 |       "provenance": [],
  8 |       "authorship_tag": "ABX9TyMQXNzei1cXERatoSYN2Tth",
  9 |       "include_colab_link": true
 10 |     },
 11 |     "kernelspec": {
 12 |       "name": "python3",
 13 |       "display_name": "Python 3"
 14 |     },
 15 |     "language_info": {
 16 |       "name": "python"
 17 |     }
 18 |   },
 19 |   "cells": [
 20 |     {
 21 |       "cell_type": "markdown",
 22 |       "metadata": {
 23 |         "id": "view-in-github",
 24 |         "colab_type": "text"
 25 |       },
 26 |       "source": [
 27 |         "<a href=\"https://colab.research.google.com/github/SatadruMukherjee/Data-Preprocessing-Models/blob/main/SNS_Message_Publish.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
 28 |       ]
 29 |     },
 30 |     {
 31 |       "cell_type": "code",
 32 |       "source": [
 33 |         "!pip install boto3"
 34 |       ],
 35 |       "metadata": {
 36 |         "id": "6cKOYN27ysNV"
 37 |       },
 38 |       "execution_count": null,
 39 |       "outputs": []
 40 |     },
 41 |     {
 42 |       "cell_type": "code",
 43 |       "source": [
 44 |         "import json\n",
 45 |         "import boto3\n",
 46 |         "access_key=\"\"\n",
 47 |         "secret_key=\"\"\n",
 48 |         "client = boto3.client('sns',aws_access_key_id=access_key,\n",
 49 |         "aws_secret_access_key=secret_key, region_name='')"
 50 |       ],
 51 |       "metadata": {
 52 |         "id": "uoqibyJQyxCM"
 53 |       },
 54 |       "execution_count": 11,
 55 |       "outputs": []
 56 |     },
 57 |     {
 58 |       "cell_type": "code",
 59 |       "execution_count": 23,
 60 |       "metadata": {
 61 |         "id": "4J0MyeFAyo1b"
 62 |       },
 63 |       "outputs": [],
 64 |       "source": [
 65 |         "def message_publish(arn,message,destination):\n",
 66 |         "  response = client.publish(\n",
 67 |         "      TargetArn=arn,\n",
 68 |         "      Message=json.dumps({'default': json.dumps(message)}),\n",
 69 |         "      MessageStructure='json',\n",
 70 |         "      MessageAttributes={\n",
 71 |         "                          'Destination': {\n",
 72 |         "                              'DataType': 'String',\n",
 73 |         "                              'StringValue': destination\n",
 74 |         "                          }\n",
 75 |         "                      }\n",
 76 |         "  )\n",
 77 |         "  return response"
 78 |       ]
 79 |     },
 80 |     {
 81 |       "cell_type": "code",
 82 |       "source": [
 83 |         "arn=\"\"\n",
 84 |         "message=\"\"\n",
 85 |         "destination=\"\"\n",
 86 |         "response=message_publish(arn,message,destination)"
 87 |       ],
 88 |       "metadata": {
 89 |         "id": "b4lyRnEozTAA"
 90 |       },
 91 |       "execution_count": 24,
 92 |       "outputs": []
 93 |     },
 94 |     {
 95 |       "cell_type": "code",
 96 |       "source": [
 97 |         "response"
 98 |       ],
 99 |       "metadata": {
100 |         "id": "rORBL0jk0_gK"
101 |       },
102 |       "execution_count": null,
103 |       "outputs": []
104 |     },
105 |     {
106 |       "cell_type": "code",
107 |       "source": [
108 |         ""
109 |       ],
110 |       "metadata": {
111 |         "id": "-0iW4CGZ_ogN"
112 |       },
113 |       "execution_count": null,
114 |       "outputs": []
115 |     }
116 |   ]
117 | }


--------------------------------------------------------------------------------
/Serverless Manual Approval Steps in AWS Step Functions and Amazon API Gateway.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/Serverless Manual Approval Steps in AWS Step Functions and Amazon API Gateway.png


--------------------------------------------------------------------------------
/Setup PySpark in ec2 using conda.txt:
--------------------------------------------------------------------------------
  1 | Think of Conda as a more powerful virtualenv that not only handles virtual environments but also manages packages and dependencies across multiple languages (not just Python)
  2 | 
  3 | 
  4 | Install Miniconda:
  5 | ===================
  6 | wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
  7 | bash ~/Miniconda3-latest-Linux-x86_64.sh
  8 | 
  9 | Check whether conda installed successfully or not:
 10 | ==================================================
 11 | conda list
 12 | 
 13 | PATH is an environment variable on Unix-like operating systems, DOS, OS/2, and Microsoft Windows, specifying a set of directories where executable programs are located.
 14 | 
 15 | cat ~/.bashrc
 16 | vi ~/.bashrc
 17 | export PATH=~/miniconda3/bin:$PATH
 18 | source ~/.bashrc
 19 | conda list
 20 | which python
 21 | 
 22 | 
 23 | Configuring yml file for conda env setup(https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html#creating-an-environment-file-manually)
 24 | 
 25 | vi environment.yml
 26 | 
 27 | name: pyspark_demo
 28 | channels:
 29 |   - conda-forge
 30 | dependencies:
 31 |   - findspark=2.0.1
 32 |   - jupyter=1.0.0
 33 |   - pyspark=3.5.0
 34 |   - openjdk=11.0.13
 35 |   - python=3.12
 36 |   - python-dotenv
 37 |   
 38 | Note: A channel is a location (a URL) where conda can search for packages to install on your machine e.g. https://anaconda.org/conda-forge/repo
 39 |   
 40 | conda env create -f environment.yml
 41 | 
 42 | conda activate pyspark_demo
 43 | 
 44 | jupyter notebook  --generate-config
 45 | 
 46 | vi /home/ec2-user/.jupyter/jupyter_notebook_config.py
 47 | 
 48 | c.NotebookApp.ip = '*'
 49 | c.NotebookApp.open_browser = False
 50 | 
 51 | (Reference: https://testnb.readthedocs.io/en/stable/examples/Notebook/Configuring%20the%20Notebook%20and%20Server.html#running-a-public-notebook-server)
 52 | 
 53 | jupyter notebook
 54 | 
 55 | 
 56 | Spark Code:
 57 | ============
 58 | 
 59 | import os
 60 | from dotenv import load_dotenv
 61 | import findspark
 62 | findspark.init()
 63 | findspark.find()
 64 | 
 65 | import pyspark
 66 | from pyspark.sql import SparkSession
 67 | from pyspark.sql import functions as F
 68 | from pyspark.sql import SparkSession
 69 | from pyspark.sql.types import StructType, StructField, IntegerType, StringType
 70 | 
 71 | spark = SparkSession.builder \
 72 |     .appName("WriteToS3") \
 73 |     .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.4,com.amazonaws:aws-java-sdk-bundle:1.12.262") \
 74 |     .config("spark.hadoop.fs.s3a.access.key", "") \
 75 |     .config("spark.hadoop.fs.s3a.secret.key", "") \
 76 |     .config("spark.hadoop.fs.s3a.endpoint", "s3.amazonaws.com") \
 77 |     .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
 78 |     .getOrCreate()
 79 | 
 80 | # Define schema
 81 | schema = StructType([
 82 |     StructField("id", IntegerType(), True),
 83 |     StructField("name", StringType(), True),
 84 |     StructField("age", IntegerType(), True)
 85 | ])
 86 | 
 87 | # Create dummy data
 88 | data = [
 89 |     (1, "Alice", 25),
 90 |     (2, "Bob", 30),
 91 |     (3, "Charlie", 28)
 92 | ]
 93 | 
 94 | # Create DataFrame
 95 | df = spark.createDataFrame(data, schema=schema)
 96 | 
 97 | # Show DataFrame
 98 | df.show()
 99 | 
100 | # Define S3 path
101 | s3_path = "s3a://{Bucket Name}/test_conda/"
102 | 
103 | # Write DataFrame to S3 in Parquet format
104 | df.write.mode("overwrite").parquet(s3_path)
105 | 
106 | 


--------------------------------------------------------------------------------
/Shake detection using Accelerometer , Kafka & Python.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | EC2 Code:
 4 | ------------
 5 | wget https://dlcdn.apache.org/kafka/3.4.0/kafka_2.13-3.4.0.tgz
 6 | tar -xvf kafka_2.13-3.4.0.tgz
 7 | sudo yum install java-1.8.0-openjdk
 8 | cd kafka_2.13-3.4.0
 9 | bin/zookeeper-server-start.sh config/zookeeper.properties
10 | export KAFKA_HEAP_OPTS="-Xmx256M -Xms128M"
11 | cd kafka_2.13-3.4.0
12 | bin/kafka-server-start.sh config/server.properties
13 | To create topic:
14 | ------------------
15 | cd kafka_2.13-3.4.0
16 | bin/kafka-topics.sh --create --topic sensor_data_consumer --bootstrap-server {}:9092 --replication-factor 1 --partitions 1 
17 | 
18 | Start Kafka Console Consumer:
19 | --------------------------------------
20 | bin/kafka-console-consumer.sh --topic sensor_data_consumer --bootstrap-server {}:9092
21 | 
22 | 
23 | Lambda Code:
24 | ----------------
25 | import json
26 | from time import sleep
27 | from json import dumps
28 | from kafka import KafkaProducer
29 | import json
30 | 
31 | topic_name='sensor_data_consumer'
32 | producer = KafkaProducer(bootstrap_servers=['{}:9092']
33 | ,value_serializer=lambda x: dumps(x).encode('utf-8'))
34 | 
35 | def lambda_handler(event, context):
36 |     # TODO implement
37 |     print(event)
38 |     payload_part=json.loads(event['body'])['payload']
39 |     for i in payload_part:
40 |         acc_x=i['values']['x']
41 |         acc_y=i['values']['y']
42 |         acc_z=i['values']['z']
43 |         capture_time=i['time']
44 |         data={"acc_x":acc_x,"acc_y":acc_y,"acc_z":acc_z,"capture_time":capture_time}
45 |         print(data)
46 |         producer.send(topic_name, value=data)
47 |     producer.flush()
48 |     return {
49 |         'statusCode': 200,
50 |         'body': json.dumps('Hello from Lambda!')
51 |     }
52 |    
53 | 
54 |     
55 |     
56 | Consumer Code:
57 | ----------------
58 | from kafka import KafkaConsumer
59 | from kafka import TopicPartition , OffsetAndMetadata
60 | import json
61 | 
62 | 
63 | consumer = KafkaConsumer ('sensor_data_consumer1',bootstrap_servers = ['{}:9092'],
64 | value_deserializer=lambda m: json.loads(m.decode('utf-8')),group_id='acceleration_test',auto_offset_reset='earliest',
65 |                           enable_auto_commit =False)
66 | 
67 | 
68 | for message in consumer:
69 |     data=message.value
70 |     x_accl=data['acc_x']
71 |     y_accl = data['acc_y']
72 |     z_accl = data['acc_z']
73 |     mag=(abs(x_accl)+abs(y_accl)+abs(z_accl))
74 |     if(mag>=15):
75 |         print('*' * 100)
76 |         print("Shaking")
77 |         print('*' * 100)
78 |     else:
79 |         print("Idle")
80 |     tp=TopicPartition(message.topic,message.partition)
81 |     om = OffsetAndMetadata(message.offset+1, message.timestamp)
82 |     consumer.commit({tp:om})
83 | 	
84 | 


--------------------------------------------------------------------------------
/Simple OTP System using AWS Serverless.txt:
--------------------------------------------------------------------------------
  1 | DynamoDB Table:
  2 | -----------------
  3 | Table Name: otp_holder
  4 | Primary Key: email_id 
  5 | Sort Key: EXPIRATION_TIME
  6 | 
  7 | 
  8 | 
  9 | OTP Generator:
 10 | ------------------
 11 | import json
 12 | import boto3
 13 | import time
 14 | from random import randint
 15 | 
 16 | client_dynamo=boto3.resource('dynamodb')
 17 | 
 18 | table=client_dynamo.Table('otp_holder')
 19 | 
 20 | default_ttl = 120
 21 | 
 22 | def lambda_handler(event, context):
 23 |     
 24 |     email_id=event['queryStringParameters']['email_address']
 25 |     
 26 |     otp_value=randint(100000, 999999)
 27 |     
 28 |     entry={
 29 |     'email_id': email_id,
 30 |     'OTP': otp_value,
 31 |     'EXPIRATION_TIME': int(time.time()) + default_ttl
 32 |     }
 33 |     
 34 |     response=table.put_item(Item=entry)
 35 |     
 36 |     return "A verification code is sent to the email address you provided."
 37 | 	
 38 | 	
 39 | Send Email:
 40 | --------------
 41 | import json
 42 | import boto3
 43 | client = boto3.client("ses")
 44 | 
 45 | 
 46 | def lambda_handler(event, context):
 47 |     print(event)
 48 |     if(event['Records'][0]['eventName']=='INSERT'):
 49 |         mail_id=event['Records'][0]['dynamodb']['Keys']['email_id']['S']
 50 |         print("The mail id is : {}".format(mail_id))
 51 |         
 52 |         otp=event['Records'][0]['dynamodb']['NewImage']['OTP']['N']
 53 |         print("The mail id is : {}".format(otp))
 54 |         
 55 |         body = """
 56 |                 Use this code to verify your login at Simple Website<br>
 57 |                 
 58 |                 {}
 59 |              """.format(otp)
 60 |              
 61 |         message = {"Subject": {"Data": 'Your OTP (valid for only 2 mins)!'}, "Body": {"Html": {"Data": body}}}
 62 |         
 63 |         response = client.send_email(Source = '{FromAddress}', Destination = {"ToAddresses": [mail_id]}, Message = message) 
 64 |         
 65 |         print("The mail is sent successfully")
 66 | 		
 67 | Verify OTP:
 68 | --------------
 69 | import json
 70 | import boto3
 71 | import time
 72 | client = boto3.client('dynamodb')
 73 | 
 74 | def lambda_handler(event, context):
 75 |     # TODO implement
 76 |     
 77 |     email_id=event['queryStringParameters']['email_address']
 78 |     print("The received email id : {}".format(email_id))
 79 |     
 80 |     otp_from_user=event['queryStringParameters']['otp']
 81 |     print("The received otp : {}".format(otp_from_user))
 82 |     
 83 |     response = client.query(
 84 |     TableName='otp_holder',
 85 |     KeyConditionExpression='email_id = :email_id',
 86 |     ExpressionAttributeValues={
 87 |         ':email_id': {'S': email_id}
 88 |     },ScanIndexForward = False, Limit = 1)
 89 |     
 90 |     if(response['Count']==0):
 91 |         return "No such OTP was shared"
 92 |     else:
 93 |         latest_stored_otp_value=response['Items'][0]['OTP']['N']
 94 |         print("Latest Stored OTP Value : {}".format(latest_stored_otp_value))
 95 |         
 96 |         if(int(response['Items'][0]['EXPIRATION_TIME']['N'])<int(time.time())):
 97 |             return "Time Over"
 98 |         else:
 99 |             if(latest_stored_otp_value==otp_from_user):
100 |                 return "Verified"
101 |             else:
102 |                 return "Wrong OTP"
103 | 


--------------------------------------------------------------------------------
/Snowflake Codes for generic s3 to snowflake loader.txt:
--------------------------------------------------------------------------------
  1 | --drop database if required
  2 | drop database if exists ramu;
  3 | --Create Database
  4 | create database if not exists ramu;
  5 | --use the database
  6 | use ramu;
  7 | 
  8 | --required table creation
  9 | create or replace TABLE RAMU.PUBLIC.COPY_CMD_REJECTS (
 10 | 	QUERY_ID VARCHAR(16777216),
 11 | 	TABLE_NAME VARCHAR(100),
 12 | 	LOAD_DATE TIMESTAMP_LTZ(9),
 13 | 	ERROR VARCHAR(16777216),
 14 | 	FILE VARCHAR(16777216),
 15 | 	LINE NUMBER(18,0),
 16 | 	CHARACTER NUMBER(18,0),
 17 | 	BYTE_OFFSET NUMBER(18,0),
 18 | 	CATEGORY VARCHAR(16777216),
 19 | 	CODE NUMBER(18,0),
 20 | 	SQL_STATE VARCHAR(16777216),
 21 | 	COLUMN_NAME VARCHAR(16777216),
 22 | 	ROW_NUMBER NUMBER(18,0),
 23 | 	ROW_START_LINE NUMBER(18,0),
 24 | 	REJECTED_RECORD VARCHAR(16777216)
 25 | );
 26 | 
 27 | create or replace TABLE RAMU.PUBLIC.COPY_AUDIT (
 28 | 	QUERY_ID VARCHAR(16777216),
 29 | 	QUERY_TEXT VARCHAR(16777216),
 30 | 	DATABASE_NAME VARCHAR(16777216),
 31 | 	ROWS_INSERTED VARCHAR(16777216),
 32 | 	ROWS_REJECTED VARCHAR(16777216),
 33 | 	SCHEMA_NAME VARCHAR(16777216),
 34 | 	ROLE_NAME VARCHAR(16777216),
 35 | 	WAREHOUSE_NAME VARCHAR(16777216),
 36 | 	WAREHOUSE_SIZE VARCHAR(16777216),
 37 | 	EXECUTION_STATUS VARCHAR(16777216),
 38 | 	ERROR_MESSAGE VARCHAR(16777216),
 39 | 	EXECUTION_TIME NUMBER(38,0),
 40 | 	ETL_TS TIMESTAMP_NTZ(9) DEFAULT CURRENT_TIMESTAMP()
 41 | );
 42 | 
 43 | // Create storage integration object
 44 | --stage creation
 45 | create or replace stage ramu.PUBLIC.snow_simple url="s3://{}/" 
 46 | credentials=(aws_key_id='{}'
 47 | aws_secret_key='{}');
 48 | 
 49 | 
 50 | list @ramu.PUBLIC.snow_simple;
 51 | 
 52 | --file-format creation
 53 | create or replace file format my_csv_format
 54 | type = csv field_delimiter = ',' skip_header = 1
 55 | field_optionally_enclosed_by = '"'
 56 | null_if = ('NULL', 'null') 
 57 | empty_field_as_null = true;
 58 | 
 59 | 
 60 |   
 61 |   
 62 | 
 63 | 
 64 | --Table Creation for experiment
 65 | create or replace table  iris_flower (ID number(10,0),sepal_length number(10,4),sepal_width number(10,4),petal_length number(10,4)  ,petal_width number(10,4),class varchar(200));
 66 | 
 67 | create or replace TABLE customer (
 68 | 	C_CUSTKEY NUMBER(38,0),
 69 | 	C_NAME VARCHAR(25),
 70 | 	C_ADDRESS VARCHAR(40),
 71 | 	C_NATIONKEY NUMBER(38,0),
 72 | 	C_PHONE VARCHAR(15),
 73 | 	C_ACCTBAL NUMBER(12,2),
 74 | 	C_MKTSEGMENT VARCHAR(10),
 75 | 	C_COMMENT VARCHAR(117)
 76 | );
 77 | 
 78 | 
 79 | 
 80 | 
 81 | select * from ramu.PUBLIC.iris_flower; 
 82 | 
 83 | select * from ramu.PUBLIC.customer; 
 84 | 
 85 | 
 86 | 
 87 | SELECT * FROM  RAMU.PUBLIC.COPY_CMD_REJECTS ;
 88 | 
 89 | SELECT * FROM RAMU.PUBLIC.COPY_AUDIT;
 90 | 
 91 | 
 92 | --drop database if required
 93 | drop database if exists Demo2;
 94 | --Create Database
 95 | create database if not exists Demo2;
 96 | --use the database
 97 | use Demo2;
 98 | 
 99 | --required table creation
100 | create or replace TABLE Demo2.PUBLIC.COPY_CMD_REJECTS (
101 | 	QUERY_ID VARCHAR(16777216),
102 | 	TABLE_NAME VARCHAR(100),
103 | 	LOAD_DATE TIMESTAMP_LTZ(9),
104 | 	ERROR VARCHAR(16777216),
105 | 	FILE VARCHAR(16777216),
106 | 	LINE NUMBER(18,0),
107 | 	CHARACTER NUMBER(18,0),
108 | 	BYTE_OFFSET NUMBER(18,0),
109 | 	CATEGORY VARCHAR(16777216),
110 | 	CODE NUMBER(18,0),
111 | 	SQL_STATE VARCHAR(16777216),
112 | 	COLUMN_NAME VARCHAR(16777216),
113 | 	ROW_NUMBER NUMBER(18,0),
114 | 	ROW_START_LINE NUMBER(18,0),
115 | 	REJECTED_RECORD VARCHAR(16777216)
116 | );
117 | 
118 | create or replace TABLE Demo2.PUBLIC.COPY_AUDIT (
119 | 	QUERY_ID VARCHAR(16777216),
120 | 	QUERY_TEXT VARCHAR(16777216),
121 | 	DATABASE_NAME VARCHAR(16777216),
122 | 	ROWS_INSERTED VARCHAR(16777216),
123 | 	ROWS_REJECTED VARCHAR(16777216),
124 | 	SCHEMA_NAME VARCHAR(16777216),
125 | 	ROLE_NAME VARCHAR(16777216),
126 | 	WAREHOUSE_NAME VARCHAR(16777216),
127 | 	WAREHOUSE_SIZE VARCHAR(16777216),
128 | 	EXECUTION_STATUS VARCHAR(16777216),
129 | 	ERROR_MESSAGE VARCHAR(16777216),
130 | 	EXECUTION_TIME NUMBER(38,0),
131 | 	ETL_TS TIMESTAMP_NTZ(9) DEFAULT CURRENT_TIMESTAMP()
132 | );
133 | 
134 | 
135 | 
136 | // Create storage integration object
137 | --stage creation
138 | create or replace stage Demo2.PUBLIC.second_stage url="s3://{}/" 
139 | credentials=(aws_key_id='{}'
140 | aws_secret_key='{}');
141 | 
142 | --file-format creation
143 | create or replace file format Demo2.PUBLIC.json_format_strip_outer_array
144 |   type = 'json' STRIP_OUTER_ARRAY=TRUE;  
145 |   
146 |  
147 |  create OR REPLACE table Demo2.PUBLIC.json_testing_with_array(jsoncolumntesting variant);
148 |  
149 |  select * from Demo2.PUBLIC.json_testing_with_array;
150 |  
151 |  SELECT * FROM  Demo2.PUBLIC.COPY_CMD_REJECTS;
152 | 
153 | SELECT * FROM Demo2.PUBLIC.COPY_AUDIT;
154 |  
155 |  
156 | 
157 | 
158 | 
159 | 
160 | 
161 | 
162 | 
163 | 
164 | 


--------------------------------------------------------------------------------
/Snowflake External Table Partitioning.txt:
--------------------------------------------------------------------------------
  1 | To create the partitioned data in s3 external stage:
  2 | -------------------------------------------------------------
  3 | copy into {externla stage}
  4 | from 
  5 | (select * from snowflake_sample_data.tpch_sf1.lineitem)
  6 | partition by(split_part(l_shipdate,'-',1))
  7 | FILE_FORMAT=my_csv_format
  8 | HEADER=TRUE
  9 | DETAILED_OUTPUT=TRUE;
 10 | 	  
 11 | 	  
 12 | Code used in the demo:
 13 | -------------------------
 14 | ALTER SESSION SET USE_CACHED_RESULT = FALSE;
 15 | 
 16 | --drop database if exists
 17 | drop database if exists s3_to_snowflake;
 18 | 
 19 | --Database Creation 
 20 | create database if not exists s3_to_snowflake;
 21 | 
 22 | --Use the database
 23 | use s3_to_snowflake;
 24 | 
 25 | --create the external stage
 26 | create or replace stage s3_to_snowflake.PUBLIC.Snow_stage url="s3://irisseta/" 
 27 | credentials=(aws_key_id=''
 28 | aws_secret_key='');
 29 | 
 30 | list @Snow_stage/unloadlineitem;
 31 | 
 32 | 
 33 | --File Format Creation
 34 | create or replace file format my_csv_format
 35 | type = csv field_delimiter = ',' skip_header = 1
 36 | field_optionally_enclosed_by = '"'
 37 | null_if = ('NULL', 'null') 
 38 | empty_field_as_null = true;
 39 | 
 40 | 
 41 | 
 42 | create or replace external table s3_to_snowflake.PUBLIC.table_without_partition_for_yt_demo  (
 43 | L_ORDERKEY number(38,0) as (Value:c1::int),
 44 | L_PARTKEY NUMBER(38,0) as (Value:c2::int),
 45 | L_SUPPKEY NUMBER(38,0) as (Value:c3::int),
 46 | L_LINENUMBER NUMBER(38,0) as (Value:c4::int),
 47 | L_QUANTITY NUMBER(12,2) as (Value:c5::number(12,2)),
 48 | L_EXTENDEDPRICE NUMBER(12,2) as (Value:c6::number(12,2)),
 49 | L_DISCOUNT NUMBER(12,2) as (Value:c7::number(12,2)),
 50 | L_TAX NUMBER(12,2) as (Value:c8::number(12,2)),
 51 | L_RETURNFLAG VARCHAR(1) as (Value:c9::varchar),
 52 | L_LINESTATUS VARCHAR(1) as (Value:c10::varchar),
 53 | L_SHIPDATE DATE as (Value:c11::DATE),
 54 | L_COMMITDATE DATE as (Value:c12::DATE),
 55 | L_RECEIPTDATE DATE as (Value:c13::DATE),
 56 | L_SHIPINSTRUCT VARCHAR(25) as (Value:c14::varchar),
 57 | L_SHIPMODE VARCHAR(10) as (Value:c15::varchar),
 58 | L_COMMENT VARCHAR(44) as (Value:c16::varchar))  with 
 59 | location =@s3_to_snowflake.PUBLIC.Snow_stage/unloadlineitem file_format ='my_csv_format';
 60 | 
 61 | select * from s3_to_snowflake.PUBLIC.table_without_partition_for_yt_demo limit 10 ;
 62 | 
 63 | 
 64 | 
 65 | select
 66 |        l_returnflag,
 67 |        l_linestatus,
 68 |        sum(l_quantity) as sum_qty,
 69 |        sum(l_extendedprice) as sum_base_price,
 70 |        sum(l_extendedprice * (1-l_discount)) as sum_disc_price,
 71 |        sum(l_extendedprice * (1-l_discount) * (1+l_tax)) as sum_charge,
 72 |        avg(l_quantity) as avg_qty,
 73 |        avg(l_extendedprice) as avg_price,
 74 |        avg(l_discount) as avg_disc,
 75 |        count(*) as count_order
 76 |  from
 77 |        s3_to_snowflake.PUBLIC.table_without_partition_for_yt_demo
 78 |  where
 79 |        cast(split_part(l_shipdate,'-',1) as int) >=1998
 80 |        group by
 81 |        l_returnflag,
 82 |        l_linestatus
 83 |  order by
 84 |        l_returnflag,
 85 |        l_linestatus;
 86 | 
 87 | select METADATA$FILENAME as File_Name from @Snow_stage/unloadlineitem limit 10;
 88 | 
 89 | select METADATA$FILENAME as File_Name,split_part(METADATA$FILENAME,'/',2) as Partition_Name from @Snow_stage/unloadlineitem limit 10;
 90 | 
 91 | --Create partitioned Table
 92 | create or replace external table s3_to_snowflake.PUBLIC.table_with_partition_for_yt_demo  (
 93 | L_ORDERKEY number(38,0) as (Value:c1::int),
 94 | L_PARTKEY NUMBER(38,0) as (Value:c2::int),
 95 | L_SUPPKEY NUMBER(38,0) as (Value:c3::int),
 96 | L_LINENUMBER NUMBER(38,0) as (Value:c4::int),
 97 | L_QUANTITY NUMBER(12,2) as (Value:c5::number(12,2)),
 98 | L_EXTENDEDPRICE NUMBER(12,2) as (Value:c6::number(12,2)),
 99 | L_DISCOUNT NUMBER(12,2) as (Value:c7::number(12,2)),
100 | L_TAX NUMBER(12,2) as (Value:c8::number(12,2)),
101 | L_RETURNFLAG VARCHAR(1) as (Value:c9::varchar),
102 | L_LINESTATUS VARCHAR(1) as (Value:c10::varchar),
103 | L_SHIPDATE DATE as (Value:c11::DATE),
104 | L_COMMITDATE DATE as (Value:c12::DATE),
105 | L_RECEIPTDATE DATE as (Value:c13::DATE),
106 | L_SHIPINSTRUCT VARCHAR(25) as (Value:c14::varchar),
107 | L_SHIPMODE VARCHAR(10) as (Value:c15::varchar),
108 | L_COMMENT VARCHAR(44) as (Value:c16::varchar),File_Partition  NUMBER(38,0) as (split_part(METADATA$FILENAME,'/',2)::int)) PARTITION BY(File_Partition) with 
109 | location =@s3_to_snowflake.PUBLIC.Snow_stage/unloadlineitem file_format ='my_csv_format' ;
110 | 
111 | 
112 | select
113 |        l_returnflag,
114 |        l_linestatus,
115 |        sum(l_quantity) as sum_qty,
116 |        sum(l_extendedprice) as sum_base_price,
117 |        sum(l_extendedprice * (1-l_discount)) as sum_disc_price,
118 |        sum(l_extendedprice * (1-l_discount) * (1+l_tax)) as sum_charge,
119 |        avg(l_quantity) as avg_qty,
120 |        avg(l_extendedprice) as avg_price,
121 |        avg(l_discount) as avg_disc,
122 |        count(*) as count_order
123 |  from
124 |        s3_to_snowflake.PUBLIC.table_with_partition_for_yt_demo
125 |  where
126 |        File_Partition >=1998
127 |        group by
128 |        l_returnflag,
129 |        l_linestatus
130 |  order by
131 |        l_returnflag,
132 |        l_linestatus;
133 | 


--------------------------------------------------------------------------------
/Snowflake Parallel Processing using Python Lab.txt:
--------------------------------------------------------------------------------
  1 | 
  2 | Snowflake Code:
  3 | ---------------------------------
  4 | --drop database if required
  5 | drop database if exists ramu;
  6 | --Create Database
  7 | create database if not exists ramu;
  8 | --use the database
  9 | use ramu;
 10 | 
 11 | 
 12 | create or replace table employee_history(employee_id number,
 13 |                      empl_join_date date,
 14 |                      dept varchar(10),
 15 |                      salary number,
 16 |                      manager_id number,t timestamp default current_timestamp());
 17 |                      
 18 |                      
 19 | 
 20 | create or replace procedure demostoredprocedure(run_this_sql_query Varchar)
 21 |   returns String not null
 22 |   language javascript
 23 |   as     
 24 |   $$  
 25 |     var my_sql_command_3 ="call system$wait(10)"
 26 |     var statement3 = snowflake.createStatement( {sqlText: my_sql_command_3} );
 27 |     var result_set3 = statement3.execute();
 28 |     var my_sql_command_2 = RUN_THIS_SQL_QUERY;
 29 |     var statement2 = snowflake.createStatement( {sqlText: my_sql_command_2} );
 30 |     var result_set2 = statement2.execute();
 31 | return "Done"; // Replace with something more useful.
 32 |   $$
 33 |   ;
 34 |   
 35 | 
 36 |   
 37 | select * from employee_history;
 38 | 
 39 | 
 40 | create or replace table employee_history_parallel(employee_id number,
 41 |                      empl_join_date date,
 42 |                      dept varchar(10),
 43 |                      salary number,
 44 |                      manager_id number,t timestamp default current_timestamp());
 45 |                      
 46 | select * from employee_history_parallel;
 47 | 
 48 | 
 49 | 
 50 | Sequential Execution:
 51 | -------------------------
 52 | import pandas as pd
 53 | import time
 54 | import snowflake.connector as sf
 55 | 
 56 | user=""
 57 | password=""
 58 | account=""
 59 | conn=sf.connect(user=user,password=password,account=account)
 60 | 
 61 | 
 62 | cursor = conn.cursor()
 63 | 
 64 | statement_1 = 'use warehouse COMPUTE_WH'
 65 | cursor.execute(statement_1)
 66 | statement2 = "alter warehouse COMPUTE_WH resume  IF SUSPENDED"
 67 | cursor.execute(statement2)
 68 | statement3 = "use database RAMU"
 69 | cursor.execute(statement3)
 70 | statement4 = "use role ACCOUNTADMIN"
 71 | cursor.execute(statement4)
 72 | statement5 = "use schema PUBLIC"
 73 | cursor.execute(statement5)
 74 | 
 75 | 
 76 | 
 77 | 
 78 | df=pd.read_csv('{}')
 79 | 
 80 | for index,row in df.iterrows():
 81 |     query_to_be_executed=row['Query'].replace("'","''")
 82 |     print(query_to_be_executed)
 83 |     exeucte_snowflake_query="""call demostoredprocedure('{}');""".format(query_to_be_executed)
 84 |     print("Executing the query :{} ".format(exeucte_snowflake_query))
 85 |     cursor.execute(exeucte_snowflake_query)
 86 | 
 87 | Parallel Execution:
 88 | -------------------------
 89 | 
 90 | import pandas as pd
 91 | import time
 92 | import snowflake.connector as sf
 93 | import warnings
 94 | warnings.filterwarnings('ignore')
 95 | 
 96 | user=""
 97 | password=""
 98 | account=""
 99 | conn=sf.connect(user=user,password=password,account=account)
100 | 
101 | 
102 | cursor = conn.cursor()
103 | 
104 | statement_1 = 'use warehouse COMPUTE_WH'
105 | cursor.execute(statement_1)
106 | statement2 = "alter warehouse COMPUTE_WH resume  IF SUSPENDED"
107 | cursor.execute(statement2)
108 | statement3 = "use database RAMU"
109 | cursor.execute(statement3)
110 | statement4 = "use role ACCOUNTADMIN"
111 | cursor.execute(statement4)
112 | statement5 = "use schema PUBLIC"
113 | cursor.execute(statement5)
114 | 
115 | 
116 | def get_status(cur_list):
117 |     print("Check the status of the query list : {}".format(cur_list))
118 |     status=[]
119 |     df=pd.DataFrame(columns=['Query_id','Status'])
120 |     arr=cur_list
121 |     for query_id in cur_list:
122 |         status_for_the_query=conn.get_query_status(query_id).name
123 |         status.append(status_for_the_query)
124 |         df=df.append({'Query_id':query_id,'Status':status_for_the_query},ignore_index=True)
125 |     if status.count('RUNNING')>1:
126 |         del status[:]
127 |         print(df)
128 |         print("One or more commands still running")
129 |         time.sleep(5)
130 |         get_status(arr)
131 |     else:
132 |         print("All commands execution done!")
133 |         print(df)
134 |     return
135 | 
136 | 
137 | query_ids=[]
138 | 
139 | df=pd.read_csv('{}')
140 | 
141 | for index,row in df.iterrows():
142 |     query_to_be_executed=row['Query'].replace("'","''")
143 |     print(query_to_be_executed)
144 |     exeucte_snowflake_query="""call demostoredprocedure('{}');""".format(query_to_be_executed)
145 |     print("Executing the query :{} ".format(exeucte_snowflake_query))
146 |     cursor.execute_async(exeucte_snowflake_query)
147 |     query_id=cursor.sfqid
148 |     print("Query id for the above query execution : {}".format(query_id))
149 |     query_ids.append(query_id)
150 | 
151 | get_status(query_ids)
152 | 
153 | 


--------------------------------------------------------------------------------
/Snowflake Row Level Security.sql:
--------------------------------------------------------------------------------
 1 | drop database if exists ramu;
 2 | 
 3 | create database ramu;
 4 | 
 5 | create or replace table ramu.public.employees(employee_id number,
 6 |                      empl_join_date date,
 7 |                      dept varchar(10),
 8 |                      salary number,
 9 |                      manager_id number);
10 |                      
11 | insert into ramu.public.employees values(1,'2014-10-01','HR',40000,4),
12 |                                  (2,'2014-09-01','Tech',50000,9),
13 |                                  (3,'2018-09-01','Marketing',30000,5),
14 |                                  (4,'2017-09-01','HR',10000,5),
15 |                                  (5,'2019-09-01','HR',35000,9),
16 |                                  (6,'2015-09-01','Tech',90000,4),
17 |                                  (7,'2016-09-01','Marketing',20000,1);
18 |                                  
19 | select * from ramu.public.employees;
20 | 
21 | CREATE OR REPLACE TABLE access_management_lookup (role string, dept_name string);
22 | 
23 | INSERT INTO access_management_lookup VALUES ('HR_ADMIN', 'HR'), ('TECH_ADMIN', 'Tech'),('MARKETING_ADMIN', 'Marketing');
24 | 
25 | select * from access_management_lookup;
26 | 
27 | 
28 | 
29 | create or replace row access policy dept_level_access as (dept varchar) returns boolean ->
30 |    current_role()='ACCOUNTADMIN'
31 |       or exists (
32 |             select 1 from access_management_lookup
33 |               where role = current_role()
34 |                 and dept_name = depta
35 |           );
36 | 
37 | 
38 | ALTER TABLE ramu.public.employees ADD ROW ACCESS POLICY dept_level_access ON (dept);
39 | 
40 | select * from ramu.public.employees;
41 | 
42 |                
43 | 
44 | 
45 | create or replace role HR_Admin;
46 | create or replace role Tech_Admin;
47 | create or replace role Marketing_Admin;
48 | 
49 | grant usage on warehouse compute_Wh to role HR_Admin;
50 | grant usage on warehouse compute_Wh to role Tech_Admin;
51 | grant usage on warehouse compute_Wh to role Marketing_Admin;
52 | 
53 | grant usage on database ramu to role HR_Admin;
54 | grant usage on database ramu to role Tech_Admin;
55 | grant usage on database ramu to role Marketing_Admin;
56 | 
57 | grant usage on schema public to role HR_Admin;
58 | grant usage on schema public to role Tech_Admin;
59 | grant usage on schema public to role Marketing_Admin;
60 | 
61 | grant select on table ramu.public.employees to role HR_Admin;
62 | grant select on table ramu.public.employees to role Tech_Admin;
63 | grant select on table ramu.public.employees to role Marketing_Admin;
64 | 
65 | 
66 | create or replace user jadu_hr password = '123456';
67 | grant role HR_Admin to user jadu_hr;
68 | 
69 | 
70 | 
71 | create or replace user mimo_marketing password = '456789';
72 | grant role Marketing_Admin to user mimo_marketing;
73 | 
74 | 
75 | create or replace user jimo_tech password = '147258';
76 | grant role Tech_Admin to user jimo_tech;
77 | 
78 | 
79 | drop user jimo_tech;
80 | drop user mimo_marketing;
81 | drop user jadu_hr;
82 | 
83 | drop role HR_Admin;
84 | drop role Tech_Admin;
85 | drop role Marketing_Admin;
86 | 


--------------------------------------------------------------------------------
/Snowflake Schema Detection.txt:
--------------------------------------------------------------------------------
  1 | Create 3 buckets:
  2 | -----------------
  3 | 1)For storing csv file
  4 | 2)For storing parquet file
  5 | 3)For storing athena logs
  6 | 
  7 | 
  8 | 
  9 | CSV to parquet conversion using AWS Athena:
 10 | --------------------------------------------
 11 | CREATE  EXTERNAL TABLE helloworld.hellocsv (
 12 | `Id` int,
 13 | `SEPAL_LENGTH` double,
 14 | `SEPAL_WIDTH` double,
 15 | `PETAL_LENGTH` double,
 16 | `PETAL_WIDTH` double,
 17 | `CLASS_NAME` string
 18 | )
 19 | ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' 
 20 | WITH SERDEPROPERTIES (
 21 | 'serialization.format' = ',',
 22 | 'field.delim' = ','
 23 | ) LOCATION 's3://{}/'
 24 | TBLPROPERTIES ('has_encrypted_data'='false','skip.header.line.count'='1');
 25 | 
 26 | 
 27 | 
 28 | 
 29 | CREATE TABLE helloworld.helloparquet
 30 | WITH (
 31 | format = 'PARQUET',
 32 | parquet_compression = 'SNAPPY',
 33 | external_location = 's3://{}'
 34 | ) AS SELECT * FROM helloworld.hellocsv ;
 35 | 
 36 | 
 37 | Inferschema:
 38 | -------------
 39 | --drop database if required
 40 | drop database ramu;
 41 | --Create Database
 42 | create database if not exists ramu;
 43 | --use the database
 44 | use ramu;
 45 | --create file format
 46 | create file format parquet_format TYPE=parquet;
 47 | --create external stage
 48 | create or replace stage ramu.PUBLIC.snow_simple url="s3://{}/" 
 49 | credentials=(aws_key_id='{}'
 50 | aws_secret_key='{}')
 51 | file_format = parquet_format;
 52 | --list stage elements
 53 | list @ramu.PUBLIC.snow_simple;
 54 | 
 55 | select * from TABLE(INFER_SCHEMA (LOCATION=>'@ramu.PUBLIC.snow_simple',FILE_FORMAT=>'parquet_format'));
 56 | 
 57 | 
 58 | CREATE TABLE … USING TEMPLATE:
 59 | -------------------------------
 60 | Recap:
 61 | ------
 62 | create table demo_table_1 (province varchar, created_date date);
 63 | insert into demo_table_1 (province, created_date) values
 64 |     ('Manitoba', '2020-01-18'::date),
 65 |     ('Alberta', '2020-01-19'::date);
 66 | 	
 67 | select * from demo_table_1;
 68 | +----------+--------------+
 69 | | PROVINCE | CREATED_DATE |
 70 | |----------+--------------|
 71 | | Manitoba | 2020-01-18   |
 72 | | Alberta  | 2020-01-19   |
 73 | +----------+--------------+
 74 | 
 75 | 
 76 | select object_construct(*) from demo_table_1;
 77 | +---------------------------------+
 78 | | OBJECT_CONSTRUCT(*)             |
 79 | |---------------------------------|
 80 | | {                               |
 81 | |   "CREATED_DATE": "2020-01-18", |
 82 | |   "PROVINCE": "Manitoba"        |
 83 | | }                               |
 84 | | {                               |
 85 | |   "CREATED_DATE": "2020-01-19", |
 86 | |   "PROVINCE": "Alberta"         |
 87 | | }                               |
 88 | +---------------------------------+
 89 | 
 90 | 
 91 | select array_agg(object_construct(*)) from demo_table_1;
 92 | 
 93 | [
 94 |   {
 95 |     "CREATED_DATE": "2020-01-18",
 96 |     "PROVINCE": "Manitoba"
 97 |   },
 98 |   {
 99 |     "CREATED_DATE": "2020-01-19",
100 |     "PROVINCE": "Alberta"
101 |   }
102 | ]
103 | 
104 | 
105 | select ARRAY_AGG(OBJECT_CONSTRUCT(*)) from TABLE(INFER_SCHEMA (LOCATION=>'@ramu.PUBLIC.snow_simple',FILE_FORMAT=>'parquet_format'));
106 | 
107 | create or replace table helloparquet using template(select ARRAY_AGG(OBJECT_CONSTRUCT(*)) from TABLE(INFER_SCHEMA (LOCATION=>'@ramu.PUBLIC.snow_simple',FILE_FORMAT=>'parquet_format')));
108 | 
109 | --load data
110 | copy into ramu.PUBLIC.helloparquet from @ramu.PUBLIC.snow_simple MATCH_BY_COLUMN_NAME=CASE_INSENSITIVE FILE_FORMAT=parquet_format;
111 | 
112 | select * from helloparquet;
113 | 
114 | GENERATE_COLUMN_DESCRIPTION:
115 | ----------------------------
116 | select  generate_column_description(ARRAY_AGG(OBJECT_CONSTRUCT(*)),'table')  from TABLE(INFER_SCHEMA (LOCATION=>'@ramu.PUBLIC.snow_simple',FILE_FORMAT=>'parquet_format')) ;
117 | 
118 | 
119 | select  generate_column_description(ARRAY_AGG(OBJECT_CONSTRUCT(*)),'view')  from TABLE(INFER_SCHEMA (LOCATION=>'@ramu.PUBLIC.snow_simple',FILE_FORMAT=>'parquet_format')) ;
120 | 
121 | select  generate_column_description(ARRAY_AGG(OBJECT_CONSTRUCT(*)),'external_table')  from TABLE(INFER_SCHEMA (LOCATION=>'@ramu.PUBLIC.snow_simple',FILE_FORMAT=>'parquet_format')) ;
122 | 
123 | --generate complete ddl script using string concatenation
124 | select 'create or replace external table ramu.PUBLIC.Iris_dataset( ' ||
125 |  generate_column_description(ARRAY_AGG(OBJECT_CONSTRUCT(*)),'external_table') || 
126 | ') with location = @ramu.PUBLIC.snow_simple file_format =''parquet_format''' from TABLE(INFER_SCHEMA (LOCATION=>'@ramu.PUBLIC.snow_simple',FILE_FORMAT=>'parquet_format')) ;
127 | 
128 | 
129 | 
130 | 
131 | 
132 | 
133 | 
134 | 
135 | 


--------------------------------------------------------------------------------
/Snowflake Stored Porcedure Parallel execution (Part 1).txt:
--------------------------------------------------------------------------------
  1 | --drop database if required
  2 | drop database if exists ramu;
  3 | --Create Database
  4 | create database if not exists ramu;
  5 | --use the database
  6 | use ramu;
  7 | 
  8 | 
  9 | 
 10 | // Prepare table
 11 | CREATE OR REPLACE TABLE video_demo (
 12 |     ID INT AUTOINCREMENT START = 1 INCREMENT =1,
 13 |     NAME VARCHAR(40) DEFAULT 'DemoYoutube' ,
 14 |     CREATE_DATE timestamp);
 15 |     
 16 |     
 17 | create or replace procedure dummy_executor()
 18 |   returns string not null
 19 |   language javascript
 20 |   as     
 21 |   $$  
 22 |     var my_sql_command = "INSERT INTO video_demo(CREATE_DATE) VALUES(CURRENT_TIMESTAMP)";
 23 |     var statement1 = snowflake.createStatement( {sqlText: my_sql_command} );
 24 |     var result_set1 = statement1.execute();
 25 |     return "Successfully Executed"
 26 |   $$
 27 |   ;
 28 |   
 29 |  
 30 |  set my_variable=(select current_timestamp());
 31 |  
 32 |  select $my_variable;
 33 | 
 34 | 
 35 | 
 36 |  
 37 | // Create task
 38 | CREATE OR REPLACE TASK INSERT_DATA_SET_123456789
 39 |     WAREHOUSE = COMPUTE_WH
 40 |     SCHEDULE = '1 MINUTE'
 41 |     AS call dummy_executor()
 42 |     ;
 43 |     
 44 | 
 45 | SHOW TASKS;
 46 | 
 47 | select * from video_demo;
 48 | 
 49 | // Task starting and suspending
 50 | ALTER TASK INSERT_DATA_SET_123456789 RESUME;
 51 | 
 52 | 
 53 | SELECT  * FROM TABLE(INFORMATION_SCHEMA.TASK_HISTORY()) WHERE SCHEMA_NAME = 'PUBLIC' AND NAME = 'INSERT_DATA_SET_123456789' ;
 54 | 
 55 | 
 56 | 
 57 | SELECT *  FROM TABLE(INFORMATION_SCHEMA.TASK_HISTORY()) 
 58 | WHERE SCHEMA_NAME = 'PUBLIC' AND NAME = 'INSERT_DATA_SET_123456789' AND QUERY_START_TIME IS NOT NULL AND SCHEDULED_TIME > $my_variable
 59 | ORDER BY SCHEDULED_TIME DESC LIMIT 1;
 60 | 
 61 | SELECT COALESCE(QUERY_ID, '') AS QUERY_ID FROM TABLE(INFORMATION_SCHEMA.TASK_HISTORY()) 
 62 | WHERE SCHEMA_NAME = 'PUBLIC' AND NAME = 'INSERT_DATA_SET_123456789' AND QUERY_START_TIME IS NOT NULL AND SCHEDULED_TIME > $my_variable
 63 | ORDER BY SCHEDULED_TIME DESC LIMIT 1;
 64 |       
 65 | 
 66 |            
 67 | 
 68 | ALTER TASK INSERT_DATA_SET_123456789 SUSPEND;
 69 | 
 70 | 
 71 | 
 72 | 
 73 | create or replace procedure dummy_executoryt123()
 74 |   returns string not null
 75 |   language javascript
 76 |   EXECUTE AS CALLER
 77 |   as     
 78 |   $$  
 79 |   
 80 |     function sleep(milliseconds) {
 81 |     const date = Date.now();
 82 |     let currentDate = null;
 83 |     do {
 84 |       currentDate = Date.now();
 85 |     } while (currentDate - date < milliseconds);
 86 |   };
 87 | 
 88 | 
 89 | 
 90 |     var v_Now = new Date().toISOString();
 91 | 
 92 | 
 93 |     var query = 'CREATE OR REPLACE TASK RAMU.PUBLIC.MULTITHREAD_CHILD_12345678 WAREHOUSE=COMPUTE_WH SCHEDULE = \'1 MINUTE\' AS call RAMU.PUBLIC.dummy_executor();';
 94 |     var statement = snowflake.createStatement( {sqlText: query} );
 95 |     var result_set = statement.execute();
 96 |     
 97 |     query = 'ALTER TASK RAMU.PUBLIC.MULTITHREAD_CHILD_12345678  RESUME;';
 98 |     var statement = snowflake.createStatement( {sqlText: query} );
 99 |     var result_set = statement.execute();
100 |     
101 |     var v_QueryID = '';
102 |     
103 |     sleep(90000);
104 |     
105 |     while (v_QueryID == '') 
106 |     {
107 |     
108 |     v_Query = 'SELECT COALESCE(QUERY_ID, \'\') AS QUERY_ID FROM TABLE(INFORMATION_SCHEMA.TASK_HISTORY()) \
109 |       WHERE SCHEMA_NAME = \'PUBLIC\' AND NAME = \'MULTITHREAD_CHILD_12345678\' AND QUERY_START_TIME IS NOT NULL AND SCHEDULED_TIME > \'' + v_Now + '\' \
110 |       ORDER BY SCHEDULED_TIME DESC LIMIT 1;';
111 |       
112 |       
113 |     v_Statement = snowflake.createStatement( {sqlText: v_Query} );
114 |     rs = v_Statement.execute();
115 |     
116 |     while (rs.next()) 
117 |     {
118 |       v_QueryID = rs.getColumnValue('QUERY_ID');
119 |     };
120 |     
121 |     sleep(10000);
122 |   };
123 |     
124 |   // Suspend the main task & clean up all tasks
125 |   v_Query = 'ALTER TASK RAMU.PUBLIC.MULTITHREAD_CHILD_12345678 SUSPEND;';
126 |   snowflake.execute( {sqlText: v_Query} );
127 |   v_Statement = snowflake.createStatement( {sqlText: v_Query} );
128 |   v_Statement.execute();
129 |     return "Success"
130 |   $$
131 |   ;
132 |   
133 |   
134 |   call dummy_executoryt123();
135 |   
136 |   
137 |   select * from video_demo;
138 | 


--------------------------------------------------------------------------------
/Snowflake logging (1).txt:
--------------------------------------------------------------------------------
  1 | DROP DATABASE IF EXISTS RAMU;
  2 | 
  3 | CREATE DATABASE RAMU;
  4 | 
  5 | USE RAMU;
  6 | 
  7 | CREATE or replace TABLE log_storer(log_key number ,Stored_procedure_name  text,Success_status VARCHAR(1),log_message Text,Start_time String,End_time String);
  8 | 
  9 | CREATE OR REPLACE PROCEDURE open_sp_process(Stored_procedure_name STRING)
 10 | RETURNS STRING
 11 | LANGUAGE JAVASCRIPT
 12 | AS
 13 | $$
 14 | 
 15 |   var my_sql_command_date = "select to_char(current_timestamp(2)) as curr_time";
 16 |   var statement1 = snowflake.createStatement( {sqlText: my_sql_command_date} );
 17 |   var result_set1 = statement1.execute();
 18 |   result_set1.next();
 19 |   var time_info= result_set1.getColumnValue(1);
 20 |   
 21 |   var my_sql_command_2 = "select coalesce( max(log_key),0 )  from log_storer";
 22 |   var statement2 = snowflake.createStatement( {sqlText: my_sql_command_2} );
 23 |   var result_set2 = statement2.execute();
 24 |   result_set2.next();
 25 |   var log_key_for_the_entry= result_set2.getColumnValue(1)+1;
 26 |   
 27 |   var my_sql_command_3 = "INSERT INTO log_storer (log_key,Stored_procedure_name,Success_status,Start_time) values ("+log_key_for_the_entry+",'"+STORED_PROCEDURE_NAME+"','"+'I'+"','"+time_info+"')";
 28 |   var statement3 = snowflake.createStatement( {sqlText: my_sql_command_3} );
 29 |   var result_set3 = statement3.execute();
 30 |   
 31 |   
 32 |   return log_key_for_the_entry;
 33 | $$;
 34 | 
 35 | 
 36 | CREATE OR REPLACE PROCEDURE close_sp_process(log_key String,success_status String,log_message STRING)
 37 | RETURNS String
 38 | LANGUAGE JAVASCRIPT
 39 | AS
 40 | $$
 41 | 
 42 |   var my_sql_command_date = "select to_char(current_timestamp(2)) as curr_time";
 43 |   var statement1 = snowflake.createStatement( {sqlText: my_sql_command_date} );
 44 |   var result_set1 = statement1.execute();
 45 |   result_set1.next();
 46 |   var time_info= result_set1.getColumnValue(1);
 47 | 
 48 |   
 49 |   var my_sql_command_3 = "UPDATE log_storer set Success_status='"+SUCCESS_STATUS+"',log_message='"+LOG_MESSAGE+"',End_time='"+time_info+"' where log_key="+LOG_KEY;
 50 |   var statement3 = snowflake.createStatement( {sqlText: my_sql_command_3} );
 51 |   var result_set3 = statement3.execute();
 52 |   return 'Done'
 53 | $$;
 54 | 
 55 | 
 56 | 
 57 | 
 58 | --Case 1: Testing correct Procedure 
 59 | 
 60 | CREATE OR REPLACE PROCEDURE my_test1()
 61 | RETURNS STRING
 62 | LANGUAGE JAVASCRIPT
 63 | AS
 64 | $$
 65 | function sql_runner_with_return(sql_to_be_executed)
 66 | {
 67 |   var my_sql_command = sql_to_be_executed;
 68 |   var statement1 = snowflake.createStatement( {sqlText: my_sql_command} );
 69 |   var result_set1 = statement1.execute();
 70 |   result_set1.next();
 71 |   var log_key= result_set1.getColumnValue(1);
 72 |   return log_key
 73 | }
 74 | 
 75 | function sql_runner_without_return(sql_to_be_executed)
 76 | {
 77 |   var my_sql_command = sql_to_be_executed;
 78 |   var statement1 = snowflake.createStatement( {sqlText: my_sql_command} );
 79 |   var result_set1 = statement1.execute();
 80 | }
 81 | 
 82 | try
 83 |   {
 84 |     var log_key=sql_runner_with_return("call open_sp_process('RAMU.PUBLIC.my_test')")
 85 |     var x=10/10;
 86 |     sql_runner_with_return("call system$wait(10)");
 87 |     var closing_command ="call close_sp_process("+log_key+",'C','Stored Procedure Successfully completed')"
 88 |     sql_runner_without_return(closing_command)
 89 |     return log_key;
 90 |   }
 91 | catch(ERROR)
 92 |   {
 93 |   var closing_command ="call close_sp_process("+log_key+",'E','"+ERROR+"')"
 94 |   sql_runner_without_return(closing_command)
 95 |     return ERROR
 96 |   }
 97 | $$;
 98 | 
 99 | select * from log_storer;
100 | 
101 | call my_test1();
102 | 
103 | select * from log_storer;
104 | 
105 | --Case 2: Testing wrong Procedure 
106 | 
107 | CREATE OR REPLACE PROCEDURE my_test2()
108 | RETURNS STRING
109 | LANGUAGE JAVASCRIPT
110 | AS
111 | $$
112 | function sql_runner_with_return(sql_to_be_executed)
113 | {
114 |   var my_sql_command = sql_to_be_executed;
115 |   var statement1 = snowflake.createStatement( {sqlText: my_sql_command} );
116 |   var result_set1 = statement1.execute();
117 |   result_set1.next();
118 |   var log_key= result_set1.getColumnValue(1);
119 |   return log_key
120 | }
121 | function sql_runner_without_return(sql_to_be_executed)
122 | {
123 |   var my_sql_command = sql_to_be_executed;
124 |   var statement1 = snowflake.createStatement( {sqlText: my_sql_command} );
125 |   var result_set1 = statement1.execute();
126 | }
127 | try
128 |   {
129 |     var log_key=sql_runner_with_return("call open_sp_process('RAMU.PUBLIC.my_test')")
130 |     var x=10/10;
131 |     sql_runner_with_return("call system$wait(10)");
132 |     SUM(2,3)
133 |     var closing_command ="call close_sp_process("+log_key+",'C','Stored Procedure Successfully completed')"
134 |     sql_runner_without_return(closing_command)
135 |     return log_key;
136 |   }
137 | catch(ERROR)
138 |   {
139 |   var closing_command ="call close_sp_process("+log_key+",'E','"+ERROR+"')"
140 |   sql_runner_without_return(closing_command)
141 |     return ERROR
142 |   }
143 | $$;
144 | 
145 | 
146 | call my_test2();
147 | 


--------------------------------------------------------------------------------
/Snowflake_SP_Util.py:
--------------------------------------------------------------------------------
  1 | 
  2 | --Create Database
  3 | Create or replace database snowpark_helper;
  4 | use snowpark_helper;
  5 | 
  6 | -- Create a sample table
  7 | CREATE OR REPLACE TABLE sample_table (
  8 |     id INTEGER,
  9 |     name STRING,
 10 |     score INTEGER
 11 | );
 12 | 
 13 | -- Insert sample data into the table
 14 | INSERT INTO sample_table (id, name, score)
 15 | VALUES 
 16 | (1, 'ALICE', 85),
 17 | (2, 'BOB', 90),
 18 | (3, 'CHARLIE', 75),
 19 | (4, 'DAVID', 60),
 20 | (5, 'EVE', 95);
 21 | 
 22 | select * from sample_table;
 23 | 
 24 | 
 25 | CREATE OR REPLACE PROCEDURE process_data()
 26 | RETURNS STRING
 27 | LANGUAGE PYTHON
 28 | RUNTIME_VERSION = '3.8'
 29 | PACKAGES = ('snowflake-snowpark-python')
 30 | Handler='run'
 31 | EXECUTE AS CALLER
 32 | AS
 33 | $$
 34 | from snowflake.snowpark import Session
 35 | from snowflake.snowpark.functions import * 
 36 | 
 37 | def convert_string_to_lowercase(df):
 38 |     return df.with_column("name", lower(df["name"]))
 39 | 
 40 | def filter_based_on_score(df, threshold):
 41 |     return df.filter(df["score"] > threshold)
 42 | 
 43 | def write_data_to_table(df, target_table):
 44 |     df.write.save_as_table(target_table, mode="overwrite")
 45 | 
 46 | def run(session: Session) -> str:
 47 |     # Load the data from the source table
 48 |     df = session.table("sample_table")
 49 |     
 50 |     # Apply the transformations
 51 |     df_lowercase = convert_string_to_lowercase(df)
 52 |     df_filtered = filter_based_on_score(df_lowercase, 80)
 53 |     
 54 |     # Write the transformed data to the target table
 55 |     target_table = "transformed_table"
 56 |     write_data_to_table(df_filtered, target_table)
 57 |     
 58 |     return "Data transformation and write successful!"
 59 | 
 60 | $$;
 61 | 
 62 | call process_data();
 63 | 
 64 | select * from transformed_table;
 65 | 
 66 | 
 67 | --concept of imports
 68 | Create or replace  stage snowpark_helper.PUBLIC.snowpark_reusable_code url="s3://{}/" 
 69 | credentials=(aws_key_id=''
 70 | aws_secret_key='');
 71 | 
 72 | list @snowpark_helper.PUBLIC.snowpark_reusable_code;
 73 | 
 74 | 
 75 | 
 76 | CREATE OR REPLACE PROCEDURE process_data_with_util()
 77 | RETURNS STRING
 78 | LANGUAGE PYTHON
 79 | RUNTIME_VERSION = '3.8'
 80 | PACKAGES = ('snowflake-snowpark-python')
 81 | IMPORTS = ('@snowpark_helper.PUBLIC.snowpark_reusable_code/snowpark_modules.zip')
 82 | Handler='run'
 83 | EXECUTE AS CALLER
 84 | AS
 85 | $$
 86 | from snowflake.snowpark import Session
 87 | from snowflake.snowpark.functions import * 
 88 | from snowpark_modules.transformation import *
 89 | from snowpark_modules.write_data import *
 90 | 
 91 | def run(session: Session) -> str:
 92 |     # Load the data from the source table
 93 |     df = session.table("sample_table")
 94 |     
 95 |     # Apply the transformations
 96 |     df_lowercase = convert_string_to_lowercase(df)
 97 |     df_filtered = filter_based_on_score(df_lowercase, 80)
 98 |     
 99 |     # Write the transformed data to the target table
100 |     target_table = "transformed_table_with_util"
101 |     write_data_to_table(df_filtered, target_table)
102 |     
103 |     return "Data transformation and write successful!"
104 | 
105 | $$;
106 | 
107 | call process_data_with_util();
108 | 
109 | select * from SNOWPARK_HELPER.PUBLIC.TRANSFORMED_TABLE_WITH_UTIL;
110 | 
111 | 
112 | 
113 | 
114 | 
115 | CREATE OR REPLACE PROCEDURE process_data_with_util_and_param(threshold_value Integer)
116 | RETURNS STRING
117 | LANGUAGE PYTHON
118 | RUNTIME_VERSION = '3.8'
119 | PACKAGES = ('snowflake-snowpark-python')
120 | IMPORTS = ('@snowpark_helper.PUBLIC.snowpark_reusable_code/tf.zip')
121 | Handler='run'
122 | EXECUTE AS CALLER
123 | AS
124 | $$
125 | from snowflake.snowpark import Session
126 | from snowflake.snowpark.functions import * 
127 | from tf.transformation import *
128 | from tf.write_data import *
129 | 
130 | def run(session: Session,threshold_value) -> str:
131 |     # Load the data from the source table
132 |     df = session.table("sample_table")
133 |     
134 |     # Apply the transformations
135 |     df_lowercase = convert_string_to_lowercase(df)
136 |     df_filtered = filter_based_on_score(df_lowercase, threshold_value)
137 |     
138 |     # Write the transformed data to the target table
139 |     target_table = "transformed_table_with_util_and_param"
140 |     write_data_to_table(df_filtered, target_table)
141 |     
142 |     return "Data transformation and write successful!"
143 | 
144 | $$;
145 | 
146 | call process_data_with_util_and_param(70);
147 | 
148 | select * from SNOWPARK_HELPER.PUBLIC.TRANSFORMED_TABLE_WITH_UTIL_AND_PARAM;


--------------------------------------------------------------------------------
/Snyk Code for Github Action.yml:
--------------------------------------------------------------------------------
 1 | name: Example workflow for Python using Snyk
 2 | on: push
 3 | 
 4 | jobs:
 5 |   security:
 6 |     runs-on: ubuntu-latest
 7 |     steps:
 8 |       - uses: actions/checkout@master
 9 |       - name: Run Snyk to check for vulnerabilities
10 |         uses: snyk/actions/python@master
11 |         continue-on-error: true # To make sure that SARIF upload gets called
12 |         env:
13 |           SNYK_TOKEN: ${{ secrets.SNYK_TOKEN }}
14 |         with:
15 |           command: code test
16 |           args: --sarif-file-output=snyk.sarif
17 |       - name: Count total number of vulnerabilities
18 |         run: |
19 |           RESULTS_LENGTH=$(jq '.runs[0].results | length' snyk.sarif)
20 |           echo "RESULTS_LENGTH=$RESULTS_LENGTH" >> $GITHUB_ENV
21 |           echo $RESULTS_LENGTH
22 |       - name: Pass_or_Fail_the_job
23 |         run: |
24 |             if [ "$RESULTS_LENGTH" != 0 ]; then
25 |                 echo "Job Failed"
26 |                 exit 1
27 |             else
28 |                 echo "Pass"
29 |             fi
30 |         
31 |       - name: Send notification on Slack using Webhooks
32 |         uses: slackapi/slack-github-action@v1.24.0
33 |         if: always()
34 |         with:
35 |           payload: |
36 |                {
37 |                  "text": "*The Snyk scan result for repo is : ${{ job.status }}* \n*Number of vulnerabilities : ${{ env.RESULTS_LENGTH }}* \n*Detail*: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}"
38 |                }
39 |         env:
40 |           SLACK_WEBHOOK_URL: ${{ secrets.slack_webhook_url }}


--------------------------------------------------------------------------------
/Talend with EMR & Snowflake.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/Talend with EMR & Snowflake.png


--------------------------------------------------------------------------------
/Time Traven in Snowflake.txt:
--------------------------------------------------------------------------------
 1 | use database Ramu;
 2 | 
 3 | create or replace TABLE BUSINESSES (
 4 | 	BUSINESS_ID VARCHAR(100),
 5 | 	NAME VARCHAR(100),
 6 | 	CITY VARCHAR(50),
 7 | 	STATE VARCHAR(2),
 8 | 	REVIEW_COUNT NUMBER(38,4),
 9 | 	STARS NUMBER(38,4)
10 | );
11 | 
12 | INSERT INTO BUSINESSES VALUES
13 | ('QNcv3mwnHJ5w4YB4giqkWw','Preferred Veterinary Care','Pittsburgh','PA',4,3.5),
14 | ('oZG8sxDL54ki9pmDfyL7rA','Not My Dog','Toronto','ON',9,3.5),
15 | ('S06JfRM3ICESOHc1pr3LOA','Chase Bank','Las Vegas','NV',3,5.0),
16 | ('NL_BfZ4BkQXJSYAFouJqsQ','24 hr lockouts','Las Vegas','NV',3,1.0),
17 | ('AnUyv2zHq_35gCeHr8555w','Soma Restaurant','Las Vegas','NV',12,3.0),
18 | ('jjBTBObnHrY87qQIMybjzQ','Blue Jade','Cleveland','OH',24,3.5),
19 | ('PhL85G9Y6OstQzThDIllMQ','Animalerie Little Bear','Westmount','QC',9,4.0),
20 | ('SkRqx-hxVPLgV4K5hxNa9g','Parkview Dental Associates','Sun Prairie','WI',4,3.0),
21 | ('tWX7j4Qg4cXofQqmoNKH3A','Sir Hobbs','Sun Prairie','WI',35,3.0),
22 | ('4a9Rypytzdz9NZuGMS2ZYw','Rogue Bar','Scottsdale','AZ',80,3.5),
23 | ('oYWy-hOTCOF7h8DCAZ_Mxw','Cool Girl','Toronto','ON',48,3.5),
24 | ('AMxxi7jyxhcdNF7FIRbUVA','Remington''s Restaurant','Scottsdale','AZ',103,3.0),
25 | ('d01d-w7pxHrMCX5mDwaaHQ','D Liche','Montréal','QC',89,4.5),
26 | ('66DKb6APF96InEKrUVIbZw','Allo Inde','Montréal','QC',3,3.5);
27 | 
28 | 
29 | SELECT * from BUSINESSES
30 | WHERE City='Las Vegas';
31 | 
32 | ALTER SESSION  SET TIMEZONE = 'UTC';
33 | select getdate();
34 | --2021-02-12 10:55:49.190 +0000
35 | 
36 | SELECT * from BUSINESSES
37 | WHERE City='Las Vegas';
38 | 
39 | SELECT * from BUSINESSES at(timestamp => '2021-02-12 10:55:49.190 +0000'::timestamp)
40 | WHERE City='Las Vegas';
41 | 
42 | INSERT INTO BUSINESSES
43 | SELECT * from BUSINESSES at(timestamp => '2021-02-12 10:55:49.190 +0000'::timestamp)
44 | WHERE City='Las Vegas';
45 | 
46 | SELECT * from BUSINESSES
47 | WHERE City='Las Vegas';
48 | 


--------------------------------------------------------------------------------
/Unstructured Data processing with Snowflake.txt:
--------------------------------------------------------------------------------
 1 | --drop database if exists
 2 | drop database if exists s3_to_snowflake;
 3 | 
 4 | --Database Creation 
 5 | create database if not exists s3_to_snowflake;
 6 | 
 7 | --Use the database
 8 | use s3_to_snowflake;
 9 | 
10 | 
11 | --create the external stage
12 | create or replace stage s3_to_snowflake.PUBLIC.Snow_stage url="{s3 location}" 
13 | credentials=(aws_key_id='{AWS Access Key}'
14 | aws_secret_key='{AWS Secret Key}');
15 | 
16 | 
17 | list @s3_to_snowflake.PUBLIC.Snow_stage;
18 | 
19 | 
20 | CREATE OR REPLACE PROCEDURE count_no_of_pages_sp(file_name string)
21 |   RETURNS integer
22 |   LANGUAGE PYTHON
23 |   RUNTIME_VERSION = '3.8'
24 |   PACKAGES = ('snowflake-snowpark-python','PyPDF2')
25 |   HANDLER = 'main_fn'
26 | AS
27 | $$
28 | from snowflake.snowpark.files import SnowflakeFile
29 | import PyPDF2
30 | def main_fn(session,file_name):
31 |     f=SnowflakeFile.open(file_name,'rb')
32 |     pdf_object=PyPDF2.PdfReader(f);
33 |     return len(pdf_object.pages)
34 | $$;
35 | 
36 | call count_no_of_pages_sp(BUILD_SCOPED_FILE_URL( @s3_to_snowflake.PUBLIC.Snow_stage , 'crm_system.pdf' ));
37 | 
38 | --select BUILD_SCOPED_FILE_URL( @s3_to_snowflake.PUBLIC.Snow_stage , 'upgrades-white-paper-final.pdf' );
39 | 
40 | 
41 | CREATE OR REPLACE function count_no_of_pages_udf(file_name string)
42 |   RETURNS integer
43 |   LANGUAGE PYTHON
44 |   RUNTIME_VERSION = '3.8'
45 |   PACKAGES = ('snowflake-snowpark-python','PyPDF2')
46 |   HANDLER = 'main_fn'
47 | AS
48 | $$
49 | from snowflake.snowpark.files import SnowflakeFile
50 | import PyPDF2
51 | def main_fn(file_name):
52 |     f=SnowflakeFile.open(file_name,'rb')
53 |     pdf_object=PyPDF2.PdfReader(f);
54 |     return len(pdf_object.pages)
55 | $$;
56 | 
57 | select count_no_of_pages_udf(BUILD_SCOPED_FILE_URL( @s3_to_snowflake.PUBLIC.Snow_stage , 'sample_pdf_1.pdf' ));
58 | 
59 | list @s3_to_snowflake.PUBLIC.Snow_stage;
60 | SELECT "name",count_no_of_pages_udf(BUILD_SCOPED_FILE_URL( @s3_to_snowflake.PUBLIC.Snow_stage , file_name )) as pdf_page_count FROM 
61 | (select "name",split_part("name",'/',-1) as file_name from TABLE(RESULT_SCAN(LAST_QUERY_ID())));
62 | 
63 | 
64 | 
65 | 
66 | --create the external stage with directory table enabled
67 | create or replace stage s3_to_snowflake.PUBLIC.Snow_stage_directory_table_yt url="{s3 location}" 
68 | credentials=(aws_key_id='{AWS Access Key}'
69 | aws_secret_key='{AWS Secret Key}')
70 | Directory=(ENABLE=TRUE);
71 | 
72 | list @s3_to_snowflake.PUBLIC.Snow_stage_directory_table_yt;
73 | 
74 | select * from directory(@s3_to_snowflake.PUBLIC.Snow_stage_directory_table_yt);
75 | 
76 | SELECT RELATIVE_PATH,count_no_of_pages_udf(BUILD_SCOPED_FILE_URL( @s3_to_snowflake.PUBLIC.Snow_stage_directory_table_yt , RELATIVE_PATH )) as pdf_page_count FROM directory(@s3_to_snowflake.PUBLIC.Snow_stage_directory_table_yt); 
77 | 
78 | alter stage s3_to_snowflake.PUBLIC.Snow_stage_directory_table_yt refresh;
79 | 
80 | --create the external stage with directory table enabled & automatic refresh
81 | create or replace stage s3_to_snowflake.PUBLIC.Snow_stage_directory_table_yt_auto_refresh url="{s3 location}" 
82 | credentials=(aws_key_id='{AWS Access Key}'
83 | aws_secret_key='{AWS Secret Key}')
84 | Directory=(ENABLE=TRUE AUTO_REFRESH=TRUE);
85 | 
86 | desc stage  s3_to_snowflake.PUBLIC.Snow_stage_directory_table_yt_auto_refresh;
87 | 
88 | 
89 | SELECT RELATIVE_PATH,count_no_of_pages_udf(BUILD_SCOPED_FILE_URL( @s3_to_snowflake.PUBLIC.Snow_stage_directory_table_yt_auto_refresh , RELATIVE_PATH )) as pdf_page_count FROM directory(@s3_to_snowflake.PUBLIC.Snow_stage_directory_table_yt_auto_refresh); 


--------------------------------------------------------------------------------
/Updated GenAI Notes.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/Updated GenAI Notes.pdf


--------------------------------------------------------------------------------
/_Manual Offset Commits & At Least Once Processing in Kafka Consumer.drawio.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/_Manual Offset Commits & At Least Once Processing in Kafka Consumer.drawio.png


--------------------------------------------------------------------------------
/airflow_emr_s3_snowflake_setup.txt:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | sudo apt update
  4 | sudo apt install -y python3-pip
  5 | sudo apt install -y sqlite3
  6 | sudo apt-get install -y libpq-dev
  7 | pip3 install --upgrade awscli
  8 | pip3 install boto3
  9 | sudo pip3 install virtualenv 
 10 | virtualenv venv 
 11 | source venv/bin/activate
 12 | pip install "apache-airflow[postgres]==2.5.0" --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-2.5.0/constraints-3.7.txt"
 13 | pip install pandas apache-airflow-providers-snowflake==2.1.0 snowflake-connector-python==2.5.1 snowflake-sqlalchemy==1.2.5
 14 | airflow db init
 15 | sudo apt-get install postgresql postgresql-contrib
 16 | sudo -i -u postgres
 17 | psql
 18 | CREATE DATABASE airflow;
 19 | CREATE USER airflow WITH PASSWORD 'airflow';
 20 | GRANT ALL PRIVILEGES ON DATABASE airflow TO airflow;
 21 | exit
 22 | exit
 23 | ls
 24 | cd airflow
 25 | sed -i 's#sqlite:////home/ubuntu/airflow/airflow.db#postgresql+psycopg2://airflow:airflow@localhost/airflow#g' airflow.cfg
 26 | sed -i 's#SequentialExecutor#LocalExecutor#g' airflow.cfg
 27 | airflow db init
 28 | airflow users create -u airflow -f airflow -l airflow -r Admin -e airflow1@gmail.com
 29 | User id --airflow
 30 | password--admin@123! 
 31 | mkdir /home/ubuntu/dags
 32 | cd airflow
 33 | vi airflow.cfg
 34 | change the below properties --
 35 | dags_folder = /home/ubuntu/dags
 36 | load_examples = False
 37 | 
 38 | 
 39 | 
 40 | 
 41 | airflow db init
 42 | airflow webserver
 43 | 
 44 | source venv/bin/activate
 45 | airflow scheduler
 46 | 
 47 | Create external stage & tables in Snowflake
 48 | 
 49 | Snowflake Queries:
 50 | ------------------------
 51 | drop database if exists s3_to_snowflake;
 52 | 
 53 | use role accountadmin;
 54 | --Database Creation 
 55 | create database if not exists s3_to_snowflake;
 56 | 
 57 | --Specify the active/current database for the session.
 58 | use s3_to_snowflake;
 59 | 
 60 | 
 61 | 
 62 | 
 63 | create or replace stage s3_to_snowflake.PUBLIC.snow_simple url="s3://irisseta/output_folder/" 
 64 | credentials=(aws_key_id=''
 65 | aws_secret_key='');
 66 | 
 67 | 
 68 | 
 69 | list @s3_to_snowflake.PUBLIC.snow_simple;
 70 | 
 71 | --File Format Creation
 72 | create or replace file format my_parquet_format
 73 | type = parquet;
 74 | 
 75 | 
 76 | 
 77 | --Table Creation
 78 | create or replace external table s3_to_snowflake.PUBLIC.Iris_dataset  (CLASS_NAME varchar(20) as (Value:CLASS_NAME::varchar),
 79 | Count_Value Number as (Value:count::Number)) with location = @s3_to_snowflake.PUBLIC.snow_simple
 80 | file_format ='my_parquet_format';
 81 | 
 82 | 
 83 | select * from s3_to_snowflake.PUBLIC.Iris_dataset;
 84 | 
 85 | 
 86 | 
 87 | Create Snowflake Connection
 88 | 
 89 | 
 90 | 
 91 | 
 92 | 
 93 | 
 94 | 
 95 | 
 96 | 
 97 | 
 98 | 
 99 | 
100 | 
101 | 
102 | 
103 | 
104 | 


--------------------------------------------------------------------------------
/airflow_install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -xe
 2 | upload_log() {
 3 |   aws s3 cp /tmp/userdata_execution.log s3://demoytuserdataairflow/logs/
 4 | }
 5 | 
 6 | trap 'upload_log' EXIT
 7 | 
 8 | sudo -u ubuntu -i <<'EOF'
 9 | 
10 | exec &>> /tmp/userdata_execution.log
11 | 
12 | 
13 | sudo apt update
14 | sudo apt -y install awscli
15 | sudo apt --yes install python3-pip
16 | sudo apt --yes install sqlite3
17 | sudo apt-get --yes install libpq-dev
18 | pip3 install --upgrade awscli
19 | sudo pip3  install virtualenv 
20 | python3 -m virtualenv  /home/ubuntu/venv 
21 | source /home/ubuntu/venv/bin/activate
22 | pip install "apache-airflow[postgres]==2.5.0" --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-2.5.0/constraints-3.7.txt" pandas boto3 
23 | airflow db init
24 | sudo apt-get --yes install postgresql postgresql-contrib
25 | sudo -i -u postgres <<'EOpostgres'
26 | psql -U postgres -c "CREATE DATABASE airflow;"
27 | psql -U postgres -c "CREATE USER airflow WITH PASSWORD 'airflow';"
28 | psql -U postgres -c "GRANT ALL PRIVILEGES ON DATABASE airflow TO airflow;"
29 | EOpostgres
30 | sed -i 's#sqlite:////home/ubuntu/airflow/airflow.db#postgresql+psycopg2://airflow:airflow@localhost/airflow#g' /home/ubuntu/airflow/airflow.cfg
31 | sed -i 's#SequentialExecutor#LocalExecutor#g' /home/ubuntu/airflow/airflow.cfg
32 | airflow db init
33 | airflow users create -u airflow -f airflow -l airflow -r Admin -e airflow@gmail.com -p admin@123!
34 | mkdir /home/ubuntu/dags
35 | aws s3 cp s3://demoytuserdataairflow/dags/hello_world.py /home/ubuntu/dags
36 | sed -i 's/^dags_folder = .*/dags_folder = \/home\/ubuntu\/dags/' /home/ubuntu/airflow/airflow.cfg
37 | sed -i 's/^load_examples = .*/load_examples = False/' /home/ubuntu/airflow/airflow.cfg
38 | airflow db init
39 | EOF


--------------------------------------------------------------------------------
/airflow_news_data_pipeline.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import airflow
 3 | from airflow import DAG
 4 | from airflow.operators.python_operator import PythonOperator
 5 | from airflow.contrib.operators.snowflake_operator import SnowflakeOperator
 6 | from airflow.contrib.hooks.snowflake_hook import SnowflakeHook
 7 | from airflow.operators.bash_operator import BashOperator
 8 | from datetime import datetime, timedelta
 9 | from news_fetcher_etl import runner
10 | logging.basicConfig(level=logging.INFO)
11 | logger = logging.getLogger(__name__)
12 | 
13 | 
14 | args = {"owner": "Airflow", "start_date": airflow.utils.dates.days_ago(2)}
15 | 
16 | dag = DAG(
17 |     dag_id="snowflake_automation_dag", default_args=args, schedule_interval=None
18 | )
19 | 
20 | 
21 | 
22 | 
23 | with dag:
24 | 
25 | 	extract_news_info = PythonOperator(
26 | 	task_id='extract_news_info',
27 | 	python_callable=runner,
28 | 	dag=dag, 
29 | 	)
30 | 		
31 | 	move_file_to_s3 = BashOperator(
32 | 	task_id="move_file_to_s3",
33 | 	bash_command='aws s3 mv {{ ti.xcom_pull("extract_news_info")}}  s3://irisseta',
34 | 	)
35 | 	
36 | 	snowflake_create_table=SnowflakeOperator(
37 | 		task_id="snowflake_create_table",
38 | 		sql="""create  table if not exists helloparquet using template(select ARRAY_AGG(OBJECT_CONSTRUCT(*)) from TABLE(INFER_SCHEMA (LOCATION=>'@ramu.PUBLIC.snow_simple',FILE_FORMAT=>'parquet_format')))
39 |         """ ,
40 | 		snowflake_conn_id="snowflake_conn"
41 | 	)
42 | 	
43 | 	
44 | 	snowflake_copy=SnowflakeOperator(
45 | 		task_id="snowflake_copy",
46 | 		sql="""copy into ramu.PUBLIC.helloparquet from @ramu.PUBLIC.snow_simple MATCH_BY_COLUMN_NAME=CASE_INSENSITIVE FILE_FORMAT=parquet_format
47 |         """ ,
48 | 		snowflake_conn_id="snowflake_conn"
49 | 	)
50 | 	
51 | 
52 | extract_news_info >> move_file_to_s3 >> snowflake_create_table >> snowflake_copy


--------------------------------------------------------------------------------
/airflow_talend_runner.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import airflow
 3 | from airflow import DAG
 4 | from airflow.operators.python_operator import PythonOperator
 5 | from airflow.operators.bash_operator import BashOperator
 6 | from datetime import datetime, timedelta
 7 | from datetime import datetime, timedelta
 8 | from airflow.models import DAG
 9 | from airflow.providers.amazon.aws.sensors.s3 import S3KeySensor
10 | import boto3
11 | logging.basicConfig(level=logging.INFO)
12 | logger = logging.getLogger(__name__)
13 | 
14 | 
15 | args = {"owner": "Airflow", "start_date": airflow.utils.dates.days_ago(2)}
16 | 
17 | dag = DAG(
18 |     dag_id="Hello_World", default_args=args, schedule_interval=None
19 | )
20 | 
21 | 
22 | s3_bucketname = 's3datacapturetestconnector'
23 | s3_loc = 'landing_directory/success.txt'
24 | 
25 |     
26 |     
27 | with dag:
28 | 
29 |   s3_sensor = S3KeySensor(
30 |         task_id='success_flg_check',
31 |         bucket_name=s3_bucketname,
32 |         bucket_key=s3_loc,
33 |         aws_conn_id='aws_default',
34 |         mode='poke',
35 |         poke_interval=5,
36 |         timeout=15,
37 |         soft_fail=False
38 |     )
39 | 
40 |   load_data_sf_table=BashOperator(
41 | 	task_id="move_file_from_s3_to_snowflake",
42 | 	bash_command='sh /home/ubuntu/dags/elt_runner/elt_runner_run.sh ',
43 | 	)
44 | 
45 | s3_sensor >> load_data_sf_table


--------------------------------------------------------------------------------
/airflow_talend_success_file_snesor.bash:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -xe
 2 | upload_log() {
 3 |   aws s3 cp /tmp/userdata_execution.log s3://s3datacapturetestconnector/logs/
 4 | }
 5 | 
 6 | trap 'upload_log' EXIT
 7 | 
 8 | sudo -u ubuntu -i <<'EOF'
 9 | 
10 | exec &>> /tmp/userdata_execution.log
11 | 
12 | 
13 | sudo apt update
14 | sudo apt-get install -y openjdk-8-jdk
15 | sudo apt-get install -y unzip
16 | sudo apt -y install awscli
17 | sudo apt --yes install python3-pip
18 | sudo apt --yes install sqlite3
19 | sudo apt-get --yes install libpq-dev
20 | pip3 install --upgrade awscli
21 | sudo pip3  install virtualenv 
22 | python3 -m virtualenv  /home/ubuntu/venv 
23 | source /home/ubuntu/venv/bin/activate
24 | pip install "apache-airflow[postgres]==2.5.0" --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-2.5.0/constraints-3.7.txt" pandas boto3 
25 | airflow db init
26 | sudo apt-get --yes install postgresql postgresql-contrib
27 | sudo -i -u postgres <<'EOpostgres'
28 | psql -U postgres -c "CREATE DATABASE airflow;"
29 | psql -U postgres -c "CREATE USER airflow WITH PASSWORD 'airflow';"
30 | psql -U postgres -c "GRANT ALL PRIVILEGES ON DATABASE airflow TO airflow;"
31 | EOpostgres
32 | sed -i 's#sqlite:////home/ubuntu/airflow/airflow.db#postgresql+psycopg2://airflow:airflow@localhost/airflow#g' /home/ubuntu/airflow/airflow.cfg
33 | sed -i 's#SequentialExecutor#LocalExecutor#g' /home/ubuntu/airflow/airflow.cfg
34 | airflow db init
35 | airflow users create -u airflow -f airflow -l airflow -r Admin -e airflow@gmail.com -p admin@123!
36 | mkdir /home/ubuntu/dags
37 | pip install apache-airflow-providers-amazon[apache.hive]
38 | aws s3 cp s3://s3datacapturetestconnector/codebase /home/ubuntu/dags --recursive
39 | unzip -d /home/ubuntu/dags /home/ubuntu/dags/elt_runner_0.1.zip
40 | sed -i 's/^dags_folder = .*/dags_folder = \/home\/ubuntu\/dags/' /home/ubuntu/airflow/airflow.cfg
41 | sed -i 's/^load_examples = .*/load_examples = False/' /home/ubuntu/airflow/airflow.cfg
42 | airflow db init
43 | EOF


--------------------------------------------------------------------------------
/algolia_layer1.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/algolia_layer1.zip


--------------------------------------------------------------------------------
/aws-eventbridge-kinesisfirehose-s3.drawio.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/aws-eventbridge-kinesisfirehose-s3.drawio.png


--------------------------------------------------------------------------------
/dbscan_visualization.txt:
--------------------------------------------------------------------------------
 1 | clc
 2 | clear all
 3 | close all
 4 | warning off
 5 | load matlab.mat
 6 | x=[x';x2';x3';t1'];
 7 | y=[y';y2';y3';y1'];
 8 | for i=1:length(x)
 9 |     plot(x(i),y(i),'.','MarkerSize',10);
10 |     axis([-3 3 -1.5 3.5]);
11 |     drawnow limitrate;
12 |     hold on;
13 | end
14 | hold on;
15 | data=[x y];
16 | idx=dbscan(data,0.2,15);
17 | for i=1:length(x)
18 |     if(idx(i)==1)
19 |        plot(x(i),y(i),'r.','MarkerSize',20);
20 |     elseif(idx(i)==2)
21 |        plot(x(i),y(i),'g.','MarkerSize',20);
22 |     elseif(idx(i)==3)
23 |        plot(x(i),y(i),'b.','MarkerSize',20);
24 |     elseif(idx(i)==4)
25 |        plot(x(i),y(i),'m.','MarkerSize',20);
26 |     else
27 |        plot(x(i),y(i),'c.','MarkerSize',20);
28 |     end
29 |        drawnow limitrate;
30 |        hold on;
31 | end
32 | 


--------------------------------------------------------------------------------
/generate_smiling_face_cluster.txt:
--------------------------------------------------------------------------------
 1 | clc
 2 | clear all
 3 | close all
 4 | warning off
 5 | t=0:0.01:2*pi;
 6 | x=2*cos(t)+0.08*randn(length(t),1)';
 7 | y=1+2*sin(t)+0.08*randn(length(t),1)';
 8 | scatter(x,y);
 9 | axis square
10 | hold on
11 | t=0:0.08:2*pi;
12 | x2=-1+0.1*cos(t)+0.08*randn(length(t),1)';
13 | y2=2+0.1*sin(t)+0.08*randn(length(t),1)';
14 | scatter(x2,y2,'k');
15 | hold on;
16 | x3=1+0.1*cos(t)+0.08*randn(length(t),1)';
17 | y3=2+0.1*sin(t)+0.08*randn(length(t),1)';
18 | scatter(x3,y3,'k')
19 | t1=-0.6:0.01:0.6;
20 | y1=t1.^2+0.08*rand(length(t1),1)';
21 | scatter(t1,y1)


--------------------------------------------------------------------------------
/generic_job_s3_to_snowflake_using_copy_command.py:
--------------------------------------------------------------------------------
  1 | """
  2 | pip install pandas
  3 | pip install -r https://raw.githubusercontent.com/snowflakedb/snowflake-connector-python/v2.3.10/tested_requirements/requirements_38.reqs -t .
  4 | pip install snowflake-connector-python==2.3.10 -t .
  5 | """
  6 | import pandas as pd
  7 | import snowflake.connector as sf
  8 | 
  9 | user=""
 10 | password="@"
 11 | account=""
 12 | conn=sf.connect(user=user,password=password,account=account)
 13 | 
 14 | 
 15 | def run_query(query):
 16 |     print("Executing the query : {}".format(query))
 17 |     cursor = conn.cursor()
 18 |     cursor.execute(query)
 19 |     cursor.close()
 20 | 
 21 | 
 22 | def run_query_single_value_return(query):
 23 |     print("Executing the query : {}".format(query))
 24 |     cursor = conn.cursor()
 25 |     cursor.execute(query)
 26 |     records = cursor.fetchone()[0]
 27 |     cursor.close()
 28 |     return records
 29 | 
 30 | 
 31 | def run_copy_command(query):
 32 |     print("Executing the query : {}".format(query))
 33 |     cursor = conn.cursor()
 34 |     cursor.execute(query)
 35 |     query_id=cursor.sfqid
 36 |     cursor.close()
 37 |     return query_id
 38 | 
 39 | 
 40 | def execute_copy_cmd():
 41 |     print("Reading the metadata file...")
 42 | 
 43 |     df=pd.read_csv('{}')
 44 | 
 45 |     for index,row in df.iterrows():
 46 |         database=row['DATABASE']
 47 |         schema=row['SCHEMA']
 48 |         table=row['TABLE_NAME']
 49 |         external_stage_object=row['STAGE_OBJECT']
 50 |         s3_file_path=row['S3_FILE_PATH_TO_BE_APPENDED_WITH_STAGE_OBJECT']
 51 |         warehouse=row['WAREHOUSE']
 52 |         snowflake_role=row['SNOWFLAKE_ROLE']
 53 |         file_format=row['FILE_FORMAT']
 54 |         pattern=row['PATTERN']
 55 | 
 56 | 
 57 |         #set up the env of execution
 58 |         statement_1 = 'use warehouse ' + warehouse
 59 |         statement2 = 'alter warehouse ' + warehouse + " resume  IF SUSPENDED"
 60 |         statement3 = "use database " + database
 61 |         statement4 = "use role " + snowflake_role
 62 |         statement5=  "use schema " + schema
 63 |         run_query(statement_1)
 64 |         run_query(statement2)
 65 |         run_query(statement3)
 66 |         run_query(statement4)
 67 |         run_query(statement5)
 68 | 
 69 |         #executing the copy command
 70 |         copy_command="""copy into {}.{}.{} from @{}/{}/ FILE_FORMAT={} PATTERN='{}' ON_ERROR=CONTINUE""".format(database,schema,table,external_stage_object,s3_file_path,file_format,pattern)
 71 |         query_id_of_the_copy_command=run_copy_command(copy_command)
 72 | 
 73 |         #check whether copy command picked up any file or not
 74 |         detecting_copy_command_picked_up_file_or_not="""SELECT "status"    FROM  TABLE(RESULT_SCAN('{}'))  limit 1;""".format(query_id_of_the_copy_command)
 75 |         first_value_of_status_in_copy_command_output= run_query_single_value_return(detecting_copy_command_picked_up_file_or_not)
 76 |         print("First value of result-set of the above copy command execution : {}".format(first_value_of_status_in_copy_command_output))
 77 |         count_no_of_rows_inserted_due_to_copy_command=0
 78 | 
 79 |         if(first_value_of_status_in_copy_command_output!='Copy executed with 0 files processed.'):
 80 |             #rows inserted by copy command
 81 |             command_to_get_no_of_rows_inserted_due_to_copy_command="""SELECT sum("rows_loaded") FROM  TABLE(RESULT_SCAN('{}'));""".format(query_id_of_the_copy_command)
 82 |             count_no_of_rows_inserted_due_to_copy_command=run_query_single_value_return(command_to_get_no_of_rows_inserted_due_to_copy_command)
 83 |         print("No. of rows inserted due to this copy command execution : {}".format(count_no_of_rows_inserted_due_to_copy_command))
 84 | 
 85 |         #Capture the rejected records
 86 |         rejects_collector = """insert into {}.{}.copy_cmd_rejects select '{}' QUERY_ID,'{}' TABLE_NAME, CURRENT_TIMESTAMP(),A.* from table(validate({}.{}.{},job_id=>'{}')) A""".format(database,schema,query_id_of_the_copy_command,table,database,schema,table,query_id_of_the_copy_command)
 87 |         run_query(rejects_collector)
 88 | 
 89 | 
 90 |         #get total number of rejected records
 91 |         rejected_records="select count(distinct ROW_NUMBER) from {}.{}.copy_cmd_rejects where QUERY_ID='{}'".format(database,schema,query_id_of_the_copy_command)
 92 |         count_of_rejected_records=run_query_single_value_return(rejected_records)
 93 | 
 94 |         #audit the records
 95 |         audit_copy="""insert into {}.{}.COPY_AUDIT select QUERY_ID,QUERY_TEXT,DATABASE_NAME,'{}' ROWS_INSERTED,'{}' 
 96 |                     ROWS_REJECTED,SCHEMA_NAME,ROLE_NAME,WAREHOUSE_NAME,WAREHOUSE_SIZE,EXECUTION_STATUS,ERROR_MESSAGE,EXECUTION_TIME ,current_timestamp() ETL_TS 
 97 |                     FROM table(information_schema.query_history()) where query_type='COPY' AND QUERY_ID='{}' """.format(database,schema,count_no_of_rows_inserted_due_to_copy_command,
 98 |                                                                                                                         count_of_rejected_records,query_id_of_the_copy_command)
 99 |         run_query(audit_copy)
100 | 
101 | 
102 | execute_copy_cmd()


--------------------------------------------------------------------------------
/incremental_etl.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/incremental_etl.zip


--------------------------------------------------------------------------------
/ingest.sh:
--------------------------------------------------------------------------------
1 | wget -O - https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data | aws s3 cp - s3://irisseta/input_folder/hello_world.csv


--------------------------------------------------------------------------------
/iris_partitioned_Data.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/iris_partitioned_Data.zip


--------------------------------------------------------------------------------
/kafka snowflake integration.txt:
--------------------------------------------------------------------------------
 1 | Kafka Snowflake Integration:
 2 | --------------------------------------------------------
 3 | Download the required jar file -- https://mvnrepository.com/artifact/com.snowflake/snowflake-kafka-connector/1.5.0
 4 | 
 5 | Put this jar in libs folders
 6 | 
 7 | Update the plugin.path in kafka connect-standalone properties.
 8 | 
 9 | Create Private & Public key-pair:
10 | --------------------------------------------------------------
11 | openssl genrsa -out rsa_key.pem 2048
12 | openssl rsa -in rsa_key.pem -pubout -out rsa_key.pub
13 | 
14 | 
15 | Configure the public key in Snowflake:
16 | ----------------------------------------------------------------
17 | 
18 | alter user {User_name} set rsa_public_key='{Put the Public key content here}';
19 | 
20 | Verify the public key is configured properly or not --
21 | desc user {User_name};
22 | 
23 | 
24 | 
25 | Create a SF_connect.properties file with below properties in config folder --
26 | 
27 | connector.class=com.snowflake.kafka.connector.SnowflakeSinkConnector
28 | tasks.max=8
29 | topics={topic_name}
30 | snowflake.topic2table.map={topic_name}:{snowflake_table_name}
31 | buffer.count.records=10000
32 | buffer.flush.time=60
33 | buffer.size.bytes=5000000
34 | snowflake.url.name={Snowflake URL}
35 | snowflake.user.name={Snowflake User Name}
36 | snowflake.private.key={Put the Private key content here}
37 | snowflake.database.name={Snowflake Database Name}
38 | snowflake.schema.name={Snowflake Schema Name}
39 | key.converter=com.snowflake.kafka.connector.records.SnowflakeJsonConverter
40 | value.converter=com.snowflake.kafka.connector.records.SnowflakeJsonConverter
41 | name={}
42 | 
43 | Create the topic if not already exists & run the python code to ingest the data in the topic.
44 | 
45 | 
46 | Start the Kafka Connector:
47 | ---------------------------------------------------------
48 | F:/kafka_2.12-3.2.0/bin/windows/connect-standalone.bat F:/kafka_2.12-3.2.0/config/connect-standalone.properties F:/kafka_2.12-3.2.0/config/SF_connect.properties
49 | 
50 | To unset the Public Key in Snowflake:
51 | ----------------------------------------------------------------------
52 | alter user {User_name} unset rsa_public_key;
53 | 


--------------------------------------------------------------------------------
/kafka_producer_with_topic_partitioning.py:
--------------------------------------------------------------------------------
 1 | from time import sleep
 2 | from json import dumps
 3 | from kafka import KafkaProducer
 4 | 
 5 | 
 6 | #Lab 1: Write message to a partition (mentioning the partition number while publishing the message)
 7 | 
 8 | topic_name='hello_world1'
 9 | producer = KafkaProducer(bootstrap_servers=['localhost:9092'],value_serializer=lambda x: dumps(x).encode('utf-8'))
10 | data1 = {'number' : 1}
11 | data2 = {'number' : 2}
12 | data3 = {'number' : 3}
13 | data4 = {'number' : 4}
14 | data5 = {'number' : 5}
15 | data6 = {'number' : 6}
16 | producer.send(topic_name, value=data1,partition=1)
17 | producer.send(topic_name, value=data2,partition=1)
18 | producer.send(topic_name, value=data3,partition=1)
19 | producer.send(topic_name, value=data4,partition=2)
20 | producer.send(topic_name, value=data5,partition=2)
21 | producer.send(topic_name, value=data6,partition=0)
22 | producer.close()
23 | 
24 | #Lab 2: Pass key value pair
25 | from json import dumps
26 | from kafka import KafkaProducer
27 | producer = KafkaProducer(bootstrap_servers=['localhost:9092'])
28 | topic_name='hello_world2'
29 | producer.send(topic_name, key=b'foo', value=b'bar') #Note :key & value serialization we are doing while publishing the message
30 |                                                     #itself , so explicitly not mentioning the key or value serializer
31 | producer.send(topic_name, key=b'foo', value=b'bar')
32 | producer.close()
33 | 
34 | #Lab 3: Pass key value pair with key & value serialization with key or value serializer explicitly mentioned
35 | from json import dumps
36 | from kafka import KafkaProducer
37 | producer = KafkaProducer(bootstrap_servers=['localhost:9092'],key_serializer=str.encode,value_serializer=lambda x: dumps(x).encode('utf-8'))
38 | topic_name='hello_world3'
39 | data1 = {'number' : 1}
40 | data2 = {'number' : 2}
41 | data3 = {'number' : 3}
42 | data4 = {'number' : 4}
43 | data5 = {'number' : 5}
44 | data6 = {'number' : 6}
45 | producer.send(topic_name,  key='ping',value=data1)
46 | producer.send(topic_name, key='ping',value=data2)
47 | producer.send(topic_name, key='ping',value=data3)
48 | producer.send(topic_name, key='pong',value=data4)
49 | producer.send(topic_name, key='pong',value=data5)
50 | producer.send(topic_name, key='pong',value=data6)
51 | producer.close()
52 | 
53 | 
54 | 
55 | #Lab 4: Customize a partitioner
56 | from time import sleep
57 | from json import dumps
58 | from kafka import KafkaProducer
59 | 
60 | 
61 | def custom_partitioner(key, all_partitions, available):
62 |     """
63 |     Customer Kafka partitioner to get the partition corresponding to key
64 |     :param key: partitioning key
65 |     :param all_partitions: list of all partitions sorted by partition ID
66 |     :param available: list of available partitions in no particular order
67 |     :return: one of the values from all_partitions or available
68 |     """
69 |     print("The key is  : {}".format(key))
70 |     print("All partitions : {}".format(all_partitions))
71 |     print("After decoding of the key : {}".format(key.decode('UTF-8')))
72 |     return int(key.decode('UTF-8'))%len(all_partitions)
73 | 
74 | 
75 | producer = KafkaProducer(bootstrap_servers=['localhost:9092'],partitioner=custom_partitioner)
76 | topic_name='hello_world4'
77 | producer.send(topic_name, key=b'3', value=b'Hello Partitioner')
78 | producer.send(topic_name, key=b'2', value=b'Hello Partitioner123')
79 | producer.send(topic_name, key=b'369', value=b'Hello Partitioner')
80 | producer.send(topic_name, key=b'301', value=b'Hello Partitioner')
81 | 
82 | 
83 | 


--------------------------------------------------------------------------------
/kafka_yt_demo.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/kafka_yt_demo.zip


--------------------------------------------------------------------------------
/key monitor.txt:
--------------------------------------------------------------------------------
 1 | import json
 2 | import boto3
 3 | import datetime
 4 | from datetime import datetime,timedelta
 5 | from dateutil import tz
 6 | 
 7 | def lambda_handler(event, context):
 8 |     client=boto3.client('iam')
 9 |     time_utc=datetime.now(tz.tzutc())
10 |     response=client.list_users();
11 |     print("List of users : ",response)
12 |     for user in response['Users']:
13 |         print("Access keys for user : {}".format(user['UserName']))
14 |         access_key_info=client.list_access_keys(UserName=user['UserName'])
15 |         print(access_key_info)
16 |         access_key_metadata=access_key_info['AccessKeyMetadata']
17 |         for key in access_key_metadata:
18 |             if(key['CreateDate']<(time_utc-timedelta(days=90))):
19 |                 print("Alert!!")
20 | 


--------------------------------------------------------------------------------
/lambda_powertools.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from aws_lambda_powertools.event_handler.exceptions import NotFoundError
 3 | from aws_lambda_powertools.event_handler import (
 4 |     APIGatewayRestResolver,
 5 |     Response,
 6 |     content_types,
 7 | )
 8 | app = APIGatewayRestResolver()
 9 | 
10 | @app.not_found
11 | def handle_not_found_errors(exc: NotFoundError) -> Response:
12 |     return Response(status_code=418, content_type=content_types.TEXT_PLAIN, body="No such resource path found")
13 |     
14 | @app.get("/v1")
15 | def v1_call():
16 |     print("Inside v1")
17 |     return {"path":1}
18 |    
19 | @app.get("/v2")
20 | def v2_call():
21 |     print("Inside v2")
22 |     return {"path":2}
23 |     
24 | @app.get("/v4/<animal_name>")
25 | def v4_call(animal_name: str):
26 |     print("Inside v4")
27 |     return {"value":animal_name}
28 |     
29 |     
30 | @app.post("/v3")
31 | def v3_call():
32 |     print("Inside v3 post endpoint")
33 |     todo_data: dict = app.current_event.json_body
34 |     return f"I love my {todo_data["country"]}"
35 |     
36 | def lambda_handler(event, context):
37 |     # TODO implement
38 |     print("Input Event: ",event)
39 |     return app.resolve(event, context)
40 | 


--------------------------------------------------------------------------------
/mysql_cdc_fetcher_runner.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import boto3
 3 | 
 4 | from pymysqlreplication import BinLogStreamReader
 5 | from pymysqlreplication.row_event import (
 6 |   DeleteRowsEvent,
 7 |   UpdateRowsEvent,
 8 |   WriteRowsEvent,
 9 | )
10 | 
11 | def main():
12 |   kinesis = boto3.client("kinesis",region_name='{}')
13 |   stream = BinLogStreamReader(
14 |     connection_settings= {
15 |       "host": "{}",
16 |       "port":{} ,
17 |       "user": "{}",
18 |       "passwd": "{}"},
19 |     server_id=100,
20 |     blocking=True,
21 |     resume_stream=True,
22 |     only_events=[DeleteRowsEvent, WriteRowsEvent, UpdateRowsEvent])
23 |   for binlogevent in stream:
24 |     for row in binlogevent.rows:
25 |       event = {"schema": binlogevent.schema,
26 |       "table": binlogevent.table,
27 |       "type": type(binlogevent).__name__,
28 |       "row": row
29 |       }
30 |       kinesis.put_record(StreamName="{}", Data=str(event), PartitionKey="1")
31 |       print(json.dumps(event))
32 |       
33 | main()


--------------------------------------------------------------------------------
/news_fetcher.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import json
 3 | import requests
 4 | import os
 5 | from base64 import b64decode
 6 | import datetime
 7 | from datetime import date
 8 | import uuid
 9 | import os
10 | 
11 | 
12 | def runner():
13 |     today = (date.today())
14 |     api_key = ''
15 | 
16 |     base_url = "https://newsapi.org/v2/everything?q={}&from={}&to={}&sortBy=popularity&apiKey={}&language=en"
17 |     print(base_url)
18 |     start_date_value = str(today - datetime.timedelta(days=1))
19 |     end_date_value = str(today)
20 | 
21 |     df = pd.DataFrame(columns=['newsTitle', 'timestamp', 'url_source', 'content', 'source', 'author', 'urlToImage'])
22 | 
23 |     url_extractor = base_url.format('India', start_date_value, end_date_value, api_key)
24 |     print(url_extractor)
25 |     response = requests.get(url_extractor)
26 |     d = response.json()
27 | 
28 |     for i in d['articles']:
29 |         newsTitle = i['title']
30 |         timestamp = i['publishedAt']
31 |         trimmed_part = "None"
32 |         url_source = i['url']
33 |         source = i['source']
34 |         author = i['author']
35 |         urlToImage = i['urlToImage']
36 |         partial_content = ""
37 |         if (str(i['content']) != 'None'):
38 |             partial_content = i['content']
39 |         if (len(partial_content) >= 200):
40 |             partial_content = partial_content[0:199]
41 |         if ('.' in partial_content):
42 |             trimmed_part = partial_content[:partial_content.rindex('.')]
43 |         else:
44 |             trimmed_part = partial_content
45 |         df = pd.concat([df, pd.DataFrame(
46 |         {'newsTitle': newsTitle, 'timestamp': timestamp, 'url_source': url_source, 'content': trimmed_part,
47 |         'source': source, 'author': author, 'urlToImage': urlToImage})], ignore_index=True)
48 | 
49 | 
50 |     output_file = "/home/ubuntu/news_data.parquet"
51 |     df1 = df.drop_duplicates()
52 |     df1.to_parquet(output_file)
53 | 
54 | runner()


--------------------------------------------------------------------------------
/news_fetcher_etl.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import json
 3 | import requests
 4 | import os
 5 | from base64 import b64decode
 6 | import datetime
 7 | from datetime import date
 8 | import uuid
 9 | import os
10 | 
11 | 
12 | def runner():
13 |     today = (date.today())
14 |     api_key = '{}'
15 | 
16 |     base_url = "https://newsapi.org/v2/everything?q={}&from={}&to={}&sortBy=popularity&apiKey={}&language=en"
17 |     print(base_url)
18 |     start_date_value = str(today - datetime.timedelta(days=1))
19 |     end_date_value = str(today)
20 | 
21 |     df = pd.DataFrame(columns=['newsTitle', 'timestamp', 'url_source', 'content', 'source', 'author', 'urlToImage'])
22 | 
23 |     url_extractor = base_url.format('Covid', start_date_value, end_date_value, api_key)
24 |     print(url_extractor)
25 |     response = requests.get(url_extractor)
26 |     d = response.json()
27 | 
28 |     for i in d['articles']:
29 |         newsTitle = i['title']
30 |         timestamp = i['publishedAt']
31 |         trimmed_part = "None"
32 |         url_source = i['url']
33 |         source = i['source']
34 |         author = i['author']
35 |         urlToImage = i['urlToImage']
36 |         partial_content = ""
37 |         if (str(i['content']) != 'None'):
38 |             partial_content = i['content']
39 |         if (len(partial_content) >= 200):
40 |             partial_content = partial_content[0:199]
41 |         if ('.' in partial_content):
42 |             trimmed_part = partial_content[:partial_content.rindex('.')]
43 |         else:
44 |             trimmed_part = partial_content
45 |         df = pd.concat([df, pd.DataFrame(
46 |         {'newsTitle': newsTitle, 'timestamp': timestamp, 'url_source': url_source, 'content': trimmed_part,
47 |         'source': source, 'author': author, 'urlToImage': urlToImage})], ignore_index=True)
48 | 
49 |     filename = str(uuid.uuid4())
50 |     output_file = "/home/ubuntu/{}.parquet".format(filename)
51 |     df1 = df.drop_duplicates()
52 |     df1.to_parquet(output_file)
53 |     return output_file


--------------------------------------------------------------------------------
/otp system.drawio.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/otp system.drawio.png


--------------------------------------------------------------------------------
/scd type 1 Snowflake.txt:
--------------------------------------------------------------------------------
 1 | create or replace database ramu;
 2 | use ramu;
 3 | 
 4 | create or replace sequence seq_01 start = 1 increment = 1;
 5 | 
 6 | create or replace table source_table( emp_no int,emp_name text,salary int, hra int );
 7 |   
 8 |     
 9 | 
10 | INSERT INTO source_table VALUES (100, 'A' ,2000, 100),
11 | (101, 'B' ,5000, 300),
12 | (102, 'C' ,6000, 400),
13 | (103, 'D' ,500, 50),
14 | (104, 'E' ,15000, 3000),
15 | (105, 'F' ,150000, 20050);
16 | 
17 | select * from source_table; 
18 | 
19 | 
20 | create or replace table target_table(surrogate_key int default seq_01.nextval,emp_no int,emp_name text,salary int, 
21 |                                      hra int);
22 |                                      
23 |                                      
24 | select * from target_table;  
25 | 
26 | 
27 | INSERT INTO PUBLIC.target_table(emp_no, emp_name, salary, hra) 
28 | SELECT  t.emp_no, t.emp_name, t.salary, t.hra FROM PUBLIC.source_table t 
29 | LEFT JOIN PUBLIC.target_table d ON  d.emp_no = t.emp_no WHERE ( d.emp_no IS NULL);
30 | 
31 | 
32 | select * from target_table;  
33 | 
34 | 
35 | 
36 | update source_table set salary=5690 where emp_name='A';
37 | 
38 | select * from source_table; 
39 | select * from target_table;  
40 | 
41 | 
42 | UPDATE PUBLIC.target_table d SET  emp_name = t.emp_name,  salary = t.salary,  hra = t.hra 
43 | FROM PUBLIC.source_table t WHERE  d.emp_no = t.emp_no AND (( d.emp_name <> t.emp_name) OR ( d.salary <> t.salary) OR ( d.hra <> t.hra));
44 | 
45 | 
46 | select * from target_table;  
47 | 
48 | update source_table set salary=6000 where emp_name='B';
49 | update source_table set HRA=3000 where emp_name='B';
50 | INSERT INTO source_table VALUES (1001, 'MG' ,2000, 100);
51 | 
52 | select * from source_table; 
53 | select * from target_table;  
54 | 
55 | 
56 | INSERT INTO PUBLIC.target_table(emp_no, emp_name, salary, hra) 
57 | SELECT  t.emp_no, t.emp_name, t.salary, t.hra FROM PUBLIC.source_table t 
58 | LEFT JOIN PUBLIC.target_table d ON  d.emp_no = t.emp_no WHERE ( d.emp_no IS NULL);
59 | UPDATE PUBLIC.target_table d SET  emp_name = t.emp_name,  salary = t.salary,  hra = t.hra 
60 | FROM PUBLIC.source_table t WHERE  d.emp_no = t.emp_no AND (( d.emp_name <> t.emp_name) OR ( d.salary <> t.salary) OR ( d.hra <> t.hra));
61 | 
62 | select * from source_table; 
63 | select * from target_table;  


--------------------------------------------------------------------------------
/scd_type_2_snowflake_queries.sql:
--------------------------------------------------------------------------------
 1 | create or replace table source_table( emp_no int,emp_name text,salary int, hra int );
 2 |   
 3 |     
 4 | 
 5 | INSERT INTO source_table VALUES (100, 'A' ,2000, 100),
 6 | (101, 'B' ,5000, 300),
 7 | (102, 'C' ,6000, 400),
 8 | (103, 'D' ,500, 50),
 9 | (104, 'E' ,15000, 3000),
10 | (105, 'F' ,150000, 20050);
11 | 
12 | 
13 | create or replace sequence seq_01 start = 1 increment = 1;
14 | 
15 | 
16 | create or replace table target_table( surrogate_key int default seq_01.nextval,emp_no int,emp_name text,salary int, 
17 |                                      hra int,start_date string default current_timestamp()::string ,end_date string,activeflag text default 'Y' );
18 |    
19 | SELECT * FROM source_table;
20 | 
21 | select * from target_table;  
22 | 
23 | 
24 | select * from updated_emp_id;
25 | 
26 | 
27 | 
28 | 
29 | 
30 | INSERT INTO source_table VALUES (110, 'AB' ,5600, 180);
31 | INSERT INTO source_table VALUES (115, 'CD' ,5670, 185);
32 | 
33 | 
34 | update source_table set salary=5690 where emp_name='A';
35 | update source_table set HRA=645 where emp_name='CD';
36 | 
37 | delete from source_table where emp_name='B';
38 | 
39 | 
40 | INSERT INTO source_table VALUES (1010, 'B' ,5600, 180);
41 | update source_table set salary=7000 where emp_name='A';
42 | delete from source_table where emp_name='C';
43 | 
44 | select * from    target_table;  
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/snowflake cortex fine tuning.txt:
--------------------------------------------------------------------------------
  1 | --Lab section: 
  2 | DROP DATABASE IF EXISTS snowflake_llm_poc;
  3 | CREATE Database snowflake_llm_poc;
  4 | use snowflake_llm_poc;
  5 | 
  6 | CREATE or REPLACE file format csvformat
  7 |   SKIP_HEADER = 1
  8 |   FIELD_OPTIONALLY_ENCLOSED_BY = '"'
  9 |   type = 'CSV';
 10 | 
 11 | CREATE or REPLACE stage support_tickets_data_stage
 12 |   file_format = csvformat
 13 |   url = 's3://sfquickstarts/finetuning_llm_using_snowflake_cortex_ai/';
 14 | 
 15 | CREATE or REPLACE TABLE SUPPORT_TICKETS (
 16 |   ticket_id VARCHAR(60),
 17 |   customer_name VARCHAR(60),
 18 |   customer_email VARCHAR(60),
 19 |   service_type VARCHAR(60),
 20 |   request VARCHAR,
 21 |   contact_preference VARCHAR(60)
 22 | );
 23 | 
 24 | COPY into SUPPORT_TICKETS
 25 |   from @support_tickets_data_stage;
 26 | 
 27 | select * from SUPPORT_TICKETS;
 28 | 
 29 | --with mistral-large
 30 | select *,snowflake.cortex.complete('mistral-large',concat('You are an agent that helps organize requests that come to our support team. 
 31 | 
 32 | The request category is the reason why the customer reached out. These are the possible types of request categories:
 33 | 
 34 | Roaming fees
 35 | Slow data speed
 36 | Lost phone
 37 | Add new line
 38 | Closing account
 39 | 
 40 | Try doing it for this request and return only the request category only.
 41 | <request>',REQUEST,'</request>')) as classification_result from SUPPORT_TICKETS;
 42 | 
 43 | 
 44 | 
 45 | --with mistral-7b
 46 | select *,snowflake.cortex.complete('mistral-7b',concat('You are an agent that helps organize requests that come to our support team. 
 47 | 
 48 | The request category is the reason why the customer reached out. These are the possible types of request categories:
 49 | 
 50 | Roaming fees
 51 | Slow data speed
 52 | Lost phone
 53 | Add new line
 54 | Closing account
 55 | 
 56 | Try doing it for this request and return only the request category only.
 57 | <request>',REQUEST,'</request>')) as classification_result from SUPPORT_TICKETS;
 58 | 
 59 | 
 60 | --step 1 : create the data format
 61 | Create or replace table snowflake_llm_poc.public.annotated_data_for_finetuning as 
 62 | (select *,concat('You are an agent that helps organize requests that come to our support team. 
 63 | 
 64 | The request category is the reason why the customer reached out. These are the possible types of request categories:
 65 | 
 66 | Roaming fees
 67 | Slow data speed
 68 | Lost phone
 69 | Add new line
 70 | Closing account
 71 | 
 72 | Try doing it for this request and return only the request category only.
 73 | <request>',REQUEST,'</request>') as prompt,snowflake.cortex.complete('mistral-large',concat('You are an agent that helps organize requests that come to our support team. 
 74 | 
 75 | The request category is the reason why the customer reached out. These are the possible types of request categories:
 76 | 
 77 | Roaming fees
 78 | Slow data speed
 79 | Lost phone
 80 | Add new line
 81 | Closing account
 82 | 
 83 | Try doing it for this request and return only the request category only.
 84 | <request>',REQUEST,'</request>')) as classification_result from SUPPORT_TICKETS);
 85 | 
 86 | select * from snowflake_llm_poc.public.annotated_data_for_finetuning;
 87 | 
 88 | --splitting into training & test dataset
 89 | create or replace table snowflake_llm_poc.public.trainig_data as select * from snowflake_llm_poc.public.annotated_data_for_finetuning sample(80);
 90 | 
 91 | select * from snowflake_llm_poc.public.trainig_data;
 92 | 
 93 | create or replace table snowflake_llm_poc.public.validation_data as select * from snowflake_llm_poc.public.annotated_data_for_finetuning minus
 94 | select * from snowflake_llm_poc.public.trainig_data;
 95 | 
 96 | select * from snowflake_llm_poc.public.validation_data;
 97 | 
 98 | select * from snowflake_llm_poc.public.trainig_data
 99 | intersect
100 | select * from snowflake_llm_poc.public.validation_data;
101 | 
102 | --fine-tuning
103 | select snowflake.cortex.finetune(
104 | 'CREATE', 
105 | 'snowflake_llm_poc.PUBLIC.SUPPORT_TICKETS_FINETUNED_MISTRAL_7B', 'mistral-7b', 
106 | 'SELECT prompt, CLASSIFICATION_RESULT as completion from snowflake_llm_poc.PUBLIC.trainig_data', 
107 | 'SELECT prompt, CLASSIFICATION_RESULT as completion from snowflake_llm_poc.PUBLIC.validation_data'
108 | );
109 | 
110 | --check the fine-tune job completed or not
111 | select SNOWFLAKE.CORTEX.FINETUNE(
112 |   'DESCRIBE',
113 |   'CortexFineTuningWorkflow_398c6ef0-afcf-4934-913c-546285e53ec7'
114 | );
115 | 
116 | --Inferencing the fine-tuned model
117 | select *,snowflake.cortex.complete('snowflake_llm_poc.PUBLIC.SUPPORT_TICKETS_FINETUNED_MISTRAL_7B',concat('You are an agent that helps organize requests that come to our support team. 
118 | 
119 | The request category is the reason why the customer reached out. These are the possible types of request categories:
120 | 
121 | Roaming fees
122 | Slow data speed
123 | Lost phone
124 | Add new line
125 | Closing account
126 | 
127 | Try doing it for this request and return only the request category only.
128 | <request>',REQUEST,'</request>')) as classification_result from SUPPORT_TICKETS;
129 | 


--------------------------------------------------------------------------------
/snowflake_connector_python-2.3.8-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SatadruMukherjee/Data-Preprocessing-Models/3aeeda4e68aafd47fe784a71c4aa02a34be76820/snowflake_connector_python-2.3.8-py3-none-any.whl


--------------------------------------------------------------------------------
/snowflake_dq_framework.py:
--------------------------------------------------------------------------------
  1 | #code explanation: https://youtu.be/Rp0RHsT0jIA?si=zFrYgPChEJn4aWv4
  2 | #pip install snowflake-connector-python
  3 | #pip install "snowflake-connector-python[pandas]" -t .
  4 | #pip install pandas -t .
  5 | 
  6 | 
  7 | from snowflake.connector import connect
  8 | import pandas as pd
  9 | import os
 10 | 
 11 | 
 12 | def run_query(conn, query):
 13 |     cursor = conn.cursor()
 14 |     cursor.execute(query)
 15 |     cursor.close()
 16 | 
 17 | def run_query1(conn, query):
 18 |     cursor = conn.cursor()
 19 |     cursor.execute(query)
 20 |     records=cursor.fetchone()[0]
 21 |     cursor.close()
 22 |     return records
 23 | 
 24 | def execute_test(
 25 |         db_conn,
 26 |         script_1,
 27 |         script_2,
 28 |         comp_operator):
 29 |     print("1st SQL Query : ",script_1)
 30 |     result_1=run_query1(db_conn,script_1)
 31 |     print("2nd SQL Query : ", script_2)
 32 |     result_2 = run_query1(db_conn, script_2)
 33 |     print("result 1 = " + str(result_1))
 34 |     print("result 2 = " + str(result_2))
 35 |     # compare values based on the comp_operator
 36 |     if comp_operator == "equals":
 37 |         return result_1 == result_2
 38 |     elif comp_operator == "greater_equals":
 39 |         return result_1 >= result_2
 40 |     elif comp_operator == "greater":
 41 |         return result_1 > result_2
 42 |     elif comp_operator == "less_equals":
 43 |         return result_1 <= result_2
 44 |     elif comp_operator == "less":
 45 |         return result_1 < result_2
 46 |     elif comp_operator == "not_equal":
 47 |         return result_1 != result_2
 48 |     # if we made it here, something went wrong
 49 |     return False
 50 | 
 51 | 
 52 | 
 53 | user=''
 54 | password=''
 55 | account=""
 56 | database=""
 57 | warehouse=""
 58 | schema=""
 59 | role=""
 60 | conn = connect(
 61 |         user=user,
 62 |         password=password,
 63 |         account=account,
 64 |         database=database,
 65 |         schema=schema,
 66 |         warehouse=warehouse,
 67 |         role=role
 68 |     )
 69 | 
 70 | 
 71 | 
 72 | sql_query ="""select * from dq_check where table_name='dummy_table'"""
 73 | cursor = conn.cursor()
 74 | cursor.execute(sql_query)
 75 | 
 76 | df=cursor.fetch_pandas_all()
 77 | cursor.close()
 78 | 
 79 | test_case_output_df=pd.DataFrame(columns=['Check_Description','Status'])
 80 | 
 81 | for index,row in df.iterrows():
 82 |     table_name=row["TABLE_NAME"]
 83 |     description=row['DESCRIPTION']
 84 |     print('*'*100)
 85 |     print("Performing check : ",description)
 86 |     sql_query_1=row['SQL_QUERY_1']
 87 |     sql_query_2=row['SQL_QUERY_2']
 88 |     comparison_type=row['COMPARISON_TYPE']
 89 |     outcome=execute_test(
 90 |         conn,
 91 |         sql_query_1,
 92 |         sql_query_2,
 93 |         comparison_type)
 94 |     testcase_pass_fail= "Pass" if  outcome else "Failed"
 95 |     print("Testcase Results : ",testcase_pass_fail)
 96 |     new_row=({'Check_Description': description, 'Status': testcase_pass_fail})
 97 |     test_case_output_df = pd.concat([test_case_output_df, pd.DataFrame([new_row])], ignore_index=True)
 98 |     print('*' * 100)
 99 | 
100 | print(test_case_output_df)


--------------------------------------------------------------------------------
/study_data.csv:
--------------------------------------------------------------------------------
  1 | ﻿repetition_time,study_time,knowledge_level
  2 | 0,0,Low
  3 | 0.24,0.9,High
  4 | 0.25,0.33,Low
  5 | 0.65,0.3,High
  6 | 0.98,0.24,Low
  7 | 0.1,0.66,High
  8 | 0.29,0.56,High
  9 | 0.4,0.01,Low
 10 | 0.72,0.25,Low
 11 | 0.2,0.85,High
 12 | 0.3,0.81,High
 13 | 0.41,0.3,Low
 14 | 0.78,0.34,High
 15 | 0.15,0.9,High
 16 | 0.3,0.6,High
 17 | 0.35,0.8,High
 18 | 0.01,0.05,Low
 19 | 0.08,0.33,Low
 20 | 0.27,0.29,Low
 21 | 0.49,0.56,High
 22 | 0.78,0.2,Low
 23 | 0.12,0.66,High
 24 | 0.29,0.65,High
 25 | 0.42,0.28,Low
 26 | 0.76,0.25,Low
 27 | 0.18,0.85,High
 28 | 0.25,0.1,Low
 29 | 0.45,0.25,Low
 30 | 0.94,0.56,High
 31 | 0.21,0.81,High
 32 | 0.31,0.59,High
 33 | 0.65,0.24,Low
 34 | 0.76,0.16,Low
 35 | 0.19,0.82,High
 36 | 0.31,0.78,High
 37 | 0.43,0.29,Low
 38 | 0.72,0.26,Low
 39 | 0.08,0.33,Low
 40 | 0.26,0,Low
 41 | 0.49,0.45,High
 42 | 0.76,0.1,Low
 43 | 0.2,0.78,High
 44 | 0.29,0.6,High
 45 | 0.64,0.25,High
 46 | 0.27,0.04,Low
 47 | 0.14,0.66,High
 48 | 0.31,0.62,High
 49 | 0.38,0.77,High
 50 | 0.71,0.9,High
 51 | 0.18,0.67,High
 52 | 0.28,0.25,Low
 53 | 0.51,0.45,High
 54 | 0.78,0.05,Low
 55 | 0.18,0.86,High
 56 | 0.29,0.55,High
 57 | 0.42,0.26,Low
 58 | 0.84,0.25,High
 59 | 0.19,0.59,High
 60 | 0.33,0.82,High
 61 | 0.64,0.1,Low
 62 | 0.75,0.01,Low
 63 | 0.19,0.56,High
 64 | 0.3,0.51,High
 65 | 0.48,0.28,Low
 66 | 0.8,0.28,High
 67 | 0.09,0.15,Low
 68 | 0.26,0.24,Low
 69 | 0.55,0.51,High
 70 | 0.28,0.32,Low
 71 | 0.02,0.67,High
 72 | 0.29,0.58,High
 73 | 0.42,0.75,High
 74 | 0.66,0.08,Low
 75 | 0.3,0.53,High
 76 | 0.33,0.78,High
 77 | 0.4,0.1,Low
 78 | 0.75,0.1,Low
 79 | 0.1,0.3,Low
 80 | 0.26,0.2,Low
 81 | 0.44,0.28,Low
 82 | 0.76,0.24,Low
 83 | 0.12,0.67,High
 84 | 0.29,0.62,High
 85 | 0.48,0.26,Low
 86 | 0.7,0.25,Low
 87 | 0.2,0.03,Low
 88 | 0.31,0.79,High
 89 | 0.41,0.28,Low
 90 | 0.78,0.18,Low
 91 | 0.09,0.67,High
 92 | 0.29,0.56,High
 93 | 0.78,0.34,High
 94 | 0.6,0.09,Low
 95 | 0.23,0.24,Low
 96 | 0.32,0.8,High
 97 | 0.62,0.15,Low
 98 | 0.77,0.19,Low
 99 | 0.33,0.77,High
100 | 0.29,0.57,High
101 | 0.42,0.29,Low
102 | 0.48,0.26,Low
103 | 0.33,0.87,High
104 | 0.31,0.54,High
105 | 0.49,0.27,Low
106 | 0.76,0.1,Low
107 | 0.25,0.67,High
108 | 0.29,0.59,High
109 | 0.4,0.54,High
110 | 0.81,0.3,High
111 | 0.37,0.84,High
112 | 0.27,0.33,Low
113 | 0.4,0.3,Low
114 | 0.89,0.58,High
115 | 0.4,0.79,High
116 | 0.31,0.55,High
117 | 0.61,0.45,High
118 | 0.66,0.07,Low
119 | 0.8,0.7,High
120 | 0.17,0.66,High
121 | 0.32,0.81,High
122 | 0.65,0.13,Low
123 | 0.72,0.25,Low
124 | 0.11,0.333,Low
125 | 0.25,0.83,High
126 | 0.49,0.76,High
127 | 0.92,0.5,High
128 | 0.22,0.66,High
129 | 0.28,0.28,Low
130 | 0.63,0.14,Low
131 | 0.88,0.28,High
132 | 0.06,0.34,Low
133 | 0.26,0.67,High
134 | 0.55,0.07,Low
135 | 0.7,0.71,High
136 | 0.1,0.65,High
137 | 0.31,0.5,High
138 | 0.48,0.26,Low
139 | 0.78,0.1,Low
140 | 0.18,0.58,High
141 | 0.27,0.3,Low
142 | 0.55,0.1,Low
143 | 0.78,0.4,High
144 | 0.22,0.56,High
145 | 0.22,0.29,Low
146 | 0.56,0.48,High
147 | 0.95,0.65,High
148 | 0.24,0.35,Low
149 | 0.33,0.1,Low
150 | 0.64,0.13,Low
151 | 0.65,0.77,High
152 | 0.14,0.86,High
153 | 0.32,0.3,Low
154 | 0.48,0.13,Low
155 | 0.77,0.14,Low
156 | 0.09,0.64,High
157 | 0.33,0.52,High
158 | 0.36,0.51,High
159 | 0.77,0.83,High
160 | 0.18,0.59,High
161 | 0.31,0.54,High
162 | 0.61,0.18,Low
163 | 0.84,0.3,High
164 | 0.24,0.88,High
165 | 0.27,0.89,High
166 | 0.49,0.12,Low
167 | 0.3,0.9,High
168 | 0.2,0.61,High
169 | 0.49,0.78,High
170 | 0.6,0.16,Low
171 | 0.21,0.92,High
172 | 0.04,0.25,Low
173 | 0.33,0.49,High
174 | 0.53,0.85,High
175 | 0.75,0.16,Low
176 | 0.12,0.66,High
177 | 0.33,0.3,Low
178 | 0.65,0.19,Low
179 | 0.75,0.71,High
180 | 0.22,0.6,High
181 | 0.26,0.83,High
182 | 0.63,0.18,Low
183 | 0.99,0.55,High
184 | 0.24,0.89,High
185 | 0.29,0.3,Low
186 | 0.62,0.2,Low
187 | 0.78,0.21,Low
188 | 0.01,0.93,High
189 | 0.29,0.57,High
190 | 0.55,0.25,Low
191 | 0.9,0.47,High
192 | 0.16,0.64,High
193 | 0.3,0.8,High
194 | 0.4,0.5,High
195 | 0.88,0.67,High
196 | 0.11,0.66,High
197 | 0.25,0.29,Low
198 | 0.48,0.1,Low
199 | 0.72,0.26,Low
200 | 0.18,0.63,High
201 | 0.3,0.1,Low
202 | 0.55,0.09,Low
203 | 0.65,0.5,High
204 | 0.08,0.1,Low
205 | 0.3,0.29,Low
206 | 0.65,0.75,High
207 | 0.81,0.15,Low
208 | 0.09,0.66,High
209 | 0.31,0.53,High
210 | 0.48,0.11,Low
211 | 0.8,0.68,High
212 | 0.14,0.62,High
213 | 0.31,0.51,High
214 | 0.58,0.79,High
215 | 0.83,0.34,High
216 | 0.2,0.6,High
217 | 0.29,0.3,Low
218 | 0.5,0.3,Low
219 | 0.87,0.58,High
220 | 0.17,0.64,High
221 | 0.28,0.3,Low
222 | 0.62,0.24,Low
223 | 0.78,0.28,High
224 | 0.2,0.66,High
225 | 0.31,0.57,High
226 | 0.63,0.21,Low
227 | 0.82,0.68,High
228 | 0.23,0.59,High
229 | 0.29,0.31,Low
230 | 0.55,0.78,High
231 | 0.7,0.69,High
232 | 0.12,0.65,High
233 | 0.28,0.28,Low
234 | 0.59,0.23,Low
235 | 0.91,0.66,High
236 | 0.15,0.62,High
237 | 0.27,0.3,Low
238 | 0.6,0.22,Low
239 | 0.4,0.83,High
240 | 0.13,0.64,High
241 | 0.3,0.52,High
242 | 0.62,0.2,Low
243 | 0.78,0.86,High
244 | 0.18,0.63,High
245 | 0.27,0.25,Low
246 | 0.65,0.25,High
247 | 0.89,0.88,High
248 | 0.1,0.66,High
249 | 0.29,0.29,Low
250 | 0.65,0.9,High
251 | 0.79,0.45,High
252 | 0.09,0.66,High
253 | 0.31,0.5,High
254 | 0.64,0.19,Low
255 | 0.92,0.58,High
256 | 0.19,0.6,High
257 | 0.29,0.77,High
258 | 0.61,0.26,High
259 | 0.87,0.74,High
260 | 


--------------------------------------------------------------------------------
/test123.txt:
--------------------------------------------------------------------------------
1 | Getting Started with AWS Managed Streaming for Kafka with in-depth service setup
2 | https://youtube.com/watch?v=BFKmQAafE_c&feature=shares
3 | Capturing client events using Amazon API Gateway and Amazon EventBridge
4 | https://youtube.com/watch?v=mcpnhZThZ7s&feature=shares
5 | End to End Streaming Data Pipeline Using AWS MSK & AWS Serverless Services
6 | https://youtube.com/watch?v=l5ypWBHMsNY&feature=shares


--------------------------------------------------------------------------------
/testa:
--------------------------------------------------------------------------------
 1 |     service_name = 's3'
 2 |     region_name = 'us-east-2'
 3 |     aws_access_key_id = ''
 4 |     aws_secret_access_key = ''
 5 | 
 6 |     s3_resource = boto3.resource(
 7 |         service_name=service_name,
 8 |         region_name=region_name,
 9 |         aws_access_key_id=aws_access_key_id,
10 |         aws_secret_access_key=aws_secret_access_key
11 |     )
12 |     bucket='destinationbucketdemoshow';
13 |     df = initial_df[(initial_df.species == "setosa")];
14 |     csv_buffer = StringIO()
15 |     df.to_csv(csv_buffer,index=False);
16 |     s3_resource.Object(bucket, s3_file_key).put(Body=csv_buffer.getvalue())
17 | 


--------------------------------------------------------------------------------
/transform.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | 
 4 | from pyspark.sql import SparkSession
 5 | from pyspark import SparkContext
 6 | from pyspark.sql.types import *
 7 | 
 8 | 
 9 | spark = SparkSession \
10 |         .builder \
11 |         .appName("airflow_with_emr") \
12 |         .getOrCreate()
13 |         
14 | 
15 | 
16 | def main():
17 |     s3_location="s3://irisseta/input_folder/";
18 |     iris = spark.read.format("csv").option("inferSchema","true").load(s3_location).toDF('SEPAL_LENGTH','SEPAL_WIDTH','PETAL_LENGTH','PETAL_WIDTH','CLASS_NAME');
19 |     ms=iris.groupBy("CLASS_NAME").count()
20 |     ms.coalesce(1).write.format("parquet").mode('overwrite').save("s3://irisseta/output_folder/")
21 | 
22 | main()
23 | 
24 | 
25 | 


--------------------------------------------------------------------------------
/translator_with_polly.py:
--------------------------------------------------------------------------------
  1 | import streamlit as st
  2 | import boto3
  3 | from botocore.exceptions import BotoCoreError, ClientError
  4 | import os
  5 | import sys
  6 | from tempfile import gettempdir
  7 | from contextlib import closing
  8 | 
  9 | 
 10 | aws_access_key="{}"
 11 | aws_secret_key="{}"
 12 | 
 13 | session=boto3.Session(aws_access_key_id =aws_access_key,aws_secret_access_key = aws_secret_key,region_name ='{}')
 14 | 
 15 | translate = session.client('translate')
 16 | 
 17 | polly = session.client("polly")
 18 | 
 19 | 
 20 | 
 21 | def translate_text(text, source_language,destination_language):
 22 |     """
 23 | 
 24 |     :param text: Input text which has to be translated
 25 |     :param source_language: The Input Language
 26 |     :param destination_language: The desired output Language
 27 |     :return:
 28 |     """
 29 |     result = translate.translate_text(
 30 |         Text=text,
 31 |         SourceLanguageCode=source_language,
 32 |         TargetLanguageCode=destination_language
 33 |     )
 34 |     return result['TranslatedText']
 35 | 
 36 | 
 37 | def text_to_speech(text_part):
 38 |     """
 39 |     :param text_part: The text which has to be converted to Hindi audio
 40 |     :return: temporary path where the audio is stored
 41 |     """
 42 | 
 43 |     print("Input text part for text to speech conversion : ",text_part)
 44 |     try:
 45 |         # Request speech synthesis
 46 |         response = polly.synthesize_speech(Text=text_part, LanguageCode="hi-IN",OutputFormat="mp3",
 47 |                                            VoiceId="Joanna")
 48 |     except (BotoCoreError, ClientError) as error:
 49 |         # The service returned an error, exit gracefully
 50 |         print(error)
 51 |         sys.exit(-1)
 52 |     # Access the audio stream from the response
 53 |     if "AudioStream" in response:
 54 |         # Note: Closing the stream is important because the service throttles on the
 55 |         # number of parallel connections. Here we are using contextlib.closing to
 56 |         # ensure the close method of the stream object will be called automatically
 57 |         # at the end of the with statement's scope.
 58 |         with closing(response["AudioStream"]) as stream:
 59 |             output = os.path.join(gettempdir(), "speech.mp3")
 60 | 
 61 |             try:
 62 |                 # Open a file for writing the output as a binary stream
 63 |                 with open(output, "wb") as file:
 64 |                     file.write(stream.read())
 65 |             except IOError as error:
 66 |                 # Could not write to file, exit gracefully
 67 |                 print(error)
 68 |                 sys.exit(-1)
 69 |             print("Output Path where audio is stored :",output)
 70 |             return output
 71 | 
 72 | 
 73 | 
 74 | 
 75 | 
 76 | def runner():
 77 |     col11, col22 = st.columns(2)
 78 |     with col11:
 79 |         st.title("Language Translation")
 80 |     st.markdown('Feel the power of Neural Machine Translation')
 81 |     with col22:
 82 |         st.image('Capture.PNG', use_column_width=True);
 83 |     col1, col2 = st.columns(2)
 84 |     conversion_list={"English":"en","Bengali":"bn","Hindi":"hi","French":"fr"}
 85 |     with col1:
 86 |         source_language = st.selectbox('Select the Source Language', ['Default', 'English', 'Bengali','Hindi','French'])
 87 |         input_text = st.text_input('Enter the Input text', 'Enter text here')
 88 |     with col2:
 89 |         destination_language = st.selectbox('Select the Destination', ['Default', 'English', 'Bengali','Hindi','French'])
 90 | 
 91 |     with col1:
 92 |         button_value = st.checkbox(label='Translate')
 93 |     translated_text=""
 94 |     if button_value:
 95 |         if source_language!=destination_language:
 96 |             print("The Source Language is {}".format(conversion_list[source_language]))
 97 |             print("The Destination Language is {}".format(conversion_list[destination_language]))
 98 |             translated_text=translate_text(input_text, conversion_list[source_language],conversion_list[destination_language])
 99 |         else:
100 |             translated_text=input_text
101 |     with col2:
102 |         st.text_input('Translated Text', translated_text)
103 |         print("Translated Text : ",translated_text)
104 |         if (destination_language == 'Hindi'):
105 |             button_value_text_to_speech = st.checkbox(label='Audio Form')
106 |             if(button_value_text_to_speech):
107 |                 audio_path = text_to_speech(translated_text)
108 |                 audio_file = open(audio_path, 'rb')
109 |                 audio_bytes = audio_file.read()
110 |                 st.audio(audio_bytes, format='audio / ogg')
111 | 
112 | 
113 | 
114 | runner()
115 | 
116 | 
117 | 
118 | 
119 | 


--------------------------------------------------------------------------------
/user_data_yt.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -xe
 2 | exec &>> /tmp/userdata_execution.log
 3 | 
 4 | upload_log() {
 5 |   aws s3 cp /tmp/userdata_execution.log s3://demoytuserdata/logs/
 6 |   sudo shutdown now -h
 7 | }
 8 | 
 9 | trap 'upload_log' EXIT
10 | 
11 | sudo apt update
12 | sudo apt -y install awscli
13 | sudo apt -y install python3-pip
14 | pip3  install --upgrade awscli
15 | pip3  install boto3 pandas pyarrow fastparquet
16 | aws s3 cp s3://demoytuserdata/script/news_fetcher.py .
17 | python3 news_fetcher.py
18 | aws s3 mv /home/ubuntu/news_data.parquet  s3://demoytuserdata/outputdirectory/


--------------------------------------------------------------------------------