├── outuput.PNG ├── Zylotech-Data-Engineer-Assessment.pdf ├── sql_queries.py ├── create_tables.py ├── README.md ├── etl.py └── etl.ipynb /outuput.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chandu-muthyala/Zylotech-Data-Engineer-Assessment/master/outuput.PNG -------------------------------------------------------------------------------- /Zylotech-Data-Engineer-Assessment.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chandu-muthyala/Zylotech-Data-Engineer-Assessment/master/Zylotech-Data-Engineer-Assessment.pdf -------------------------------------------------------------------------------- /sql_queries.py: -------------------------------------------------------------------------------- 1 | #DROP TABLES 2 | users_table_drop = "DROP TABLE IF EXISTS users" 3 | ac_users_table_drop = "DROP TABLE IF EXISTS ac_users" 4 | #CREATE TABLES 5 | users_table_create = ("""CREATE TABLE IF NOT EXISTS users( 6 | id SERIAL PRIMARY KEY, 7 | user_id int NOT NULL, 8 | first_name varchar NOT NULL, 9 | last_name varchar NOT NULL, 10 | email varchar NOT NULL, 11 | avatar text, 12 | time_stamp timestamp); 13 | """) 14 | ac_users_table_create = '''CREATE TABLE IF NOT EXISTS ac_users 15 | (user_id int, 16 | first_name text, 17 | last_name text, 18 | email text, 19 | avatar text, 20 | PRIMARY KEY((user_id), first_name, last_name))''' 21 | 22 | #INSERT INTO TABLES 23 | user_table_insert = ("""INSERT INTO users (user_id, first_name, last_name, email, avatar, time_stamp) 24 | VALUES (%s, %s, %s, %s, %s, %s)""") 25 | 26 | ac_users_table_insert = '''INSERT INTO ac_users(user_id, first_name, last_name, email, avatar) 27 | VALUES(%s, %s, %s, %s,%s)''' 28 | 29 | 30 | #Aggregation Metrics Queries 31 | 32 | fn_agg = ("""SELECT LEFT(first_name, 1) first_char, Count(*) 33 | FROM users 34 | GROUP BY first_char 35 | ORDER BY first_char ASC;""") 36 | ln_agg = ("""SELECT LEFT(last_name, 1) first_char, Count(*) 37 | FROM users 38 | GROUP BY first_char 39 | ORDER BY first_char ASC """) 40 | #select query 41 | fn_query = """SELECT first_name as first_char 42 | FROM ac_users 43 | """ 44 | ln_query = """SELECT last_name as first_char 45 | FROM ac_users 46 | """ 47 | 48 | drop_table_queries = [users_table_drop] 49 | create_table_queries = [users_table_create] 50 | -------------------------------------------------------------------------------- /create_tables.py: -------------------------------------------------------------------------------- 1 | import psycopg2 2 | from cassandra.cluster import Cluster 3 | from sql_queries import drop_table_queries, create_table_queries, ac_users_table_drop 4 | 5 | def create_database(): 6 | # connect to default database 7 | conn = psycopg2.connect("host=127.0.0.1 dbname=studentdb user=student password=student") 8 | conn.set_session(autocommit=True) 9 | cur = conn.cursor() 10 | 11 | # create zylotechdb database with UTF8 encoding 12 | cur.execute("DROP DATABASE IF EXISTS zylotechdb") 13 | cur.execute("CREATE DATABASE zylotechdb WITH ENCODING 'utf8' TEMPLATE template0") 14 | 15 | # close connection to default database 16 | conn.close() 17 | 18 | # connect to zylotech database 19 | conn = psycopg2.connect("host=127.0.0.1 dbname=zylotechdb user=student password=student") 20 | cur = conn.cursor() 21 | 22 | return cur, conn 23 | def ac_create_database(): 24 | ''' 25 | This function is used to crate a NoSQL database. 26 | return: None 27 | ''' 28 | try: 29 | cluster = Cluster(['127.0.0.1']) 30 | session = cluster.connect() 31 | except Exception as e: 32 | print(e) 33 | try: 34 | session.execute(""" 35 | CREATE KEYSPACE IF NOT EXISTS zylotechdb2 36 | WITH REPLICATION = 37 | { 'class' : 'SimpleStrategy', 'replication_factor' : 1 }""" 38 | ) 39 | return session, cluster 40 | except Exception as e: 41 | print(e) 42 | 43 | def drop_tables(cur, conn): 44 | ''' 45 | This function is used to drop the tables in RDBMS 46 | args: 47 | cur: cursor connection 48 | conn: database connection 49 | return: None 50 | ''' 51 | for query in drop_table_queries: 52 | try: 53 | cur.execute(query) 54 | conn.commit() 55 | except psycopg2.Error as e: 56 | print("Error: Issue in droping table") 57 | print (e) 58 | def ac_drop_tables(session, query): 59 | ''' 60 | This function is used to drop tables in NoSQL database 61 | args: 62 | session: holds connection 63 | query: string, query statement to drop table. 64 | return: None 65 | ''' 66 | try: 67 | res = session.execute(query) 68 | except Exception as e: 69 | print(e) 70 | def create_tables(cur, conn): 71 | ''' 72 | This function is used to create tables 73 | args: 74 | cur: cursor connection 75 | conn: database connection 76 | return: None 77 | ''' 78 | for query in create_table_queries: 79 | try: 80 | cur.execute(query) 81 | conn.commit() 82 | except psycopg2.Error as e: 83 | print("Error: Issue in creating table") 84 | print (e) 85 | 86 | 87 | def main(): 88 | cur, conn = create_database() 89 | session, cluster = ac_create_database() 90 | drop_tables(cur, conn) 91 | create_tables(cur, conn) 92 | 93 | ac_drop_tables(session, ac_users_table_drop) 94 | 95 | conn.close() 96 | session.shutdown() 97 | cluster.shutdown() 98 | if __name__ == "__main__": 99 | main() -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Zylotech-Data-Engineer-Assessment 2 | 3 | ## Installation 4 | Please install below packages to run the code 5 | 1. Install PostgreSQL database drivers 6 | > pip install psycopg2 7 | 2. Install a package called requests 8 | > pip install requests 9 | 3. Install a cassandra driver to run the Apache Cassandra queries 10 | > pip install cassandra-driver 11 | 4. Install schedule library, a simple library to use for scheduling jobs. 12 | > pip install schedule 13 | 14 | 15 | ## Dataset 16 | ``` 17 | API endpoints to collect data 18 | https://reqres.in/api/users?page=1 19 | https://reqres.in/api/users?page=2 20 | https://reqres.in/api/users?page=3 21 | https://reqres.in/api/users?page=4 22 | ``` 23 | And below is the sample JSON response from the endpoint API when parameter, page=1: 24 | ``` 25 | { 26 | "page":1, 27 | "per_page":3, 28 | "total":12, 29 | "total_pages":4, 30 | "data":[ 31 | { 32 | "id":1, 33 | "email":"george.bluth@reqres.in", 34 | "first_name":"George", 35 | "last_name":"Bluth", 36 | "avatar":"https://s3.amazonaws.com/uifaces/faces/twitter/calebogden/128.jpg" 37 | }, 38 | { 39 | "id":2, 40 | "email":"janet.weaver@reqres.in", 41 | "first_name":"Janet", 42 | "last_name":"Weaver", 43 | "avatar":"https://s3.amazonaws.com/uifaces/faces/twitter/josephstein/128.jpg" 44 | } 45 | { 46 | "id":3, 47 | "email":"emma.wong@reqres.in", 48 | "first_name":"Emma", 49 | "last_name":"Wong", 50 | "avatar":"https://s3.amazonaws.com/uifaces/faces/twitter/olegpogodaev/128.jpg" 51 | } 52 | ] 53 | } 54 | ``` 55 | 56 | ## Project Template 57 | The Assessment workspace includes 4 files: 58 | 1. ***create_tables.py***:This script is used to drop and create tables. Run prior to executing ETL scripts. 59 | 2. ***sql_queries.py***: Contains all sql queries, and is imported whenever required. 60 | 3. ***etl.ipynb***: This notebook contains step by step execution of ETL process with out schedulers job. 61 | 4. ***etl.py***: This file feteches data by calling endpoint APIs to create **SQL** and **NoSQL** data model. 62 | 63 | ## Assessment Steps 64 | 65 | Below are the steps I followed to complete the assessment: 66 | 67 | ### Database Selection: 68 | Reasons for selecting the databases. 69 | 70 | 1. NoSQL: Apache Cassandra 71 | - High Availability: 72 | * Supports multiple master model, when a single node is lost the ability of the cluster writes are not affected during the crash. 73 | * 100% uptime and no downtime. 74 | - Scalability: 75 | * Multiple master model can write on any server node. 76 | * Scalability is directly proportional to the number of nodes in the servers in a cluster. 77 | - The learning curve for CQL is very minimal as it is similar to SQL. 78 | 2. SQL: PostgreSQL 79 | - Performance: PostgreSQL performs well in OLTP/OLAP systems when read/write speeds are required and extensive data analysis is needed. 80 | - It is better suited for Data Warehousing and data analysis applications that require fast read/write speeds. 81 | - Supports a wide variety of programming languages. 82 | ### Create Tables 83 | 1. Created tables using CREATE statements in sql_queries.py. 84 | 2. Existing tables are droped using DROP statements in sql_queries.py. 85 | 3. Ran create_tables.py to create database and tables. 86 | 4. Inserted records into tables using INSERT statements in sql_queries.py. 87 | 5. Aggregation is achieved through SELECT statements in sql_queries.py. 88 | 89 | ### ETL Pipeline 90 | #### RDBMS (PostgreSQL) data modeling 91 | 1. Connected to the created zylotechdb database. 92 | 2. Using requests library collected the data from API endpoints and inserted into relational database. 93 | 3. Validated the data after insertion. 94 | 4. Ran aggregation metrics queries to get the required result. 95 | #### NoSQL (Apache Cassandra) data modeling 96 | 1. Connected to the created zylotechdb2 KEYSPACE. 97 | 2. Created tables in Apache Cassandra. 98 | 3. Using requests library collected the data from API endpoints and inserted into NoSQL tables. 99 | 4. SELECT statements in sql_queries.py for aggregation metrics. 100 | ### Scheduling Job 101 | Scheduling a job can be done in multiple ways: 102 | - CRON: Schedule jobs to run ETL scripts periodically at fixed times, dates, or intervals using `crontab`. 103 | - Flask server: Can Schedule ETL script periodically at certain intervals using `threading.Timer`. We can use this for smaller data sets. 104 | - Schedule: Can schedule ETL pipeline periodically at pre-determined intervals using `schedule` library. 105 | - Jenkins: ETL process can be scheduled as a jenkins job, the cron time can be set in the jenkins file of the ETL process project. Mostly used in production envirnment. 106 | - Apache Airflow: Schedule, monitor and visualize workflows, this can also mostly used in production envirnment. 107 | 108 | As it is a small dataset I used schedule library to schedule ETL script which will run for every 12 hours. 109 | 1. Job function is called to run the ETL pipeline every 12 hours using schedule library. 110 | 2. Every 12 hours ETL pipeline will be executed and can view the given aggregation metrics result. 111 | ### Data Check: 112 | 1. Data Type Check:Column data type defination as per the data model design specification. 113 | 2. Data Length Check: Database columns are as per the data model design specifications. 114 | 3. Index/Constraint Check: 115 | - Added 'NOT NULL' constraint for the required columns. 116 | - Unique key columns are Indexed for required column to avoid duplicate entries. 117 | 118 | **DATA COMPLETENESS CHECK:** 119 | 120 | 1 . Record Count Validation: Compared endpoint records to the inserted records. 121 | 122 | ## Execute files in the below order each time before running ETL pipeline 123 | 1. create_tables.py 124 | > python3 create_tables.py 125 | 2. etl.py 126 | > python3 etl.py 127 | ### Final Output: 128 | 129 | | ![Aggregation metric and status](outuput.PNG) | 130 | |:---:| 131 | | Aggregation metric and status | 132 | 133 | ## References: 134 | 1. [stackOverflow](https://stackoverflow.com/questions/8856384/sql-select-first-letter-of-a-word) 135 | 2. [HTTP requests and JSON parsing in Python](https://stackoverflow.com/questions/6386308/http-requests-and-json-parsing-in-python) 136 | 3. [scheduling](https://pypi.org/project/schedule/) 137 | 4. [generic logging to my scheduled jobs](https://schedule.readthedocs.io/en/stable/faq.html#what-if-my-task-throws-an-exception) 138 | 5. [ETL Testing](http://www.datagaps.com/concepts/etl-testing) 139 | 6. [PostgreSQL](https://www.2ndquadrant.com/en/postgresql/postgresql-vs-mysql/) 140 | 7. [Apache Cassandra](https://scalegrid.io/blog/cassandra-vs-mongodb/) 141 | -------------------------------------------------------------------------------- /etl.py: -------------------------------------------------------------------------------- 1 | import psycopg2 2 | from sql_queries import user_table_insert, fn_agg, ln_agg, fn_query, ln_query 3 | import requests 4 | import functools 5 | import time 6 | import schedule 7 | import cassandra 8 | from cassandra.cluster import Cluster 9 | from sql_queries import ac_users_table_create, ac_users_table_insert 10 | 11 | 12 | try: 13 | conn = psycopg2.connect("host=127.0.0.1 dbname=zylotechdb user=student password=student") 14 | cur = conn.cursor() 15 | except psycopg2.Error as e: 16 | print("Error: Could not make connection to the Postgres database") 17 | print(e) 18 | 19 | try: 20 | cluster = Cluster(['127.0.0.1']) 21 | session = cluster.connect() 22 | except Exception as e: 23 | print(e) 24 | 25 | try: 26 | session.set_keyspace('zylotechdb2') 27 | except Exception as e: 28 | print(e) 29 | 30 | try: 31 | session.execute(ac_users_table_create) 32 | except Exception as e: 33 | print(e) 34 | # api-endpoint 35 | URL = "https://reqres.in/api/users?page=" 36 | #data requesting from endpoint pages 37 | pages = [1, 2, 3, 4] 38 | def ac_user_check(user_id): 39 | try: 40 | rows = session.execute("select count(*) from ac_users where user_id = {0}".format(user_id)) 41 | for row in rows: 42 | if row.count: 43 | return False 44 | else: 45 | return True 46 | except Exception as e: 47 | print(e) 48 | 49 | def check_user(id1): 50 | # print("id1: ", isinstance(id1, int)) 51 | if not isinstance(id1, int): 52 | return False 53 | try: 54 | 55 | cur.execute("SELECT * FROM users where user_id ='{0}'".format(id1)) 56 | result = cur.fetchall() 57 | if len(result): 58 | return False 59 | else: 60 | return True 61 | except psycopg2.Error as e: 62 | print("Error: select *") 63 | print (e) 64 | def ac_aggregation_metrics(query): 65 | 66 | fc_dic = {} 67 | try: 68 | rows = session.execute(query) 69 | for row in rows: 70 | # print(row.first_char) 71 | fn_temp = row.first_char[0].upper() 72 | if fn_temp in fc_dic: 73 | fc_dic[fn_temp] +=1 74 | else: 75 | fc_dic[fn_temp] = 1 76 | except Exception as e: 77 | print(e) 78 | return fc_dic 79 | def insert_data(session, query): 80 | ''' 81 | This function is used to insert records into tables 82 | args: 83 | session: holds connection 84 | query: string, query statement to insert into table. 85 | return: None 86 | ''' 87 | for page in pages: 88 | URL = "https://reqres.in/api/users?page="+str(page) 89 | # print(URL) 90 | try: 91 | res = requests.get(url = URL) 92 | res_json = res.json() 93 | except requests.exceptions.HTTPError as e: 94 | print(e) 95 | 96 | rows_affected = 0 97 | rows_duplicate = 0 98 | per_page = res_json['per_page'] 99 | if res_json['data']: 100 | users = res_json['data'] 101 | for user in users: 102 | id1 = user['id'] 103 | email = user['email'] 104 | first_name = user['first_name'] 105 | last_name = user['last_name'] 106 | avatar = user['avatar'] 107 | if ac_user_check(id1): 108 | try: 109 | session.execute(query, (int(id1), str(first_name), str(last_name), str(email), str(avatar))) 110 | rows_affected += 1 111 | except Exception as e: 112 | print(e) 113 | else: 114 | # print("Record already exists in the DB") 115 | rows_duplicate += 1 116 | pass 117 | if rows_affected == per_page: 118 | print("Page = {0}, Status = All records inserted.".format(page)) 119 | else: 120 | print("Page = {0}, Status = Missing few records, per_page = {1}, Failed = {2}, Duplicate_rows = {3}"\ 121 | .format(page, per_page, (per_page-rows_affected), rows_duplicate)) 122 | else: 123 | print("No users list in the API end point.") 124 | 125 | # Count of last name starting with same letter: 126 | def aggregation_metrics(query): 127 | try: 128 | cur.execute(query) 129 | result = cur.fetchall() 130 | return result 131 | except psycopg2.Error as e: 132 | print("Error: ", query) 133 | print (e) 134 | # This decorator can be applied to 135 | def with_logging(func): 136 | @functools.wraps(func) 137 | def wrapper(*args, **kwargs): 138 | print('LOG: Running job "%s"' % func.__name__) 139 | result = func(*args, **kwargs) 140 | print('LOG: Job "%s" completed' % func.__name__) 141 | return result 142 | return wrapper 143 | 144 | @with_logging 145 | def job(): 146 | print("--- SQL datamodeling ---") 147 | for page in pages: 148 | URL = "https://reqres.in/api/users?page="+str(page) 149 | # print(URL) 150 | try: 151 | res = requests.get(url = URL) 152 | res_json = res.json() 153 | except requests.exceptions.HTTPError as e: 154 | print(e) 155 | 156 | rows_affected = 0 157 | rows_duplicate = 0 158 | per_page = res_json['per_page'] 159 | if res_json['data']: 160 | users = res_json['data'] 161 | for user in users: 162 | id1 = user['id'] 163 | email = user['email'] 164 | first_name = user['first_name'] 165 | last_name = user['last_name'] 166 | avatar = user['avatar'] 167 | # print(id1, email, first_name, last_name, avatar) 168 | values = (id1, first_name, last_name,email, avatar, "now()") 169 | 170 | try: 171 | if check_user(id1): 172 | cur.execute(user_table_insert, values) 173 | rows_affected += 1 174 | else: 175 | # print("Record already exists in the DB") 176 | rows_duplicate += 1 177 | pass 178 | 179 | except psycopg2.Error as e: 180 | print("Error: Inserting Rows") 181 | print (e) 182 | if rows_affected == per_page: 183 | print("Page = {0}, Status = All records inserted.".format(page)) 184 | else: 185 | print("Page = {0}, Status = Missing few records, per_page = {1}, Failed = {2}, Duplicate_rows = {3}"\ 186 | .format(page, per_page, (per_page-rows_affected), rows_duplicate)) 187 | else: 188 | print("No users list in the API end point.") 189 | 190 | fn_agg_res = aggregation_metrics(fn_agg) 191 | ln_agg_res = aggregation_metrics(ln_agg) 192 | print("RDBMS Aggregation Results: ") 193 | print("First Name: ", end = '') 194 | for row in fn_agg_res: 195 | print(row, end = ' ') 196 | print("\nLast Name: ", end = '') 197 | for row in ln_agg_res: 198 | print(row, end = ' ') 199 | print() 200 | 201 | 202 | #NoSQL datamodeling 203 | 204 | # INSERT into the table 205 | print("--- NoSQL datamodeling ---") 206 | insert_data(session, ac_users_table_insert) 207 | fn_result = ac_aggregation_metrics(fn_query) 208 | 209 | ln_result = ac_aggregation_metrics(ln_query) 210 | 211 | print("NoSQl Aggregation Results: ") 212 | print("First Name:", fn_result) 213 | print("Last Name:", ln_result) 214 | schedule.every(12).hours.do(job) 215 | while 1: 216 | schedule.run_pending() 217 | time.sleep(1) 218 | -------------------------------------------------------------------------------- /etl.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## RDBMS data modeling" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 26, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import psycopg2\n", 17 | "from sql_queries import user_table_insert, fn_agg, ln_agg\n", 18 | "import requests" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 27, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "try: \n", 28 | " conn = psycopg2.connect(\"host=127.0.0.1 dbname=zylotechdb user=student password=student\")\n", 29 | " cur = conn.cursor()\n", 30 | "except psycopg2.Error as e: \n", 31 | " print(\"Error: Could not make connection to the Postgres database\")\n", 32 | " print(e)\n", 33 | "conn.set_session(autocommit=True)" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 28, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "# api-endpoint \n", 43 | "URL = \"https://reqres.in/api/users?page=\"\n", 44 | "#data requesting from endpoint pages\n", 45 | "pages = [1, 2, 3, 4]" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 29, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "def check_user(id1):\n", 62 | " try: \n", 63 | " cur.execute(\"SELECT * FROM users where user_id ='{0}'\".format(id1))\n", 64 | " result = cur.fetchall()\n", 65 | " if len(result):\n", 66 | " return False\n", 67 | " else:\n", 68 | " return True\n", 69 | " except psycopg2.Error as e: \n", 70 | " print(\"Error: select *\")\n", 71 | " print (e)" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 30, 77 | "metadata": {}, 78 | "outputs": [ 79 | { 80 | "name": "stdout", 81 | "output_type": "stream", 82 | "text": [ 83 | "https://reqres.in/api/users?page=1\n", 84 | "Record already exists in the DB\n", 85 | "Record already exists in the DB\n", 86 | "Record already exists in the DB\n", 87 | "Page = 1, Status = Missing few records, per_page = 3, Failed = 3\n", 88 | "https://reqres.in/api/users?page=2\n", 89 | "Record already exists in the DB\n", 90 | "Record already exists in the DB\n", 91 | "Record already exists in the DB\n", 92 | "Page = 2, Status = Missing few records, per_page = 3, Failed = 3\n", 93 | "https://reqres.in/api/users?page=3\n", 94 | "Record already exists in the DB\n", 95 | "Record already exists in the DB\n", 96 | "Record already exists in the DB\n", 97 | "Page = 3, Status = Missing few records, per_page = 3, Failed = 3\n", 98 | "https://reqres.in/api/users?page=4\n", 99 | "Record already exists in the DB\n", 100 | "Record already exists in the DB\n", 101 | "Record already exists in the DB\n", 102 | "Page = 4, Status = Missing few records, per_page = 3, Failed = 3\n" 103 | ] 104 | } 105 | ], 106 | "source": [ 107 | "for page in pages:\n", 108 | " URL = \"https://reqres.in/api/users?page=\"+str(page)\n", 109 | " print(URL)\n", 110 | " try:\n", 111 | " res = requests.get(url = URL)\n", 112 | " res_json = res.json()\n", 113 | " except requests.exceptions.HTTPError as e:\n", 114 | " print(e)\n", 115 | " rows_affected = 0\n", 116 | " per_page = res_json['per_page']\n", 117 | " if res_json['data']:\n", 118 | " users = res_json['data']\n", 119 | " for user in users:\n", 120 | " id1 = user['id']\n", 121 | " email = user['email']\n", 122 | " first_name = user['first_name']\n", 123 | " last_name = user['last_name']\n", 124 | " avatar = user['avatar']\n", 125 | "# print(id1, email, first_name, last_name, avatar)\n", 126 | " values = (id1, first_name, last_name,email, avatar, \"now()\")\n", 127 | " \n", 128 | " try:\n", 129 | " if check_user(id1):\n", 130 | " cur.execute(user_table_insert, values)\n", 131 | " rows_affected += 1\n", 132 | " else:\n", 133 | " print(\"Record already exists in the DB\")\n", 134 | " except psycopg2.Error as e: \n", 135 | " print(\"Error: Inserting Rows\")\n", 136 | " print (e) \n", 137 | " if rows_affected == per_page:\n", 138 | " print(\"Page = {0}, Status = All records inserted.\".format(page))\n", 139 | " else:\n", 140 | " print(\"Page = {0}, Status = Missing few records, per_page = {1}, Failed = {2}\".format(page, per_page, (per_page-rows_affected)))\n", 141 | " \n", 142 | "# print(\"affected rows = {0}, per_page = {1}\".format(rows_affected, per_page))\n", 143 | " else:\n", 144 | " print(\"No users list in the API end point.\")\n" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": 31, 150 | "metadata": {}, 151 | "outputs": [ 152 | { 153 | "data": { 154 | "text/plain": [ 155 | "False" 156 | ] 157 | }, 158 | "execution_count": 31, 159 | "metadata": {}, 160 | "output_type": "execute_result" 161 | } 162 | ], 163 | "source": [ 164 | "check_user(1)" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 32, 170 | "metadata": {}, 171 | "outputs": [ 172 | { 173 | "name": "stdout", 174 | "output_type": "stream", 175 | "text": [ 176 | "George Bluth\n", 177 | "Janet Weaver\n", 178 | "Emma Wong\n", 179 | "Eve Holt\n", 180 | "Charles Morris\n", 181 | "Tracey Ramos\n", 182 | "Michael Lawson\n", 183 | "Lindsay Ferguson\n", 184 | "Tobias Funke\n", 185 | "Byron Fields\n", 186 | "George Edwards\n", 187 | "Rachel Howell\n" 188 | ] 189 | } 190 | ], 191 | "source": [ 192 | "try: \n", 193 | " cur.execute(\"SELECT * FROM users;\")\n", 194 | "except psycopg2.Error as e: \n", 195 | " print(\"Error: select *\")\n", 196 | " print (e)\n", 197 | "\n", 198 | "row = cur.fetchone()\n", 199 | "while row:\n", 200 | " #print first_name and last_name\n", 201 | " print(row[2], row[3])\n", 202 | " row = cur.fetchone()" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": 33, 208 | "metadata": {}, 209 | "outputs": [], 210 | "source": [ 211 | "# Count of last name starting with same letter:\n", 212 | "def aggregation_metrics(query):\n", 213 | " try: \n", 214 | " cur.execute(query)\n", 215 | " rows = cur.fetchall()\n", 216 | " return rows\n", 217 | " except psycopg2.Error as e: \n", 218 | " print(\"Error: \", query)\n", 219 | " print (e)" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": 34, 225 | "metadata": {}, 226 | "outputs": [], 227 | "source": [ 228 | "fn_agg_res = aggregation_metrics(fn_agg)\n", 229 | "ln_agg_res = aggregation_metrics(ln_agg)" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": 35, 235 | "metadata": {}, 236 | "outputs": [], 237 | "source": [ 238 | "# try: \n", 239 | "# cur.execute(\"select * from users\")\n", 240 | "# rows = cur.fetchall()\n", 241 | "# print( rows)\n", 242 | "# except psycopg2.Error as e: \n", 243 | "# print(\"Error: \", query)\n", 244 | "# print (e)" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": 36, 250 | "metadata": {}, 251 | "outputs": [ 252 | { 253 | "name": "stdout", 254 | "output_type": "stream", 255 | "text": [ 256 | "('B', 1)\n", 257 | "('C', 1)\n", 258 | "('E', 2)\n", 259 | "('G', 2)\n", 260 | "('J', 1)\n", 261 | "('L', 1)\n", 262 | "('M', 1)\n", 263 | "('R', 1)\n", 264 | "('T', 2)\n" 265 | ] 266 | } 267 | ], 268 | "source": [ 269 | "for row in fn_agg_res:\n", 270 | " print(row) " 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": 37, 276 | "metadata": {}, 277 | "outputs": [ 278 | { 279 | "name": "stdout", 280 | "output_type": "stream", 281 | "text": [ 282 | "('B', 1)\n", 283 | "('E', 1)\n", 284 | "('F', 3)\n", 285 | "('H', 2)\n", 286 | "('L', 1)\n", 287 | "('M', 1)\n", 288 | "('R', 1)\n", 289 | "('W', 2)\n" 290 | ] 291 | } 292 | ], 293 | "source": [ 294 | "for row in ln_agg_res:\n", 295 | " print(row)" 296 | ] 297 | }, 298 | { 299 | "cell_type": "markdown", 300 | "metadata": {}, 301 | "source": [ 302 | "## NoSQL data modeling" 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": 38, 308 | "metadata": {}, 309 | "outputs": [], 310 | "source": [ 311 | "import cassandra\n", 312 | "from cassandra.cluster import Cluster\n", 313 | "from sql_queries import ac_users_table_create, ac_users_table_insert" 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": 39, 319 | "metadata": {}, 320 | "outputs": [], 321 | "source": [ 322 | "try: \n", 323 | " cluster = Cluster(['127.0.0.1'])\n", 324 | " session = cluster.connect()\n", 325 | "except Exception as e:\n", 326 | " print(e)" 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": 40, 332 | "metadata": {}, 333 | "outputs": [], 334 | "source": [ 335 | "try:\n", 336 | " session.set_keyspace('zylotechdb2')\n", 337 | "except Exception as e:\n", 338 | " print(e)" 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "execution_count": 41, 344 | "metadata": {}, 345 | "outputs": [], 346 | "source": [ 347 | "try:\n", 348 | " session.execute(ac_users_table_create)\n", 349 | "except Exception as e:\n", 350 | " print(e)" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": null, 356 | "metadata": {}, 357 | "outputs": [], 358 | "source": [] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "execution_count": 42, 363 | "metadata": {}, 364 | "outputs": [], 365 | "source": [ 366 | "def insert_data(session, query):\n", 367 | " '''\n", 368 | " This function is used to insert records into tables\n", 369 | " args:\n", 370 | " session: holds connection\n", 371 | " query: string, query statement to insert into table.\n", 372 | " return: None\n", 373 | " '''\n", 374 | " for page in pages:\n", 375 | " URL = \"https://reqres.in/api/users?page=\"+str(page)\n", 376 | " print(URL)\n", 377 | " try:\n", 378 | " res = requests.get(url = URL)\n", 379 | " res_json = res.json()\n", 380 | " except requests.exceptions.HTTPError as e:\n", 381 | " print(e)\n", 382 | " \n", 383 | " if res_json['data']:\n", 384 | " users = res_json['data']\n", 385 | " for user in users:\n", 386 | " id1 = user['id']\n", 387 | " email = user['email']\n", 388 | " first_name = user['first_name']\n", 389 | " last_name = user['last_name']\n", 390 | " avatar = user['avatar']\n", 391 | " try:\n", 392 | " session.execute(query, (int(id1), str(first_name), str(last_name), str(email), str(avatar)))\n", 393 | " except Exception as e:\n", 394 | " print(e)" 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": 43, 400 | "metadata": {}, 401 | "outputs": [ 402 | { 403 | "name": "stdout", 404 | "output_type": "stream", 405 | "text": [ 406 | "https://reqres.in/api/users?page=1\n", 407 | "https://reqres.in/api/users?page=2\n", 408 | "https://reqres.in/api/users?page=3\n", 409 | "https://reqres.in/api/users?page=4\n" 410 | ] 411 | } 412 | ], 413 | "source": [ 414 | "# INSERT into the table\n", 415 | "insert_data(session, ac_users_table_insert)" 416 | ] 417 | }, 418 | { 419 | "cell_type": "code", 420 | "execution_count": 63, 421 | "metadata": {}, 422 | "outputs": [ 423 | { 424 | "data": { 425 | "text/plain": [ 426 | "False" 427 | ] 428 | }, 429 | "execution_count": 63, 430 | "metadata": {}, 431 | "output_type": "execute_result" 432 | } 433 | ], 434 | "source": [ 435 | "def ac_user_check(user_id):\n", 436 | " try:\n", 437 | " rows = session.execute(\"select count(*) from ac_users where user_id = {0}\".format(user_id))\n", 438 | " for row in rows:\n", 439 | " if row.count:\n", 440 | " return False\n", 441 | " else:\n", 442 | " return True\n", 443 | " except Exception as e:\n", 444 | " print(e)\n", 445 | "ac_user_check(12)" 446 | ] 447 | }, 448 | { 449 | "cell_type": "code", 450 | "execution_count": 20, 451 | "metadata": {}, 452 | "outputs": [], 453 | "source": [ 454 | "\n", 455 | "def ac_aggregation_metrics(query):\n", 456 | " \n", 457 | " fc_dic = {}\n", 458 | " try:\n", 459 | " rows = session.execute(query)\n", 460 | " for row in rows:\n", 461 | "# print(row.first_char)\n", 462 | " fn_temp = row.first_char[0].upper()\n", 463 | " if fn_temp in fc_dic:\n", 464 | " fc_dic[fn_temp] +=1\n", 465 | " else:\n", 466 | " fc_dic[fn_temp] = 1\n", 467 | " except Exception as e:\n", 468 | " print(e)\n", 469 | " return fc_dic" 470 | ] 471 | }, 472 | { 473 | "cell_type": "code", 474 | "execution_count": 21, 475 | "metadata": {}, 476 | "outputs": [], 477 | "source": [ 478 | "#select query\n", 479 | "fn_query = \"\"\"SELECT first_name as first_char\n", 480 | " FROM ac_users\n", 481 | " \"\"\"\n", 482 | "ln_query = \"\"\"SELECT last_name as first_char\n", 483 | " FROM ac_users\n", 484 | " \"\"\"\n", 485 | "fn_result = ac_aggregation_metrics(fn_query)\n", 486 | "ln_result = ac_aggregation_metrics(ln_query)" 487 | ] 488 | }, 489 | { 490 | "cell_type": "code", 491 | "execution_count": 22, 492 | "metadata": {}, 493 | "outputs": [ 494 | { 495 | "name": "stdout", 496 | "output_type": "stream", 497 | "text": [ 498 | "fn: {'C': 1, 'B': 1, 'G': 2, 'L': 1, 'J': 1, 'E': 2, 'M': 1, 'T': 2, 'R': 1}\n", 499 | "ln: {'M': 1, 'F': 3, 'E': 1, 'B': 1, 'W': 2, 'H': 2, 'L': 1, 'R': 1}\n" 500 | ] 501 | } 502 | ], 503 | "source": [ 504 | "print(\"fn:\", fn_result)\n", 505 | "print(\"ln:\", ln_result)" 506 | ] 507 | }, 508 | { 509 | "cell_type": "code", 510 | "execution_count": 23, 511 | "metadata": {}, 512 | "outputs": [ 513 | { 514 | "name": "stdout", 515 | "output_type": "stream", 516 | "text": [ 517 | "Requirement already satisfied: schedule in /opt/conda/lib/python3.6/site-packages (0.6.0)\n" 518 | ] 519 | } 520 | ], 521 | "source": [ 522 | "! pip install schedule" 523 | ] 524 | }, 525 | { 526 | "cell_type": "code", 527 | "execution_count": 24, 528 | "metadata": {}, 529 | "outputs": [], 530 | "source": [ 531 | "session.shutdown()\n", 532 | "cluster.shutdown()" 533 | ] 534 | } 535 | ], 536 | "metadata": { 537 | "kernelspec": { 538 | "display_name": "Python 3", 539 | "language": "python", 540 | "name": "python3" 541 | }, 542 | "language_info": { 543 | "codemirror_mode": { 544 | "name": "ipython", 545 | "version": 3 546 | }, 547 | "file_extension": ".py", 548 | "mimetype": "text/x-python", 549 | "name": "python", 550 | "nbconvert_exporter": "python", 551 | "pygments_lexer": "ipython3", 552 | "version": "3.6.3" 553 | } 554 | }, 555 | "nbformat": 4, 556 | "nbformat_minor": 2 557 | } 558 | --------------------------------------------------------------------------------