├── outuput.PNG
├── Zylotech-Data-Engineer-Assessment.pdf
├── sql_queries.py
├── create_tables.py
├── README.md
├── etl.py
└── etl.ipynb


/outuput.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chandu-muthyala/Zylotech-Data-Engineer-Assessment/master/outuput.PNG


--------------------------------------------------------------------------------
/Zylotech-Data-Engineer-Assessment.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chandu-muthyala/Zylotech-Data-Engineer-Assessment/master/Zylotech-Data-Engineer-Assessment.pdf


--------------------------------------------------------------------------------
/sql_queries.py:
--------------------------------------------------------------------------------
 1 | #DROP TABLES
 2 | users_table_drop = "DROP TABLE IF EXISTS users"
 3 | ac_users_table_drop = "DROP TABLE IF EXISTS ac_users"
 4 | #CREATE TABLES
 5 | users_table_create = ("""CREATE TABLE IF NOT EXISTS users(
 6 |                             id SERIAL PRIMARY KEY,
 7 |                             user_id int NOT NULL,
 8 |                             first_name varchar NOT NULL,
 9 |                             last_name varchar NOT NULL,
10 |                             email varchar NOT NULL, 
11 |                             avatar text,
12 |                             time_stamp timestamp);
13 |                       """)
14 | ac_users_table_create = '''CREATE TABLE IF NOT EXISTS ac_users
15 |                                 (user_id int,
16 |                                 first_name text,
17 |                                 last_name  text,
18 |                                 email text,
19 |                                 avatar text,
20 |                                 PRIMARY KEY((user_id), first_name, last_name))'''
21 | 
22 | #INSERT INTO TABLES
23 | user_table_insert = ("""INSERT INTO users (user_id, first_name, last_name, email, avatar, time_stamp)
24 |                         VALUES (%s, %s, %s, %s, %s, %s)""")
25 | 
26 | ac_users_table_insert = '''INSERT INTO ac_users(user_id, first_name, last_name, email, avatar)
27 |                             VALUES(%s, %s, %s, %s,%s)'''
28 | 
29 | 
30 | #Aggregation Metrics Queries
31 | 
32 | fn_agg = ("""SELECT LEFT(first_name, 1) first_char, Count(*)
33 |                  FROM users
34 |                  GROUP BY first_char
35 |                  ORDER BY  first_char ASC;""")
36 | ln_agg = ("""SELECT LEFT(last_name, 1) first_char, Count(*)
37 |                 FROM users
38 |                 GROUP BY first_char
39 |                 ORDER BY first_char ASC """)
40 | #select query
41 | fn_query = """SELECT  first_name as first_char
42 |             FROM ac_users
43 |             """
44 | ln_query = """SELECT  last_name as first_char
45 |             FROM ac_users
46 |             """
47 | 
48 | drop_table_queries = [users_table_drop]
49 | create_table_queries = [users_table_create]
50 | 


--------------------------------------------------------------------------------
/create_tables.py:
--------------------------------------------------------------------------------
 1 | import psycopg2
 2 | from cassandra.cluster import Cluster
 3 | from sql_queries import drop_table_queries, create_table_queries, ac_users_table_drop
 4 | 
 5 | def create_database():
 6 |     # connect to default database
 7 |     conn = psycopg2.connect("host=127.0.0.1 dbname=studentdb user=student password=student")
 8 |     conn.set_session(autocommit=True)
 9 |     cur = conn.cursor()
10 |     
11 |     # create zylotechdb database with UTF8 encoding
12 |     cur.execute("DROP DATABASE IF EXISTS zylotechdb")
13 |     cur.execute("CREATE DATABASE zylotechdb WITH ENCODING 'utf8' TEMPLATE template0")
14 | 
15 |     # close connection to default database
16 |     conn.close()    
17 |     
18 |     # connect to zylotech database
19 |     conn = psycopg2.connect("host=127.0.0.1 dbname=zylotechdb user=student password=student")
20 |     cur = conn.cursor()
21 |     
22 |     return cur, conn
23 | def ac_create_database():
24 |     '''
25 |         This function is used to crate a NoSQL database.
26 |         return: None
27 |     '''
28 |     try: 
29 |         cluster = Cluster(['127.0.0.1'])
30 |         session = cluster.connect()
31 |     except Exception as e:
32 |         print(e)
33 |     try:
34 |         session.execute("""
35 |         CREATE KEYSPACE IF NOT EXISTS zylotechdb2 
36 |         WITH REPLICATION = 
37 |         { 'class' : 'SimpleStrategy', 'replication_factor' : 1 }"""
38 |         )
39 |         return session, cluster
40 |     except Exception as e:
41 |         print(e)
42 |         
43 | def drop_tables(cur, conn):
44 |     '''
45 |         This function is used to drop the tables in RDBMS
46 |         args:
47 |             cur: cursor connection
48 |             conn: database connection
49 |         return: None
50 |     '''
51 |     for query  in drop_table_queries:
52 |         try:
53 |             cur.execute(query)
54 |             conn.commit()
55 |         except psycopg2.Error as e: 
56 |             print("Error: Issue in droping table")
57 |             print (e)
58 | def ac_drop_tables(session, query):
59 |     '''
60 |         This function is used to drop tables in NoSQL database
61 |         args:
62 |             session: holds connection
63 |             query: string, query statement to drop table.
64 |         return: None
65 |     '''
66 |     try:
67 |         res = session.execute(query)
68 |     except Exception as e:
69 |         print(e)
70 | def create_tables(cur, conn):
71 |     '''
72 |         This function is used to create tables
73 |         args:
74 |             cur: cursor connection
75 |             conn: database connection
76 |         return: None
77 |     '''
78 |     for query  in create_table_queries:
79 |         try:
80 |             cur.execute(query)
81 |             conn.commit()
82 |         except psycopg2.Error as e: 
83 |             print("Error: Issue in creating table")
84 |             print (e)
85 |         
86 |     
87 | def main():
88 |     cur, conn = create_database()
89 |     session, cluster = ac_create_database()
90 |     drop_tables(cur, conn)
91 |     create_tables(cur, conn)
92 | 
93 |     ac_drop_tables(session, ac_users_table_drop)
94 |     
95 |     conn.close()
96 |     session.shutdown()
97 |     cluster.shutdown()
98 | if __name__ == "__main__":
99 |     main()


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Zylotech-Data-Engineer-Assessment
  2 | 
  3 | ## Installation
  4 | Please install below packages to run the code
  5 | 1. Install PostgreSQL database drivers
  6 |    > pip install psycopg2
  7 | 2. Install a package called requests
  8 |    > pip install requests
  9 | 3. Install a cassandra  driver to run the Apache Cassandra queries
 10 |    > pip install cassandra-driver
 11 | 4. Install schedule library, a simple library to use for scheduling jobs.
 12 |    > pip install schedule
 13 | 
 14 | 
 15 | ## Dataset
 16 | ```
 17 | API endpoints to collect data
 18 |    https://reqres.in/api/users?page=1
 19 |    https://reqres.in/api/users?page=2
 20 |    https://reqres.in/api/users?page=3
 21 |    https://reqres.in/api/users?page=4
 22 | ```
 23 | And below is the sample JSON response from the endpoint API when parameter, page=1:
 24 | ```
 25 | {
 26 |    "page":1,
 27 |    "per_page":3,
 28 |    "total":12,
 29 |    "total_pages":4,
 30 |    "data":[
 31 |                         {
 32 |                            "id":1,
 33 |                            "email":"george.bluth@reqres.in",
 34 |                            "first_name":"George",
 35 |                            "last_name":"Bluth",
 36 |                            "avatar":"https://s3.amazonaws.com/uifaces/faces/twitter/calebogden/128.jpg"
 37 |                         },
 38 |                         {
 39 |                            "id":2,
 40 |                            "email":"janet.weaver@reqres.in",
 41 |                            "first_name":"Janet",
 42 |                            "last_name":"Weaver",
 43 |                            "avatar":"https://s3.amazonaws.com/uifaces/faces/twitter/josephstein/128.jpg"
 44 |                         }
 45 |                         {
 46 |                            "id":3,
 47 |                            "email":"emma.wong@reqres.in",
 48 |                            "first_name":"Emma",
 49 |                            "last_name":"Wong",
 50 |                            "avatar":"https://s3.amazonaws.com/uifaces/faces/twitter/olegpogodaev/128.jpg"
 51 |                         }
 52 |            ]
 53 |   }
 54 | ```
 55 | 
 56 | ## Project Template
 57 | The Assessment workspace includes 4 files:
 58 | 1. ***create_tables.py***:This script is used to drop and create tables. Run prior to executing ETL scripts.
 59 | 2. ***sql_queries.py***: Contains all sql queries, and is imported whenever required.
 60 | 3. ***etl.ipynb***: This notebook contains step by step execution of ETL process with out schedulers job.
 61 | 4. ***etl.py***: This file feteches data by calling endpoint APIs to create **SQL** and **NoSQL** data model.
 62 | 
 63 | ## Assessment Steps
 64 | 
 65 | Below are the steps I  followed to complete the assessment:
 66 | 
 67 |    ### Database Selection:
 68 |    Reasons for selecting the databases.
 69 |    
 70 |       1. NoSQL: Apache Cassandra
 71 |          - High Availability:
 72 |             * Supports multiple master model, when a single node is lost the ability of the cluster writes are not affected during the crash.
 73 |             * 100% uptime and no downtime.
 74 |          - Scalability: 
 75 |             * Multiple master model can write on any server node.
 76 |             * Scalability is directly proportional to the number of nodes in the servers in a cluster.
 77 |          - The learning curve for CQL is very minimal as it is similar to SQL.
 78 |       2. SQL: PostgreSQL 
 79 |          - Performance: PostgreSQL performs well in OLTP/OLAP systems when read/write speeds are required and extensive data analysis is needed.
 80 |          - It is better suited for Data Warehousing and data analysis applications that require fast read/write speeds.
 81 |          - Supports a wide variety of programming languages.
 82 |    ### Create Tables
 83 |       1. Created tables using CREATE statements in sql_queries.py.
 84 |       2. Existing tables are droped using DROP statements in sql_queries.py.
 85 |       3. Ran create_tables.py to create database and tables.
 86 |       4. Inserted records into tables using INSERT statements in sql_queries.py.
 87 |       5. Aggregation is achieved through SELECT statements in sql_queries.py.
 88 |       
 89 |    ### ETL Pipeline
 90 |    #### RDBMS (PostgreSQL) data modeling
 91 |       1. Connected to the created zylotechdb database.
 92 |       2. Using requests library collected the data from API endpoints and inserted into relational database.
 93 |       3. Validated the data after insertion.
 94 |       4. Ran aggregation metrics queries to get the required result.
 95 |    #### NoSQL (Apache Cassandra) data modeling
 96 |       1. Connected to the created zylotechdb2 KEYSPACE.
 97 |       2. Created tables in Apache Cassandra.
 98 |       3. Using requests library collected the data from API endpoints and inserted into NoSQL tables.
 99 |       4. SELECT statements in sql_queries.py  for aggregation metrics.
100 |    ### Scheduling Job
101 |       Scheduling a job can be done in multiple ways:
102 |          - CRON: Schedule jobs to run ETL scripts periodically at fixed times, dates, or intervals using `crontab`.
103 |          - Flask server: Can Schedule ETL script periodically at certain intervals using `threading.Timer`. We can use this for smaller data sets.
104 |          - Schedule: Can schedule ETL pipeline periodically at pre-determined intervals using `schedule` library.
105 |          - Jenkins: ETL process can be scheduled as a jenkins job, the cron time can be set in the jenkins file of the ETL process project. Mostly used in production envirnment.
106 |          - Apache Airflow: Schedule, monitor and visualize workflows, this can also mostly used in production envirnment.
107 |          
108 |       As it is a small dataset I used schedule library to schedule ETL script which will run for every 12 hours.
109 |       1. Job function is called to run the ETL pipeline every 12 hours using schedule library.
110 |       2. Every 12 hours ETL pipeline will be executed and can view the given aggregation metrics result.
111 |   ### Data Check:
112 |       1.  Data Type Check:Column data type defination as per the data model design specification.
113 |       2.  Data Length Check: Database columns are as per the data model design specifications.
114 |       3. Index/Constraint Check: 
115 |             - Added 'NOT NULL' constraint for the required columns.
116 |             - Unique key columns are Indexed for required column to avoid duplicate entries.
117 | 
118 |    **DATA COMPLETENESS CHECK:**
119 |    
120 |       1 . Record Count Validation: Compared endpoint records to the inserted records.
121 | 
122 | ## Execute files in the below order each time before running ETL pipeline
123 |    1. create_tables.py
124 |       > python3 create_tables.py
125 |    2. etl.py
126 |       > python3 etl.py
127 | ### Final Output:
128 |  
129 | | ![Aggregation metric and status](outuput.PNG)  |
130 | |:---:|
131 | | Aggregation metric and status | 
132 | 
133 | ## References:
134 | 1. [stackOverflow](https://stackoverflow.com/questions/8856384/sql-select-first-letter-of-a-word)
135 | 2. [HTTP requests and JSON parsing in Python](https://stackoverflow.com/questions/6386308/http-requests-and-json-parsing-in-python)
136 | 3. [scheduling](https://pypi.org/project/schedule/)
137 | 4. [generic logging to my scheduled jobs](https://schedule.readthedocs.io/en/stable/faq.html#what-if-my-task-throws-an-exception)
138 | 5. [ETL Testing](http://www.datagaps.com/concepts/etl-testing)
139 | 6. [PostgreSQL](https://www.2ndquadrant.com/en/postgresql/postgresql-vs-mysql/)
140 | 7. [Apache Cassandra](https://scalegrid.io/blog/cassandra-vs-mongodb/)
141 | 


--------------------------------------------------------------------------------
/etl.py:
--------------------------------------------------------------------------------
  1 | import psycopg2
  2 | from sql_queries import user_table_insert, fn_agg, ln_agg, fn_query, ln_query
  3 | import requests
  4 | import functools
  5 | import time
  6 | import schedule
  7 | import cassandra
  8 | from cassandra.cluster import Cluster
  9 | from sql_queries import ac_users_table_create, ac_users_table_insert
 10 | 
 11 | 
 12 | try: 
 13 |     conn = psycopg2.connect("host=127.0.0.1 dbname=zylotechdb user=student password=student")
 14 |     cur = conn.cursor()
 15 | except psycopg2.Error as e: 
 16 |     print("Error: Could not make connection to the Postgres database")
 17 |     print(e)
 18 | 
 19 | try: 
 20 |     cluster = Cluster(['127.0.0.1'])
 21 |     session = cluster.connect()
 22 | except Exception as e:
 23 |     print(e)
 24 | 
 25 | try:
 26 |     session.set_keyspace('zylotechdb2')
 27 | except Exception as e:
 28 |     print(e)
 29 | 
 30 | try:
 31 |     session.execute(ac_users_table_create)
 32 | except Exception as e:
 33 |     print(e)
 34 | # api-endpoint 
 35 | URL = "https://reqres.in/api/users?page="
 36 | #data requesting from endpoint pages
 37 | pages = [1, 2, 3, 4]
 38 | def ac_user_check(user_id):
 39 |     try:
 40 |         rows = session.execute("select count(*) from ac_users where user_id = {0}".format(user_id))
 41 |         for row in rows:
 42 |             if row.count:
 43 |                 return False
 44 |             else:
 45 |                 return True
 46 |     except Exception as e:
 47 |         print(e)
 48 |         
 49 | def check_user(id1):
 50 | #     print("id1: ", isinstance(id1, int))
 51 |     if not isinstance(id1, int):
 52 |         return False
 53 |     try: 
 54 |         
 55 |         cur.execute("SELECT * FROM users where user_id ='{0}'".format(id1))
 56 |         result = cur.fetchall()
 57 |         if len(result):
 58 |             return False
 59 |         else:
 60 |             return True
 61 |     except psycopg2.Error as e: 
 62 |         print("Error: select *")
 63 |         print (e)
 64 | def ac_aggregation_metrics(query):
 65 |     
 66 |     fc_dic = {}
 67 |     try:
 68 |         rows = session.execute(query)
 69 |         for row in rows:
 70 | #             print(row.first_char)
 71 |             fn_temp = row.first_char[0].upper()
 72 |             if fn_temp in fc_dic:
 73 |                 fc_dic[fn_temp] +=1
 74 |             else:
 75 |                 fc_dic[fn_temp] = 1
 76 |     except Exception as e:
 77 |         print(e)
 78 |     return fc_dic
 79 | def insert_data(session, query):
 80 |     '''
 81 |         This function is used to insert records into tables
 82 |         args:
 83 |             session: holds connection
 84 |             query: string, query statement to insert into table.
 85 |         return: None
 86 |     '''
 87 |     for page in pages:
 88 |         URL = "https://reqres.in/api/users?page="+str(page)
 89 | #         print(URL)
 90 |         try:
 91 |             res = requests.get(url = URL)
 92 |             res_json = res.json()
 93 |         except requests.exceptions.HTTPError as e:
 94 |             print(e)
 95 |         
 96 |         rows_affected = 0
 97 |         rows_duplicate = 0
 98 |         per_page = res_json['per_page']
 99 |         if res_json['data']:
100 |             users = res_json['data']
101 |             for user in users:
102 |                 id1 = user['id']
103 |                 email = user['email']
104 |                 first_name = user['first_name']
105 |                 last_name = user['last_name']
106 |                 avatar = user['avatar']
107 |                 if ac_user_check(id1):
108 |                     try:
109 |                         session.execute(query, (int(id1), str(first_name), str(last_name), str(email), str(avatar)))
110 |                         rows_affected += 1
111 |                     except Exception as e:
112 |                         print(e)
113 |                 else:
114 | #                     print("Record already exists in the DB")
115 |                     rows_duplicate += 1
116 |                     pass
117 |             if rows_affected == per_page:
118 |                     print("Page = {0}, Status = All records inserted.".format(page))
119 |             else:
120 |                 print("Page = {0}, Status = Missing  few records, per_page = {1}, Failed = {2}, Duplicate_rows = {3}"\
121 |                       .format(page, per_page, (per_page-rows_affected), rows_duplicate))
122 |         else:
123 |             print("No users list in the API end point.")
124 | 
125 | # Count of last name starting with same letter:
126 | def aggregation_metrics(query):
127 |     try: 
128 |         cur.execute(query)
129 |         result = cur.fetchall()
130 |         return result
131 |     except psycopg2.Error as e: 
132 |         print("Error: ", query)
133 |         print (e)
134 | # This decorator can be applied to
135 | def with_logging(func):
136 |     @functools.wraps(func)
137 |     def wrapper(*args, **kwargs):
138 |         print('LOG: Running job "%s"' % func.__name__)
139 |         result = func(*args, **kwargs)
140 |         print('LOG: Job "%s" completed' % func.__name__)
141 |         return result
142 |     return wrapper
143 | 
144 | @with_logging
145 | def job():
146 |     print("--- SQL datamodeling ---")
147 |     for page in pages:
148 |         URL = "https://reqres.in/api/users?page="+str(page)
149 | #         print(URL)
150 |         try:
151 |             res = requests.get(url = URL)
152 |             res_json = res.json()
153 |         except requests.exceptions.HTTPError as e:
154 |             print(e)
155 |         
156 |         rows_affected = 0
157 |         rows_duplicate = 0
158 |         per_page = res_json['per_page']
159 |         if res_json['data']:
160 |             users = res_json['data']
161 |             for user in users:
162 |                 id1 = user['id']
163 |                 email = user['email']
164 |                 first_name = user['first_name']
165 |                 last_name = user['last_name']
166 |                 avatar = user['avatar']
167 |     #                 print(id1, email, first_name, last_name, avatar)
168 |                 values = (id1,  first_name, last_name,email, avatar, "now()")
169 |         
170 |                 try:
171 |                     if check_user(id1):
172 |                         cur.execute(user_table_insert, values)
173 |                         rows_affected += 1
174 |                     else:
175 | #                         print("Record already exists in the DB")
176 |                         rows_duplicate += 1
177 |                         pass
178 |                         
179 |                 except psycopg2.Error as e: 
180 |                     print("Error: Inserting Rows")
181 |                     print (e) 
182 |             if rows_affected == per_page:
183 |                 print("Page = {0}, Status = All records inserted.".format(page))
184 |             else:
185 |                 print("Page = {0}, Status = Missing  few records, per_page = {1}, Failed = {2}, Duplicate_rows = {3}"\
186 |                       .format(page, per_page, (per_page-rows_affected), rows_duplicate))
187 |         else:
188 |             print("No users list in the API end point.")
189 |     
190 |     fn_agg_res = aggregation_metrics(fn_agg)
191 |     ln_agg_res = aggregation_metrics(ln_agg)
192 |     print("RDBMS Aggregation Results: ")
193 |     print("First Name: ", end = '')
194 |     for row in fn_agg_res:
195 |         print(row, end = ' ')
196 |     print("\nLast Name: ", end = '')
197 |     for row in ln_agg_res:
198 |         print(row, end = ' ')
199 |     print()
200 | 
201 | 
202 |     #NoSQL datamodeling
203 |     
204 |     # INSERT into the table
205 |     print("--- NoSQL datamodeling ---")
206 |     insert_data(session, ac_users_table_insert)
207 |     fn_result = ac_aggregation_metrics(fn_query)
208 | 
209 |     ln_result = ac_aggregation_metrics(ln_query)
210 |     
211 |     print("NoSQl Aggregation Results: ")
212 |     print("First Name:", fn_result)
213 |     print("Last Name:", ln_result)
214 | schedule.every(12).hours.do(job)
215 | while 1:
216 |     schedule.run_pending()
217 |     time.sleep(1)
218 | 


--------------------------------------------------------------------------------
/etl.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## RDBMS data modeling"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 26,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import psycopg2\n",
 17 |     "from sql_queries import user_table_insert, fn_agg, ln_agg\n",
 18 |     "import requests"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 27,
 24 |    "metadata": {},
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "try: \n",
 28 |     "    conn = psycopg2.connect(\"host=127.0.0.1 dbname=zylotechdb user=student password=student\")\n",
 29 |     "    cur = conn.cursor()\n",
 30 |     "except psycopg2.Error as e: \n",
 31 |     "    print(\"Error: Could not make connection to the Postgres database\")\n",
 32 |     "    print(e)\n",
 33 |     "conn.set_session(autocommit=True)"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 28,
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "# api-endpoint \n",
 43 |     "URL = \"https://reqres.in/api/users?page=\"\n",
 44 |     "#data requesting from endpoint pages\n",
 45 |     "pages = [1, 2, 3, 4]"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": null,
 51 |    "metadata": {},
 52 |    "outputs": [],
 53 |    "source": []
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": 29,
 58 |    "metadata": {},
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "def check_user(id1):\n",
 62 |     "    try: \n",
 63 |     "        cur.execute(\"SELECT * FROM users where user_id ='{0}'\".format(id1))\n",
 64 |     "        result = cur.fetchall()\n",
 65 |     "        if len(result):\n",
 66 |     "            return False\n",
 67 |     "        else:\n",
 68 |     "            return True\n",
 69 |     "    except psycopg2.Error as e: \n",
 70 |     "        print(\"Error: select *\")\n",
 71 |     "        print (e)"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": 30,
 77 |    "metadata": {},
 78 |    "outputs": [
 79 |     {
 80 |      "name": "stdout",
 81 |      "output_type": "stream",
 82 |      "text": [
 83 |       "https://reqres.in/api/users?page=1\n",
 84 |       "Record already exists in the DB\n",
 85 |       "Record already exists in the DB\n",
 86 |       "Record already exists in the DB\n",
 87 |       "Page = 1, Status = Missing  few records, per_page = 3, Failed = 3\n",
 88 |       "https://reqres.in/api/users?page=2\n",
 89 |       "Record already exists in the DB\n",
 90 |       "Record already exists in the DB\n",
 91 |       "Record already exists in the DB\n",
 92 |       "Page = 2, Status = Missing  few records, per_page = 3, Failed = 3\n",
 93 |       "https://reqres.in/api/users?page=3\n",
 94 |       "Record already exists in the DB\n",
 95 |       "Record already exists in the DB\n",
 96 |       "Record already exists in the DB\n",
 97 |       "Page = 3, Status = Missing  few records, per_page = 3, Failed = 3\n",
 98 |       "https://reqres.in/api/users?page=4\n",
 99 |       "Record already exists in the DB\n",
100 |       "Record already exists in the DB\n",
101 |       "Record already exists in the DB\n",
102 |       "Page = 4, Status = Missing  few records, per_page = 3, Failed = 3\n"
103 |      ]
104 |     }
105 |    ],
106 |    "source": [
107 |     "for page in pages:\n",
108 |     "    URL = \"https://reqres.in/api/users?page=\"+str(page)\n",
109 |     "    print(URL)\n",
110 |     "    try:\n",
111 |     "        res = requests.get(url = URL)\n",
112 |     "        res_json = res.json()\n",
113 |     "    except requests.exceptions.HTTPError as e:\n",
114 |     "        print(e)\n",
115 |     "    rows_affected = 0\n",
116 |     "    per_page = res_json['per_page']\n",
117 |     "    if res_json['data']:\n",
118 |     "        users = res_json['data']\n",
119 |     "        for user in users:\n",
120 |     "            id1 = user['id']\n",
121 |     "            email = user['email']\n",
122 |     "            first_name = user['first_name']\n",
123 |     "            last_name = user['last_name']\n",
124 |     "            avatar = user['avatar']\n",
125 |     "#                 print(id1, email, first_name, last_name, avatar)\n",
126 |     "            values = (id1,  first_name, last_name,email, avatar, \"now()\")\n",
127 |     "    \n",
128 |     "            try:\n",
129 |     "                if check_user(id1):\n",
130 |     "                    cur.execute(user_table_insert, values)\n",
131 |     "                    rows_affected += 1\n",
132 |     "                else:\n",
133 |     "                    print(\"Record already exists in the DB\")\n",
134 |     "            except psycopg2.Error as e: \n",
135 |     "                print(\"Error: Inserting Rows\")\n",
136 |     "                print (e) \n",
137 |     "        if rows_affected == per_page:\n",
138 |     "            print(\"Page = {0}, Status = All records inserted.\".format(page))\n",
139 |     "        else:\n",
140 |     "            print(\"Page = {0}, Status = Missing  few records, per_page = {1}, Failed = {2}\".format(page, per_page, (per_page-rows_affected)))\n",
141 |     "            \n",
142 |     "#         print(\"affected rows = {0}, per_page = {1}\".format(rows_affected, per_page))\n",
143 |     "    else:\n",
144 |     "        print(\"No users list in the API end point.\")\n"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": 31,
150 |    "metadata": {},
151 |    "outputs": [
152 |     {
153 |      "data": {
154 |       "text/plain": [
155 |        "False"
156 |       ]
157 |      },
158 |      "execution_count": 31,
159 |      "metadata": {},
160 |      "output_type": "execute_result"
161 |     }
162 |    ],
163 |    "source": [
164 |     "check_user(1)"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": 32,
170 |    "metadata": {},
171 |    "outputs": [
172 |     {
173 |      "name": "stdout",
174 |      "output_type": "stream",
175 |      "text": [
176 |       "George Bluth\n",
177 |       "Janet Weaver\n",
178 |       "Emma Wong\n",
179 |       "Eve Holt\n",
180 |       "Charles Morris\n",
181 |       "Tracey Ramos\n",
182 |       "Michael Lawson\n",
183 |       "Lindsay Ferguson\n",
184 |       "Tobias Funke\n",
185 |       "Byron Fields\n",
186 |       "George Edwards\n",
187 |       "Rachel Howell\n"
188 |      ]
189 |     }
190 |    ],
191 |    "source": [
192 |     "try: \n",
193 |     "    cur.execute(\"SELECT * FROM users;\")\n",
194 |     "except psycopg2.Error as e: \n",
195 |     "    print(\"Error: select *\")\n",
196 |     "    print (e)\n",
197 |     "\n",
198 |     "row = cur.fetchone()\n",
199 |     "while row:\n",
200 |     "    #print first_name and last_name\n",
201 |     "    print(row[2], row[3])\n",
202 |     "    row = cur.fetchone()"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "code",
207 |    "execution_count": 33,
208 |    "metadata": {},
209 |    "outputs": [],
210 |    "source": [
211 |     "# Count of last name starting with same letter:\n",
212 |     "def aggregation_metrics(query):\n",
213 |     "    try: \n",
214 |     "        cur.execute(query)\n",
215 |     "        rows = cur.fetchall()\n",
216 |     "        return rows\n",
217 |     "    except psycopg2.Error as e: \n",
218 |     "        print(\"Error: \", query)\n",
219 |     "        print (e)"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": 34,
225 |    "metadata": {},
226 |    "outputs": [],
227 |    "source": [
228 |     "fn_agg_res = aggregation_metrics(fn_agg)\n",
229 |     "ln_agg_res = aggregation_metrics(ln_agg)"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "code",
234 |    "execution_count": 35,
235 |    "metadata": {},
236 |    "outputs": [],
237 |    "source": [
238 |     "# try: \n",
239 |     "#     cur.execute(\"select * from users\")\n",
240 |     "#     rows = cur.fetchall()\n",
241 |     "#     print( rows)\n",
242 |     "# except psycopg2.Error as e: \n",
243 |     "#     print(\"Error: \", query)\n",
244 |     "#     print (e)"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "code",
249 |    "execution_count": 36,
250 |    "metadata": {},
251 |    "outputs": [
252 |     {
253 |      "name": "stdout",
254 |      "output_type": "stream",
255 |      "text": [
256 |       "('B', 1)\n",
257 |       "('C', 1)\n",
258 |       "('E', 2)\n",
259 |       "('G', 2)\n",
260 |       "('J', 1)\n",
261 |       "('L', 1)\n",
262 |       "('M', 1)\n",
263 |       "('R', 1)\n",
264 |       "('T', 2)\n"
265 |      ]
266 |     }
267 |    ],
268 |    "source": [
269 |     "for row in fn_agg_res:\n",
270 |     "    print(row)   "
271 |    ]
272 |   },
273 |   {
274 |    "cell_type": "code",
275 |    "execution_count": 37,
276 |    "metadata": {},
277 |    "outputs": [
278 |     {
279 |      "name": "stdout",
280 |      "output_type": "stream",
281 |      "text": [
282 |       "('B', 1)\n",
283 |       "('E', 1)\n",
284 |       "('F', 3)\n",
285 |       "('H', 2)\n",
286 |       "('L', 1)\n",
287 |       "('M', 1)\n",
288 |       "('R', 1)\n",
289 |       "('W', 2)\n"
290 |      ]
291 |     }
292 |    ],
293 |    "source": [
294 |     "for row in ln_agg_res:\n",
295 |     "    print(row)"
296 |    ]
297 |   },
298 |   {
299 |    "cell_type": "markdown",
300 |    "metadata": {},
301 |    "source": [
302 |     "## NoSQL data modeling"
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "code",
307 |    "execution_count": 38,
308 |    "metadata": {},
309 |    "outputs": [],
310 |    "source": [
311 |     "import cassandra\n",
312 |     "from cassandra.cluster import Cluster\n",
313 |     "from sql_queries import ac_users_table_create, ac_users_table_insert"
314 |    ]
315 |   },
316 |   {
317 |    "cell_type": "code",
318 |    "execution_count": 39,
319 |    "metadata": {},
320 |    "outputs": [],
321 |    "source": [
322 |     "try: \n",
323 |     "    cluster = Cluster(['127.0.0.1'])\n",
324 |     "    session = cluster.connect()\n",
325 |     "except Exception as e:\n",
326 |     "    print(e)"
327 |    ]
328 |   },
329 |   {
330 |    "cell_type": "code",
331 |    "execution_count": 40,
332 |    "metadata": {},
333 |    "outputs": [],
334 |    "source": [
335 |     "try:\n",
336 |     "    session.set_keyspace('zylotechdb2')\n",
337 |     "except Exception as e:\n",
338 |     "    print(e)"
339 |    ]
340 |   },
341 |   {
342 |    "cell_type": "code",
343 |    "execution_count": 41,
344 |    "metadata": {},
345 |    "outputs": [],
346 |    "source": [
347 |     "try:\n",
348 |     "    session.execute(ac_users_table_create)\n",
349 |     "except Exception as e:\n",
350 |     "    print(e)"
351 |    ]
352 |   },
353 |   {
354 |    "cell_type": "code",
355 |    "execution_count": null,
356 |    "metadata": {},
357 |    "outputs": [],
358 |    "source": []
359 |   },
360 |   {
361 |    "cell_type": "code",
362 |    "execution_count": 42,
363 |    "metadata": {},
364 |    "outputs": [],
365 |    "source": [
366 |     "def insert_data(session, query):\n",
367 |     "    '''\n",
368 |     "        This function is used to insert records into tables\n",
369 |     "        args:\n",
370 |     "            session: holds connection\n",
371 |     "            query: string, query statement to insert into table.\n",
372 |     "        return: None\n",
373 |     "    '''\n",
374 |     "    for page in pages:\n",
375 |     "        URL = \"https://reqres.in/api/users?page=\"+str(page)\n",
376 |     "        print(URL)\n",
377 |     "        try:\n",
378 |     "            res = requests.get(url = URL)\n",
379 |     "            res_json = res.json()\n",
380 |     "        except requests.exceptions.HTTPError as e:\n",
381 |     "            print(e)\n",
382 |     "        \n",
383 |     "        if res_json['data']:\n",
384 |     "            users = res_json['data']\n",
385 |     "            for user in users:\n",
386 |     "                id1 = user['id']\n",
387 |     "                email = user['email']\n",
388 |     "                first_name = user['first_name']\n",
389 |     "                last_name = user['last_name']\n",
390 |     "                avatar = user['avatar']\n",
391 |     "                try:\n",
392 |     "                    session.execute(query, (int(id1), str(first_name), str(last_name), str(email), str(avatar)))\n",
393 |     "                except Exception as e:\n",
394 |     "                    print(e)"
395 |    ]
396 |   },
397 |   {
398 |    "cell_type": "code",
399 |    "execution_count": 43,
400 |    "metadata": {},
401 |    "outputs": [
402 |     {
403 |      "name": "stdout",
404 |      "output_type": "stream",
405 |      "text": [
406 |       "https://reqres.in/api/users?page=1\n",
407 |       "https://reqres.in/api/users?page=2\n",
408 |       "https://reqres.in/api/users?page=3\n",
409 |       "https://reqres.in/api/users?page=4\n"
410 |      ]
411 |     }
412 |    ],
413 |    "source": [
414 |     "# INSERT into the table\n",
415 |     "insert_data(session, ac_users_table_insert)"
416 |    ]
417 |   },
418 |   {
419 |    "cell_type": "code",
420 |    "execution_count": 63,
421 |    "metadata": {},
422 |    "outputs": [
423 |     {
424 |      "data": {
425 |       "text/plain": [
426 |        "False"
427 |       ]
428 |      },
429 |      "execution_count": 63,
430 |      "metadata": {},
431 |      "output_type": "execute_result"
432 |     }
433 |    ],
434 |    "source": [
435 |     "def ac_user_check(user_id):\n",
436 |     "    try:\n",
437 |     "        rows = session.execute(\"select count(*) from ac_users where user_id = {0}\".format(user_id))\n",
438 |     "        for row in rows:\n",
439 |     "            if row.count:\n",
440 |     "                return False\n",
441 |     "            else:\n",
442 |     "                return True\n",
443 |     "    except Exception as e:\n",
444 |     "        print(e)\n",
445 |     "ac_user_check(12)"
446 |    ]
447 |   },
448 |   {
449 |    "cell_type": "code",
450 |    "execution_count": 20,
451 |    "metadata": {},
452 |    "outputs": [],
453 |    "source": [
454 |     "\n",
455 |     "def ac_aggregation_metrics(query):\n",
456 |     "    \n",
457 |     "    fc_dic = {}\n",
458 |     "    try:\n",
459 |     "        rows = session.execute(query)\n",
460 |     "        for row in rows:\n",
461 |     "#             print(row.first_char)\n",
462 |     "            fn_temp = row.first_char[0].upper()\n",
463 |     "            if fn_temp in fc_dic:\n",
464 |     "                fc_dic[fn_temp] +=1\n",
465 |     "            else:\n",
466 |     "                fc_dic[fn_temp] = 1\n",
467 |     "    except Exception as e:\n",
468 |     "        print(e)\n",
469 |     "    return fc_dic"
470 |    ]
471 |   },
472 |   {
473 |    "cell_type": "code",
474 |    "execution_count": 21,
475 |    "metadata": {},
476 |    "outputs": [],
477 |    "source": [
478 |     "#select query\n",
479 |     "fn_query = \"\"\"SELECT  first_name as first_char\n",
480 |     "            FROM ac_users\n",
481 |     "            \"\"\"\n",
482 |     "ln_query = \"\"\"SELECT  last_name as first_char\n",
483 |     "            FROM ac_users\n",
484 |     "            \"\"\"\n",
485 |     "fn_result = ac_aggregation_metrics(fn_query)\n",
486 |     "ln_result = ac_aggregation_metrics(ln_query)"
487 |    ]
488 |   },
489 |   {
490 |    "cell_type": "code",
491 |    "execution_count": 22,
492 |    "metadata": {},
493 |    "outputs": [
494 |     {
495 |      "name": "stdout",
496 |      "output_type": "stream",
497 |      "text": [
498 |       "fn: {'C': 1, 'B': 1, 'G': 2, 'L': 1, 'J': 1, 'E': 2, 'M': 1, 'T': 2, 'R': 1}\n",
499 |       "ln: {'M': 1, 'F': 3, 'E': 1, 'B': 1, 'W': 2, 'H': 2, 'L': 1, 'R': 1}\n"
500 |      ]
501 |     }
502 |    ],
503 |    "source": [
504 |     "print(\"fn:\", fn_result)\n",
505 |     "print(\"ln:\", ln_result)"
506 |    ]
507 |   },
508 |   {
509 |    "cell_type": "code",
510 |    "execution_count": 23,
511 |    "metadata": {},
512 |    "outputs": [
513 |     {
514 |      "name": "stdout",
515 |      "output_type": "stream",
516 |      "text": [
517 |       "Requirement already satisfied: schedule in /opt/conda/lib/python3.6/site-packages (0.6.0)\n"
518 |      ]
519 |     }
520 |    ],
521 |    "source": [
522 |     "! pip install schedule"
523 |    ]
524 |   },
525 |   {
526 |    "cell_type": "code",
527 |    "execution_count": 24,
528 |    "metadata": {},
529 |    "outputs": [],
530 |    "source": [
531 |     "session.shutdown()\n",
532 |     "cluster.shutdown()"
533 |    ]
534 |   }
535 |  ],
536 |  "metadata": {
537 |   "kernelspec": {
538 |    "display_name": "Python 3",
539 |    "language": "python",
540 |    "name": "python3"
541 |   },
542 |   "language_info": {
543 |    "codemirror_mode": {
544 |     "name": "ipython",
545 |     "version": 3
546 |    },
547 |    "file_extension": ".py",
548 |    "mimetype": "text/x-python",
549 |    "name": "python",
550 |    "nbconvert_exporter": "python",
551 |    "pygments_lexer": "ipython3",
552 |    "version": "3.6.3"
553 |   }
554 |  },
555 |  "nbformat": 4,
556 |  "nbformat_minor": 2
557 | }
558 | 


--------------------------------------------------------------------------------