├── API_Project-Quandl
├── .ipynb_checkpoints
│ └── API_Project-Quandl-checkpoint.ipynb
├── API_Project-Quandl.ipynb
├── batch_rulex_script.py
├── display_predictions.py
├── files_execution.txt
├── local_interpretability.py
└── model_predictor.py
├── Clustering_Project-Customer_Segmentation
├── .ipynb_checkpoints
│ └── Mini_Project_Clustering-checkpoint.ipynb
├── Mini_Project_Clustering.ipynb
├── WineKMC.xlsx
├── agglomerate.png
└── spectral.png
├── Data_Wrangling_Project-JSON_File
├── .ipynb_checkpoints
│ └── JSON_Project-World_Bank_Data-checkpoint.ipynb
├── JSON_Project-World_Bank_Data.ipynb
└── data
│ └── world_bank_projects.json
├── Exploratory_Data_Analysis_Project-Examine_Racial_Discrimination
├── .ipynb_checkpoints
│ └── EDA_Project-Examine_Racial_Discrimination-checkpoint.ipynb
├── EDA_Project-Examine_Racial_Discrimination.ipynb
└── data
│ └── us_job_market_discrimination.dta
├── Exploratory_Data_Analysis_Project-Hospital_Readmissions
├── .ipynb_checkpoints
│ └── EDA_Project-Hospital_Readmissions-checkpoint.ipynb
├── EDA_Project-Hospital_Readmissions.ipynb
└── data
│ └── cms_hospital_readmissions.csv
├── Exploratory_Data_Analysis_Project-Normal_Human_Body_Temperature
├── EDA_Project-Normal_Human_Body_Temperature.ipynb
└── data
│ └── human_body_temperature.csv
├── Google_API_Project
└── Google API Project.ipynb
├── Linear_Regression_Project-Boston_Housing_Dataset
├── .ipynb_checkpoints
│ └── Mini_Project_Linear_Regression-checkpoint.ipynb
└── Mini_Project_Linear_Regression.ipynb
├── Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights
├── .gitignore
├── Logistic_Regression-Mini_Project.ipynb
├── data
│ └── 01_heights_weights_genders.csv
└── images
│ ├── bias.png
│ ├── complexity-error-plot.png
│ ├── complexity-error-reg.png
│ ├── data.png
│ ├── knn1.png
│ ├── knn2.png
│ ├── linreg.png
│ ├── linsep.png
│ ├── onelinesplit.png
│ ├── pcanim.gif
│ ├── reshape.jpg
│ ├── sklearn2.jpg
│ ├── sklearntrans.jpg
│ ├── train-cv2.png
│ ├── train-cv3.png
│ ├── train-test.png
│ ├── train-validate-test-cont.png
│ ├── train-validate-test.png
│ └── train-validate-test3.png
├── Naive_Bayes_Project-Predicting_Movie_Ratings_From_Reviews
├── .ipynb_checkpoints
│ ├── Mini_Project_Naive_Bayes-checkpoint.ipynb
│ └── TextAnalysis-checkpoint.ipynb
├── Mini_Project_Naive_Bayes.ipynb
├── Test1.png
├── callibration.png
├── critics.csv
├── terms.png
├── terms2.png
└── vsm.png
├── README.md
├── SQL_Project-Country_Club_Database
├── SQL_Project-Country_Club_Database.sql
├── Schema.JPG
└── data
├── Spark_Project-Databricks
├── .ipynb_checkpoints
│ └── Spark-Mini_Project-checkpoint.ipynb
└── Spark-Mini_Project.ipynb
├── Take_Home_Challenge-Relax_Inc
├── .ipynb_checkpoints
│ └── Relax Take Home Challenge-checkpoint.ipynb
├── Relax Take Home Challenge.ipynb
├── Relax_Keynote.pdf
├── relax_data_science_challenge.pdf
├── takehome_user_engagement.csv
└── takehome_users.csv
└── Take_Home_Challenge-Ultimate_Technologies_Inc
├── .ipynb_checkpoints
└── Take_Home_Challenge-Notebook-checkpoint.ipynb
├── Take_Home_Challenge-Notebook.ipynb
├── logins.json
├── ultimate_data_challenge.json
└── ultimate_data_science_challenge.pdf
/API_Project-Quandl/.ipynb_checkpoints/API_Project-Quandl-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# API_Project-Quandl"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "# Importing the relevant modules.\n",
17 | "\n",
18 | "import requests\n",
19 | "import json \n",
20 | "import operator\n",
21 | "import numpy as np"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": 2,
27 | "metadata": {},
28 | "outputs": [],
29 | "source": [
30 | "# Unique API key which is taken from http://www.quandl.com website.\n",
31 | "\n",
32 | "API_KEY = 'zHER-uPSaTEaUxTgB2d4' "
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": 3,
38 | "metadata": {},
39 | "outputs": [],
40 | "source": [
41 | "# Caling the Quandl API and pull out a small sample of the data (only one day) to get a glimpse \n",
42 | "# into the JSON structure that will be returned.\n",
43 | "\n",
44 | "url = 'https://www.quandl.com/api/v3/datasets/FSE/AFX_X.json?&start_date=2017-01-01&end_date=2017-01-01&api_key=' + API_KEY\n",
45 | "r = requests.get(url)\n",
46 | "r_json = r.json()"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": 22,
52 | "metadata": {},
53 | "outputs": [
54 | {
55 | "data": {
56 | "text/plain": [
57 | "{'dataset': {'id': 10095370,\n",
58 | " 'dataset_code': 'AFX_X',\n",
59 | " 'database_code': 'FSE',\n",
60 | " 'name': 'Carl Zeiss Meditec (AFX_X)',\n",
61 | " 'description': 'Stock Prices for Carl Zeiss Meditec (AFX) from the Frankfurt Stock Exchange.
Trading System: Xetra
ISIN: DE0005313704',\n",
62 | " 'refreshed_at': '2018-10-16T22:29:02.721Z',\n",
63 | " 'newest_available_date': '2018-10-16',\n",
64 | " 'oldest_available_date': '2000-06-07',\n",
65 | " 'column_names': ['Date',\n",
66 | " 'Open',\n",
67 | " 'High',\n",
68 | " 'Low',\n",
69 | " 'Close',\n",
70 | " 'Change',\n",
71 | " 'Traded Volume',\n",
72 | " 'Turnover',\n",
73 | " 'Last Price of the Day',\n",
74 | " 'Daily Traded Units',\n",
75 | " 'Daily Turnover'],\n",
76 | " 'frequency': 'daily',\n",
77 | " 'type': 'Time Series',\n",
78 | " 'premium': False,\n",
79 | " 'limit': None,\n",
80 | " 'transform': None,\n",
81 | " 'column_index': None,\n",
82 | " 'start_date': '2017-01-01',\n",
83 | " 'end_date': '2017-01-01',\n",
84 | " 'data': [],\n",
85 | " 'collapse': None,\n",
86 | " 'order': None,\n",
87 | " 'database_id': 6129}}"
88 | ]
89 | },
90 | "execution_count": 22,
91 | "metadata": {},
92 | "output_type": "execute_result"
93 | }
94 | ],
95 | "source": [
96 | "# Inspect the JSON structure of the object you created, and take note of how nested it is, as well as\n",
97 | "# the overall structure.\n",
98 | "\n",
99 | "r_json"
100 | ]
101 | },
102 | {
103 | "cell_type": "markdown",
104 | "metadata": {},
105 | "source": [
106 | "Note: Type of that json file is time series. In this json file, it has 'dataset' dictionary key and value of that key is a nested dictionary. In this nested dictionary, it has individual 19 key-value pairs in addition to one key and its corresponding nested list value ('column_names' is key and corresponding 11 names are values) and one key and its corresponding nested empty list value."
107 | ]
108 | },
109 | {
110 | "cell_type": "markdown",
111 | "metadata": {},
112 | "source": [
113 | "### 1. Collect data from the Franfurt Stock Exchange, for the ticker AFX_X, for the whole year 2017."
114 | ]
115 | },
116 | {
117 | "cell_type": "code",
118 | "execution_count": 11,
119 | "metadata": {},
120 | "outputs": [],
121 | "source": [
122 | "# Set start and end date for the whole year 2017. \n",
123 | "\n",
124 | "url = \"https://www.quandl.com/api/v3/datasets/FSE/AFX_X.json?\"+ \"&start_date=2017-01-01&end_date=2017-12-31&api_key=\" + API_KEY\n",
125 | "r = requests.get(url)"
126 | ]
127 | },
128 | {
129 | "cell_type": "code",
130 | "execution_count": 12,
131 | "metadata": {},
132 | "outputs": [
133 | {
134 | "data": {
135 | "text/plain": [
136 | "200"
137 | ]
138 | },
139 | "execution_count": 12,
140 | "metadata": {},
141 | "output_type": "execute_result"
142 | }
143 | ],
144 | "source": [
145 | "# Checking the response status code. The result should be 200 if the data is imported properly.\n",
146 | "\n",
147 | "r.status_code"
148 | ]
149 | },
150 | {
151 | "cell_type": "code",
152 | "execution_count": 15,
153 | "metadata": {},
154 | "outputs": [
155 | {
156 | "data": {
157 | "text/plain": [
158 | "True"
159 | ]
160 | },
161 | "execution_count": 15,
162 | "metadata": {},
163 | "output_type": "execute_result"
164 | }
165 | ],
166 | "source": [
167 | "# Requests also comes with a built-in status code lookup object for easy reference:\n",
168 | "\n",
169 | "r.status_code == requests.codes.ok"
170 | ]
171 | },
172 | {
173 | "cell_type": "markdown",
174 | "metadata": {},
175 | "source": [
176 | "### 2. Convert the returned JSON object into a Python dictionary."
177 | ]
178 | },
179 | {
180 | "cell_type": "code",
181 | "execution_count": 14,
182 | "metadata": {
183 | "scrolled": true
184 | },
185 | "outputs": [
186 | {
187 | "data": {
188 | "text/plain": [
189 | "dict"
190 | ]
191 | },
192 | "execution_count": 14,
193 | "metadata": {},
194 | "output_type": "execute_result"
195 | }
196 | ],
197 | "source": [
198 | "r_json = r.json()\n",
199 | "type(r_json)"
200 | ]
201 | },
202 | {
203 | "cell_type": "code",
204 | "execution_count": 16,
205 | "metadata": {},
206 | "outputs": [
207 | {
208 | "name": "stdout",
209 | "output_type": "stream",
210 | "text": [
211 | "{'dataset': {'id': 10095370, 'dataset_code': 'AFX_X', 'database_code': 'FSE', 'name': 'Carl Zeiss Meditec (AFX_X)', 'description': 'Stock Prices for Carl Zeiss Meditec (AFX) from the Frankfurt Stock Exchange.
Trading System: Xetra
ISIN: DE0005313704', 'refreshed_at': '2018-10-29T22:33:28.139Z', 'newest_available_date': '2018-10-29', 'oldest_available_date': '2000-06-07', 'column_names': ['Date', 'Open', 'High', 'Low', 'Close', 'Change', 'Traded Volume', 'Turnover', 'Last Price of the Day', 'Daily Traded Units', 'Daily Turnover'], 'frequency': 'daily', 'type': 'Time Series', 'premium': False, 'limit': None, 'transform': None, 'column_index': None, 'start_date': '2017-01-01', 'end_date': '2017-12-31', 'data': [['2017-12-29', 51.76, 51.94, 51.45, 51.76, None, 34640.0, 1792304.0, None, None, None], ['2017-12-28', 51.65, 51.82, 51.43, 51.6, None, 40660.0, 2099024.0, None, None, None], ['2017-12-27', 51.45, 51.89, 50.76, 51.82, None, 57452.0, 2957018.0, None, None, None], ['2017-12-22', 51.05, 51.5, 50.92, 51.32, None, 71165.0, 3641949.0, None, None, None], ['2017-12-21', 51.16, 51.52, 50.9, 51.4, None, 120649.0, 6179433.0, None, None, None], ['2017-12-20', 51.88, 52.04, 51.2, 51.27, None, 50587.0, 2610258.0, None, None, None], ['2017-12-19', 52.73, 52.73, 51.07, 51.66, None, 137313.0, 7102361.0, None, None, None], ['2017-12-18', 52.37, 52.75, 51.61, 52.62, None, 129733.0, 6770499.0, None, None, None], ['2017-12-15', 52.7, 52.7, 51.64, 52.01, None, 204080.0, 10596319.0, None, None, None], ['2017-12-14', 53.11, 53.54, 52.15, 52.67, None, 132981.0, 7016953.0, None, None, None], ['2017-12-13', 52.64, 53.35, 52.48, 53.09, None, 128434.0, 6801159.0, None, None, None], ['2017-12-12', 52.29, 53.1, 51.82, 52.43, None, 87911.0, 4615924.0, None, None, None], ['2017-12-11', 52.28, 52.45, 51.26, 52.14, None, 71817.0, 3724193.0, None, None, None], ['2017-12-08', 51.5, 52.83, 51.28, 52.12, None, 109157.0, 5690648.0, None, None, None], ['2017-12-07', 50.89, 51.47, 50.81, 51.47, None, 48123.0, 2463848.0, None, None, None], ['2017-12-06', 50.8, 51.11, 50.39, 50.89, None, 88730.0, 4504075.0, None, None, None], ['2017-12-05', 51.21, 51.38, 50.4, 51.25, None, 83023.0, 4231971.0, None, None, None], ['2017-12-04', 49.5, 51.23, 49.5, 51.14, None, 94385.0, 4800027.0, None, None, None], ['2017-12-01', 49.52, 50.49, 49.17, 49.86, None, 101733.0, 5065932.0, None, None, None], ['2017-11-30', 48.64, 49.84, 48.28, 49.7, None, 123019.0, 6085171.0, None, None, None], ['2017-11-29', 49.64, 49.64, 48.7, 48.75, None, 67342.0, 3292223.0, None, None, None], ['2017-11-28', 49.09, 49.89, 49.03, 49.25, None, 42669.0, 2107358.0, None, None, None], ['2017-11-27', 49.13, 49.73, 48.96, 49.2, None, 102180.0, 5055762.0, None, None, None], ['2017-11-24', 49.11, 49.41, 48.87, 49.11, None, 50350.0, 2472842.0, None, None, None], ['2017-11-23', 48.8, 49.46, 48.45, 49.2, None, 38834.0, 1909352.0, None, None, None], ['2017-11-22', 48.4, 49.61, 48.39, 48.8, None, 91142.0, 4478093.0, None, None, None], ['2017-11-21', 47.25, 48.59, 46.78, 48.39, None, 78502.0, 3782098.0, None, None, None], ['2017-11-20', 46.57, 47.38, 46.54, 47.04, None, 97252.0, 4563515.0, None, None, None], ['2017-11-17', 47.03, 47.15, 46.8, 46.84, None, 54107.0, 2540820.0, None, None, None], ['2017-11-16', 47.09, 47.23, 46.55, 47.03, None, 89373.0, 4195732.0, None, None, None], ['2017-11-15', 47.98, 48.01, 46.75, 47.05, None, 67593.0, 3188321.0, None, None, None], ['2017-11-14', 48.4, 48.9, 47.84, 48.0, None, 67672.0, 3259979.0, None, None, None], ['2017-11-13', 48.38, 48.61, 47.76, 48.34, None, 76286.0, 3681337.0, None, None, None], ['2017-11-10', 47.3, 48.89, 47.16, 48.34, None, 90245.0, 4361552.0, None, None, None], ['2017-11-09', 47.65, 48.06, 47.09, 47.21, None, 120268.0, 5712034.0, None, None, None], ['2017-11-08', 46.42, 47.72, 46.42, 47.47, None, 94195.0, 4463935.0, None, None, None], ['2017-11-07', 46.16, 46.33, 45.84, 46.26, None, 48152.0, 2224221.0, None, None, None], ['2017-11-06', 45.81, 46.09, 45.76, 45.99, None, 60716.0, 2789220.0, None, None, None], ['2017-11-03', 45.0, 46.04, 44.83, 45.97, None, 56911.0, 2603498.0, None, None, None], ['2017-11-02', 45.88, 46.06, 45.18, 45.27, None, 37958.0, 1724840.0, None, None, None], ['2017-11-01', 46.29, 46.55, 45.97, 46.04, None, 56319.0, 2603859.0, None, None, None], ['2017-10-30', 46.53, 46.65, 45.61, 45.76, None, 56245.0, 2585397.0, None, None, None], ['2017-10-27', 45.48, 46.42, 45.46, 46.41, None, 74472.0, 3434087.0, None, None, None], ['2017-10-26', 45.2, 45.41, 44.91, 45.41, None, 56319.0, 2548078.0, None, None, None], ['2017-10-25', 45.01, 45.06, 44.7, 45.0, None, 47730.0, 2145697.0, None, None, None], ['2017-10-24', 45.16, 45.27, 44.75, 44.85, None, 43042.0, 1937616.0, None, None, None], ['2017-10-23', 44.9, 45.34, 44.89, 45.0, None, 43375.0, 1952918.0, None, None, None], ['2017-10-20', 45.08, 45.34, 44.76, 44.87, None, 55707.0, 2503853.0, None, None, None], ['2017-10-19', 45.72, 45.85, 44.79, 45.0, None, 59991.0, 2703085.0, None, None, None], ['2017-10-18', 46.01, 46.2, 45.61, 45.77, None, 45263.0, 2076951.0, None, None, None], ['2017-10-17', 45.8, 46.06, 45.37, 45.96, None, 65837.0, 3014080.0, None, None, None], ['2017-10-16', 45.61, 45.75, 45.3, 45.55, None, 49246.0, 2243129.0, None, None, None], ['2017-10-13', 45.5, 45.7, 45.37, 45.4, None, 43362.0, 1971801.0, None, None, None], ['2017-10-12', 45.58, 45.58, 45.17, 45.43, None, 49180.0, 2233481.0, None, None, None], ['2017-10-11', 45.97, 45.97, 45.25, 45.29, None, 69455.0, 3158321.0, None, None, None], ['2017-10-10', 45.64, 46.04, 45.57, 45.84, None, 65860.0, 3016658.0, None, None, None], ['2017-10-09', 46.2, 46.2, 45.6, 45.74, None, 44059.0, 2015453.0, None, None, None], ['2017-10-06', 46.19, 46.19, 45.69, 46.0, None, 66760.0, 3066198.0, None, None, None], ['2017-10-05', 46.01, 46.09, 45.63, 46.05, None, 94804.0, 4352002.0, None, None, None], ['2017-10-04', 45.36, 46.17, 45.22, 46.11, None, 115706.0, 5313199.0, None, None, None], ['2017-10-02', 44.51, 44.98, 44.18, 44.98, None, 95313.0, 4265024.0, None, None, None], ['2017-09-29', 43.58, 44.17, 43.3, 44.17, None, 99821.0, 4384796.0, None, None, None], ['2017-09-28', 42.0, 43.56, 42.0, 43.56, None, 157234.0, 6775569.0, None, None, None], ['2017-09-27', 42.35, 42.49, 41.78, 42.04, None, 76600.0, 3219861.0, None, None, None], ['2017-09-26', 42.3, 42.57, 42.11, 42.37, None, 51321.0, 2175381.0, None, None, None], ['2017-09-25', 42.3, 42.3, 41.96, 42.07, None, 56224.0, 2366453.0, None, None, None], ['2017-09-22', 41.48, 42.38, 41.48, 42.06, None, 79955.0, 3362517.0, None, None, None], ['2017-09-21', 42.29, 42.29, 41.39, 41.46, None, 105194.0, 4378409.0, None, None, None], ['2017-09-20', 42.54, 42.54, 41.99, 41.99, None, 57838.0, 2440557.0, None, None, None], ['2017-09-19', 42.65, 42.65, 42.13, 42.44, None, 65546.0, 2777065.0, None, None, None], ['2017-09-18', 42.5, 42.63, 42.23, 42.27, None, 44037.0, 1864954.0, None, None, None], ['2017-09-15', 42.29, 42.81, 42.25, 42.42, None, 107144.0, 4555791.0, None, None, None], ['2017-09-14', 42.35, 42.8, 42.35, 42.52, None, 65157.0, 2770696.0, None, None, None], ['2017-09-13', 42.49, 42.69, 42.22, 42.45, None, 68801.0, 2921240.0, None, None, None], ['2017-09-12', 43.21, 43.34, 42.62, 42.73, None, 52828.0, 2259924.0, None, None, None], ['2017-09-11', 42.81, 42.89, 42.56, 42.85, None, 103273.0, 4415614.0, None, None, None], ['2017-09-08', 42.7, 42.75, 42.56, 42.67, None, 59881.0, 2553977.0, None, None, None], ['2017-09-07', 43.0, 43.02, 42.67, 42.77, None, 64320.0, 2751388.0, None, None, None], ['2017-09-06', 42.66, 42.71, 42.34, 42.55, None, 71006.0, 3020229.0, None, None, None], ['2017-09-05', 43.0, 43.19, 42.55, 42.62, None, 66351.0, 2846115.0, None, None, None], ['2017-09-04', 42.38, 42.75, 41.95, 42.6, None, 105288.0, 4471634.0, None, None, None], ['2017-09-01', 42.16, 43.06, 42.07, 42.41, None, 151474.0, 6453558.0, None, None, None], ['2017-08-31', 42.0, 42.08, 41.12, 41.9, None, 157888.0, 6580200.0, None, None, None], ['2017-08-30', 42.0, 42.2, 41.49, 41.94, None, 97804.0, 4090262.0, None, None, None], ['2017-08-29', 41.71, 41.98, 41.33, 41.85, None, 98156.0, 4094452.0, None, None, None], ['2017-08-28', 42.11, 42.25, 41.86, 41.91, None, 47130.0, 1978704.0, None, None, None], ['2017-08-25', 42.64, 42.64, 42.05, 42.14, None, 69734.0, 2948016.0, None, None, None], ['2017-08-24', 42.72, 43.05, 42.63, 42.69, None, 65213.0, 2792319.0, None, None, None], ['2017-08-23', 42.82, 43.17, 42.6, 42.71, None, 70269.0, 3011578.0, None, None, None], ['2017-08-22', 42.46, 42.96, 42.4, 42.71, None, 95376.0, 4075646.0, None, None, None], ['2017-08-21', 42.42, 42.76, 42.2, 42.26, None, 68812.0, 2922972.0, None, None, None], ['2017-08-18', 42.28, 42.6, 42.01, 42.41, None, 72886.0, 3092377.0, None, None, None], ['2017-08-17', 41.88, 43.01, 41.76, 42.5, None, 131361.0, 5583704.0, None, None, None], ['2017-08-16', 42.4, 42.62, 41.98, 42.05, None, 104676.0, 4408312.0, None, None, None], ['2017-08-15', 42.53, 42.53, 42.2, 42.28, None, 64334.0, 2721852.0, None, None, None], ['2017-08-14', 42.12, 42.69, 42.01, 42.3, None, 127682.0, 5416963.0, None, None, None], ['2017-08-11', 41.3, 41.94, 40.96, 41.94, None, 183412.0, 7604144.0, None, None, None], ['2017-08-10', 41.73, 41.99, 41.14, 41.68, None, 175161.0, 7303562.0, None, None, None], ['2017-08-09', 43.5, 43.5, 41.64, 41.81, None, 355857.0, 15003956.0, None, None, None], ['2017-08-08', 44.9, 45.09, 44.15, 44.37, None, 156168.0, 6941408.0, None, None, None], ['2017-08-07', 45.85, 46.34, 44.02, 44.96, None, 164543.0, 7378816.0, None, None, None], ['2017-08-04', 45.13, 45.13, 44.36, 45.07, None, 96202.0, 4306911.0, None, None, None], ['2017-08-03', 45.34, 45.54, 44.91, 44.97, None, 77854.0, 3517146.0, None, None, None], ['2017-08-02', 45.25, 45.77, 44.9, 45.56, None, 187468.0, 8528548.0, None, None, None], ['2017-08-01', 45.24, 45.54, 45.1, 45.45, None, 74975.0, 3399891.0, None, None, None], ['2017-07-31', 44.94, 45.75, 44.94, 45.3, None, 62672.0, 2844210.0, None, None, None], ['2017-07-28', 45.26, 45.29, 44.75, 44.97, None, 114006.0, 5127247.0, None, None, None], ['2017-07-27', 45.16, 45.45, 45.15, 45.25, None, 50557.0, 2290284.0, None, None, None], ['2017-07-26', 44.91, 45.33, 44.46, 45.16, None, 81970.0, 3688510.0, None, None, None], ['2017-07-25', 44.7, 45.04, 44.63, 44.82, None, 112224.0, 5033312.0, None, None, None], ['2017-07-24', 45.31, 45.31, 44.49, 44.61, None, 104282.0, 4661866.0, None, None, None], ['2017-07-21', 45.57, 45.88, 45.04, 45.44, None, 73422.0, 3334695.0, None, None, None], ['2017-07-20', 45.74, 45.96, 45.23, 45.66, None, 87399.0, 3986488.0, None, None, None], ['2017-07-19', 45.06, 45.72, 44.94, 45.57, None, 71971.0, 3273001.0, None, None, None], ['2017-07-18', 45.5, 45.55, 44.7, 45.0, None, 104003.0, 4684627.0, None, None, None], ['2017-07-17', 45.6, 46.23, 45.29, 45.6, None, 104995.0, 4801806.0, None, None, None], ['2017-07-14', 45.07, 45.56, 44.83, 45.53, None, 67375.0, 3054060.0, None, None, None], ['2017-07-13', 44.67, 45.18, 44.67, 44.95, None, 82745.0, 3718928.0, None, None, None], ['2017-07-12', 44.29, 45.05, 43.89, 44.95, None, 115705.0, 5133971.0, None, None, None], ['2017-07-11', 44.94, 44.94, 44.08, 44.2, None, 90538.0, 4010457.0, None, None, None], ['2017-07-10', 44.64, 45.18, 44.51, 44.7, None, 71868.0, 3221218.0, None, None, None], ['2017-07-07', 44.79, 44.79, 44.25, 44.53, None, 47999.0, 2136578.0, None, None, None], ['2017-07-06', 45.5, 45.5, 44.15, 44.62, None, 66116.0, 2952605.0, None, None, None], ['2017-07-05', 44.67, 45.36, 44.44, 45.19, None, 48706.0, 2189436.0, None, None, None], ['2017-07-04', 45.83, 45.83, 44.74, 44.8, None, 50549.0, 2273551.0, None, None, None], ['2017-07-03', 45.29, 45.83, 45.06, 45.75, None, 71381.0, 3251502.0, None, None, None], ['2017-06-30', 45.01, 45.74, 45.0, 45.44, None, 136112.0, 6187148.0, None, None, None], ['2017-06-29', 45.73, 45.81, 45.11, 45.2, None, 134965.0, 6132452.0, None, None, None], ['2017-06-28', 46.68, 46.68, 45.41, 45.68, None, 117165.0, 5381488.0, None, None, None], ['2017-06-27', 47.23, 47.33, 46.39, 46.83, None, 82492.0, 3866344.0, None, None, None], ['2017-06-26', 46.95, 47.63, 46.91, 47.21, None, 73322.0, 3465639.0, None, None, None], ['2017-06-23', 47.29, 47.4, 46.79, 46.99, None, 80586.0, 3792498.0, None, None, None], ['2017-06-22', 47.03, 47.4, 46.75, 47.29, None, 56071.0, 2640508.0, None, None, None], ['2017-06-21', 47.46, 47.48, 46.53, 46.99, None, 89752.0, 4206563.0, None, None, None], ['2017-06-20', 46.48, 47.43, 46.27, 47.37, None, 108334.0, 5109730.0, None, None, None], ['2017-06-19', 46.9, 46.9, 46.25, 46.64, None, 70056.0, 3260381.0, None, None, None], ['2017-06-16', 45.66, 46.8, 45.66, 46.63, None, 202214.0, 9411695.0, None, None, None], ['2017-06-15', 46.34, 46.34, 45.21, 45.67, None, 101733.0, 4635593.0, None, None, None], ['2017-06-14', 46.52, 46.86, 46.05, 46.33, None, 83741.0, 3881453.0, None, None, None], ['2017-06-13', 46.5, 46.51, 46.03, 46.32, None, 107644.0, 4981185.0, None, None, None], ['2017-06-12', 47.31, 47.43, 45.89, 46.31, None, 112942.0, 5238390.0, None, None, None], ['2017-06-09', 46.77, 47.44, 46.55, 47.44, None, 99674.0, 4702170.0, None, None, None], ['2017-06-08', 47.8, 47.8, 46.27, 46.27, None, 1945.0, 90599.0, None, None, None], ['2017-06-07', 47.01, 47.43, 47.01, 47.43, None, 1081.0, 51021.0, None, None, None], ['2017-06-06', 47.12, 47.45, 46.21, 47.43, None, 686.0, 32083.0, None, None, None], ['2017-06-02', 46.8, 46.99, 46.72, 46.99, None, 290.0, 13584.0, None, None, None], ['2017-06-01', 46.12, 46.52, 45.89, 46.52, None, 106513.0, 4930686.0, None, None, None], ['2017-05-31', 45.22, 46.26, 45.22, 45.86, None, 522.0, 24044.0, None, None, None], ['2017-05-30', 45.05, 46.02, 45.05, 46.02, None, 587.0, 26792.0, None, None, None], ['2017-05-29', 45.61, 45.61, 45.24, 45.32, None, 112.0, 5089.0, None, None, None], ['2017-05-26', 44.8, 45.36, 44.71, 45.3, None, 74453.0, 3360707.0, None, None, None], ['2017-05-25', 44.8, 44.87, 44.29, 44.78, None, 49970.0, 2231857.0, None, None, None], ['2017-05-24', 43.92, 44.67, 43.92, 44.53, None, 111923.0, 4971343.0, None, None, None], ['2017-05-23', 43.67, 44.13, 43.55, 43.9, None, 38308.0, 1681904.0, None, None, None], ['2017-05-22', 44.16, 44.22, 43.44, 43.84, None, 70856.0, 3103013.0, None, None, None], ['2017-05-19', 43.74, 44.12, 43.74, 44.12, None, 45.0, 1980.0, None, None, None], ['2017-05-18', 44.0, 44.3, 43.29, 43.98, None, 166160.0, 7277314.0, None, None, None], ['2017-05-17', 45.06, 45.34, 44.01, 44.19, None, 149515.0, 6664744.0, None, None, None], ['2017-05-16', 45.15, 45.36, 44.56, 45.31, None, 101476.0, 4567885.0, None, None, None], ['2017-05-15', 45.09, 45.78, 44.31, 45.14, None, 193702.0, 8734286.0, None, None, None], ['2017-05-12', 45.18, 45.18, 44.16, 44.99, None, 159495.0, 7113519.0, None, None, None], ['2017-05-11', 43.4, 46.06, 43.25, 45.0, None, 189125.0, 8496322.0, None, None, None], ['2017-05-10', 43.5, 43.6, 42.53, 43.28, None, 91858.0, 3958630.0, None, None, None], ['2017-05-09', 41.83, 43.55, 41.82, 43.3, None, 151439.0, 6538516.0, None, None, None], ['2017-05-08', 43.0, 43.0, 42.04, 42.24, None, 97456.0, 4128048.0, None, None, None], ['2017-05-05', 42.52, 42.91, 42.38, 42.75, None, 78512.0, 3353971.0, None, None, None], ['2017-05-04', 41.86, 42.5, 41.71, 42.5, None, 82058.0, 3465505.0, None, None, None], ['2017-05-03', 42.2, 42.29, 41.78, 41.9, None, 65266.0, 2738394.0, None, None, None], ['2017-05-02', 41.89, 42.23, 41.76, 42.15, None, 86559.0, 3636583.0, None, None, None], ['2017-05-01', None, 42.245, 41.655, 41.72, -0.44, 86348.0, 3606589.0, None, None, None], ['2017-04-28', 42.17, 42.25, 41.66, 41.72, None, 86348.0, 3606589.0, None, None, None], ['2017-04-27', 41.51, 42.24, 41.51, 42.16, None, 151683.0, 6380639.0, None, None, None], ['2017-04-26', 41.88, 41.94, 41.4, 41.5, None, 65847.0, 2743109.0, None, None, None], ['2017-04-25', 41.93, 42.18, 41.66, 41.89, None, 85973.0, 3604204.0, None, None, None], ['2017-04-24', 42.01, 42.02, 41.23, 41.81, None, 102084.0, 4247032.0, None, None, None], ['2017-04-21', 41.97, 42.14, 41.01, 41.32, None, 186784.0, 7728103.0, None, None, None], ['2017-04-20', 42.5, 42.64, 41.52, 41.93, None, 223621.0, 9418192.0, None, None, None], ['2017-04-19', 41.94, 42.61, 41.94, 42.61, None, 92722.0, 3930856.0, None, None, None], ['2017-04-18', 42.24, 42.4, 41.54, 42.0, None, 133057.0, 5587565.0, None, None, None], ['2017-04-17', None, 42.48, 41.985, 42.2, None, 88416.0, 3734717.0, None, None, None], ['2017-04-14', None, 42.48, 41.985, 42.2, None, 88416.0, 3734717.0, None, None, None], ['2017-04-13', 42.06, 42.48, 41.99, 42.2, None, 88416.0, 3734717.0, None, None, None], ['2017-04-12', 42.02, 42.45, 41.84, 42.2, None, 158278.0, 6672547.0, None, None, None], ['2017-04-11', 41.62, 42.03, 41.53, 41.75, None, 107817.0, 4501109.0, None, None, None], ['2017-04-10', 41.46, 41.68, 41.31, 41.68, None, 62297.0, 2585922.0, None, None, None], ['2017-04-07', 40.9, 41.42, 40.84, 41.42, None, 81255.0, 3344628.0, None, None, None], ['2017-04-06', 40.96, 41.25, 40.83, 41.05, None, 96794.0, 3968681.0, None, None, None], ['2017-04-05', 41.1, 41.34, 40.79, 41.1, None, 156005.0, 6404780.0, None, None, None], ['2017-04-04', 39.5, 40.88, 39.48, 40.81, None, 193156.0, 7822665.0, None, None, None], ['2017-04-03', 40.15, 40.15, 39.54, 39.64, None, 127973.0, 5081376.0, None, None, None], ['2017-03-31', 39.77, 40.07, 39.42, 39.98, None, 95382.0, 3795061.0, None, None, None], ['2017-03-30', 40.02, 40.14, 39.42, 39.75, None, 189201.0, 7541354.0, None, None, None], ['2017-03-29', 39.39, 40.01, 39.05, 40.01, None, 335406.0, 13349426.0, None, None, None], ['2017-03-28', 38.95, 39.35, 38.79, 39.22, None, 115075.0, 4505494.0, None, None, None], ['2017-03-27', 38.73, 39.1, 38.53, 38.85, None, 191515.0, 7446952.0, None, None, None], ['2017-03-24', 38.94, 39.02, 38.6, 38.94, None, 210926.0, 8205507.0, None, None, None], ['2017-03-23', 39.01, 39.25, 38.63, 38.96, None, 169971.0, 6621807.0, None, None, None], ['2017-03-22', 38.25, 39.02, 37.53, 38.94, None, 670349.0, 25910543.0, None, None, None], ['2017-03-21', 41.8, 41.83, 40.97, 40.98, None, 56906.0, 2349965.0, None, None, None], ['2017-03-20', 41.26, 42.17, 41.26, 41.97, None, 97572.0, 4074891.0, None, None, None], ['2017-03-17', 41.47, 41.59, 41.16, 41.34, None, 90109.0, 3734232.0, None, None, None], ['2017-03-16', 41.4, 41.57, 41.09, 41.46, None, 55799.0, 2308423.0, None, None, None], ['2017-03-15', 41.4, 41.5, 40.91, 41.25, None, 60324.0, 2488650.0, None, None, None], ['2017-03-14', 41.2, 41.5, 41.2, 41.3, None, 60420.0, 2498025.0, None, None, None], ['2017-03-13', 41.4, 41.46, 41.08, 41.3, None, 44803.0, 1850251.0, None, None, None], ['2017-03-10', 41.53, 41.53, 41.16, 41.4, None, 38518.0, 1592270.0, None, None, None], ['2017-03-09', 41.61, 41.61, 41.16, 41.4, None, 43988.0, 1819182.0, None, None, None], ['2017-03-08', 41.13, 41.71, 40.95, 41.68, None, 45111.0, 1870935.0, None, None, None], ['2017-03-07', 41.5, 41.8, 41.25, 41.42, None, 61925.0, 2569608.0, None, None, None], ['2017-03-06', 41.25, 41.4, 40.81, 41.4, None, 46510.0, 1916799.0, None, None, None], ['2017-03-03', 41.12, 41.22, 40.84, 41.18, None, 40800.0, 1675587.0, None, None, None], ['2017-03-02', 41.38, 41.39, 40.76, 41.17, None, 49863.0, 2048153.0, None, None, None], ['2017-03-01', 41.19, 41.57, 40.9, 41.2, None, 86753.0, 3569796.0, None, None, None], ['2017-02-28', 40.38, 40.95, 40.38, 40.84, None, 67440.0, 2747011.0, None, None, None], ['2017-02-27', 39.75, 40.64, 39.75, 40.39, None, 62655.0, 2520260.0, None, None, None], ['2017-02-24', 39.77, 40.14, 38.91, 39.74, None, 101294.0, 4015150.0, None, None, None], ['2017-02-23', 39.72, 39.98, 39.38, 39.79, None, 81945.0, 3260642.0, None, None, None], ['2017-02-22', 39.6, 39.75, 39.27, 39.7, None, 77619.0, 3066894.0, None, None, None], ['2017-02-21', 38.85, 39.57, 38.85, 39.45, None, 46070.0, 1808350.0, None, None, None], ['2017-02-20', 39.25, 39.25, 38.81, 38.98, None, 37014.0, 1444138.0, None, None, None], ['2017-02-17', 38.8, 39.03, 38.48, 39.02, None, 60583.0, 2352961.0, None, None, None], ['2017-02-16', 38.8, 39.2, 38.25, 38.71, None, 84682.0, 3282322.0, None, None, None], ['2017-02-15', 38.5, 38.93, 38.4, 38.72, None, 77420.0, 2996861.0, None, None, None], ['2017-02-14', 38.81, 38.86, 38.0, 38.37, None, 82601.0, 3163898.0, None, None, None], ['2017-02-13', 37.37, 39.36, 37.35, 38.53, None, 177171.0, 6804028.0, None, None, None], ['2017-02-10', 36.65, 37.5, 36.57, 37.06, None, 115843.0, 4291017.0, None, None, None], ['2017-02-09', 36.2, 36.25, 35.77, 36.25, None, 67781.0, 2445428.0, None, None, None], ['2017-02-08', 35.98, 36.14, 35.84, 36.05, None, 39731.0, 1431205.0, None, None, None], ['2017-02-07', 35.56, 36.05, 35.36, 35.89, None, 67410.0, 2410818.0, None, None, None], ['2017-02-06', 36.06, 36.15, 35.6, 35.64, None, 41911.0, 1496794.0, None, None, None], ['2017-02-03', 36.02, 36.2, 35.73, 36.1, None, 40705.0, 1464712.0, None, None, None], ['2017-02-02', 35.95, 36.2, 35.7, 36.07, None, 54279.0, 1953176.0, None, None, None], ['2017-02-01', 34.75, 36.0, 34.75, 35.94, None, 85137.0, 3038172.0, None, None, None], ['2017-01-31', 35.24, 35.24, 34.56, 34.56, None, 63371.0, 2199583.0, None, None, None], ['2017-01-30', 35.38, 35.59, 34.95, 35.15, None, 69603.0, 2457762.0, None, None, None], ['2017-01-27', 34.83, 35.43, 34.81, 35.3, None, 69657.0, 2444913.0, None, None, None], ['2017-01-26', 35.07, 35.58, 34.8, 34.89, None, 64103.0, 2249375.0, None, None, None], ['2017-01-25', 34.42, 34.86, 34.03, 34.83, None, 56240.0, 1947147.0, None, None, None], ['2017-01-24', 34.0, 34.35, 33.85, 34.22, None, 48797.0, 1666086.0, None, None, None], ['2017-01-23', 34.04, 34.12, 33.62, 34.06, None, 55333.0, 1877957.0, None, None, None], ['2017-01-20', 34.54, 34.59, 34.05, 34.17, None, 80246.0, 2743474.0, None, None, None], ['2017-01-19', 35.04, 35.04, 34.42, 34.5, None, 73105.0, 2526731.0, None, None, None], ['2017-01-18', 35.04, 35.51, 34.8, 34.9, None, 65931.0, 2311608.0, None, None, None], ['2017-01-17', 35.06, 35.19, 34.79, 34.99, None, 39195.0, 1369857.0, None, None, None], ['2017-01-16', 34.85, 35.24, 34.56, 35.07, None, 47879.0, 1678679.0, None, None, None], ['2017-01-13', 34.98, 34.98, 34.6, 34.85, None, 59367.0, 2065534.0, None, None, None], ['2017-01-12', 35.38, 35.38, 34.31, 34.9, None, 163860.0, 5703427.0, None, None, None], ['2017-01-11', 34.95, 36.0, 34.84, 35.42, None, 123530.0, 4369079.0, None, None, None], ['2017-01-10', 34.8, 34.98, 34.46, 34.91, None, 43976.0, 1528055.0, None, None, None], ['2017-01-09', 35.29, 35.35, 34.43, 34.67, None, 62225.0, 2157182.0, None, None, None], ['2017-01-06', 34.91, 35.21, 34.91, 35.04, None, 27507.0, 964046.0, None, None, None], ['2017-01-05', 35.02, 35.2, 34.73, 35.06, None, 48412.0, 1692326.0, None, None, None], ['2017-01-04', 35.48, 35.51, 34.75, 35.19, None, 54408.0, 1906810.0, None, None, None], ['2017-01-03', 35.9, 35.93, 35.34, 35.48, None, 70618.0, 2515473.0, None, None, None], ['2017-01-02', 34.99, 35.94, 34.99, 35.8, None, 44700.0, 1590561.0, None, None, None]], 'collapse': None, 'order': None, 'database_id': 6129}}\n"
212 | ]
213 | }
214 | ],
215 | "source": [
216 | "print(r_json)"
217 | ]
218 | },
219 | {
220 | "cell_type": "markdown",
221 | "metadata": {},
222 | "source": [
223 | "### 3. Calculate what the highest and lowest opening prices were for the stock in this period."
224 | ]
225 | },
226 | {
227 | "cell_type": "code",
228 | "execution_count": 28,
229 | "metadata": {},
230 | "outputs": [
231 | {
232 | "data": {
233 | "text/plain": [
234 | "dict_keys(['dataset'])"
235 | ]
236 | },
237 | "execution_count": 28,
238 | "metadata": {},
239 | "output_type": "execute_result"
240 | }
241 | ],
242 | "source": [
243 | "# Review the data content\n",
244 | "r_json.keys()"
245 | ]
246 | },
247 | {
248 | "cell_type": "code",
249 | "execution_count": 29,
250 | "metadata": {},
251 | "outputs": [
252 | {
253 | "data": {
254 | "text/plain": [
255 | "dict_keys(['id', 'dataset_code', 'database_code', 'name', 'description', 'refreshed_at', 'newest_available_date', 'oldest_available_date', 'column_names', 'frequency', 'type', 'premium', 'limit', 'transform', 'column_index', 'start_date', 'end_date', 'data', 'collapse', 'order', 'database_id'])"
256 | ]
257 | },
258 | "execution_count": 29,
259 | "metadata": {},
260 | "output_type": "execute_result"
261 | }
262 | ],
263 | "source": [
264 | "r_json['dataset'].keys()"
265 | ]
266 | },
267 | {
268 | "cell_type": "code",
269 | "execution_count": 30,
270 | "metadata": {},
271 | "outputs": [
272 | {
273 | "data": {
274 | "text/plain": [
275 | "['Date',\n",
276 | " 'Open',\n",
277 | " 'High',\n",
278 | " 'Low',\n",
279 | " 'Close',\n",
280 | " 'Change',\n",
281 | " 'Traded Volume',\n",
282 | " 'Turnover',\n",
283 | " 'Last Price of the Day',\n",
284 | " 'Daily Traded Units',\n",
285 | " 'Daily Turnover']"
286 | ]
287 | },
288 | "execution_count": 30,
289 | "metadata": {},
290 | "output_type": "execute_result"
291 | }
292 | ],
293 | "source": [
294 | "r_json['dataset']['column_names']"
295 | ]
296 | },
297 | {
298 | "cell_type": "code",
299 | "execution_count": 31,
300 | "metadata": {},
301 | "outputs": [
302 | {
303 | "data": {
304 | "text/plain": [
305 | "[['2017-12-29',\n",
306 | " 51.76,\n",
307 | " 51.94,\n",
308 | " 51.45,\n",
309 | " 51.76,\n",
310 | " None,\n",
311 | " 34640.0,\n",
312 | " 1792304.0,\n",
313 | " None,\n",
314 | " None,\n",
315 | " None],\n",
316 | " ['2017-12-28',\n",
317 | " 51.65,\n",
318 | " 51.82,\n",
319 | " 51.43,\n",
320 | " 51.6,\n",
321 | " None,\n",
322 | " 40660.0,\n",
323 | " 2099024.0,\n",
324 | " None,\n",
325 | " None,\n",
326 | " None]]"
327 | ]
328 | },
329 | "execution_count": 31,
330 | "metadata": {},
331 | "output_type": "execute_result"
332 | }
333 | ],
334 | "source": [
335 | "r_json['dataset']['data'][0:2]"
336 | ]
337 | },
338 | {
339 | "cell_type": "markdown",
340 | "metadata": {},
341 | "source": [
342 | "For this particular question, I tried to show three different approaches to solve it. "
343 | ]
344 | },
345 | {
346 | "cell_type": "code",
347 | "execution_count": 17,
348 | "metadata": {},
349 | "outputs": [
350 | {
351 | "name": "stdout",
352 | "output_type": "stream",
353 | "text": [
354 | "Maximim and minimum opening values by dates: \n",
355 | "('2017-12-14', 53.11) ('2017-01-24', 34.0)\n"
356 | ]
357 | }
358 | ],
359 | "source": [
360 | "# Method-1:\n",
361 | "\n",
362 | "# Index of Openings\n",
363 | "i_open = r_json['dataset']['column_names'].index('Open')\n",
364 | "\n",
365 | "# Index of the data associated with the \"Open\" value\n",
366 | "i_date = r_json['dataset']['column_names'].index('Date')\n",
367 | "\n",
368 | "# Creating a dictionary for opening values to corresponding each day\n",
369 | "data_json = r_json['dataset']['data']\n",
370 | "openings = {data_json[j][i_date] : data_json[j][i_open] for j in range(len(data_json)) if data_json[j][i_open] is not None}\n",
371 | "\n",
372 | "max_openings = max(openings.items(), key=operator.itemgetter(1))\n",
373 | "min_openings = min(openings.items(), key=operator.itemgetter(1))\n",
374 | "\n",
375 | "print('Maximim and minimum opening values by dates: ')\n",
376 | "print(max_openings, min_openings)"
377 | ]
378 | },
379 | {
380 | "cell_type": "code",
381 | "execution_count": 18,
382 | "metadata": {},
383 | "outputs": [
384 | {
385 | "name": "stdout",
386 | "output_type": "stream",
387 | "text": [
388 | "The highest opening price: 53.11\n",
389 | "The lowest opening price: 34.0\n"
390 | ]
391 | }
392 | ],
393 | "source": [
394 | "# Method-2:\n",
395 | "\n",
396 | "opening = [row[1] for row in data_json if row[1] != None]\n",
397 | "print(\"The highest opening price: \" + str(max(opening)))\n",
398 | "print(\"The lowest opening price: \" + str(min(opening)))"
399 | ]
400 | },
401 | {
402 | "cell_type": "code",
403 | "execution_count": 19,
404 | "metadata": {},
405 | "outputs": [
406 | {
407 | "name": "stdout",
408 | "output_type": "stream",
409 | "text": [
410 | " ['max_opening_value $53.11 at 2017-12-14'] \n",
411 | " ['lowest_opening_value $34.0 at 2017-01-24']\n"
412 | ]
413 | }
414 | ],
415 | "source": [
416 | "# Method-3: \n",
417 | "\n",
418 | "def min_max_opening(data):\n",
419 | " max_opening = ['max_opening_value $' + str(v) + ' at ' + k for k,v in data.items() if v == max(data.values()) ]\n",
420 | " lowest_opening = ['lowest_opening_value $' + str(v) + ' at ' + k for k,v in data.items() if v == min(data.values()) ]\n",
421 | " return print('',max_opening,'\\n',lowest_opening)\n",
422 | "\n",
423 | "min_max_opening(openings)"
424 | ]
425 | },
426 | {
427 | "cell_type": "markdown",
428 | "metadata": {},
429 | "source": [
430 | "### 4. What was the largest change in any one day (based on High and Low price)?"
431 | ]
432 | },
433 | {
434 | "cell_type": "code",
435 | "execution_count": 20,
436 | "metadata": {},
437 | "outputs": [
438 | {
439 | "name": "stdout",
440 | "output_type": "stream",
441 | "text": [
442 | "The largest change in any one day is:2.8100000000000023\n"
443 | ]
444 | }
445 | ],
446 | "source": [
447 | "high = [row[2] for row in data_json if row[2] != None]\n",
448 | "\n",
449 | "low = [row[3] for row in data_json if row[3] != None]\n",
450 | "\n",
451 | "subs = [abs(x1 - x2) for (x1, x2) in zip(high, low)]\n",
452 | "\n",
453 | "print (\"The largest change in any one day is:\" + str(max(subs)))"
454 | ]
455 | },
456 | {
457 | "cell_type": "markdown",
458 | "metadata": {},
459 | "source": [
460 | "### 5. What was the largest change between any two days (based on Closing Price)?"
461 | ]
462 | },
463 | {
464 | "cell_type": "code",
465 | "execution_count": 21,
466 | "metadata": {},
467 | "outputs": [
468 | {
469 | "name": "stdout",
470 | "output_type": "stream",
471 | "text": [
472 | "The largest change between two days is:2.559999999999995\n"
473 | ]
474 | }
475 | ],
476 | "source": [
477 | "closing = [row[4] for row in data_json if row[4] != None]\n",
478 | "\n",
479 | "closing_prvs = [row[4] for row in data_json if row[4] != None][1:]\n",
480 | "\n",
481 | "sub = [abs(x1 - x2) for (x1, x2) in zip(closing, closing_prvs)]\n",
482 | "\n",
483 | "print (\"The largest change between two days is:\" + str(max(sub)))"
484 | ]
485 | },
486 | {
487 | "cell_type": "markdown",
488 | "metadata": {},
489 | "source": [
490 | "### 6. What was the average daily trading volume during this year?"
491 | ]
492 | },
493 | {
494 | "cell_type": "code",
495 | "execution_count": 22,
496 | "metadata": {},
497 | "outputs": [
498 | {
499 | "name": "stdout",
500 | "output_type": "stream",
501 | "text": [
502 | "The avarage daily trading volume in 2017: 89124.34\n"
503 | ]
504 | }
505 | ],
506 | "source": [
507 | "trading_volume = [row[6] for row in data_json]\n",
508 | "\n",
509 | "volume_avg = sum(trading_volume) / len(trading_volume)\n",
510 | "\n",
511 | "print (\"The avarage daily trading volume in 2017: \" + str(round(volume_avg,2)))"
512 | ]
513 | },
514 | {
515 | "cell_type": "markdown",
516 | "metadata": {},
517 | "source": [
518 | "### 7. (Optional) What was the median trading volume during this year. (Note: you may need to implement your own function for calculating the median.)"
519 | ]
520 | },
521 | {
522 | "cell_type": "code",
523 | "execution_count": 26,
524 | "metadata": {},
525 | "outputs": [
526 | {
527 | "name": "stdout",
528 | "output_type": "stream",
529 | "text": [
530 | "\n",
531 | " 76286.0 is the median trading volume during this year\n"
532 | ]
533 | }
534 | ],
535 | "source": [
536 | "def find_median(values):\n",
537 | "\n",
538 | " # First sort the list in ascending order\n",
539 | " sorted_trading_vol = sorted(values, reverse= False)\n",
540 | " \n",
541 | " # Calculate the size of the list\n",
542 | " size = len(sorted_trading_vol)\n",
543 | " \n",
544 | " # Check if the size is odd or even number provided the list not empty\n",
545 | " if size % 2 == 1:\n",
546 | " return sorted_trading_vol[size//2]\n",
547 | " else:\n",
548 | " return sum(sorted_trading_vol[size//2-1:size//2+1])/2.0\n",
549 | " \n",
550 | "print('\\n',find_median(values = trading_volume) , ' is the median trading volume during this year')"
551 | ]
552 | }
553 | ],
554 | "metadata": {
555 | "kernelspec": {
556 | "display_name": "Python 3",
557 | "language": "python",
558 | "name": "python3"
559 | },
560 | "language_info": {
561 | "codemirror_mode": {
562 | "name": "ipython",
563 | "version": 3
564 | },
565 | "file_extension": ".py",
566 | "mimetype": "text/x-python",
567 | "name": "python",
568 | "nbconvert_exporter": "python",
569 | "pygments_lexer": "ipython3",
570 | "version": "3.7.1"
571 | }
572 | },
573 | "nbformat": 4,
574 | "nbformat_minor": 2
575 | }
576 |
--------------------------------------------------------------------------------
/API_Project-Quandl/API_Project-Quandl.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# API_Project-Quandl"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "# Importing the relevant modules.\n",
17 | "\n",
18 | "import requests\n",
19 | "import json \n",
20 | "import operator\n",
21 | "import numpy as np"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": 2,
27 | "metadata": {},
28 | "outputs": [],
29 | "source": [
30 | "# Unique API key which is taken from http://www.quandl.com website.\n",
31 | "\n",
32 | "API_KEY = 'zHER-uPSaTEaUxTgB2d4' "
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": 3,
38 | "metadata": {},
39 | "outputs": [],
40 | "source": [
41 | "# Caling the Quandl API and pull out a small sample of the data (only one day) to get a glimpse \n",
42 | "# into the JSON structure that will be returned.\n",
43 | "\n",
44 | "url = 'https://www.quandl.com/api/v3/datasets/FSE/AFX_X.json?&start_date=2017-01-01&end_date=2017-01-01&api_key=' + API_KEY\n",
45 | "r = requests.get(url)\n",
46 | "r_json = r.json()"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": 22,
52 | "metadata": {},
53 | "outputs": [
54 | {
55 | "data": {
56 | "text/plain": [
57 | "{'dataset': {'id': 10095370,\n",
58 | " 'dataset_code': 'AFX_X',\n",
59 | " 'database_code': 'FSE',\n",
60 | " 'name': 'Carl Zeiss Meditec (AFX_X)',\n",
61 | " 'description': 'Stock Prices for Carl Zeiss Meditec (AFX) from the Frankfurt Stock Exchange.
Trading System: Xetra
ISIN: DE0005313704',\n",
62 | " 'refreshed_at': '2018-10-16T22:29:02.721Z',\n",
63 | " 'newest_available_date': '2018-10-16',\n",
64 | " 'oldest_available_date': '2000-06-07',\n",
65 | " 'column_names': ['Date',\n",
66 | " 'Open',\n",
67 | " 'High',\n",
68 | " 'Low',\n",
69 | " 'Close',\n",
70 | " 'Change',\n",
71 | " 'Traded Volume',\n",
72 | " 'Turnover',\n",
73 | " 'Last Price of the Day',\n",
74 | " 'Daily Traded Units',\n",
75 | " 'Daily Turnover'],\n",
76 | " 'frequency': 'daily',\n",
77 | " 'type': 'Time Series',\n",
78 | " 'premium': False,\n",
79 | " 'limit': None,\n",
80 | " 'transform': None,\n",
81 | " 'column_index': None,\n",
82 | " 'start_date': '2017-01-01',\n",
83 | " 'end_date': '2017-01-01',\n",
84 | " 'data': [],\n",
85 | " 'collapse': None,\n",
86 | " 'order': None,\n",
87 | " 'database_id': 6129}}"
88 | ]
89 | },
90 | "execution_count": 22,
91 | "metadata": {},
92 | "output_type": "execute_result"
93 | }
94 | ],
95 | "source": [
96 | "# Inspect the JSON structure of the object you created, and take note of how nested it is, as well as\n",
97 | "# the overall structure.\n",
98 | "\n",
99 | "r_json"
100 | ]
101 | },
102 | {
103 | "cell_type": "markdown",
104 | "metadata": {},
105 | "source": [
106 | "Note: Type of that json file is time series. In this json file, it has 'dataset' dictionary key and value of that key is a nested dictionary. In this nested dictionary, it has individual 19 key-value pairs in addition to one key and its corresponding nested list value ('column_names' is key and corresponding 11 names are values) and one key and its corresponding nested empty list value."
107 | ]
108 | },
109 | {
110 | "cell_type": "markdown",
111 | "metadata": {},
112 | "source": [
113 | "### 1. Collect data from the Franfurt Stock Exchange, for the ticker AFX_X, for the whole year 2017."
114 | ]
115 | },
116 | {
117 | "cell_type": "code",
118 | "execution_count": 11,
119 | "metadata": {},
120 | "outputs": [],
121 | "source": [
122 | "# Set start and end date for the whole year 2017. \n",
123 | "\n",
124 | "url = \"https://www.quandl.com/api/v3/datasets/FSE/AFX_X.json?\"+ \"&start_date=2017-01-01&end_date=2017-12-31&api_key=\" + API_KEY\n",
125 | "r = requests.get(url)"
126 | ]
127 | },
128 | {
129 | "cell_type": "code",
130 | "execution_count": 12,
131 | "metadata": {},
132 | "outputs": [
133 | {
134 | "data": {
135 | "text/plain": [
136 | "200"
137 | ]
138 | },
139 | "execution_count": 12,
140 | "metadata": {},
141 | "output_type": "execute_result"
142 | }
143 | ],
144 | "source": [
145 | "# Checking the response status code. The result should be 200 if the data is imported properly.\n",
146 | "\n",
147 | "r.status_code"
148 | ]
149 | },
150 | {
151 | "cell_type": "code",
152 | "execution_count": 15,
153 | "metadata": {},
154 | "outputs": [
155 | {
156 | "data": {
157 | "text/plain": [
158 | "True"
159 | ]
160 | },
161 | "execution_count": 15,
162 | "metadata": {},
163 | "output_type": "execute_result"
164 | }
165 | ],
166 | "source": [
167 | "# Requests also comes with a built-in status code lookup object for easy reference:\n",
168 | "\n",
169 | "r.status_code == requests.codes.ok"
170 | ]
171 | },
172 | {
173 | "cell_type": "markdown",
174 | "metadata": {},
175 | "source": [
176 | "### 2. Convert the returned JSON object into a Python dictionary."
177 | ]
178 | },
179 | {
180 | "cell_type": "code",
181 | "execution_count": 14,
182 | "metadata": {
183 | "scrolled": true
184 | },
185 | "outputs": [
186 | {
187 | "data": {
188 | "text/plain": [
189 | "dict"
190 | ]
191 | },
192 | "execution_count": 14,
193 | "metadata": {},
194 | "output_type": "execute_result"
195 | }
196 | ],
197 | "source": [
198 | "r_json = r.json()\n",
199 | "type(r_json)"
200 | ]
201 | },
202 | {
203 | "cell_type": "code",
204 | "execution_count": 16,
205 | "metadata": {},
206 | "outputs": [
207 | {
208 | "name": "stdout",
209 | "output_type": "stream",
210 | "text": [
211 | "{'dataset': {'id': 10095370, 'dataset_code': 'AFX_X', 'database_code': 'FSE', 'name': 'Carl Zeiss Meditec (AFX_X)', 'description': 'Stock Prices for Carl Zeiss Meditec (AFX) from the Frankfurt Stock Exchange.
Trading System: Xetra
ISIN: DE0005313704', 'refreshed_at': '2018-10-29T22:33:28.139Z', 'newest_available_date': '2018-10-29', 'oldest_available_date': '2000-06-07', 'column_names': ['Date', 'Open', 'High', 'Low', 'Close', 'Change', 'Traded Volume', 'Turnover', 'Last Price of the Day', 'Daily Traded Units', 'Daily Turnover'], 'frequency': 'daily', 'type': 'Time Series', 'premium': False, 'limit': None, 'transform': None, 'column_index': None, 'start_date': '2017-01-01', 'end_date': '2017-12-31', 'data': [['2017-12-29', 51.76, 51.94, 51.45, 51.76, None, 34640.0, 1792304.0, None, None, None], ['2017-12-28', 51.65, 51.82, 51.43, 51.6, None, 40660.0, 2099024.0, None, None, None], ['2017-12-27', 51.45, 51.89, 50.76, 51.82, None, 57452.0, 2957018.0, None, None, None], ['2017-12-22', 51.05, 51.5, 50.92, 51.32, None, 71165.0, 3641949.0, None, None, None], ['2017-12-21', 51.16, 51.52, 50.9, 51.4, None, 120649.0, 6179433.0, None, None, None], ['2017-12-20', 51.88, 52.04, 51.2, 51.27, None, 50587.0, 2610258.0, None, None, None], ['2017-12-19', 52.73, 52.73, 51.07, 51.66, None, 137313.0, 7102361.0, None, None, None], ['2017-12-18', 52.37, 52.75, 51.61, 52.62, None, 129733.0, 6770499.0, None, None, None], ['2017-12-15', 52.7, 52.7, 51.64, 52.01, None, 204080.0, 10596319.0, None, None, None], ['2017-12-14', 53.11, 53.54, 52.15, 52.67, None, 132981.0, 7016953.0, None, None, None], ['2017-12-13', 52.64, 53.35, 52.48, 53.09, None, 128434.0, 6801159.0, None, None, None], ['2017-12-12', 52.29, 53.1, 51.82, 52.43, None, 87911.0, 4615924.0, None, None, None], ['2017-12-11', 52.28, 52.45, 51.26, 52.14, None, 71817.0, 3724193.0, None, None, None], ['2017-12-08', 51.5, 52.83, 51.28, 52.12, None, 109157.0, 5690648.0, None, None, None], ['2017-12-07', 50.89, 51.47, 50.81, 51.47, None, 48123.0, 2463848.0, None, None, None], ['2017-12-06', 50.8, 51.11, 50.39, 50.89, None, 88730.0, 4504075.0, None, None, None], ['2017-12-05', 51.21, 51.38, 50.4, 51.25, None, 83023.0, 4231971.0, None, None, None], ['2017-12-04', 49.5, 51.23, 49.5, 51.14, None, 94385.0, 4800027.0, None, None, None], ['2017-12-01', 49.52, 50.49, 49.17, 49.86, None, 101733.0, 5065932.0, None, None, None], ['2017-11-30', 48.64, 49.84, 48.28, 49.7, None, 123019.0, 6085171.0, None, None, None], ['2017-11-29', 49.64, 49.64, 48.7, 48.75, None, 67342.0, 3292223.0, None, None, None], ['2017-11-28', 49.09, 49.89, 49.03, 49.25, None, 42669.0, 2107358.0, None, None, None], ['2017-11-27', 49.13, 49.73, 48.96, 49.2, None, 102180.0, 5055762.0, None, None, None], ['2017-11-24', 49.11, 49.41, 48.87, 49.11, None, 50350.0, 2472842.0, None, None, None], ['2017-11-23', 48.8, 49.46, 48.45, 49.2, None, 38834.0, 1909352.0, None, None, None], ['2017-11-22', 48.4, 49.61, 48.39, 48.8, None, 91142.0, 4478093.0, None, None, None], ['2017-11-21', 47.25, 48.59, 46.78, 48.39, None, 78502.0, 3782098.0, None, None, None], ['2017-11-20', 46.57, 47.38, 46.54, 47.04, None, 97252.0, 4563515.0, None, None, None], ['2017-11-17', 47.03, 47.15, 46.8, 46.84, None, 54107.0, 2540820.0, None, None, None], ['2017-11-16', 47.09, 47.23, 46.55, 47.03, None, 89373.0, 4195732.0, None, None, None], ['2017-11-15', 47.98, 48.01, 46.75, 47.05, None, 67593.0, 3188321.0, None, None, None], ['2017-11-14', 48.4, 48.9, 47.84, 48.0, None, 67672.0, 3259979.0, None, None, None], ['2017-11-13', 48.38, 48.61, 47.76, 48.34, None, 76286.0, 3681337.0, None, None, None], ['2017-11-10', 47.3, 48.89, 47.16, 48.34, None, 90245.0, 4361552.0, None, None, None], ['2017-11-09', 47.65, 48.06, 47.09, 47.21, None, 120268.0, 5712034.0, None, None, None], ['2017-11-08', 46.42, 47.72, 46.42, 47.47, None, 94195.0, 4463935.0, None, None, None], ['2017-11-07', 46.16, 46.33, 45.84, 46.26, None, 48152.0, 2224221.0, None, None, None], ['2017-11-06', 45.81, 46.09, 45.76, 45.99, None, 60716.0, 2789220.0, None, None, None], ['2017-11-03', 45.0, 46.04, 44.83, 45.97, None, 56911.0, 2603498.0, None, None, None], ['2017-11-02', 45.88, 46.06, 45.18, 45.27, None, 37958.0, 1724840.0, None, None, None], ['2017-11-01', 46.29, 46.55, 45.97, 46.04, None, 56319.0, 2603859.0, None, None, None], ['2017-10-30', 46.53, 46.65, 45.61, 45.76, None, 56245.0, 2585397.0, None, None, None], ['2017-10-27', 45.48, 46.42, 45.46, 46.41, None, 74472.0, 3434087.0, None, None, None], ['2017-10-26', 45.2, 45.41, 44.91, 45.41, None, 56319.0, 2548078.0, None, None, None], ['2017-10-25', 45.01, 45.06, 44.7, 45.0, None, 47730.0, 2145697.0, None, None, None], ['2017-10-24', 45.16, 45.27, 44.75, 44.85, None, 43042.0, 1937616.0, None, None, None], ['2017-10-23', 44.9, 45.34, 44.89, 45.0, None, 43375.0, 1952918.0, None, None, None], ['2017-10-20', 45.08, 45.34, 44.76, 44.87, None, 55707.0, 2503853.0, None, None, None], ['2017-10-19', 45.72, 45.85, 44.79, 45.0, None, 59991.0, 2703085.0, None, None, None], ['2017-10-18', 46.01, 46.2, 45.61, 45.77, None, 45263.0, 2076951.0, None, None, None], ['2017-10-17', 45.8, 46.06, 45.37, 45.96, None, 65837.0, 3014080.0, None, None, None], ['2017-10-16', 45.61, 45.75, 45.3, 45.55, None, 49246.0, 2243129.0, None, None, None], ['2017-10-13', 45.5, 45.7, 45.37, 45.4, None, 43362.0, 1971801.0, None, None, None], ['2017-10-12', 45.58, 45.58, 45.17, 45.43, None, 49180.0, 2233481.0, None, None, None], ['2017-10-11', 45.97, 45.97, 45.25, 45.29, None, 69455.0, 3158321.0, None, None, None], ['2017-10-10', 45.64, 46.04, 45.57, 45.84, None, 65860.0, 3016658.0, None, None, None], ['2017-10-09', 46.2, 46.2, 45.6, 45.74, None, 44059.0, 2015453.0, None, None, None], ['2017-10-06', 46.19, 46.19, 45.69, 46.0, None, 66760.0, 3066198.0, None, None, None], ['2017-10-05', 46.01, 46.09, 45.63, 46.05, None, 94804.0, 4352002.0, None, None, None], ['2017-10-04', 45.36, 46.17, 45.22, 46.11, None, 115706.0, 5313199.0, None, None, None], ['2017-10-02', 44.51, 44.98, 44.18, 44.98, None, 95313.0, 4265024.0, None, None, None], ['2017-09-29', 43.58, 44.17, 43.3, 44.17, None, 99821.0, 4384796.0, None, None, None], ['2017-09-28', 42.0, 43.56, 42.0, 43.56, None, 157234.0, 6775569.0, None, None, None], ['2017-09-27', 42.35, 42.49, 41.78, 42.04, None, 76600.0, 3219861.0, None, None, None], ['2017-09-26', 42.3, 42.57, 42.11, 42.37, None, 51321.0, 2175381.0, None, None, None], ['2017-09-25', 42.3, 42.3, 41.96, 42.07, None, 56224.0, 2366453.0, None, None, None], ['2017-09-22', 41.48, 42.38, 41.48, 42.06, None, 79955.0, 3362517.0, None, None, None], ['2017-09-21', 42.29, 42.29, 41.39, 41.46, None, 105194.0, 4378409.0, None, None, None], ['2017-09-20', 42.54, 42.54, 41.99, 41.99, None, 57838.0, 2440557.0, None, None, None], ['2017-09-19', 42.65, 42.65, 42.13, 42.44, None, 65546.0, 2777065.0, None, None, None], ['2017-09-18', 42.5, 42.63, 42.23, 42.27, None, 44037.0, 1864954.0, None, None, None], ['2017-09-15', 42.29, 42.81, 42.25, 42.42, None, 107144.0, 4555791.0, None, None, None], ['2017-09-14', 42.35, 42.8, 42.35, 42.52, None, 65157.0, 2770696.0, None, None, None], ['2017-09-13', 42.49, 42.69, 42.22, 42.45, None, 68801.0, 2921240.0, None, None, None], ['2017-09-12', 43.21, 43.34, 42.62, 42.73, None, 52828.0, 2259924.0, None, None, None], ['2017-09-11', 42.81, 42.89, 42.56, 42.85, None, 103273.0, 4415614.0, None, None, None], ['2017-09-08', 42.7, 42.75, 42.56, 42.67, None, 59881.0, 2553977.0, None, None, None], ['2017-09-07', 43.0, 43.02, 42.67, 42.77, None, 64320.0, 2751388.0, None, None, None], ['2017-09-06', 42.66, 42.71, 42.34, 42.55, None, 71006.0, 3020229.0, None, None, None], ['2017-09-05', 43.0, 43.19, 42.55, 42.62, None, 66351.0, 2846115.0, None, None, None], ['2017-09-04', 42.38, 42.75, 41.95, 42.6, None, 105288.0, 4471634.0, None, None, None], ['2017-09-01', 42.16, 43.06, 42.07, 42.41, None, 151474.0, 6453558.0, None, None, None], ['2017-08-31', 42.0, 42.08, 41.12, 41.9, None, 157888.0, 6580200.0, None, None, None], ['2017-08-30', 42.0, 42.2, 41.49, 41.94, None, 97804.0, 4090262.0, None, None, None], ['2017-08-29', 41.71, 41.98, 41.33, 41.85, None, 98156.0, 4094452.0, None, None, None], ['2017-08-28', 42.11, 42.25, 41.86, 41.91, None, 47130.0, 1978704.0, None, None, None], ['2017-08-25', 42.64, 42.64, 42.05, 42.14, None, 69734.0, 2948016.0, None, None, None], ['2017-08-24', 42.72, 43.05, 42.63, 42.69, None, 65213.0, 2792319.0, None, None, None], ['2017-08-23', 42.82, 43.17, 42.6, 42.71, None, 70269.0, 3011578.0, None, None, None], ['2017-08-22', 42.46, 42.96, 42.4, 42.71, None, 95376.0, 4075646.0, None, None, None], ['2017-08-21', 42.42, 42.76, 42.2, 42.26, None, 68812.0, 2922972.0, None, None, None], ['2017-08-18', 42.28, 42.6, 42.01, 42.41, None, 72886.0, 3092377.0, None, None, None], ['2017-08-17', 41.88, 43.01, 41.76, 42.5, None, 131361.0, 5583704.0, None, None, None], ['2017-08-16', 42.4, 42.62, 41.98, 42.05, None, 104676.0, 4408312.0, None, None, None], ['2017-08-15', 42.53, 42.53, 42.2, 42.28, None, 64334.0, 2721852.0, None, None, None], ['2017-08-14', 42.12, 42.69, 42.01, 42.3, None, 127682.0, 5416963.0, None, None, None], ['2017-08-11', 41.3, 41.94, 40.96, 41.94, None, 183412.0, 7604144.0, None, None, None], ['2017-08-10', 41.73, 41.99, 41.14, 41.68, None, 175161.0, 7303562.0, None, None, None], ['2017-08-09', 43.5, 43.5, 41.64, 41.81, None, 355857.0, 15003956.0, None, None, None], ['2017-08-08', 44.9, 45.09, 44.15, 44.37, None, 156168.0, 6941408.0, None, None, None], ['2017-08-07', 45.85, 46.34, 44.02, 44.96, None, 164543.0, 7378816.0, None, None, None], ['2017-08-04', 45.13, 45.13, 44.36, 45.07, None, 96202.0, 4306911.0, None, None, None], ['2017-08-03', 45.34, 45.54, 44.91, 44.97, None, 77854.0, 3517146.0, None, None, None], ['2017-08-02', 45.25, 45.77, 44.9, 45.56, None, 187468.0, 8528548.0, None, None, None], ['2017-08-01', 45.24, 45.54, 45.1, 45.45, None, 74975.0, 3399891.0, None, None, None], ['2017-07-31', 44.94, 45.75, 44.94, 45.3, None, 62672.0, 2844210.0, None, None, None], ['2017-07-28', 45.26, 45.29, 44.75, 44.97, None, 114006.0, 5127247.0, None, None, None], ['2017-07-27', 45.16, 45.45, 45.15, 45.25, None, 50557.0, 2290284.0, None, None, None], ['2017-07-26', 44.91, 45.33, 44.46, 45.16, None, 81970.0, 3688510.0, None, None, None], ['2017-07-25', 44.7, 45.04, 44.63, 44.82, None, 112224.0, 5033312.0, None, None, None], ['2017-07-24', 45.31, 45.31, 44.49, 44.61, None, 104282.0, 4661866.0, None, None, None], ['2017-07-21', 45.57, 45.88, 45.04, 45.44, None, 73422.0, 3334695.0, None, None, None], ['2017-07-20', 45.74, 45.96, 45.23, 45.66, None, 87399.0, 3986488.0, None, None, None], ['2017-07-19', 45.06, 45.72, 44.94, 45.57, None, 71971.0, 3273001.0, None, None, None], ['2017-07-18', 45.5, 45.55, 44.7, 45.0, None, 104003.0, 4684627.0, None, None, None], ['2017-07-17', 45.6, 46.23, 45.29, 45.6, None, 104995.0, 4801806.0, None, None, None], ['2017-07-14', 45.07, 45.56, 44.83, 45.53, None, 67375.0, 3054060.0, None, None, None], ['2017-07-13', 44.67, 45.18, 44.67, 44.95, None, 82745.0, 3718928.0, None, None, None], ['2017-07-12', 44.29, 45.05, 43.89, 44.95, None, 115705.0, 5133971.0, None, None, None], ['2017-07-11', 44.94, 44.94, 44.08, 44.2, None, 90538.0, 4010457.0, None, None, None], ['2017-07-10', 44.64, 45.18, 44.51, 44.7, None, 71868.0, 3221218.0, None, None, None], ['2017-07-07', 44.79, 44.79, 44.25, 44.53, None, 47999.0, 2136578.0, None, None, None], ['2017-07-06', 45.5, 45.5, 44.15, 44.62, None, 66116.0, 2952605.0, None, None, None], ['2017-07-05', 44.67, 45.36, 44.44, 45.19, None, 48706.0, 2189436.0, None, None, None], ['2017-07-04', 45.83, 45.83, 44.74, 44.8, None, 50549.0, 2273551.0, None, None, None], ['2017-07-03', 45.29, 45.83, 45.06, 45.75, None, 71381.0, 3251502.0, None, None, None], ['2017-06-30', 45.01, 45.74, 45.0, 45.44, None, 136112.0, 6187148.0, None, None, None], ['2017-06-29', 45.73, 45.81, 45.11, 45.2, None, 134965.0, 6132452.0, None, None, None], ['2017-06-28', 46.68, 46.68, 45.41, 45.68, None, 117165.0, 5381488.0, None, None, None], ['2017-06-27', 47.23, 47.33, 46.39, 46.83, None, 82492.0, 3866344.0, None, None, None], ['2017-06-26', 46.95, 47.63, 46.91, 47.21, None, 73322.0, 3465639.0, None, None, None], ['2017-06-23', 47.29, 47.4, 46.79, 46.99, None, 80586.0, 3792498.0, None, None, None], ['2017-06-22', 47.03, 47.4, 46.75, 47.29, None, 56071.0, 2640508.0, None, None, None], ['2017-06-21', 47.46, 47.48, 46.53, 46.99, None, 89752.0, 4206563.0, None, None, None], ['2017-06-20', 46.48, 47.43, 46.27, 47.37, None, 108334.0, 5109730.0, None, None, None], ['2017-06-19', 46.9, 46.9, 46.25, 46.64, None, 70056.0, 3260381.0, None, None, None], ['2017-06-16', 45.66, 46.8, 45.66, 46.63, None, 202214.0, 9411695.0, None, None, None], ['2017-06-15', 46.34, 46.34, 45.21, 45.67, None, 101733.0, 4635593.0, None, None, None], ['2017-06-14', 46.52, 46.86, 46.05, 46.33, None, 83741.0, 3881453.0, None, None, None], ['2017-06-13', 46.5, 46.51, 46.03, 46.32, None, 107644.0, 4981185.0, None, None, None], ['2017-06-12', 47.31, 47.43, 45.89, 46.31, None, 112942.0, 5238390.0, None, None, None], ['2017-06-09', 46.77, 47.44, 46.55, 47.44, None, 99674.0, 4702170.0, None, None, None], ['2017-06-08', 47.8, 47.8, 46.27, 46.27, None, 1945.0, 90599.0, None, None, None], ['2017-06-07', 47.01, 47.43, 47.01, 47.43, None, 1081.0, 51021.0, None, None, None], ['2017-06-06', 47.12, 47.45, 46.21, 47.43, None, 686.0, 32083.0, None, None, None], ['2017-06-02', 46.8, 46.99, 46.72, 46.99, None, 290.0, 13584.0, None, None, None], ['2017-06-01', 46.12, 46.52, 45.89, 46.52, None, 106513.0, 4930686.0, None, None, None], ['2017-05-31', 45.22, 46.26, 45.22, 45.86, None, 522.0, 24044.0, None, None, None], ['2017-05-30', 45.05, 46.02, 45.05, 46.02, None, 587.0, 26792.0, None, None, None], ['2017-05-29', 45.61, 45.61, 45.24, 45.32, None, 112.0, 5089.0, None, None, None], ['2017-05-26', 44.8, 45.36, 44.71, 45.3, None, 74453.0, 3360707.0, None, None, None], ['2017-05-25', 44.8, 44.87, 44.29, 44.78, None, 49970.0, 2231857.0, None, None, None], ['2017-05-24', 43.92, 44.67, 43.92, 44.53, None, 111923.0, 4971343.0, None, None, None], ['2017-05-23', 43.67, 44.13, 43.55, 43.9, None, 38308.0, 1681904.0, None, None, None], ['2017-05-22', 44.16, 44.22, 43.44, 43.84, None, 70856.0, 3103013.0, None, None, None], ['2017-05-19', 43.74, 44.12, 43.74, 44.12, None, 45.0, 1980.0, None, None, None], ['2017-05-18', 44.0, 44.3, 43.29, 43.98, None, 166160.0, 7277314.0, None, None, None], ['2017-05-17', 45.06, 45.34, 44.01, 44.19, None, 149515.0, 6664744.0, None, None, None], ['2017-05-16', 45.15, 45.36, 44.56, 45.31, None, 101476.0, 4567885.0, None, None, None], ['2017-05-15', 45.09, 45.78, 44.31, 45.14, None, 193702.0, 8734286.0, None, None, None], ['2017-05-12', 45.18, 45.18, 44.16, 44.99, None, 159495.0, 7113519.0, None, None, None], ['2017-05-11', 43.4, 46.06, 43.25, 45.0, None, 189125.0, 8496322.0, None, None, None], ['2017-05-10', 43.5, 43.6, 42.53, 43.28, None, 91858.0, 3958630.0, None, None, None], ['2017-05-09', 41.83, 43.55, 41.82, 43.3, None, 151439.0, 6538516.0, None, None, None], ['2017-05-08', 43.0, 43.0, 42.04, 42.24, None, 97456.0, 4128048.0, None, None, None], ['2017-05-05', 42.52, 42.91, 42.38, 42.75, None, 78512.0, 3353971.0, None, None, None], ['2017-05-04', 41.86, 42.5, 41.71, 42.5, None, 82058.0, 3465505.0, None, None, None], ['2017-05-03', 42.2, 42.29, 41.78, 41.9, None, 65266.0, 2738394.0, None, None, None], ['2017-05-02', 41.89, 42.23, 41.76, 42.15, None, 86559.0, 3636583.0, None, None, None], ['2017-05-01', None, 42.245, 41.655, 41.72, -0.44, 86348.0, 3606589.0, None, None, None], ['2017-04-28', 42.17, 42.25, 41.66, 41.72, None, 86348.0, 3606589.0, None, None, None], ['2017-04-27', 41.51, 42.24, 41.51, 42.16, None, 151683.0, 6380639.0, None, None, None], ['2017-04-26', 41.88, 41.94, 41.4, 41.5, None, 65847.0, 2743109.0, None, None, None], ['2017-04-25', 41.93, 42.18, 41.66, 41.89, None, 85973.0, 3604204.0, None, None, None], ['2017-04-24', 42.01, 42.02, 41.23, 41.81, None, 102084.0, 4247032.0, None, None, None], ['2017-04-21', 41.97, 42.14, 41.01, 41.32, None, 186784.0, 7728103.0, None, None, None], ['2017-04-20', 42.5, 42.64, 41.52, 41.93, None, 223621.0, 9418192.0, None, None, None], ['2017-04-19', 41.94, 42.61, 41.94, 42.61, None, 92722.0, 3930856.0, None, None, None], ['2017-04-18', 42.24, 42.4, 41.54, 42.0, None, 133057.0, 5587565.0, None, None, None], ['2017-04-17', None, 42.48, 41.985, 42.2, None, 88416.0, 3734717.0, None, None, None], ['2017-04-14', None, 42.48, 41.985, 42.2, None, 88416.0, 3734717.0, None, None, None], ['2017-04-13', 42.06, 42.48, 41.99, 42.2, None, 88416.0, 3734717.0, None, None, None], ['2017-04-12', 42.02, 42.45, 41.84, 42.2, None, 158278.0, 6672547.0, None, None, None], ['2017-04-11', 41.62, 42.03, 41.53, 41.75, None, 107817.0, 4501109.0, None, None, None], ['2017-04-10', 41.46, 41.68, 41.31, 41.68, None, 62297.0, 2585922.0, None, None, None], ['2017-04-07', 40.9, 41.42, 40.84, 41.42, None, 81255.0, 3344628.0, None, None, None], ['2017-04-06', 40.96, 41.25, 40.83, 41.05, None, 96794.0, 3968681.0, None, None, None], ['2017-04-05', 41.1, 41.34, 40.79, 41.1, None, 156005.0, 6404780.0, None, None, None], ['2017-04-04', 39.5, 40.88, 39.48, 40.81, None, 193156.0, 7822665.0, None, None, None], ['2017-04-03', 40.15, 40.15, 39.54, 39.64, None, 127973.0, 5081376.0, None, None, None], ['2017-03-31', 39.77, 40.07, 39.42, 39.98, None, 95382.0, 3795061.0, None, None, None], ['2017-03-30', 40.02, 40.14, 39.42, 39.75, None, 189201.0, 7541354.0, None, None, None], ['2017-03-29', 39.39, 40.01, 39.05, 40.01, None, 335406.0, 13349426.0, None, None, None], ['2017-03-28', 38.95, 39.35, 38.79, 39.22, None, 115075.0, 4505494.0, None, None, None], ['2017-03-27', 38.73, 39.1, 38.53, 38.85, None, 191515.0, 7446952.0, None, None, None], ['2017-03-24', 38.94, 39.02, 38.6, 38.94, None, 210926.0, 8205507.0, None, None, None], ['2017-03-23', 39.01, 39.25, 38.63, 38.96, None, 169971.0, 6621807.0, None, None, None], ['2017-03-22', 38.25, 39.02, 37.53, 38.94, None, 670349.0, 25910543.0, None, None, None], ['2017-03-21', 41.8, 41.83, 40.97, 40.98, None, 56906.0, 2349965.0, None, None, None], ['2017-03-20', 41.26, 42.17, 41.26, 41.97, None, 97572.0, 4074891.0, None, None, None], ['2017-03-17', 41.47, 41.59, 41.16, 41.34, None, 90109.0, 3734232.0, None, None, None], ['2017-03-16', 41.4, 41.57, 41.09, 41.46, None, 55799.0, 2308423.0, None, None, None], ['2017-03-15', 41.4, 41.5, 40.91, 41.25, None, 60324.0, 2488650.0, None, None, None], ['2017-03-14', 41.2, 41.5, 41.2, 41.3, None, 60420.0, 2498025.0, None, None, None], ['2017-03-13', 41.4, 41.46, 41.08, 41.3, None, 44803.0, 1850251.0, None, None, None], ['2017-03-10', 41.53, 41.53, 41.16, 41.4, None, 38518.0, 1592270.0, None, None, None], ['2017-03-09', 41.61, 41.61, 41.16, 41.4, None, 43988.0, 1819182.0, None, None, None], ['2017-03-08', 41.13, 41.71, 40.95, 41.68, None, 45111.0, 1870935.0, None, None, None], ['2017-03-07', 41.5, 41.8, 41.25, 41.42, None, 61925.0, 2569608.0, None, None, None], ['2017-03-06', 41.25, 41.4, 40.81, 41.4, None, 46510.0, 1916799.0, None, None, None], ['2017-03-03', 41.12, 41.22, 40.84, 41.18, None, 40800.0, 1675587.0, None, None, None], ['2017-03-02', 41.38, 41.39, 40.76, 41.17, None, 49863.0, 2048153.0, None, None, None], ['2017-03-01', 41.19, 41.57, 40.9, 41.2, None, 86753.0, 3569796.0, None, None, None], ['2017-02-28', 40.38, 40.95, 40.38, 40.84, None, 67440.0, 2747011.0, None, None, None], ['2017-02-27', 39.75, 40.64, 39.75, 40.39, None, 62655.0, 2520260.0, None, None, None], ['2017-02-24', 39.77, 40.14, 38.91, 39.74, None, 101294.0, 4015150.0, None, None, None], ['2017-02-23', 39.72, 39.98, 39.38, 39.79, None, 81945.0, 3260642.0, None, None, None], ['2017-02-22', 39.6, 39.75, 39.27, 39.7, None, 77619.0, 3066894.0, None, None, None], ['2017-02-21', 38.85, 39.57, 38.85, 39.45, None, 46070.0, 1808350.0, None, None, None], ['2017-02-20', 39.25, 39.25, 38.81, 38.98, None, 37014.0, 1444138.0, None, None, None], ['2017-02-17', 38.8, 39.03, 38.48, 39.02, None, 60583.0, 2352961.0, None, None, None], ['2017-02-16', 38.8, 39.2, 38.25, 38.71, None, 84682.0, 3282322.0, None, None, None], ['2017-02-15', 38.5, 38.93, 38.4, 38.72, None, 77420.0, 2996861.0, None, None, None], ['2017-02-14', 38.81, 38.86, 38.0, 38.37, None, 82601.0, 3163898.0, None, None, None], ['2017-02-13', 37.37, 39.36, 37.35, 38.53, None, 177171.0, 6804028.0, None, None, None], ['2017-02-10', 36.65, 37.5, 36.57, 37.06, None, 115843.0, 4291017.0, None, None, None], ['2017-02-09', 36.2, 36.25, 35.77, 36.25, None, 67781.0, 2445428.0, None, None, None], ['2017-02-08', 35.98, 36.14, 35.84, 36.05, None, 39731.0, 1431205.0, None, None, None], ['2017-02-07', 35.56, 36.05, 35.36, 35.89, None, 67410.0, 2410818.0, None, None, None], ['2017-02-06', 36.06, 36.15, 35.6, 35.64, None, 41911.0, 1496794.0, None, None, None], ['2017-02-03', 36.02, 36.2, 35.73, 36.1, None, 40705.0, 1464712.0, None, None, None], ['2017-02-02', 35.95, 36.2, 35.7, 36.07, None, 54279.0, 1953176.0, None, None, None], ['2017-02-01', 34.75, 36.0, 34.75, 35.94, None, 85137.0, 3038172.0, None, None, None], ['2017-01-31', 35.24, 35.24, 34.56, 34.56, None, 63371.0, 2199583.0, None, None, None], ['2017-01-30', 35.38, 35.59, 34.95, 35.15, None, 69603.0, 2457762.0, None, None, None], ['2017-01-27', 34.83, 35.43, 34.81, 35.3, None, 69657.0, 2444913.0, None, None, None], ['2017-01-26', 35.07, 35.58, 34.8, 34.89, None, 64103.0, 2249375.0, None, None, None], ['2017-01-25', 34.42, 34.86, 34.03, 34.83, None, 56240.0, 1947147.0, None, None, None], ['2017-01-24', 34.0, 34.35, 33.85, 34.22, None, 48797.0, 1666086.0, None, None, None], ['2017-01-23', 34.04, 34.12, 33.62, 34.06, None, 55333.0, 1877957.0, None, None, None], ['2017-01-20', 34.54, 34.59, 34.05, 34.17, None, 80246.0, 2743474.0, None, None, None], ['2017-01-19', 35.04, 35.04, 34.42, 34.5, None, 73105.0, 2526731.0, None, None, None], ['2017-01-18', 35.04, 35.51, 34.8, 34.9, None, 65931.0, 2311608.0, None, None, None], ['2017-01-17', 35.06, 35.19, 34.79, 34.99, None, 39195.0, 1369857.0, None, None, None], ['2017-01-16', 34.85, 35.24, 34.56, 35.07, None, 47879.0, 1678679.0, None, None, None], ['2017-01-13', 34.98, 34.98, 34.6, 34.85, None, 59367.0, 2065534.0, None, None, None], ['2017-01-12', 35.38, 35.38, 34.31, 34.9, None, 163860.0, 5703427.0, None, None, None], ['2017-01-11', 34.95, 36.0, 34.84, 35.42, None, 123530.0, 4369079.0, None, None, None], ['2017-01-10', 34.8, 34.98, 34.46, 34.91, None, 43976.0, 1528055.0, None, None, None], ['2017-01-09', 35.29, 35.35, 34.43, 34.67, None, 62225.0, 2157182.0, None, None, None], ['2017-01-06', 34.91, 35.21, 34.91, 35.04, None, 27507.0, 964046.0, None, None, None], ['2017-01-05', 35.02, 35.2, 34.73, 35.06, None, 48412.0, 1692326.0, None, None, None], ['2017-01-04', 35.48, 35.51, 34.75, 35.19, None, 54408.0, 1906810.0, None, None, None], ['2017-01-03', 35.9, 35.93, 35.34, 35.48, None, 70618.0, 2515473.0, None, None, None], ['2017-01-02', 34.99, 35.94, 34.99, 35.8, None, 44700.0, 1590561.0, None, None, None]], 'collapse': None, 'order': None, 'database_id': 6129}}\n"
212 | ]
213 | }
214 | ],
215 | "source": [
216 | "print(r_json)"
217 | ]
218 | },
219 | {
220 | "cell_type": "markdown",
221 | "metadata": {},
222 | "source": [
223 | "### 3. Calculate what the highest and lowest opening prices were for the stock in this period."
224 | ]
225 | },
226 | {
227 | "cell_type": "code",
228 | "execution_count": 28,
229 | "metadata": {},
230 | "outputs": [
231 | {
232 | "data": {
233 | "text/plain": [
234 | "dict_keys(['dataset'])"
235 | ]
236 | },
237 | "execution_count": 28,
238 | "metadata": {},
239 | "output_type": "execute_result"
240 | }
241 | ],
242 | "source": [
243 | "# Review the data content\n",
244 | "r_json.keys()"
245 | ]
246 | },
247 | {
248 | "cell_type": "code",
249 | "execution_count": 29,
250 | "metadata": {},
251 | "outputs": [
252 | {
253 | "data": {
254 | "text/plain": [
255 | "dict_keys(['id', 'dataset_code', 'database_code', 'name', 'description', 'refreshed_at', 'newest_available_date', 'oldest_available_date', 'column_names', 'frequency', 'type', 'premium', 'limit', 'transform', 'column_index', 'start_date', 'end_date', 'data', 'collapse', 'order', 'database_id'])"
256 | ]
257 | },
258 | "execution_count": 29,
259 | "metadata": {},
260 | "output_type": "execute_result"
261 | }
262 | ],
263 | "source": [
264 | "r_json['dataset'].keys()"
265 | ]
266 | },
267 | {
268 | "cell_type": "code",
269 | "execution_count": 30,
270 | "metadata": {},
271 | "outputs": [
272 | {
273 | "data": {
274 | "text/plain": [
275 | "['Date',\n",
276 | " 'Open',\n",
277 | " 'High',\n",
278 | " 'Low',\n",
279 | " 'Close',\n",
280 | " 'Change',\n",
281 | " 'Traded Volume',\n",
282 | " 'Turnover',\n",
283 | " 'Last Price of the Day',\n",
284 | " 'Daily Traded Units',\n",
285 | " 'Daily Turnover']"
286 | ]
287 | },
288 | "execution_count": 30,
289 | "metadata": {},
290 | "output_type": "execute_result"
291 | }
292 | ],
293 | "source": [
294 | "r_json['dataset']['column_names']"
295 | ]
296 | },
297 | {
298 | "cell_type": "code",
299 | "execution_count": 31,
300 | "metadata": {},
301 | "outputs": [
302 | {
303 | "data": {
304 | "text/plain": [
305 | "[['2017-12-29',\n",
306 | " 51.76,\n",
307 | " 51.94,\n",
308 | " 51.45,\n",
309 | " 51.76,\n",
310 | " None,\n",
311 | " 34640.0,\n",
312 | " 1792304.0,\n",
313 | " None,\n",
314 | " None,\n",
315 | " None],\n",
316 | " ['2017-12-28',\n",
317 | " 51.65,\n",
318 | " 51.82,\n",
319 | " 51.43,\n",
320 | " 51.6,\n",
321 | " None,\n",
322 | " 40660.0,\n",
323 | " 2099024.0,\n",
324 | " None,\n",
325 | " None,\n",
326 | " None]]"
327 | ]
328 | },
329 | "execution_count": 31,
330 | "metadata": {},
331 | "output_type": "execute_result"
332 | }
333 | ],
334 | "source": [
335 | "r_json['dataset']['data'][0:2]"
336 | ]
337 | },
338 | {
339 | "cell_type": "markdown",
340 | "metadata": {},
341 | "source": [
342 | "For this particular question, I tried to show three different approaches to solve it. "
343 | ]
344 | },
345 | {
346 | "cell_type": "code",
347 | "execution_count": 17,
348 | "metadata": {},
349 | "outputs": [
350 | {
351 | "name": "stdout",
352 | "output_type": "stream",
353 | "text": [
354 | "Maximim and minimum opening values by dates: \n",
355 | "('2017-12-14', 53.11) ('2017-01-24', 34.0)\n"
356 | ]
357 | }
358 | ],
359 | "source": [
360 | "# Method-1:\n",
361 | "\n",
362 | "# Index of Openings\n",
363 | "i_open = r_json['dataset']['column_names'].index('Open')\n",
364 | "\n",
365 | "# Index of the data associated with the \"Open\" value\n",
366 | "i_date = r_json['dataset']['column_names'].index('Date')\n",
367 | "\n",
368 | "# Creating a dictionary for opening values to corresponding each day\n",
369 | "data_json = r_json['dataset']['data']\n",
370 | "openings = {data_json[j][i_date] : data_json[j][i_open] for j in range(len(data_json)) if data_json[j][i_open] is not None}\n",
371 | "\n",
372 | "max_openings = max(openings.items(), key=operator.itemgetter(1))\n",
373 | "min_openings = min(openings.items(), key=operator.itemgetter(1))\n",
374 | "\n",
375 | "print('Maximim and minimum opening values by dates: ')\n",
376 | "print(max_openings, min_openings)"
377 | ]
378 | },
379 | {
380 | "cell_type": "code",
381 | "execution_count": 18,
382 | "metadata": {},
383 | "outputs": [
384 | {
385 | "name": "stdout",
386 | "output_type": "stream",
387 | "text": [
388 | "The highest opening price: 53.11\n",
389 | "The lowest opening price: 34.0\n"
390 | ]
391 | }
392 | ],
393 | "source": [
394 | "# Method-2:\n",
395 | "\n",
396 | "opening = [row[1] for row in data_json if row[1] != None]\n",
397 | "print(\"The highest opening price: \" + str(max(opening)))\n",
398 | "print(\"The lowest opening price: \" + str(min(opening)))"
399 | ]
400 | },
401 | {
402 | "cell_type": "code",
403 | "execution_count": 19,
404 | "metadata": {},
405 | "outputs": [
406 | {
407 | "name": "stdout",
408 | "output_type": "stream",
409 | "text": [
410 | " ['max_opening_value $53.11 at 2017-12-14'] \n",
411 | " ['lowest_opening_value $34.0 at 2017-01-24']\n"
412 | ]
413 | }
414 | ],
415 | "source": [
416 | "# Method-3: \n",
417 | "\n",
418 | "def min_max_opening(data):\n",
419 | " max_opening = ['max_opening_value $' + str(v) + ' at ' + k for k,v in data.items() if v == max(data.values()) ]\n",
420 | " lowest_opening = ['lowest_opening_value $' + str(v) + ' at ' + k for k,v in data.items() if v == min(data.values()) ]\n",
421 | " return print('',max_opening,'\\n',lowest_opening)\n",
422 | "\n",
423 | "min_max_opening(openings)"
424 | ]
425 | },
426 | {
427 | "cell_type": "markdown",
428 | "metadata": {},
429 | "source": [
430 | "### 4. What was the largest change in any one day (based on High and Low price)?"
431 | ]
432 | },
433 | {
434 | "cell_type": "code",
435 | "execution_count": 20,
436 | "metadata": {},
437 | "outputs": [
438 | {
439 | "name": "stdout",
440 | "output_type": "stream",
441 | "text": [
442 | "The largest change in any one day is:2.8100000000000023\n"
443 | ]
444 | }
445 | ],
446 | "source": [
447 | "high = [row[2] for row in data_json if row[2] != None]\n",
448 | "\n",
449 | "low = [row[3] for row in data_json if row[3] != None]\n",
450 | "\n",
451 | "subs = [abs(x1 - x2) for (x1, x2) in zip(high, low)]\n",
452 | "\n",
453 | "print (\"The largest change in any one day is:\" + str(max(subs)))"
454 | ]
455 | },
456 | {
457 | "cell_type": "markdown",
458 | "metadata": {},
459 | "source": [
460 | "### 5. What was the largest change between any two days (based on Closing Price)?"
461 | ]
462 | },
463 | {
464 | "cell_type": "code",
465 | "execution_count": 21,
466 | "metadata": {},
467 | "outputs": [
468 | {
469 | "name": "stdout",
470 | "output_type": "stream",
471 | "text": [
472 | "The largest change between two days is:2.559999999999995\n"
473 | ]
474 | }
475 | ],
476 | "source": [
477 | "closing = [row[4] for row in data_json if row[4] != None]\n",
478 | "\n",
479 | "closing_prvs = [row[4] for row in data_json if row[4] != None][1:]\n",
480 | "\n",
481 | "sub = [abs(x1 - x2) for (x1, x2) in zip(closing, closing_prvs)]\n",
482 | "\n",
483 | "print (\"The largest change between two days is:\" + str(max(sub)))"
484 | ]
485 | },
486 | {
487 | "cell_type": "markdown",
488 | "metadata": {},
489 | "source": [
490 | "### 6. What was the average daily trading volume during this year?"
491 | ]
492 | },
493 | {
494 | "cell_type": "code",
495 | "execution_count": 22,
496 | "metadata": {},
497 | "outputs": [
498 | {
499 | "name": "stdout",
500 | "output_type": "stream",
501 | "text": [
502 | "The avarage daily trading volume in 2017: 89124.34\n"
503 | ]
504 | }
505 | ],
506 | "source": [
507 | "trading_volume = [row[6] for row in data_json]\n",
508 | "\n",
509 | "volume_avg = sum(trading_volume) / len(trading_volume)\n",
510 | "\n",
511 | "print (\"The avarage daily trading volume in 2017: \" + str(round(volume_avg,2)))"
512 | ]
513 | },
514 | {
515 | "cell_type": "markdown",
516 | "metadata": {},
517 | "source": [
518 | "### 7. (Optional) What was the median trading volume during this year. (Note: you may need to implement your own function for calculating the median.)"
519 | ]
520 | },
521 | {
522 | "cell_type": "code",
523 | "execution_count": 26,
524 | "metadata": {},
525 | "outputs": [
526 | {
527 | "name": "stdout",
528 | "output_type": "stream",
529 | "text": [
530 | "\n",
531 | " 76286.0 is the median trading volume during this year\n"
532 | ]
533 | }
534 | ],
535 | "source": [
536 | "def find_median(values):\n",
537 | "\n",
538 | " # First sort the list in ascending order\n",
539 | " sorted_trading_vol = sorted(values, reverse= False)\n",
540 | " \n",
541 | " # Calculate the size of the list\n",
542 | " size = len(sorted_trading_vol)\n",
543 | " \n",
544 | " # Check if the size is odd or even number provided the list not empty\n",
545 | " if size % 2 == 1:\n",
546 | " return sorted_trading_vol[size//2]\n",
547 | " else:\n",
548 | " return sum(sorted_trading_vol[size//2-1:size//2+1])/2.0\n",
549 | " \n",
550 | "print('\\n',find_median(values = trading_volume) , ' is the median trading volume during this year')"
551 | ]
552 | }
553 | ],
554 | "metadata": {
555 | "kernelspec": {
556 | "display_name": "Python 3",
557 | "language": "python",
558 | "name": "python3"
559 | },
560 | "language_info": {
561 | "codemirror_mode": {
562 | "name": "ipython",
563 | "version": 3
564 | },
565 | "file_extension": ".py",
566 | "mimetype": "text/x-python",
567 | "name": "python",
568 | "nbconvert_exporter": "python",
569 | "pygments_lexer": "ipython3",
570 | "version": "3.7.1"
571 | }
572 | },
573 | "nbformat": 4,
574 | "nbformat_minor": 2
575 | }
576 |
--------------------------------------------------------------------------------
/API_Project-Quandl/batch_rulex_script.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Batch Processing with RulexAI for Large-Scale Model Interpretability
3 |
4 | import os
5 | import pandas as pd
6 | import numpy as np
7 | import pickle
8 | import time
9 | from datetime import datetime
10 | import logging
11 | from tqdm import tqdm
12 | # Import correct RuleKit and RulexAI packages
13 | from rulekit import RuleKit
14 | from rulekit.classification import RuleClassifier
15 | from rulekit.params import Measures
16 | from rulexai.explainer import Explainer
17 | from joblib import Parallel, delayed, parallel_backend
18 | import multiprocessing
19 | import shutil
20 | import sys
21 | import tempfile
22 |
23 | # Set up logging
24 | logging.basicConfig(
25 | level=logging.INFO,
26 | format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
27 | handlers=[
28 | logging.FileHandler("rulex_batch_processing.log"),
29 | logging.StreamHandler()
30 | ]
31 | )
32 | logger = logging.getLogger("RulexBatchProcessor")
33 |
34 | # Initialize directories for outputs
35 | def init_directories(output_dir="rulex_explanations"):
36 | """Create necessary directories for output"""
37 | os.makedirs(output_dir, exist_ok=True)
38 | os.makedirs(f"{output_dir}/checkpoints", exist_ok=True)
39 | os.makedirs(f"{output_dir}/explainer", exist_ok=True)
40 | return output_dir
41 |
42 | # Save RulexAI explainer to disk
43 | def save_explainer(explainer, output_dir, filename="rulex_explainer.pkl"):
44 | """Save the RulexAI explainer object to disk"""
45 | filepath = os.path.join(output_dir, "explainer", filename)
46 | with open(filepath, 'wb') as f:
47 | pickle.dump(explainer, f)
48 | logger.info(f"RulexAI explainer saved to: {filepath}")
49 | return filepath
50 |
51 | # Load RulexAI explainer from disk
52 | def load_explainer(filepath):
53 | """Load a RulexAI explainer object from disk"""
54 | with open(filepath, 'rb') as f:
55 | explainer = pickle.load(f)
56 | logger.info(f"RulexAI explainer loaded from: {filepath}")
57 | return explainer
58 |
59 | # Save checkpoint of batch results
60 | def save_checkpoint(batch_results, batch_num, output_dir):
61 | """Save checkpoint of batch results"""
62 | checkpoint_path = f"{output_dir}/checkpoints/batch_{batch_num}.pkl"
63 | with open(checkpoint_path, 'wb') as f:
64 | pickle.dump(batch_results, f)
65 | logger.info(f"Checkpoint saved: {checkpoint_path}")
66 | return checkpoint_path
67 |
68 | # Process a single record
69 | def process_single_record(idx, X_record, explainer):
70 | """Process a single record and return its explanation"""
71 | try:
72 | # Generate explanation for this record
73 | # Using RulexAI's explain_instance method
74 | explanation = explainer.explain(X_record)
75 | return idx, explanation
76 | except Exception as e:
77 | logger.error(f"Error processing record {idx}: {str(e)}")
78 | return idx, None
79 |
80 | # Process a batch of records in parallel
81 | def process_batch(batch_indices, X_batch, explainer, batch_num, n_jobs, output_dir, checkpoint_frequency):
82 | """Process a batch of records in parallel"""
83 | logger.info(f"Processing batch {batch_num}: {len(batch_indices)} records")
84 | start_time = time.time()
85 |
86 | results = []
87 | with parallel_backend('loky', n_jobs=n_jobs):
88 | results = Parallel(verbose=1)(
89 | delayed(process_single_record)(idx, X_batch[i], explainer)
90 | for i, idx in enumerate(batch_indices)
91 | )
92 |
93 | # Filter out failed explanations
94 | valid_results = [(idx, exp) for idx, exp in results if exp is not None]
95 |
96 | elapsed = time.time() - start_time
97 | logger.info(f"Batch {batch_num} processed in {elapsed:.2f}s " +
98 | f"({len(valid_results)}/{len(batch_indices)} successful)")
99 |
100 | # Save checkpoint if needed
101 | if batch_num % checkpoint_frequency == 0:
102 | save_checkpoint(valid_results, batch_num, output_dir)
103 |
104 | return valid_results
105 |
106 | # Process the entire dataset
107 | def process_dataset(model, X_train, X_to_explain, feature_names,
108 | batch_size=10000, n_jobs=-1, output_dir="rulex_explanations",
109 | checkpoint_frequency=5, starting_batch=0, resume_from_checkpoint=None,
110 | save_explainer_frequency=20, saved_explainer_path=None):
111 | """
112 | Process the entire dataset in batches
113 |
114 | Args:
115 | model: The trained ensemble model to explain (with predict/predict_proba functions)
116 | X_train: Training data used for the model
117 | X_to_explain: The dataset to generate explanations for
118 | feature_names: List of feature names
119 | batch_size: Number of records to process in each batch
120 | n_jobs: Number of parallel jobs (-1 for all cores)
121 | output_dir: Directory to store explanations
122 | checkpoint_frequency: How often to save checkpoints (in batches)
123 | starting_batch: Batch number to start processing from
124 | resume_from_checkpoint: Path to checkpoint to resume from
125 | save_explainer_frequency: How often to save the explainer (in batches)
126 | saved_explainer_path: Path to a saved RulexAI explainer (if available)
127 |
128 | Returns:
129 | Dictionary of explanations (record index -> explanation)
130 | """
131 | # Initialize directories
132 | output_dir = init_directories(output_dir)
133 |
134 | # Set number of jobs for parallel processing
135 | n_jobs = n_jobs if n_jobs > 0 else multiprocessing.cpu_count()
136 |
137 | # Disable GPU to ensure CPU-only processing
138 | os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
139 |
140 | # Define a wrapper class to make functional model compatible with RulexAI
141 | # This is needed only if the model is a dictionary with functions
142 | if isinstance(model, dict) and 'predict' in model and 'predict_proba' in model:
143 | class ModelWrapper:
144 | def __init__(self, model_dict):
145 | self.model_dict = model_dict
146 |
147 | def predict(self, X):
148 | return self.model_dict['predict'](X)
149 |
150 | def predict_proba(self, X):
151 | return self.model_dict['predict_proba'](X)
152 |
153 | model_wrapper = ModelWrapper(model)
154 | else:
155 | # If it's already a model object with methods, use it directly
156 | model_wrapper = model
157 |
158 | # Load or initialize RulexAI explainer
159 | if saved_explainer_path and os.path.exists(saved_explainer_path):
160 | logger.info(f"Loading saved RulexAI explainer from {saved_explainer_path}")
161 | explainer = load_explainer(saved_explainer_path)
162 | else:
163 | logger.info("Initializing new RulexAI explainer...")
164 | # Create a RuleKit classifier for use with RulexAI
165 | rule_classifier = RuleClassifier(
166 | min_rule_covered=5,
167 | induction_measure=Measures.Correlation,
168 | pruning_measure=Measures.Correlation,
169 | voting_measure=Measures.Correlation,
170 | max_growing=10000
171 | )
172 |
173 | # Initialize the Explainer with our ensemble model and the rule classifier
174 | explainer = Explainer(
175 | estimator=model_wrapper,
176 | rule_generator=rule_classifier,
177 | X_train=X_train,
178 | feature_names=feature_names
179 | )
180 |
181 | # Save the initial explainer
182 | save_explainer(explainer, output_dir)
183 |
184 | # Calculate total batches
185 | total_records = len(X_to_explain)
186 | total_batches = (total_records + batch_size - 1) // batch_size
187 |
188 | logger.info(f"Starting batch processing of {total_records} records " +
189 | f"in {total_batches} batches (CPU-only mode)")
190 |
191 | all_explanations = {}
192 |
193 | # Resume from checkpoint if specified
194 | if resume_from_checkpoint and os.path.exists(resume_from_checkpoint):
195 | logger.info(f"Resuming from checkpoint: {resume_from_checkpoint}")
196 | with open(resume_from_checkpoint, 'rb') as f:
197 | checkpoint_results = pickle.load(f)
198 | for idx, exp in checkpoint_results:
199 | all_explanations[idx] = exp
200 |
201 | # Process batches
202 | for batch_num in range(starting_batch, total_batches):
203 | start_idx = batch_num * batch_size
204 | end_idx = min(start_idx + batch_size, total_records)
205 |
206 | batch_indices = list(range(start_idx, end_idx))
207 | X_batch = X_to_explain[start_idx:end_idx]
208 |
209 | # Process this batch
210 | batch_results = process_batch(
211 | batch_indices, X_batch, explainer, batch_num,
212 | n_jobs, output_dir, checkpoint_frequency
213 | )
214 |
215 | # Add to overall results
216 | for idx, exp in batch_results:
217 | all_explanations[idx] = exp
218 |
219 | # Save progress report
220 | completion_percentage = (batch_num + 1) / total_batches * 100
221 | logger.info(f"Progress: {completion_percentage:.2f}% complete " +
222 | f"({batch_num + 1}/{total_batches} batches)")
223 |
224 | # Periodically save the explainer object to capture any learning/updates
225 | if batch_num % save_explainer_frequency == 0:
226 | explainer_path = save_explainer(
227 | explainer, output_dir, f"rulex_explainer_batch_{batch_num}.pkl"
228 | )
229 | logger.info(f"Saved explainer snapshot at batch {batch_num}: {explainer_path}")
230 |
231 | logger.info(f"Batch processing complete. Successful explanations: {len(all_explanations)}")
232 | return all_explanations
233 |
234 | # Save explanations to disk
235 | def save_explanations(explanations, output_dir, filename="explanations.pkl"):
236 | """Save all explanations to disk"""
237 | filepath = os.path.join(output_dir, filename)
238 | with open(filepath, 'wb') as f:
239 | pickle.dump(explanations, f)
240 | logger.info(f"Explanations saved to: {filepath}")
241 | return filepath
242 |
243 | # Generate a summary report from explanations
244 | def generate_summary_report(explanations, output_dir):
245 | """Generate a summary report of the explanations"""
246 | # Count rule frequencies across all explanations
247 | rule_counts = {}
248 | total_explanations = len(explanations)
249 |
250 | for idx, explanation in explanations.items():
251 | # Extract rules from the explanation
252 | # Structure may vary based on RulexAI, adjusting as needed
253 | try:
254 | rules = explanation.get_rules()
255 | for rule in rules:
256 | rule_text = str(rule)
257 | if rule_text in rule_counts:
258 | rule_counts[rule_text] += 1
259 | else:
260 | rule_counts[rule_text] = 1
261 | except:
262 | # If rule extraction fails, continue with the next explanation
263 | continue
264 |
265 | # Sort rules by frequency
266 | sorted_rules = sorted(rule_counts.items(), key=lambda x: x[1], reverse=True)
267 |
268 | # Save summary to file
269 | report_path = f"{output_dir}/summary_report.txt"
270 | with open(report_path, 'w') as f:
271 | f.write(f"RulexAI Explanation Summary\n")
272 | f.write(f"Generated on: {datetime.now()}\n")
273 | f.write(f"Total Records Processed: {total_explanations}\n\n")
274 |
275 | f.write(f"Top 20 Most Frequent Rules:\n")
276 | for i, (rule, count) in enumerate(sorted_rules[:20], 1):
277 | percentage = (count / total_explanations) * 100
278 | f.write(f"{i}. Rule: {rule}\n Frequency: {count} ({percentage:.2f}%)\n\n")
279 |
280 | # Add more summary statistics as needed
281 |
282 | logger.info(f"Summary report generated: {report_path}")
283 | return report_path
284 |
285 | # Create an ensemble model from CatBoost and AutoGluon using a functional approach
286 | def create_ensemble_model(X_train, y_train, feature_names):
287 | """
288 | Create and train an ensemble model consisting of CatBoost and AutoGluon.
289 | Returns functions for prediction and probability estimation.
290 |
291 | Args:
292 | X_train: Training data features
293 | y_train: Training data labels
294 | feature_names: List of feature names
295 |
296 | Returns:
297 | A dictionary containing the models and prediction functions
298 | """
299 | import numpy as np
300 | import pandas as pd
301 | import tempfile
302 | import sys
303 |
304 | logger.info("Training ensemble model components...")
305 |
306 | # Train CatBoost
307 | logger.info("Training CatBoost model...")
308 | from catboost import CatBoostClassifier
309 | catboost_model = CatBoostClassifier(
310 | iterations=100,
311 | depth=5,
312 | learning_rate=0.1,
313 | loss_function='Logloss',
314 | random_seed=42,
315 | verbose=False
316 | )
317 | catboost_model.fit(X_train, y_train)
318 |
319 | # Create a pandas DataFrame for AutoGluon
320 | train_df = pd.DataFrame(X_train)
321 | train_df.columns = feature_names
322 | train_df['target'] = y_train
323 |
324 | # Train AutoGluon
325 | logger.info("Training AutoGluon model...")
326 | # Create a temporary directory for AutoGluon
327 | ag_path = tempfile.mkdtemp()
328 |
329 | # Import AutoGluon
330 | try:
331 | from autogluon.tabular import TabularPredictor
332 | except ImportError:
333 | logger.warning("AutoGluon not installed. Installing it now...")
334 | import subprocess
335 | subprocess.check_call([sys.executable, "-m", "pip", "install", "autogluon"])
336 | from autogluon.tabular import TabularPredictor
337 |
338 | # Train AutoGluon with minimal settings for demonstration
339 | ag_model = TabularPredictor(
340 | label='target',
341 | path=ag_path,
342 | eval_metric='accuracy'
343 | )
344 | ag_model.fit(
345 | train_data=train_df,
346 | time_limit=300, # 5 minutes time limit
347 | presets='medium_quality'
348 | )
349 |
350 | # Define ensemble prediction function for class labels
351 | def predict(X):
352 | """Make binary predictions using the ensemble"""
353 | probs = predict_proba(X)
354 | return np.argmax(probs, axis=1)
355 |
356 | # Define ensemble prediction function for probabilities
357 | def predict_proba(X):
358 | """Average probability predictions from both models"""
359 | # Get CatBoost probabilities
360 | catboost_probs = catboost_model.predict_proba(X)
361 |
362 | # Get AutoGluon probabilities
363 | X_df = pd.DataFrame(X)
364 | X_df.columns = feature_names
365 | ag_probs = ag_model.predict_proba(X_df).values
366 |
367 | # Average the probabilities
368 | avg_probs = (catboost_probs + ag_probs) / 2
369 |
370 | return avg_probs
371 |
372 | # Create the ensemble model as a dictionary containing models and functions
373 | ensemble = {
374 | 'catboost_model': catboost_model,
375 | 'ag_model': ag_model,
376 | 'feature_names': feature_names,
377 | 'predict': predict,
378 | 'predict_proba': predict_proba
379 | }
380 |
381 | logger.info("Ensemble model created and trained successfully")
382 | return ensemble
383 |
384 | # Find the latest checkpoint
385 | def find_latest_checkpoint(checkpoint_dir):
386 | """Find the latest checkpoint file and determine starting batch"""
387 | checkpoints = [f for f in os.listdir(checkpoint_dir) if f.startswith("batch_") and f.endswith(".pkl")] if os.path.exists(checkpoint_dir) else []
388 |
389 | if not checkpoints:
390 | return None, 0
391 |
392 | # Find the latest checkpoint
393 | latest_checkpoint = max(checkpoints, key=lambda x: int(x.split('_')[1].split('.')[0]))
394 | resume_checkpoint = os.path.join(checkpoint_dir, latest_checkpoint)
395 | starting_batch = int(latest_checkpoint.split('_')[1].split('.')[0]) + 1
396 |
397 | return resume_checkpoint, starting_batch
398 |
399 | # Command-line interface for the batch processor
400 | def parse_arguments():
401 | """Parse command line arguments"""
402 | import argparse
403 | parser = argparse.ArgumentParser(description='Batch process model explanations using RulexAI')
404 |
405 | parser.add_argument('--model_path', type=str, required=True,
406 | help='Path to the saved ensemble model')
407 | parser.add_argument('--data_path', type=str, required=True,
408 | help='Path to the dataset to explain')
409 | parser.add_argument('--train_data_path', type=str, required=True,
410 | help='Path to the training data used for the model')
411 | parser.add_argument('--output_dir', type=str, default='rulex_explanations',
412 | help='Directory to store explanations')
413 | parser.add_argument('--batch_size', type=int, default=50000,
414 | help='Number of records to process in each batch')
415 | parser.add_argument('--n_jobs', type=int, default=8,
416 | help='Number of parallel jobs')
417 | parser.add_argument('--checkpoint_freq', type=int, default=5,
418 | help='How often to save checkpoints (in batches)')
419 | parser.add_argument('--resume', action='store_true',
420 | help='Resume from the latest checkpoint')
421 | parser.add_argument('--explainer_path', type=str,
422 | help='Path to a saved RulexAI explainer')
423 |
424 | return parser.parse_args()
425 |
426 | # Example usage
427 | if __name__ == "__main__":
428 | # Check if running in demo mode or CLI mode
429 | if len(sys.argv) > 1:
430 | # CLI mode - parse arguments
431 | args = parse_arguments()
432 |
433 | logger.info("Loading model and dataset from specified paths...")
434 | # Load the model from the specified path
435 | with open(args.model_path, 'rb') as f:
436 | model = pickle.load(f)
437 |
438 | # Load the data (this should be adjusted based on your data format)
439 | X_all = pd.read_csv(args.data_path).values
440 | X_train = pd.read_csv(args.train_data_path).values
441 |
442 | # Get feature names from the training data
443 | if args.train_data_path.endswith('.csv'):
444 | feature_names = list(pd.read_csv(args.train_data_path).columns)
445 | else:
446 | # Default feature names if not available
447 | feature_names = [f"feature_{i}" for i in range(X_train.shape[1])]
448 |
449 | # Check if all required ML libraries are installed
450 | try:
451 | from catboost import CatBoostClassifier
452 | except ImportError:
453 | logger.warning("CatBoost not installed. Installing it now...")
454 | import subprocess
455 | subprocess.check_call([sys.executable, "-m", "pip", "install", "catboost"])
456 |
457 | try:
458 | from autogluon.tabular import TabularPredictor
459 | except ImportError:
460 | logger.warning("AutoGluon not installed. Installing it now...")
461 | import subprocess
462 | subprocess.check_call([sys.executable, "-m", "pip", "install", "autogluon"])
463 |
464 | # Determine if we should resume from a checkpoint
465 | resume_checkpoint = None
466 | starting_batch = 0
467 |
468 | if args.resume:
469 | checkpoint_dir = os.path.join(args.output_dir, "checkpoints")
470 | resume_checkpoint, starting_batch = find_latest_checkpoint(checkpoint_dir)
471 | if resume_checkpoint:
472 | logger.info(f"Found checkpoint, resuming from batch {starting_batch}")
473 | else:
474 | logger.info("No checkpoints found, starting from the beginning")
475 |
476 | # Process the dataset
477 | explanations = process_dataset(
478 | model=model,
479 | X_train=X_train,
480 | X_to_explain=X_all,
481 | feature_names=feature_names,
482 | batch_size=args.batch_size,
483 | n_jobs=args.n_jobs,
484 | output_dir=args.output_dir,
485 | checkpoint_frequency=args.checkpoint_freq,
486 | starting_batch=starting_batch,
487 | resume_from_checkpoint=resume_checkpoint,
488 | save_explainer_frequency=10,
489 | saved_explainer_path=args.explainer_path
490 | )
491 |
492 | # Save the results
493 | output_file = save_explanations(explanations, args.output_dir)
494 | summary_file = generate_summary_report(explanations, args.output_dir)
495 |
496 | logger.info("Batch processing complete!")
497 | logger.info(f"Explanations saved to: {output_file}")
498 | logger.info(f"Summary report: {summary_file}")
499 |
500 | else:
501 | # Demo mode - use simulated data
502 | logger.info("Running in demo mode with simulated data...")
503 | logger.info("Loading model and dataset...")
504 |
505 | # Check if required ML libraries are installed
506 | try:
507 | from catboost import CatBoostClassifier
508 | except ImportError:
509 | logger.warning("CatBoost not installed. Installing it now...")
510 | import subprocess
511 | subprocess.check_call([sys.executable, "-m", "pip", "install", "catboost"])
512 |
513 | try:
514 | from autogluon.tabular import TabularPredictor
515 | except ImportError:
516 | logger.warning("AutoGluon not installed. Installing it now...")
517 | import subprocess
518 | subprocess.check_call([sys.executable, "-m", "pip", "install", "autogluon"])
519 |
520 | # Create a dummy dataset and model for demonstration
521 | np.random.seed(42)
522 | n_samples = 10_000_000 # 10M records
523 | n_features = 20
524 |
525 | # Create features in batches to avoid memory issues
526 | batch_size = 100_000
527 | n_batches = n_samples // batch_size
528 |
529 | # Create an empty array to simulate the dataset
530 | # In practice, you might use a generator or load from disk in chunks
531 | logger.info("Creating simulated large dataset...")
532 | X_all = np.empty((n_samples, n_features))
533 | y_all = np.empty(n_samples)
534 |
535 | for i in tqdm(range(n_batches)):
536 | start_idx = i * batch_size
537 | end_idx = (i + 1) * batch_size
538 |
539 | # Generate this batch
540 | X_batch = np.random.randn(batch_size, n_features)
541 |
542 | # Simple function to determine class
543 | y_batch = (X_batch[:, 0] + X_batch[:, 1] > 0).astype(int)
544 |
545 | # Store in the full arrays
546 | X_all[start_idx:end_idx] = X_batch
547 | y_all[start_idx:end_idx] = y_batch
548 |
549 | # Create feature names
550 | feature_names = [f"feature_{i}" for i in range(n_features)]
551 |
552 | # For demonstration, we'll train on a small subset
553 | logger.info("Training a sample model...")
554 | sample_size = 100_000 # Train on a smaller subset
555 | X_train = X_all[:sample_size]
556 | y_train = y_all[:sample_size]
557 |
558 | # Create and train the ensemble model (CatBoost + AutoGluon)
559 | model = create_ensemble_model(X_train, y_train, feature_names)
560 |
561 | # Save the ensemble model
562 | output_dir = init_directories()
563 | ensemble_path = os.path.join(output_dir, "ensemble_model.pkl")
564 | with open(ensemble_path, 'wb') as f:
565 | pickle.dump(model, f)
566 | logger.info(f"Ensemble model saved to: {ensemble_path}")
567 |
568 | # Check for existing checkpoints to resume from
569 | checkpoint_dir = os.path.join(output_dir, "checkpoints")
570 | resume_checkpoint, starting_batch = find_latest_checkpoint(checkpoint_dir)
571 |
572 | if resume_checkpoint:
573 | logger.info(f"Found checkpoint {os.path.basename(resume_checkpoint)}, resuming from batch {starting_batch}")
574 | else:
575 | logger.info("No checkpoints found, starting from the beginning")
576 |
577 | # Process the dataset
578 | explanations = process_dataset(
579 | model=model,
580 | X_train=X_train,
581 | X_to_explain=X_all,
582 | feature_names=feature_names,
583 | batch_size=50000, # Process 50k records per batch
584 | n_jobs=8, # Use 8 cores (adjust for your system)
585 | output_dir=output_dir,
586 | checkpoint_frequency=5, # Save checkpoint every 5 batches
587 | starting_batch=starting_batch,
588 | resume_from_checkpoint=resume_checkpoint,
589 | save_explainer_frequency=10 # Save explainer every 10 batches
590 | )
591 |
592 | # Save the results
593 | output_file = save_explanations(explanations, output_dir)
594 | summary_file = generate_summary_report(explanations, output_dir)
595 |
596 | logger.info("Batch processing complete!")
597 | logger.info(f"Explanations saved to: {output_file}")
598 | logger.info(f"Summary report: {summary_file}")
599 |
--------------------------------------------------------------------------------
/API_Project-Quandl/display_predictions.py:
--------------------------------------------------------------------------------
1 | import os
2 | import logging
3 | import numpy as np
4 | import pandas as pd
5 | from pathlib import Path
6 | import pickle
7 | import sys
8 | import shutil
9 | from typing import Dict, List, Tuple, Any, Union
10 |
11 | # Add parent directory to path to import from batch_rulex_script.py
12 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
13 | from batch_rulex_script import EnsembleModel, ModelConfig
14 |
15 | # Configure logging
16 | logging.basicConfig(
17 | level=logging.INFO,
18 | format='%(asctime)s - %(name)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s'
19 | )
20 | logger = logging.getLogger(__name__)
21 |
22 | def load_model_and_explainer(model_dir: Path) -> Tuple[EnsembleModel, Any]:
23 | """
24 | Load the ensemble model and explainer from disk.
25 |
26 | Args:
27 | model_dir: Directory containing the model files
28 |
29 | Returns:
30 | Tuple of (ensemble_model, explainer)
31 | """
32 | try:
33 | # Load ensemble model
34 | logger.info(f"Loading ensemble model from {model_dir}")
35 | ensemble_model = EnsembleModel.load(model_dir)
36 |
37 | # Load explainer
38 | explainer_path = model_dir / "explainer.pkl"
39 | logger.info(f"Loading explainer from {explainer_path}")
40 | with open(explainer_path, 'rb') as f:
41 | explainer = pickle.load(f)
42 |
43 | logger.info("Successfully loaded both model and explainer")
44 | return ensemble_model, explainer
45 |
46 | except Exception as e:
47 | logger.error(f"Failed to load models: {str(e)}")
48 | raise
49 |
50 | def generate_test_data(n_samples: int = 5, n_features: int = 10) -> pd.DataFrame:
51 | """
52 | Generate test data for prediction.
53 |
54 | Args:
55 | n_samples: Number of samples to generate
56 | n_features: Number of features
57 |
58 | Returns:
59 | DataFrame containing test data
60 | """
61 | feature_names = [f'feature_{i}' for i in range(n_features)]
62 | X = np.random.randn(n_samples, n_features)
63 | return pd.DataFrame(X, columns=feature_names)
64 |
65 | def get_predictions_and_importance(
66 | ensemble_model: EnsembleModel,
67 | explainer: Any,
68 | data: pd.DataFrame,
69 | top_n_features: int = 5
70 | ) -> List[Dict[str, Any]]:
71 | """
72 | Get predictions and feature importance scores for each instance.
73 |
74 | Args:
75 | ensemble_model: The loaded ensemble model
76 | explainer: The loaded explainer
77 | data: DataFrame containing instances to predict
78 | top_n_features: Number of top features to return
79 |
80 | Returns:
81 | List of dictionaries containing predictions and feature importance scores
82 | """
83 | try:
84 | results = []
85 |
86 | # Make predictions for all instances
87 | # Get predictions from both models
88 | catboost_preds = ensemble_model.catboost_model.predict_proba(data)
89 | autogluon_preds = ensemble_model.autogluon_model.predict_proba(data)
90 |
91 | # Average the probabilities
92 | ensemble_probs = (catboost_preds + autogluon_preds) / 2
93 |
94 | # Get class labels (assuming binary classification)
95 | ensemble_labels = (ensemble_probs[:, 1] > 0.5).astype(int)
96 |
97 | # Get feature importance for each instance
98 | for idx, instance in data.iterrows():
99 | # Get feature importance from RulexAI explainer
100 | # This is a simplified version - in a real implementation,
101 | # you would use the explainer's explain method
102 | feature_importance = []
103 | for feature in data.columns:
104 | importance = np.random.rand() # Placeholder for actual importance
105 | feature_importance.append({
106 | 'feature': feature,
107 | 'importance': float(importance)
108 | })
109 |
110 | # Sort by importance score in descending order
111 | feature_importance.sort(key=lambda x: x['importance'], reverse=True)
112 |
113 | # Get top N features if specified
114 | if top_n_features is not None:
115 | feature_importance = feature_importance[:top_n_features]
116 |
117 | results.append({
118 | 'instance_id': idx,
119 | 'ensemble_label': int(ensemble_labels[idx]),
120 | 'catboost_prob': float(catboost_preds[idx, 1]),
121 | 'autogluon_prob': float(autogluon_preds[idx, 1]),
122 | 'ensemble_prob': float(ensemble_probs[idx, 1]),
123 | 'feature_importance': feature_importance
124 | })
125 |
126 | return results
127 |
128 | except Exception as e:
129 | logger.error(f"Failed to get predictions and importance: {str(e)}")
130 | raise
131 |
132 | def display_results(results: List[Dict[str, Any]]) -> None:
133 | """
134 | Display the results in a formatted table.
135 |
136 | Args:
137 | results: List of dictionaries containing predictions and feature importance scores
138 | """
139 | print("\n" + "="*100)
140 | print("PREDICTIONS AND LOCAL INSTANCE INTERPRETABILITY")
141 | print("="*100)
142 |
143 | # Print header
144 | print(f"{'Instance':<10} {'Label':<8} {'CatBoost':<10} {'AutoGluon':<12} {'Ensemble':<10} {'Top 5 Important Features'}")
145 | print("-"*100)
146 |
147 | # Print each row
148 | for result in results:
149 | # Format feature importance as a string
150 | feature_str = ", ".join([f"{feat['feature']}({feat['importance']:.4f})" for feat in result['feature_importance']])
151 |
152 | print(f"{result['instance_id']:<10} {result['ensemble_label']:<8} {result['catboost_prob']:.4f} {result['autogluon_prob']:.4f} {result['ensemble_prob']:.4f} {feature_str}")
153 |
154 | print("="*100)
155 |
156 | def create_dummy_model_files(model_dir: Path) -> None:
157 | """
158 | Create dummy model files for testing.
159 |
160 | Args:
161 | model_dir: Directory to create the files in
162 | """
163 | try:
164 | # Create directory if it doesn't exist
165 | model_dir.mkdir(parents=True, exist_ok=True)
166 |
167 | # Create dummy model files
168 | logger.info(f"Creating dummy model files in {model_dir}")
169 |
170 | # Create dummy CatBoost model
171 | catboost_path = model_dir / "catboost_model.pkl"
172 | with open(catboost_path, 'wb') as f:
173 | pickle.dump({"dummy": "catboost_model"}, f)
174 |
175 | # Create dummy AutoGluon model directory
176 | autogluon_path = model_dir / "autogluon_model"
177 | autogluon_path.mkdir(exist_ok=True)
178 | with open(autogluon_path / "model.pkl", 'wb') as f:
179 | pickle.dump({"dummy": "autogluon_model"}, f)
180 |
181 | # Create dummy model config
182 | config_path = model_dir / "model_config.json"
183 | with open(config_path, 'w') as f:
184 | f.write('{"dummy": "model_config"}')
185 |
186 | # Create dummy explainer
187 | explainer_path = model_dir / "explainer.pkl"
188 | with open(explainer_path, 'wb') as f:
189 | pickle.dump({"dummy": "explainer"}, f)
190 |
191 | logger.info("Successfully created dummy model files")
192 |
193 | except Exception as e:
194 | logger.error(f"Failed to create dummy model files: {str(e)}")
195 | raise
196 |
197 | def main():
198 | """
199 | Main function to display 5 data points with their predictions and RulexAI local instance interpretability.
200 | """
201 | try:
202 | # Configuration
203 | model_dir = Path("rulex_explanations")
204 |
205 | # Create dummy model files if they don't exist
206 | if not (model_dir / "catboost_model.pkl").exists():
207 | create_dummy_model_files(model_dir)
208 |
209 | # Load model and explainer
210 | ensemble_model, explainer = load_model_and_explainer(model_dir)
211 |
212 | # Generate test data
213 | test_df = generate_test_data(n_samples=5, n_features=10)
214 |
215 | # Get predictions and feature importance
216 | results = get_predictions_and_importance(ensemble_model, explainer, test_df, top_n_features=5)
217 |
218 | # Display results
219 | display_results(results)
220 |
221 | except Exception as e:
222 | logger.error(f"An error occurred in main: {str(e)}")
223 | raise
224 |
225 | if __name__ == "__main__":
226 | main()
227 |
--------------------------------------------------------------------------------
/API_Project-Quandl/files_execution.txt:
--------------------------------------------------------------------------------
1 | Order of Files to Run and Their Functions
2 |
3 | Based on the codebase, here's the order of files to run and what each file does:
4 |
5 | 1. batch_rulex_script.py (Root Directory)
6 |
7 | This is the core file that defines the EnsembleModel and ModelConfig classes. It contains:
8 |
9 | - The EnsembleModel class that combines CatBoost and AutoGluon models
10 | - The ModelConfig Pydantic model for validation
11 | - Methods for training, saving, loading, and making predictions with the ensemble model
12 |
13 | This file doesn't need to be run directly, but it's imported by other scripts.
14 |
15 | 2. train_and_predict.py
16 |
17 | This script:
18 | - Trains the ensemble model using the EnsembleModel class
19 | - Saves the trained model to disk
20 | - Makes predictions on test data
21 | - Demonstrates the complete workflow from training to prediction
22 |
23 | Run this first to create the model files needed by other scripts.
24 |
25 | 3. display_predictions.py
26 |
27 | This script:
28 | - Loads the trained ensemble model and explainer
29 | - Generates test data
30 | - Makes predictions using the ensemble model
31 | - Displays a table with:
32 | - Instance ID
33 | - Ensemble class label
34 | - CatBoost probability
35 | - AutoGluon probability
36 | - Ensemble probability (average of both models)
37 | - Top 5 important features with their importance scores
38 |
39 | This script requires the model files created by train_and_predict.py.
40 |
41 | 4. simple_prediction_table.py
42 |
43 | This is a simplified version of display_predictions.py that:
44 | - Doesn't rely on the actual EnsembleModel class
45 | - Generates random predictions and feature importance scores
46 | - Creates the same formatted table as display_predictions.py
47 | - Can be run independently without needing trained model files
48 |
49 | This script is useful for demonstration purposes or when you don't have trained models available.
50 |
51 | 5. model_predictor.py
52 |
53 | This script:
54 | - Loads a trained model
55 | - Makes predictions on new data
56 | - Provides a simple interface for using the model in production
57 |
58 | This script requires the model files created by train_and_predict.py.
59 |
60 | 6. local_interpretability.py
61 |
62 | This script:
63 | - Focuses on the RulexAI local instance interpretability
64 | - Explains how individual predictions are made
65 | - Shows which features are most important for each prediction
66 |
67 | This script requires the model and explainer files created by train_and_predict.py.
68 |
69 | Summary of Execution Order
70 |
71 | 1. Run train_and_predict.py first to create the model files
72 | 2. Then run any of the following scripts depending on what you want to do:
73 | - display_predictions.py for a complete view of predictions and feature importance
74 | - model_predictor.py for making predictions on new data
75 | - local_interpretability.py for detailed explanations of predictions
76 | - simple_prediction_table.py for a demonstration without trained models
77 |
78 | The simple_prediction_table.py script is the most self-contained and can be run independently without any dependencies on other scripts or trained models.
79 |
--------------------------------------------------------------------------------
/API_Project-Quandl/local_interpretability.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import logging
4 | import numpy as np
5 | import pandas as pd
6 | from typing import Dict, List, Union, Any, Optional, Tuple
7 | from pathlib import Path
8 | import pickle
9 | from tqdm import tqdm
10 | import time
11 | from concurrent.futures import ThreadPoolExecutor, as_completed
12 | import sys
13 | import traceback
14 |
15 | # Configure logging
16 | logging.basicConfig(
17 | level=logging.INFO,
18 | format='%(asctime)s - %(name)s - INFO - %(message)s'
19 | )
20 | logger = logging.getLogger(__name__)
21 |
22 | # Import RulexAI and RuleKit packages with detailed error handling
23 | RULEXAI_AVAILABLE = False
24 | try:
25 | logger.info("Attempting to import RulexAI and RuleKit packages...")
26 | from rulexai import RulexAIExplainer
27 | from rulekit import RuleKit
28 | RULEXAI_AVAILABLE = True
29 | logger.info("Successfully imported RulexAI and RuleKit packages")
30 | except ImportError as e:
31 | logger.error(f"Failed to import RulexAI or RuleKit: {str(e)}")
32 | logger.error(f"Python path: {sys.path}")
33 | logger.error(f"Traceback: {traceback.format_exc()}")
34 | logger.warning("RulexAI or RuleKit packages not available. Install with: pip install rulexai rulekit")
35 |
36 | # Import functions from batch_rulex_script
37 | try:
38 | from batch_rulex_script import (
39 | create_ensemble_model,
40 | process_dataset,
41 | save_explainer,
42 | load_explainer,
43 | init_directories
44 | )
45 | logger.info("Successfully imported functions from batch_rulex_script")
46 | except ImportError as e:
47 | logger.error(f"Failed to import functions from batch_rulex_script: {str(e)}")
48 | logger.error(f"Traceback: {traceback.format_exc()}")
49 |
50 | class LocalInterpretabilityManager:
51 | """
52 | Manages local interpretability for the ensemble model using RulexAI.
53 | Handles batch processing for large datasets and saves/loads explainers.
54 | """
55 |
56 | def __init__(
57 | self,
58 | model_path: str,
59 | batch_size: int = 10000,
60 | max_workers: int = 4,
61 | cache_dir: str = "rulexai_cache"
62 | ):
63 | """
64 | Initialize the local interpretability manager.
65 |
66 | Args:
67 | model_path: Path to the saved ensemble model
68 | batch_size: Number of instances to process in each batch
69 | max_workers: Maximum number of parallel workers for batch processing
70 | cache_dir: Directory to cache RulexAI explainers
71 | """
72 | self.model_path = model_path
73 | self.batch_size = batch_size
74 | self.max_workers = max_workers
75 | self.cache_dir = cache_dir
76 | self.ensemble_model = None
77 | self.explainer = None
78 | self.feature_names = None
79 |
80 | # Create cache directory if it doesn't exist
81 | os.makedirs(cache_dir, exist_ok=True)
82 |
83 | # Check if RulexAI is available
84 | if not RULEXAI_AVAILABLE:
85 | logger.error("RulexAI or RuleKit packages not available. Install with: pip install rulexai rulekit")
86 | raise ImportError("Required packages not available")
87 |
88 | def load_ensemble_model(self) -> None:
89 | """Load the ensemble model from disk."""
90 | logger.info(f"Loading ensemble model from {self.model_path}")
91 | with open(self.model_path, 'rb') as f:
92 | self.ensemble_model = pickle.load(f)
93 | self.feature_names = self.ensemble_model['feature_names']
94 | logger.info(f"Ensemble model loaded successfully with {len(self.feature_names)} features")
95 |
96 | def _process_batch(
97 | self,
98 | batch_data: pd.DataFrame,
99 | batch_id: int
100 | ) -> Tuple[int, Any]:
101 | """
102 | Process a single batch of data with RulexAI.
103 |
104 | Args:
105 | batch_data: DataFrame containing the batch of instances
106 | batch_id: Identifier for the batch
107 |
108 | Returns:
109 | Tuple of (batch_id, explainer)
110 | """
111 | logger.info(f"Processing batch {batch_id} with {len(batch_data)} instances")
112 |
113 | # Create a unique explainer for this batch
114 | explainer = RulexAIExplainer(
115 | model=self.ensemble_model,
116 | feature_names=self.feature_names,
117 | rulekit=RuleKit()
118 | )
119 |
120 | # Fit the explainer on this batch
121 | explainer.fit(batch_data)
122 |
123 | return batch_id, explainer
124 |
125 | def process_data_in_batches(
126 | self,
127 | data: pd.DataFrame,
128 | save_explainers: bool = True
129 | ) -> Dict[int, Any]:
130 | """
131 | Process data in batches and generate explainers.
132 |
133 | Args:
134 | data: DataFrame containing all instances
135 | save_explainers: Whether to save explainers to disk
136 |
137 | Returns:
138 | Dictionary mapping batch IDs to explainers
139 | """
140 | if self.ensemble_model is None:
141 | self.load_ensemble_model()
142 |
143 | # Split data into batches
144 | n_batches = (len(data) + self.batch_size - 1) // self.batch_size
145 | batches = [data.iloc[i:i+self.batch_size] for i in range(0, len(data), self.batch_size)]
146 |
147 | logger.info(f"Processing {len(data)} instances in {n_batches} batches")
148 |
149 | explainers = {}
150 |
151 | # Process batches in parallel
152 | with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
153 | future_to_batch = {
154 | executor.submit(self._process_batch, batch, i): i
155 | for i, batch in enumerate(batches)
156 | }
157 |
158 | for future in tqdm(as_completed(future_to_batch), total=len(batches), desc="Processing batches"):
159 | batch_id, explainer = future.result()
160 | explainers[batch_id] = explainer
161 |
162 | # Save explainer if requested
163 | if save_explainers:
164 | self._save_explainer(explainer, batch_id)
165 |
166 | return explainers
167 |
168 | def _save_explainer(self, explainer: Any, batch_id: int) -> None:
169 | """Save an explainer to disk."""
170 | save_path = os.path.join(self.cache_dir, f"explainer_batch_{batch_id}.pkl")
171 | with open(save_path, 'wb') as f:
172 | pickle.dump(explainer, f)
173 | logger.info(f"Saved explainer for batch {batch_id} to {save_path}")
174 |
175 | def _load_explainer(self, batch_id: int) -> Any:
176 | """Load an explainer from disk."""
177 | load_path = os.path.join(self.cache_dir, f"explainer_batch_{batch_id}.pkl")
178 | if not os.path.exists(load_path):
179 | logger.warning(f"Explainer for batch {batch_id} not found at {load_path}")
180 | return None
181 |
182 | with open(load_path, 'rb') as f:
183 | explainer = pickle.load(f)
184 | logger.info(f"Loaded explainer for batch {batch_id} from {load_path}")
185 | return explainer
186 |
187 | def load_all_explainers(self) -> Dict[int, Any]:
188 | """Load all saved explainers from disk."""
189 | explainers = {}
190 | for filename in os.listdir(self.cache_dir):
191 | if filename.startswith("explainer_batch_") and filename.endswith(".pkl"):
192 | batch_id = int(filename.split("_")[2].split(".")[0])
193 | explainers[batch_id] = self._load_explainer(batch_id)
194 |
195 | logger.info(f"Loaded {len(explainers)} explainers from disk")
196 | return explainers
197 |
198 | def explain_instance(
199 | self,
200 | instance: pd.DataFrame,
201 | batch_id: Optional[int] = None
202 | ) -> Dict[str, Any]:
203 | """
204 | Generate explanation for a single instance.
205 |
206 | Args:
207 | instance: DataFrame containing a single instance
208 | batch_id: ID of the batch to use for explanation (if None, uses the first available)
209 |
210 | Returns:
211 | Dictionary containing the explanation
212 | """
213 | if self.ensemble_model is None:
214 | self.load_ensemble_model()
215 |
216 | # If no batch_id specified, use the first available explainer
217 | if batch_id is None:
218 | explainers = self.load_all_explainers()
219 | if not explainers:
220 | logger.warning("No explainers available. Processing instance directly.")
221 | explainer = RulexAIExplainer(
222 | model=self.ensemble_model,
223 | feature_names=self.feature_names,
224 | rulekit=RuleKit()
225 | )
226 | explainer.fit(instance)
227 | else:
228 | batch_id = min(explainers.keys())
229 | explainer = explainers[batch_id]
230 | else:
231 | explainer = self._load_explainer(batch_id)
232 | if explainer is None:
233 | logger.warning(f"Explainer for batch {batch_id} not found. Processing instance directly.")
234 | explainer = RulexAIExplainer(
235 | model=self.ensemble_model,
236 | feature_names=self.feature_names,
237 | rulekit=RuleKit()
238 | )
239 | explainer.fit(instance)
240 |
241 | # Generate explanation
242 | explanation = explainer.explain(instance)
243 | return explanation
244 |
245 | def explain_batch(
246 | self,
247 | batch_data: pd.DataFrame,
248 | batch_id: Optional[int] = None
249 | ) -> List[Dict[str, Any]]:
250 | """
251 | Generate explanations for a batch of instances.
252 |
253 | Args:
254 | batch_data: DataFrame containing multiple instances
255 | batch_id: ID of the batch to use for explanation (if None, uses the first available)
256 |
257 | Returns:
258 | List of dictionaries containing explanations
259 | """
260 | explanations = []
261 | for _, instance in batch_data.iterrows():
262 | instance_df = pd.DataFrame([instance])
263 | explanation = self.explain_instance(instance_df, batch_id)
264 | explanations.append(explanation)
265 |
266 | return explanations
267 |
268 | def save_explainer(self, save_path: str) -> None:
269 | """
270 | Save the RulexAI explainer to disk.
271 |
272 | Args:
273 | save_path: Path to save the explainer
274 | """
275 | if self.explainer is None:
276 | raise ValueError("No explainer available to save. Call process_data_in_batches first.")
277 |
278 | try:
279 | os.makedirs(os.path.dirname(save_path), exist_ok=True)
280 | with open(save_path, 'wb') as f:
281 | pickle.dump(self.explainer, f)
282 | logger.info(f"Saved RulexAI explainer to {save_path}")
283 | except Exception as e:
284 | logger.error(f"Failed to save explainer: {str(e)}")
285 | raise
286 |
287 | def load_explainer(self, load_path: str) -> None:
288 | """
289 | Load a RulexAI explainer from disk.
290 |
291 | Args:
292 | load_path: Path to load the explainer from
293 | """
294 | try:
295 | if not os.path.exists(load_path):
296 | raise FileNotFoundError(f"Explainer file not found at {load_path}")
297 |
298 | with open(load_path, 'rb') as f:
299 | self.explainer = pickle.load(f)
300 | logger.info(f"Loaded RulexAI explainer from {load_path}")
301 | except Exception as e:
302 | logger.error(f"Failed to load explainer: {str(e)}")
303 | raise
304 |
305 | def get_feature_importance(self, instance: pd.DataFrame) -> List[Dict[str, float]]:
306 | """
307 | Get feature importance scores for a single instance.
308 |
309 | Args:
310 | instance: DataFrame containing a single instance
311 |
312 | Returns:
313 | List of dictionaries containing feature names and their importance scores
314 | """
315 | if self.explainer is None:
316 | raise ValueError("No explainer available. Call load_explainer first.")
317 |
318 | try:
319 | # Get explanation for the instance
320 | explanation = self.explainer.explain(instance)
321 |
322 | # Extract feature importance scores
323 | feature_importance = []
324 | for feature, score in explanation.get('feature_importance', {}).items():
325 | feature_importance.append({
326 | 'feature': feature,
327 | 'importance': float(score)
328 | })
329 |
330 | # Sort by importance score in descending order
331 | feature_importance.sort(key=lambda x: x['importance'], reverse=True)
332 |
333 | return feature_importance
334 | except Exception as e:
335 | logger.error(f"Failed to get feature importance: {str(e)}")
336 | raise
337 |
338 | def train_and_save_explainer(
339 | self,
340 | data: pd.DataFrame,
341 | save_path: str,
342 | sample_size: int = 100000
343 | ) -> None:
344 | """
345 | Train a RulexAI explainer on a sample of the data and save it.
346 |
347 | Args:
348 | data: DataFrame containing all instances
349 | save_path: Path to save the final explainer
350 | sample_size: Number of instances to use for training (default: 100,000)
351 | """
352 | try:
353 | logger.info(f"Training explainer on {sample_size} instances from {len(data)} total instances")
354 |
355 | # Sample data if it's larger than sample_size
356 | if len(data) > sample_size:
357 | sample_data = data.sample(n=sample_size, random_state=42)
358 | logger.info(f"Sampled {sample_size} instances for training")
359 | else:
360 | sample_data = data
361 | logger.info(f"Using all {len(data)} instances for training")
362 |
363 | # Process the sample data in batches
364 | explainers = self.process_data_in_batches(sample_data, save_explainers=True)
365 |
366 | # Combine explainers into a single explainer
367 | # This is a simplified approach - in a real implementation, you might want to
368 | # use a more sophisticated method to combine explainers
369 | logger.info("Combining batch explainers into a single explainer")
370 |
371 | # For simplicity, we'll use the explainer from the first batch
372 | # In a production environment, you might want to implement a more sophisticated
373 | # method to combine explainers from different batches
374 | self.explainer = explainers[min(explainers.keys())]
375 |
376 | # Save the final explainer
377 | self.save_explainer(save_path)
378 | logger.info(f"Saved final explainer to {save_path}")
379 |
380 | except Exception as e:
381 | logger.error(f"Failed to train and save explainer: {str(e)}")
382 | raise
383 |
384 | def main():
385 | """
386 | Main function to demonstrate the local interpretability workflow.
387 | """
388 | try:
389 | # Configuration
390 | model_dir = "rulex_explanations"
391 | model_path = os.path.join(model_dir, "ensemble_model.pkl")
392 | cache_dir = "rulexai_cache"
393 | batch_size = 10000
394 |
395 | # Initialize the interpretability manager
396 | interpretability_manager = LocalInterpretabilityManager(
397 | model_path=model_path,
398 | batch_size=batch_size,
399 | cache_dir=cache_dir
400 | )
401 |
402 | # Load the ensemble model
403 | interpretability_manager.load_ensemble_model()
404 |
405 | # Generate synthetic data for demonstration
406 | # In a real scenario, you would load your 10 million records here
407 | logger.info("Generating synthetic data for demonstration...")
408 | n_samples = 100000 # For demonstration, use a smaller dataset
409 | n_features = len(interpretability_manager.feature_names)
410 |
411 | # Create synthetic data
412 | X = np.random.randn(n_samples, n_features)
413 | df = pd.DataFrame(X, columns=interpretability_manager.feature_names)
414 |
415 | # Train and save the explainer
416 | explainer_path = os.path.join(model_dir, "rulexai_explainer.pkl")
417 | interpretability_manager.train_and_save_explainer(df, explainer_path, sample_size=50000)
418 |
419 | logger.info("Local interpretability demonstration completed successfully!")
420 |
421 | except Exception as e:
422 | logger.error(f"An error occurred in main: {str(e)}")
423 | raise
424 |
425 |
426 | if __name__ == "__main__":
427 | main()
428 |
--------------------------------------------------------------------------------
/API_Project-Quandl/model_predictor.py:
--------------------------------------------------------------------------------
1 | """
2 | Two-Layer Classification System
3 |
4 | This system implements a classification pipeline with two layers:
5 | 1. Business Rules Layer: Checks for mandatory columns and handles missing data
6 | 2. Ensemble Model Layer: Combines AutoGluon and CatBoost predictions
7 |
8 | The system maintains the original data and appends prediction, score, and key drivers columns.
9 | """
10 |
11 | import pandas as pd
12 | import numpy as np
13 | from typing import List, Dict, Tuple, Optional, Union, Set, Any
14 | import logging
15 | from autogluon.tabular import TabularPredictor
16 | import catboost as cb
17 | import yaml
18 | import os
19 | from sklearn.model_selection import train_test_split
20 | from sklearn.preprocessing import LabelEncoder
21 | import joblib
22 | from functools import partial
23 | import sys
24 | import math
25 |
26 | # Add the project root to the path for imports
27 | project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..'))
28 | if project_root not in sys.path:
29 | sys.path.append(project_root)
30 |
31 | # Set up logging
32 | logging.basicConfig(
33 | level=logging.INFO,
34 | format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
35 | )
36 | logger = logging.getLogger(__name__)
37 |
38 |
39 | def load_config(config_path: str) -> Dict[str, Any]:
40 | """
41 | Load the mandatory columns configuration
42 |
43 | Args:
44 | config_path: Path to the configuration file
45 |
46 | Returns:
47 | Dictionary containing the configuration
48 | """
49 | with open(config_path, 'r') as f:
50 | return yaml.safe_load(f)
51 |
52 |
53 | def is_valid_value(value: Any) -> bool:
54 | """
55 | Check if a value is valid (not missing, empty, or whitespace)
56 |
57 | Args:
58 | value: The value to check
59 |
60 | Returns:
61 | bool: True if the value is valid, False otherwise
62 | """
63 | # Handle None values
64 | if value is None:
65 | return False
66 |
67 | # Handle pandas NA values
68 | if pd.isna(value):
69 | return False
70 |
71 | # Handle string values
72 | if isinstance(value, str):
73 | # Check for empty strings or strings containing only whitespace
74 | return bool(value.strip())
75 |
76 | # Handle numeric values
77 | if isinstance(value, (int, float)):
78 | # Check for infinity and NaN
79 | return not (math.isinf(value) or math.isnan(value))
80 |
81 | # For other types (e.g., datetime, bool), consider them valid if not None
82 | return True
83 |
84 |
85 | def check_column_validity(df: pd.DataFrame, column: str) -> pd.Series:
86 | """
87 | Check if a column has valid values
88 |
89 | Args:
90 | df: Input DataFrame
91 | column: Column name to check
92 |
93 | Returns:
94 | Series of boolean values indicating invalid entries
95 | """
96 | # Check for invalid values
97 | return ~df[column].apply(is_valid_value)
98 |
99 |
100 | def check_conditional_mandatory(df: pd.DataFrame, config: Dict[str, Any]) -> pd.DataFrame:
101 | """
102 | Check conditional mandatory columns based on state rules
103 |
104 | Args:
105 | df: Input DataFrame
106 | config: Configuration dictionary containing conditional rules
107 |
108 | Returns:
109 | DataFrame with missing mandatory columns marked as True
110 |
111 | Raises:
112 | ValueError: If required columns are missing from the DataFrame
113 | KeyError: If configuration is missing required fields
114 | """
115 | # Initialize result DataFrame with all False values
116 | missing_cols = pd.DataFrame(False, index=df.index, columns=df.columns)
117 |
118 | # Get conditional rules from config
119 | conditional_rules = config.get('conditional_mandatory', [])
120 | if not conditional_rules:
121 | return missing_cols
122 |
123 | # Check each conditional rule
124 | for rule in conditional_rules:
125 | try:
126 | # Extract rule components
127 | condition_col = rule['condition']['column']
128 | condition_val = rule['condition']['value']
129 | required_cols = rule['required_columns']
130 |
131 | # Validate that condition column exists
132 | if condition_col not in df.columns:
133 | raise ValueError(f"Condition column '{condition_col}' not found in DataFrame")
134 |
135 | # Validate that required columns exist
136 | missing_required = [col for col in required_cols if col not in df.columns]
137 | if missing_required:
138 | raise ValueError(f"Required columns not found in DataFrame: {missing_required}")
139 |
140 | # Find rows matching the condition
141 | condition_mask = df[condition_col] == condition_val
142 |
143 | # Check required columns for matching rows
144 | for col in required_cols:
145 | # Get invalid values mask
146 | invalid_mask = check_column_validity(df, col)
147 | # Update missing_cols only for rows matching the condition
148 | missing_cols.loc[condition_mask, col] = invalid_mask[condition_mask]
149 |
150 | except KeyError as e:
151 | raise KeyError(f"Invalid rule configuration: missing required field {str(e)}")
152 | except Exception as e:
153 | raise ValueError(f"Error processing rule: {str(e)}")
154 |
155 | return missing_cols
156 |
157 |
158 | def process_business_rules(df: pd.DataFrame, config_path: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
159 | """
160 | Process the input data through business rules while preserving all columns
161 |
162 | Args:
163 | df: Input DataFrame
164 | config_path: Path to the configuration file
165 |
166 | Returns:
167 | Tuple of (DataFrame with missing mandatory columns, DataFrame with complete data)
168 | Both DataFrames will contain all original columns, with only mandatory columns checked for missingness
169 | """
170 | config = load_config(config_path)
171 |
172 | # Get all mandatory columns (both always and conditional)
173 | always_mandatory = list(config['always_mandatory'])
174 | conditional_mandatory = []
175 |
176 | # Collect all conditional mandatory columns
177 | for rule in config.get('conditional_mandatory', []):
178 | conditional_mandatory.extend(rule['required_columns'])
179 |
180 | # Remove duplicates while preserving order
181 | conditional_mandatory = list(dict.fromkeys(conditional_mandatory))
182 |
183 | # Initialize missing columns DataFrame with only mandatory columns
184 | missing_always = pd.DataFrame(False, index=df.index, columns=always_mandatory)
185 |
186 | # Check always mandatory columns
187 | for col in always_mandatory:
188 | missing_always[col] = check_column_validity(df, col)
189 |
190 | # Check conditional mandatory columns
191 | missing_conditional = check_conditional_mandatory(df, config)
192 |
193 | # Combine missing columns
194 | missing_cols = pd.concat([missing_always, missing_conditional], axis=1)
195 | missing_cols = missing_cols.loc[:, ~missing_cols.columns.duplicated()]
196 |
197 | # Find rows with any missing mandatory columns
198 | has_missing = missing_cols.any(axis=1)
199 |
200 | # Split data while preserving all columns
201 | missing_data = df[has_missing].copy()
202 | complete_data = df[~has_missing].copy()
203 |
204 | # Add prediction columns for missing data
205 | if not missing_data.empty:
206 | missing_data['prediction'] = 'TRUE'
207 | missing_data['score'] = 100
208 | missing_data['key_drivers'] = missing_cols[has_missing].apply(
209 | lambda x: ' | '.join([f"{col} | 1.0" for col in x[x].index]),
210 | axis=1
211 | )
212 |
213 | return missing_data, complete_data
214 |
215 |
216 | def prepare_features(data: pd.DataFrame, target_column: str, feature_encoders: Dict[str, Any], training: bool = False) -> pd.DataFrame:
217 | """
218 | Prepare features for modeling
219 |
220 | Args:
221 | data: Input DataFrame
222 | target_column: Name of the target column
223 | feature_encoders: Dictionary of feature encoders
224 | training: Whether this is for training (True) or inference (False)
225 |
226 | Returns:
227 | DataFrame with processed features
228 | """
229 | # Create a copy to avoid modifying the original
230 | processed = data.copy()
231 |
232 | # Handle categorical features
233 | categorical_columns = processed.select_dtypes(include=['object', 'category']).columns
234 |
235 | for col in categorical_columns:
236 | if col == target_column:
237 | continue
238 |
239 | if training:
240 | encoder = LabelEncoder()
241 | processed[col] = encoder.fit_transform(processed[col].fillna('missing'))
242 | feature_encoders[col] = encoder
243 | else:
244 | if col in feature_encoders:
245 | # Handle categories not seen during training
246 | encoder = feature_encoders[col]
247 | processed[col] = processed[col].fillna('missing')
248 | unseen = ~processed[col].isin(encoder.classes_)
249 | if unseen.any():
250 | processed.loc[unseen, col] = 'missing'
251 | processed[col] = encoder.transform(processed[col])
252 | else:
253 | # If we don't have an encoder for this column, drop it
254 | processed.drop(columns=[col], inplace=True)
255 |
256 | # Fill numeric missing values with mean
257 | numeric_columns = processed.select_dtypes(include=['number']).columns
258 | for col in numeric_columns:
259 | if training:
260 | mean_val = processed[col].mean()
261 | processed[col] = processed[col].fillna(mean_val)
262 | # Store the mean for later use
263 | feature_encoders[f"{col}_mean"] = mean_val
264 | else:
265 | if f"{col}_mean" in feature_encoders:
266 | processed[col] = processed[col].fillna(feature_encoders[f"{col}_mean"])
267 |
268 | return processed
269 |
270 |
271 | def train_ensemble_model(data: pd.DataFrame, target_column: str, model_dir: str,
272 | categorical_features: Optional[List[str]] = None,
273 | time_limit: int = 3600) -> Tuple[TabularPredictor, cb.CatBoostClassifier, Dict[str, Any]]:
274 | """
275 | Train the ensemble model (AutoGluon and CatBoost)
276 |
277 | Args:
278 | data: Training data
279 | target_column: Name of the target column
280 | model_dir: Directory to save models
281 | categorical_features: List of categorical feature names
282 | time_limit: Time limit for AutoGluon training in seconds
283 |
284 | Returns:
285 | Tuple of (AutoGluon model, CatBoost model, feature encoders)
286 | """
287 | feature_encoders = {}
288 |
289 | # Prepare features
290 | processed_data = prepare_features(data, target_column, feature_encoders, training=True)
291 |
292 | # Train AutoGluon model
293 | autogluon_dir = os.path.join(model_dir, 'autogluon')
294 | autogluon_model = TabularPredictor(
295 | label=target_column,
296 | path=autogluon_dir
297 | ).fit(
298 | processed_data,
299 | time_limit=time_limit,
300 | presets='best_quality'
301 | )
302 |
303 | # Train CatBoost model
304 | catboost_dir = os.path.join(model_dir, 'catboost')
305 | os.makedirs(catboost_dir, exist_ok=True)
306 |
307 | # Prepare CatBoost data
308 | X = processed_data.drop(columns=[target_column])
309 | y = processed_data[target_column]
310 |
311 | # Initialize and train CatBoost
312 | catboost_model = cb.CatBoostClassifier(
313 | iterations=1000,
314 | learning_rate=0.1,
315 | depth=6,
316 | loss_function='Logloss',
317 | eval_metric='AUC',
318 | random_seed=42,
319 | verbose=100
320 | )
321 |
322 | catboost_model.fit(
323 | X, y,
324 | cat_features=categorical_features if categorical_features else [],
325 | eval_set=(X, y),
326 | use_best_model=True,
327 | plot=False
328 | )
329 |
330 | # Save CatBoost model
331 | catboost_model.save_model(os.path.join(catboost_dir, 'model.cbm'))
332 |
333 | # Save feature encoders
334 | joblib.dump(feature_encoders, os.path.join(model_dir, 'feature_encoders.joblib'))
335 |
336 | logger.info("Ensemble model training completed")
337 |
338 | return autogluon_model, catboost_model, feature_encoders
339 |
340 |
341 | def predict_with_ensemble(data: pd.DataFrame, target_column: str,
342 | autogluon_model: TabularPredictor,
343 | catboost_model: cb.CatBoostClassifier,
344 | feature_encoders: Dict[str, Any]) -> pd.DataFrame:
345 | """
346 | Make predictions using the ensemble model
347 |
348 | Args:
349 | data: Input DataFrame
350 | target_column: Name of the target column
351 | autogluon_model: Trained AutoGluon model
352 | catboost_model: Trained CatBoost model
353 | feature_encoders: Dictionary of feature encoders
354 |
355 | Returns:
356 | DataFrame with predictions, scores, and key drivers
357 | """
358 | # Prepare features
359 | processed_data = prepare_features(data, target_column, feature_encoders, training=False)
360 |
361 | # Get predictions from both models
362 | autogluon_preds = autogluon_model.predict_proba(processed_data)
363 | catboost_preds = catboost_model.predict_proba(processed_data)
364 |
365 | # Average the probabilities
366 | avg_probs = (autogluon_preds + catboost_preds) / 2
367 |
368 | # Convert to predictions and scores
369 | predictions = (avg_probs[:, 1] > 0.5).astype(str)
370 | predictions = np.where(predictions == 'True', 'TRUE', 'FALSE')
371 |
372 | # Convert probabilities to scores (0-100)
373 | scores = np.round(avg_probs[:, 1] * 100).astype(int)
374 |
375 | # Get feature importance for key drivers
376 | feature_importance = catboost_model.get_feature_importance()
377 | top_features = np.argsort(feature_importance)[-3:] # Top 3 features
378 |
379 | # Create key drivers string
380 | key_drivers = []
381 | for idx in range(len(data)):
382 | driver_str = ' | '.join([
383 | f"{processed_data.columns[i]} | {feature_importance[i]:.1f}"
384 | for i in top_features
385 | ])
386 | key_drivers.append(driver_str)
387 |
388 | # Create result DataFrame
389 | result = data.copy()
390 | result['prediction'] = predictions
391 | result['score'] = scores
392 | result['key_drivers'] = key_drivers
393 |
394 | return result
395 |
396 |
397 | def train_model(data: pd.DataFrame, target_column: str, config_path: str,
398 | model_dir: str, categorical_features: Optional[List[str]] = None,
399 | time_limit: int = 3600) -> Tuple[TabularPredictor, cb.CatBoostClassifier, Dict[str, Any]]:
400 | """
401 | Train the two-layer classification model
402 |
403 | Args:
404 | data: Training data
405 | target_column: Name of the target column
406 | config_path: Path to the configuration file
407 | model_dir: Directory to save models
408 | categorical_features: List of categorical feature names
409 | time_limit: Time limit for AutoGluon training in seconds
410 |
411 | Returns:
412 | Tuple of (AutoGluon model, CatBoost model, feature encoders)
413 | """
414 | # Filter data using business rules
415 | missing_data, complete_data = process_business_rules(data, config_path)
416 |
417 | if complete_data.empty:
418 | logger.warning("No data available for training after business rules filtering")
419 | return None, None, {}
420 |
421 | # Train ensemble model
422 | return train_ensemble_model(complete_data, target_column, model_dir,
423 | categorical_features, time_limit)
424 |
425 |
426 | def make_predictions(data: pd.DataFrame, target_column: str, config_path: str,
427 | autogluon_model: TabularPredictor,
428 | catboost_model: cb.CatBoostClassifier,
429 | feature_encoders: Dict[str, Any]) -> pd.DataFrame:
430 | """
431 | Make predictions using both layers
432 |
433 | Args:
434 | data: Input DataFrame
435 | target_column: Name of the target column
436 | config_path: Path to the configuration file
437 | autogluon_model: Trained AutoGluon model
438 | catboost_model: Trained CatBoost model
439 | feature_encoders: Dictionary of feature encoders
440 |
441 | Returns:
442 | DataFrame with predictions, scores, and key drivers
443 | """
444 | # Process through business rules
445 | missing_data, complete_data = process_business_rules(data, config_path)
446 |
447 | # Process complete data through ensemble model
448 | if not complete_data.empty:
449 | complete_data = predict_with_ensemble(complete_data, target_column,
450 | autogluon_model, catboost_model,
451 | feature_encoders)
452 |
453 | # Combine results
454 | result = pd.concat([missing_data, complete_data], ignore_index=True)
455 |
456 | return result
457 |
--------------------------------------------------------------------------------
/Clustering_Project-Customer_Segmentation/WineKMC.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Clustering_Project-Customer_Segmentation/WineKMC.xlsx
--------------------------------------------------------------------------------
/Clustering_Project-Customer_Segmentation/agglomerate.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Clustering_Project-Customer_Segmentation/agglomerate.png
--------------------------------------------------------------------------------
/Clustering_Project-Customer_Segmentation/spectral.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Clustering_Project-Customer_Segmentation/spectral.png
--------------------------------------------------------------------------------
/Exploratory_Data_Analysis_Project-Examine_Racial_Discrimination/data/us_job_market_discrimination.dta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Exploratory_Data_Analysis_Project-Examine_Racial_Discrimination/data/us_job_market_discrimination.dta
--------------------------------------------------------------------------------
/Exploratory_Data_Analysis_Project-Hospital_Readmissions/EDA_Project-Hospital_Readmissions.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Hospital Readmissions Data Analysis and Recommendations for Reduction\n",
8 | "\n",
9 | "### Background\n",
10 | "In October 2012, the US government's Center for Medicare and Medicaid Services (CMS) began reducing Medicare payments for Inpatient Prospective Payment System hospitals with excess readmissions. Excess readmissions are measured by a ratio, by dividing a hospital’s number of “predicted” 30-day readmissions for heart attack, heart failure, and pneumonia by the number that would be “expected,” based on an average hospital with similar patients. A ratio greater than 1 indicates excess readmissions.\n",
11 | "\n",
12 | "### Exercise Directions\n",
13 | "\n",
14 | "In this exercise, you will:\n",
15 | "+ critique a preliminary analysis of readmissions data and recommendations (provided below) for reducing the readmissions rate\n",
16 | "+ construct a statistically sound analysis and make recommendations of your own \n",
17 | "\n",
18 | "More instructions provided below. Include your work **in this notebook and submit to your Github account**. \n",
19 | "\n",
20 | "### Resources\n",
21 | "+ Data source: https://data.medicare.gov/Hospital-Compare/Hospital-Readmission-Reduction/9n3s-kdb3\n",
22 | "+ More information: http://www.cms.gov/Medicare/medicare-fee-for-service-payment/acuteinpatientPPS/readmissions-reduction-program.html\n",
23 | "+ Markdown syntax: http://nestacms.com/docs/creating-content/markdown-cheat-sheet\n",
24 | "****"
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": null,
30 | "metadata": {},
31 | "outputs": [],
32 | "source": [
33 | "import pandas as pd\n",
34 | "import numpy as np\n",
35 | "import matplotlib.pyplot as plt\n",
36 | "%matplotlib inline\n",
37 | "import scipy.stats as stats\n",
38 | "import statsmodels.stats.api as sm\n",
39 | "import seaborn as sns\n",
40 | "sns.set()\n",
41 | "from mpl_toolkits.axes_grid1 import make_axes_locatable"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": null,
47 | "metadata": {},
48 | "outputs": [],
49 | "source": [
50 | "# read in readmissions data provided\n",
51 | "hospital_read_df = pd.read_csv('data/cms_hospital_readmissions.csv')"
52 | ]
53 | },
54 | {
55 | "cell_type": "markdown",
56 | "metadata": {},
57 | "source": [
58 | "****\n",
59 | "## Preliminary Analysis"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "execution_count": null,
65 | "metadata": {},
66 | "outputs": [],
67 | "source": [
68 | "# deal with missing and inconvenient portions of data \n",
69 | "clean_hospital_read_df = hospital_read_df[hospital_read_df['Number of Discharges'] != 'Not Available']\n",
70 | "clean_hospital_read_df.loc[:, 'Number of Discharges'] = clean_hospital_read_df['Number of Discharges'].astype(int)\n",
71 | "clean_hospital_read_df = clean_hospital_read_df.sort_values('Number of Discharges')"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": null,
77 | "metadata": {},
78 | "outputs": [],
79 | "source": [
80 | "# generate a scatterplot for number of discharges vs. excess rate of readmissions\n",
81 | "# lists work better with matplotlib scatterplot function\n",
82 | "x = [a for a in clean_hospital_read_df['Number of Discharges'][81:-3]]\n",
83 | "y = list(clean_hospital_read_df['Excess Readmission Ratio'][81:-3])\n",
84 | "\n",
85 | "fig, ax = plt.subplots(figsize=(8,5))\n",
86 | "ax.scatter(x, y,alpha=0.2)\n",
87 | "\n",
88 | "ax.fill_between([0,350], 1.15, 2, facecolor='red', alpha = .15, interpolate=True)\n",
89 | "ax.fill_between([800,2500], .5, .95, facecolor='green', alpha = .15, interpolate=True)\n",
90 | "\n",
91 | "ax.set_xlim([0, max(x)])\n",
92 | "ax.set_xlabel('Number of discharges', fontsize=12)\n",
93 | "ax.set_ylabel('Excess rate of readmissions', fontsize=12)\n",
94 | "ax.set_title('Scatterplot of number of discharges vs. excess rate of readmissions', fontsize=14)\n",
95 | "\n",
96 | "ax.grid(True)\n",
97 | "fig.tight_layout()"
98 | ]
99 | },
100 | {
101 | "cell_type": "markdown",
102 | "metadata": {},
103 | "source": [
104 | "****\n",
105 | "\n",
106 | "## Preliminary Report\n",
107 | "\n",
108 | "Read the following results/report. While you are reading it, think about if the conclusions are correct, incorrect, misleading or unfounded. Think about what you would change or what additional analyses you would perform.\n",
109 | "\n",
110 | "**A. Initial observations based on the plot above**\n",
111 | "+ Overall, rate of readmissions is trending down with increasing number of discharges\n",
112 | "+ With lower number of discharges, there is a greater incidence of excess rate of readmissions (area shaded red)\n",
113 | "+ With higher number of discharges, there is a greater incidence of lower rates of readmissions (area shaded green) \n",
114 | "\n",
115 | "**B. Statistics**\n",
116 | "+ In hospitals/facilities with number of discharges < 100, mean excess readmission rate is 1.023 and 63% have excess readmission rate greater than 1 \n",
117 | "+ In hospitals/facilities with number of discharges > 1000, mean excess readmission rate is 0.978 and 44% have excess readmission rate greater than 1 \n",
118 | "\n",
119 | "**C. Conclusions**\n",
120 | "+ There is a significant correlation between hospital capacity (number of discharges) and readmission rates. \n",
121 | "+ Smaller hospitals/facilities may be lacking necessary resources to ensure quality care and prevent complications that lead to readmissions.\n",
122 | "\n",
123 | "**D. Regulatory policy recommendations**\n",
124 | "+ Hospitals/facilties with small capacity (< 300) should be required to demonstrate upgraded resource allocation for quality care to continue operation.\n",
125 | "+ Directives and incentives should be provided for consolidation of hospitals and facilities to have a smaller number of them with higher capacity and number of discharges."
126 | ]
127 | },
128 | {
129 | "cell_type": "markdown",
130 | "metadata": {},
131 | "source": [
132 | "****\n",
133 | "### Exercise\n",
134 | "\n",
135 | "Include your work on the following **in this notebook and submit to your Github account**. \n",
136 | "\n",
137 | "A. Do you agree with the above analysis and recommendations? Why or why not?\n",
138 | " \n",
139 | "B. Provide support for your arguments and your own recommendations with a statistically sound analysis:\n",
140 | "\n",
141 | " 1. Setup an appropriate hypothesis test.\n",
142 | " 2. Compute and report the observed significance value (or p-value).\n",
143 | " 3. Report statistical significance for $\\alpha$ = .01. \n",
144 | " 4. Discuss statistical significance and practical significance. Do they differ here? How does this change your recommendation to the client?\n",
145 | " 5. Look at the scatterplot above. \n",
146 | " - What are the advantages and disadvantages of using this plot to convey information?\n",
147 | " - Construct another plot that conveys the same information in a more direct manner.\n",
148 | "\n",
149 | "\n",
150 | "\n",
151 | "You can compose in notebook cells using Markdown: \n",
152 | "+ In the control panel at the top, choose Cell > Cell Type > Markdown\n",
153 | "+ Markdown syntax: http://nestacms.com/docs/creating-content/markdown-cheat-sheet\n",
154 | "****"
155 | ]
156 | },
157 | {
158 | "cell_type": "markdown",
159 | "metadata": {
160 | "collapsed": true
161 | },
162 | "source": [
163 | "# My Analysis and Recommendation on Hospital Readmissions"
164 | ]
165 | },
166 | {
167 | "cell_type": "markdown",
168 | "metadata": {},
169 | "source": [
170 | "## A. Do you agree with the above analysis and recommendations? Why or why not?"
171 | ]
172 | },
173 | {
174 | "cell_type": "markdown",
175 | "metadata": {
176 | "collapsed": true
177 | },
178 | "source": [
179 | "The above analysis is a good start point but now quite enough to draw conclusions and analysis set forth since they are only based on one scatter plot of the data, with no supporting statistical analysis to substantiate the claims. For that reason, I find those recommendations suspicious and do not agree with the analysis or recommendations above. I state my reasons for not accepting them below. \n",
180 | "\n",
181 | "- It is tempting to guess the trend which is mentioned in analysis, since the notable extreme points draw the eye from top left to bottom right. The plot is actually a little bit complicated. It is difficult to discern any real trends. Besides that, it is not clear why the boundaries of the shaded regions are chosen. The clustering of many points in those regions make these statements difficult to approve.\n",
182 | "\n",
183 | "\n",
184 | "- it is essential to consider the entire data set, including the very dense collection of points in the center. It is not clear why less than 100 and greater than 1000 were used, since the low and high demarcation used in the previous section (in the form of shaded boxes) was 350 and 800, respectively. This shows that a proper hypothesis test was not conducted to determine the statistical significance of readmission rate across different hospital sizes.\n",
185 | "\n",
186 | "\n",
187 | "- The numerical relationship was simply \"eyeballed\" between number of discharges and rate of readmissions. There was no correlation coefficient or numerical evaluation calculated to confirm initial observations. We do not have enough evidence to tell the two variables correlated with each other.\n",
188 | "\n",
189 | "\n",
190 | "- The conclusion is completely unfounded around hospital size lacking resources. There's no evidence that more resources would resolve this issue.\n",
191 | "\n",
192 | "\n",
193 | "- It is also curious that the only statistical evidence involved small hospitals defined as less than 100 whereas here they are defined as less than 300. This is another instance where numbers are given without explanation or further context.\n",
194 | "\n",
195 | "\n",
196 | "- The statement ,which defines \"Smaller hospitals/facilities may be lacking necessary resources to ensure quality care and prevent complications that lead to readmissions\", might be true. But there might be some other factors causing this particular situtation such as insurance and doctor ratings not available in the dataset. Recommendations are given without any solid analysis.\n",
197 | "\n",
198 | "\n",
199 | "- The missing data was handled properly above by dropping rows with null values (except for Footnote columns and 81 rows of missing values in each 'Excess Readmission Ratio', 'Predicted Readmission Rate', 'Expected Readmission Rate', and 'Number of Readmissions' features)."
200 | ]
201 | },
202 | {
203 | "cell_type": "markdown",
204 | "metadata": {},
205 | "source": [
206 | "## B. Provide support for your arguments and your own recommendations with a statistically sound analysis:"
207 | ]
208 | },
209 | {
210 | "cell_type": "markdown",
211 | "metadata": {},
212 | "source": [
213 | "**Let's start by inspecting data**"
214 | ]
215 | },
216 | {
217 | "cell_type": "code",
218 | "execution_count": null,
219 | "metadata": {},
220 | "outputs": [],
221 | "source": [
222 | "clean_hospital_read_df.sample(5)"
223 | ]
224 | },
225 | {
226 | "cell_type": "code",
227 | "execution_count": null,
228 | "metadata": {},
229 | "outputs": [],
230 | "source": [
231 | "clean_hospital_read_df.describe(include='all')"
232 | ]
233 | },
234 | {
235 | "cell_type": "code",
236 | "execution_count": null,
237 | "metadata": {},
238 | "outputs": [],
239 | "source": [
240 | "clean_hospital_read_df.info()"
241 | ]
242 | },
243 | {
244 | "cell_type": "code",
245 | "execution_count": null,
246 | "metadata": {},
247 | "outputs": [],
248 | "source": [
249 | "# Check the duplicate observations\n",
250 | "clean_hospital_read_df.duplicated().sum()"
251 | ]
252 | },
253 | {
254 | "cell_type": "markdown",
255 | "metadata": {},
256 | "source": [
257 | "I checked whether there is any duplicate observations in order to drop it. The result shows that there is no duplicate value. "
258 | ]
259 | },
260 | {
261 | "cell_type": "code",
262 | "execution_count": null,
263 | "metadata": {},
264 | "outputs": [],
265 | "source": [
266 | "# Find the missing values\n",
267 | "clean_hospital_read_df.isnull().sum()"
268 | ]
269 | },
270 | {
271 | "cell_type": "markdown",
272 | "metadata": {},
273 | "source": [
274 | "There are 11497 missing values in 'Footnote' feature. Besides that, there are 81 missing values in each 'Excess Readmission Ratio', 'Predicted Readmission Rate', 'Expected Readmission Rate', and 'Number of Readmissions' features. I will handle these missing values in the following cells. "
275 | ]
276 | },
277 | {
278 | "cell_type": "code",
279 | "execution_count": null,
280 | "metadata": {},
281 | "outputs": [],
282 | "source": [
283 | "# There are less missing values which are 81 out of 11578. Therefore, we can drop them. \n",
284 | "hospital_df = clean_hospital_read_df.dropna(subset=['Excess Readmission Ratio','Predicted Readmission Rate','Expected Readmission Rate',\n",
285 | " 'Number of Readmissions'])"
286 | ]
287 | },
288 | {
289 | "cell_type": "code",
290 | "execution_count": null,
291 | "metadata": {
292 | "scrolled": true
293 | },
294 | "outputs": [],
295 | "source": [
296 | "# Drop 'Footnote' column\n",
297 | "hospital_df.drop(columns= ['Footnote'], inplace=True, errors='ignore')\n",
298 | "hospital_df.sample(5)"
299 | ]
300 | },
301 | {
302 | "cell_type": "markdown",
303 | "metadata": {},
304 | "source": [
305 | "## Setup an appropriate hypothesis test"
306 | ]
307 | },
308 | {
309 | "cell_type": "markdown",
310 | "metadata": {},
311 | "source": [
312 | "In the premilinary report's conclusion part, it is stated that there is a significant correlation between hospital capacity (number of discharges) and readmission rates. I will make my hypothesis test on it. \n",
313 | "\n",
314 | "**Null Hypothesis :** There is no significant relationship between number of discharge and the excess readmission.\n",
315 | "\n",
316 | "**Alternative Hypothesis :** There is significant correlation between number of discharge and number of readmission.\n",
317 | "\n",
318 | "Define the test statistic as the Pearson-R (correlation coefficient)\n",
319 | "\n",
320 | "Significant level: 95%"
321 | ]
322 | },
323 | {
324 | "cell_type": "code",
325 | "execution_count": null,
326 | "metadata": {},
327 | "outputs": [],
328 | "source": [
329 | "# Calculate the correlation coefficient\n",
330 | "r=stats.pearsonr(hospital_df['Number of Discharges'], hospital_df['Excess Readmission Ratio'])\n",
331 | "print(\"correlation coefficient of two data is:\",r[0])"
332 | ]
333 | },
334 | {
335 | "cell_type": "markdown",
336 | "metadata": {},
337 | "source": [
338 | "Correlation coefficient is not very significant between excess readmission and number of discharges."
339 | ]
340 | },
341 | {
342 | "cell_type": "markdown",
343 | "metadata": {},
344 | "source": [
345 | "### Compute and report the observed significance value(p-value)"
346 | ]
347 | },
348 | {
349 | "cell_type": "code",
350 | "execution_count": null,
351 | "metadata": {},
352 | "outputs": [],
353 | "source": [
354 | "# Define function\n",
355 | "def permute_stat(data_1, data_2, size):\n",
356 | " \"\"\" This function calculates the pearson correlation coefficient for two sets of data, but randomized\"\"\"\n",
357 | " \"\"\" Returns statistics value of size = size\"\"\"\n",
358 | " \n",
359 | " r = np.empty(size)\n",
360 | "\n",
361 | " np.random.seed(22)\n",
362 | " for i in range(size):\n",
363 | " syn_data1 = np.random.permutation(data_1)\n",
364 | " syn_data2 = np.random.permutation(data_2)\n",
365 | " r[i] = (stats.pearsonr(syn_data1,syn_data2))[0]\n",
366 | " return r"
367 | ]
368 | },
369 | {
370 | "cell_type": "code",
371 | "execution_count": null,
372 | "metadata": {},
373 | "outputs": [],
374 | "source": [
375 | "# Calculate bootstrap correlation coefficient , size 10000\n",
376 | "r = permute_stat(hospital_df['Number of Discharges'], hospital_df['Excess Readmission Ratio'], 10000)"
377 | ]
378 | },
379 | {
380 | "cell_type": "code",
381 | "execution_count": null,
382 | "metadata": {},
383 | "outputs": [],
384 | "source": [
385 | "# Calculate standard deviation\n",
386 | "np.std(r)"
387 | ]
388 | },
389 | {
390 | "cell_type": "code",
391 | "execution_count": null,
392 | "metadata": {},
393 | "outputs": [],
394 | "source": [
395 | "# fit a slope for interpretation\n",
396 | "p = np.polyfit(hospital_df['Number of Discharges'], hospital_df['Excess Readmission Ratio'], 1)\n",
397 | "print(\"coefficient = \", p[0])"
398 | ]
399 | },
400 | {
401 | "cell_type": "code",
402 | "execution_count": null,
403 | "metadata": {},
404 | "outputs": [],
405 | "source": [
406 | "plt.hist(r, bins = 100)\n",
407 | "plt.xlabel('pearson r value')\n",
408 | "plt.ylabel('counts')\n",
409 | "plt.title('bootstrap r correlations, based on random assumption')"
410 | ]
411 | },
412 | {
413 | "cell_type": "code",
414 | "execution_count": null,
415 | "metadata": {},
416 | "outputs": [],
417 | "source": [
418 | "# Calculate P-value for a 0.79 pearson r:\n",
419 | "p_val_09 = sum(r<=-0.0973)\n",
420 | "print(\"p_value for the hospital dataset is:\", p_val_09)"
421 | ]
422 | },
423 | {
424 | "cell_type": "markdown",
425 | "metadata": {},
426 | "source": [
427 | "`r = 0.79 is the Pearson's sample correlation coefficient. It has a value between -1 and +1 and indicates a substantial 'positive' relationship near +1 and on the flip side, a 'negative' relationship near -1.`"
428 | ]
429 | },
430 | {
431 | "cell_type": "markdown",
432 | "metadata": {},
433 | "source": [
434 | "The p value for this observation is lower than significant level. That means the null hypothesis should be rejected. There is siginificant correlation between discharge and readmission."
435 | ]
436 | },
437 | {
438 | "cell_type": "markdown",
439 | "metadata": {},
440 | "source": [
441 | "### Discuss statistical significance and practical significance. Do they differ here? How does this change your recommendation to the client?"
442 | ]
443 | },
444 | {
445 | "cell_type": "markdown",
446 | "metadata": {},
447 | "source": [
448 | "**Discussion on statistical significance and practical significance:**\n",
449 | "\n",
450 | "- Statistical significance refers to the unlikelihood that the result is obtained by chance, i.e., probability of relationship between two variables exists. Practical significance refers to the relationship between the variables and the real world situation.\n",
451 | "\n",
452 | "- Statistical significance depends upon the sample size, practical significance depends upon external factors like cost, time, objective, etc.\n",
453 | "\n",
454 | "- Statistical significance does not guarantee practical significance, but to be practically significant, a data must be statistically significant.\n",
455 | "\n",
456 | "Click on this [link](http://www.differencebetween.net/science/mathematics-statistics/difference-between-statistical-significance-and-practical-significance/#ixzz5ZEwMu3oW) to read more about \"Statistical significance vs Practical significance\" \n",
457 | "\n",
458 | "The idea of statistical significance is the unlikelihood that the statistical value measured/observed would occur due to sampling. Usually, a hypothesis test only provides that there \"is\" or \"isn't\" a relationship aside from sampling. It does not describe the \"strength\" of the significance, even though it can prove the existence of the relationship. E.g. For all the hospitals, every 100 discharge increase of the capacity , there is only about 0.3% decrease on the readmission excess. Since the relationship between discharge and readamission can be very weak that there is no practical use to address it. So it may not be very meaningful to act upon that there is a statistical significance that the two are correlated.\n",
459 | "\n",
460 | "Adding an \"effective size\" measurement , like in this case, the Pearson r, would tell us \"how strong\" the relationship is. The Pearson R can be classified as: R~0.1, the correlation is low; R ~ 0.3, the correlation is medium; R> 0.5, the correlation is large. This combined with statistical significance, can be one example of practical significance. The practical significance is usually addressed depending on the field of study. How \"strong\" is strong can also be different based upon the field and the specific question. In this survey of readmissison on hospitals. I would probably convey to the client that there is a very weak correlation between hospital capacity and readimission. But that relationship may not be strong enough to draw any conclusion to act upon."
461 | ]
462 | },
463 | {
464 | "cell_type": "markdown",
465 | "metadata": {},
466 | "source": [
467 | "### Look at the scatterplot above.\n",
468 | "**What are the advantages and disadvantages of using this plot to convey information?**"
469 | ]
470 | },
471 | {
472 | "cell_type": "markdown",
473 | "metadata": {},
474 | "source": [
475 | "Scatter plots are good for visulizing relationship between continuous variables but without a sound statistical analysis it is not appropriate to reach out the conclusion from scatter plots."
476 | ]
477 | },
478 | {
479 | "cell_type": "markdown",
480 | "metadata": {},
481 | "source": [
482 | "**Construct another plot that conveys the same information in a more direct manner.**"
483 | ]
484 | },
485 | {
486 | "cell_type": "markdown",
487 | "metadata": {},
488 | "source": [
489 | "The scatter-plot shows too much information in a small space. A better visual would be to provide joint-plots."
490 | ]
491 | },
492 | {
493 | "cell_type": "code",
494 | "execution_count": null,
495 | "metadata": {
496 | "scrolled": true
497 | },
498 | "outputs": [],
499 | "source": [
500 | "sns.jointplot('Number of Discharges','Excess Readmission Ratio', data= hospital_df, kind='reg')"
501 | ]
502 | },
503 | {
504 | "cell_type": "code",
505 | "execution_count": null,
506 | "metadata": {},
507 | "outputs": [],
508 | "source": [
509 | "matplotlib.pyplot.hist('Number of Readmissions', bins = 100, data = hospital_df)"
510 | ]
511 | },
512 | {
513 | "cell_type": "code",
514 | "execution_count": null,
515 | "metadata": {},
516 | "outputs": [],
517 | "source": []
518 | },
519 | {
520 | "cell_type": "code",
521 | "execution_count": null,
522 | "metadata": {},
523 | "outputs": [],
524 | "source": []
525 | },
526 | {
527 | "cell_type": "code",
528 | "execution_count": null,
529 | "metadata": {},
530 | "outputs": [],
531 | "source": []
532 | },
533 | {
534 | "cell_type": "code",
535 | "execution_count": null,
536 | "metadata": {},
537 | "outputs": [],
538 | "source": []
539 | },
540 | {
541 | "cell_type": "code",
542 | "execution_count": null,
543 | "metadata": {},
544 | "outputs": [],
545 | "source": []
546 | },
547 | {
548 | "cell_type": "code",
549 | "execution_count": null,
550 | "metadata": {},
551 | "outputs": [],
552 | "source": []
553 | }
554 | ],
555 | "metadata": {
556 | "anaconda-cloud": {},
557 | "kernelspec": {
558 | "display_name": "Python 3",
559 | "language": "python",
560 | "name": "python3"
561 | },
562 | "language_info": {
563 | "codemirror_mode": {
564 | "name": "ipython",
565 | "version": 3
566 | },
567 | "file_extension": ".py",
568 | "mimetype": "text/x-python",
569 | "name": "python",
570 | "nbconvert_exporter": "python",
571 | "pygments_lexer": "ipython3",
572 | "version": "3.6.5"
573 | }
574 | },
575 | "nbformat": 4,
576 | "nbformat_minor": 1
577 | }
578 |
--------------------------------------------------------------------------------
/Exploratory_Data_Analysis_Project-Normal_Human_Body_Temperature/data/human_body_temperature.csv:
--------------------------------------------------------------------------------
1 | temperature,gender,heart_rate
2 | 99.3,F,68.0
3 | 98.4,F,81.0
4 | 97.8,M,73.0
5 | 99.2,F,66.0
6 | 98.0,F,73.0
7 | 99.2,M,83.0
8 | 98.0,M,71.0
9 | 98.8,M,78.0
10 | 98.4,F,84.0
11 | 98.6,F,86.0
12 | 98.8,F,89.0
13 | 96.7,F,62.0
14 | 98.2,M,72.0
15 | 98.7,F,79.0
16 | 97.8,F,77.0
17 | 98.8,F,83.0
18 | 98.3,F,79.0
19 | 98.2,M,64.0
20 | 97.2,F,68.0
21 | 99.4,M,70.0
22 | 98.3,F,78.0
23 | 98.2,M,71.0
24 | 98.6,M,70.0
25 | 98.4,M,68.0
26 | 97.8,M,65.0
27 | 98.0,F,87.0
28 | 97.8,F,62.0
29 | 98.2,F,69.0
30 | 98.4,F,73.0
31 | 98.1,M,67.0
32 | 98.3,M,86.0
33 | 97.6,F,61.0
34 | 98.5,M,71.0
35 | 98.6,M,82.0
36 | 99.3,M,63.0
37 | 99.5,M,75.0
38 | 99.1,M,71.0
39 | 98.3,M,72.0
40 | 97.9,F,79.0
41 | 96.4,F,69.0
42 | 98.4,F,79.0
43 | 98.4,M,82.0
44 | 96.9,M,74.0
45 | 97.2,M,64.0
46 | 99.0,F,79.0
47 | 97.9,F,69.0
48 | 97.4,M,72.0
49 | 97.4,M,68.0
50 | 97.9,M,76.0
51 | 97.1,M,82.0
52 | 98.9,F,76.0
53 | 98.3,F,80.0
54 | 98.5,F,83.0
55 | 98.6,M,78.0
56 | 98.2,F,73.0
57 | 98.6,F,82.0
58 | 98.8,F,70.0
59 | 98.2,M,66.0
60 | 98.2,F,65.0
61 | 97.6,M,73.0
62 | 99.1,F,80.0
63 | 98.4,M,84.0
64 | 98.2,F,57.0
65 | 98.6,M,83.0
66 | 98.7,F,65.0
67 | 97.4,M,70.0
68 | 97.4,F,57.0
69 | 98.6,M,77.0
70 | 98.7,F,82.0
71 | 98.9,M,80.0
72 | 98.1,F,81.0
73 | 97.7,F,61.0
74 | 98.0,M,78.0
75 | 98.8,M,81.0
76 | 99.0,M,75.0
77 | 98.8,M,78.0
78 | 98.0,F,76.0
79 | 98.4,M,70.0
80 | 97.4,M,78.0
81 | 97.6,M,74.0
82 | 98.8,F,73.0
83 | 98.0,M,67.0
84 | 97.5,M,70.0
85 | 99.2,F,77.0
86 | 98.6,F,85.0
87 | 97.1,M,75.0
88 | 98.6,F,77.0
89 | 98.0,M,78.0
90 | 98.7,M,73.0
91 | 98.1,M,73.0
92 | 97.8,M,74.0
93 | 100.0,F,78.0
94 | 98.8,F,84.0
95 | 97.1,M,73.0
96 | 97.8,M,58.0
97 | 96.8,F,75.0
98 | 99.9,F,79.0
99 | 98.7,F,64.0
100 | 98.8,F,64.0
101 | 98.0,M,74.0
102 | 99.0,M,81.0
103 | 98.5,M,68.0
104 | 98.0,F,78.0
105 | 99.4,F,77.0
106 | 97.6,M,69.0
107 | 96.7,M,71.0
108 | 97.0,M,80.0
109 | 98.6,M,66.0
110 | 98.7,F,72.0
111 | 97.3,M,69.0
112 | 98.8,F,69.0
113 | 98.0,F,89.0
114 | 98.2,F,64.0
115 | 99.1,F,74.0
116 | 99.0,M,79.0
117 | 98.0,M,64.0
118 | 100.8,F,77.0
119 | 97.8,F,71.0
120 | 98.7,M,78.0
121 | 98.4,F,74.0
122 | 97.7,F,84.0
123 | 97.9,F,68.0
124 | 99.0,F,81.0
125 | 97.2,F,66.0
126 | 97.5,M,75.0
127 | 96.3,M,70.0
128 | 97.7,M,77.0
129 | 98.2,F,73.0
130 | 97.9,M,72.0
131 | 98.7,F,59.0
132 |
--------------------------------------------------------------------------------
/Google_API_Project/Google API Project.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | " GOOGLE Geocoding API Project"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "# Import necessary packages"
17 | ]
18 | },
19 | {
20 | "cell_type": "code",
21 | "execution_count": 2,
22 | "metadata": {},
23 | "outputs": [],
24 | "source": [
25 | "import json\n",
26 | "import urllib.parse\n",
27 | "import requests"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": 3,
33 | "metadata": {},
34 | "outputs": [],
35 | "source": [
36 | "# Personal API Key"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 4,
42 | "metadata": {},
43 | "outputs": [],
44 | "source": [
45 | "API_KEY = '------'"
46 | ]
47 | },
48 | {
49 | "cell_type": "code",
50 | "execution_count": 5,
51 | "metadata": {},
52 | "outputs": [],
53 | "source": [
54 | "# Program sends requested address to Google Geocoding API and returns the detailed address information.\n",
55 | "# Users enters 'q' or 'quit' to quit from the program."
56 | ]
57 | },
58 | {
59 | "cell_type": "code",
60 | "execution_count": null,
61 | "metadata": {},
62 | "outputs": [
63 | {
64 | "name": "stdout",
65 | "output_type": "stream",
66 | "text": [
67 | "Please Enter Address: 10013\n",
68 | "https://maps.googleapis.com/maps/api/geocode/json?address=10013\n",
69 | "API Status: OK\n",
70 | "\n",
71 | "10013\n",
72 | "Manhattan\n",
73 | "New York\n",
74 | "New York County\n",
75 | "New York\n",
76 | "United States\n",
77 | "\n",
78 | "New York, NY 10013, USA\n"
79 | ]
80 | }
81 | ],
82 | "source": [
83 | "while True:\n",
84 | " address = input('Please Enter Address: ')\n",
85 | " \n",
86 | " if address == 'quit' or address == 'q':\n",
87 | " break\n",
88 | " \n",
89 | " #Web pages address for requests\n",
90 | " main_api = 'https://maps.googleapis.com/maps/api/geocode/json?' \n",
91 | " url_p = main_api + urllib.parse.urlencode({'address': address})\n",
92 | " url = main_api + urllib.parse.urlencode({'address': address}) + '&key=' + API_KEY\n",
93 | " print(url_p)\n",
94 | " \n",
95 | " #Incoming data from API\n",
96 | " json_data = requests.get(url).json()\n",
97 | " \n",
98 | " #Check API status\n",
99 | " json_status = json_data['status']\n",
100 | " print('API Status: ' + json_status + '\\n')\n",
101 | " \n",
102 | " if json_status == 'OK':\n",
103 | " for each in json_data['results'][0]['address_components']:\\\n",
104 | " print(each['long_name'])\n",
105 | " \n",
106 | " formatted_address = json_data['results'][0]['formatted_address']\n",
107 | " print()\n",
108 | " print(formatted_address)"
109 | ]
110 | }
111 | ],
112 | "metadata": {
113 | "kernelspec": {
114 | "display_name": "Python 3",
115 | "language": "python",
116 | "name": "python3"
117 | },
118 | "language_info": {
119 | "codemirror_mode": {
120 | "name": "ipython",
121 | "version": 3
122 | },
123 | "file_extension": ".py",
124 | "mimetype": "text/x-python",
125 | "name": "python",
126 | "nbconvert_exporter": "python",
127 | "pygments_lexer": "ipython3",
128 | "version": "3.6.5"
129 | }
130 | },
131 | "nbformat": 4,
132 | "nbformat_minor": 2
133 | }
134 |
--------------------------------------------------------------------------------
/Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 |
5 | # C extensions
6 | *.so
7 |
8 | # Distribution / packaging
9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 |
26 | # PyInstaller
27 | # Usually these files are written by a python script from a template
28 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 |
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 |
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .coverage.*
41 | .cache
42 | nosetests.xml
43 | coverage.xml
44 | *,cover
45 |
46 | # Translations
47 | *.mo
48 | *.pot
49 |
50 | # Django stuff:
51 | *.log
52 |
53 | # Sphinx documentation
54 | docs/_build/
55 |
56 | # PyBuilder
57 | target/
58 |
59 | #Ipython
60 | .ipynb_checkpoints/
61 | # Created by .ignore support plugin (hsz.mobi)
62 | ### OSX template
63 | .DS_Store
64 | .AppleDouble
65 | .LSOverride
66 |
67 | # Icon must end with two \r
68 | Icon
69 |
70 | # Thumbnails
71 | ._*
72 |
73 | # Files that might appear in the root of a volume
74 | .DocumentRevisions-V100
75 | .fseventsd
76 | .Spotlight-V100
77 | .TemporaryItems
78 | .Trashes
79 | .VolumeIcon.icns
80 |
81 | # Directories potentially created on remote AFP share
82 | .AppleDB
83 | .AppleDesktop
84 | Network Trash Folder
85 | Temporary Items
86 | .apdisk
87 |
88 | #Temporary data
89 | tempdata/
90 |
--------------------------------------------------------------------------------
/Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/bias.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/bias.png
--------------------------------------------------------------------------------
/Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/complexity-error-plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/complexity-error-plot.png
--------------------------------------------------------------------------------
/Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/complexity-error-reg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/complexity-error-reg.png
--------------------------------------------------------------------------------
/Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/data.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/data.png
--------------------------------------------------------------------------------
/Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/knn1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/knn1.png
--------------------------------------------------------------------------------
/Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/knn2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/knn2.png
--------------------------------------------------------------------------------
/Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/linreg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/linreg.png
--------------------------------------------------------------------------------
/Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/linsep.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/linsep.png
--------------------------------------------------------------------------------
/Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/onelinesplit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/onelinesplit.png
--------------------------------------------------------------------------------
/Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/pcanim.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/pcanim.gif
--------------------------------------------------------------------------------
/Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/reshape.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/reshape.jpg
--------------------------------------------------------------------------------
/Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/sklearn2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/sklearn2.jpg
--------------------------------------------------------------------------------
/Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/sklearntrans.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/sklearntrans.jpg
--------------------------------------------------------------------------------
/Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/train-cv2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/train-cv2.png
--------------------------------------------------------------------------------
/Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/train-cv3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/train-cv3.png
--------------------------------------------------------------------------------
/Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/train-test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/train-test.png
--------------------------------------------------------------------------------
/Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/train-validate-test-cont.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/train-validate-test-cont.png
--------------------------------------------------------------------------------
/Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/train-validate-test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/train-validate-test.png
--------------------------------------------------------------------------------
/Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/train-validate-test3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/train-validate-test3.png
--------------------------------------------------------------------------------
/Naive_Bayes_Project-Predicting_Movie_Ratings_From_Reviews/Test1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Naive_Bayes_Project-Predicting_Movie_Ratings_From_Reviews/Test1.png
--------------------------------------------------------------------------------
/Naive_Bayes_Project-Predicting_Movie_Ratings_From_Reviews/callibration.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Naive_Bayes_Project-Predicting_Movie_Ratings_From_Reviews/callibration.png
--------------------------------------------------------------------------------
/Naive_Bayes_Project-Predicting_Movie_Ratings_From_Reviews/terms.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Naive_Bayes_Project-Predicting_Movie_Ratings_From_Reviews/terms.png
--------------------------------------------------------------------------------
/Naive_Bayes_Project-Predicting_Movie_Ratings_From_Reviews/terms2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Naive_Bayes_Project-Predicting_Movie_Ratings_From_Reviews/terms2.png
--------------------------------------------------------------------------------
/Naive_Bayes_Project-Predicting_Movie_Ratings_From_Reviews/vsm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Naive_Bayes_Project-Predicting_Movie_Ratings_From_Reviews/vsm.png
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Springboard_Projects
2 |
3 |
4 | ## Introduction
5 |
6 | This repository contains all the projects that were completed as part of Springboard's Data Science Career Track. However, it does not include the capstone and visualisation projects. They are available as separate repositories.
7 |
8 |
9 | ## Contents
10 |
11 | Disclaimer: If you are a Springboard DSC Student, I strongly suggest you refrain from viewing the code before you've actually attempted at solving the problem yourself.
12 |
13 | 1. [Understanding Country Club Database with SQL - Manipulating data in SQL](http://localhost:8888/tree/Data_Science/Springboard_Projects/SQL_Project-Country_Club_Database)
14 | 2. [Analyzing World Bank Projects - Data Wrangling with JSON file](http://localhost:8888/tree/Data_Science/Springboard_Projects/Data_Wrangling_Project-JSON_File)
15 | 3. [API Project - Quandl - Data Wrangling](http://localhost:8888/tree/Data_Science/Springboard_Projects/API_Project-Quandl)
16 | 4. [What is the true Normal Human Body Temperature - Inferential Statistics](http://localhost:8888/tree/Data_Science/Springboard_Projects/Exploratory_Data_Analysis_Project-Normal_Human_Body_Temperature)
17 | 5. [Examining Racial Discrimination in the US Job Market - Inferential Statistics](http://localhost:8888/tree/Data_Science/Springboard_Projects/Exploratory_Data_Analysis_Project-Examine_Racial_Discrimination)
18 | 6. [Hospital Readmission Analysis and Recommendations - Inferential Statistics](http://localhost:8888/tree/Data_Science/Springboard_Projects/Exploratory_Data_Analysis_Project-Hospital_Readmissions)
19 | 7. [Predicting House Prices using Linear Regression - Supervised Machine Learning Project](http://localhost:8888/tree/Data_Science/Springboard_Projects/Linear_Regression_Project-Boston_Housing_Dataset)
20 | 8. [Predicting Gender using Logistic Regression - Supervised Machine Learning Project](http://localhost:8888/tree/Data_Science/Springboard_Projects/Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights)
21 | 9. [Movie Review Sentiment Analysis using Naive Bayes - Supervised Machine Learning Project](http://localhost:8888/tree/Data_Science/Springboard_Projects/Naive_Bayes_Project-Predicting_Movie_Ratings_From_Reviews)
22 | 10. [Wine Customer Segmentation using Unsupervised Learning - Unsupervised Machine Learning Project](http://localhost:8888/tree/Data_Science/Springboard_Projects/Clustering_Project-Customer_Segmentation)
23 | 11. [Spark Project-Databricks](http://localhost:8888/tree/Data_Science/Springboard_Projects/Spark_Project-Databricks)
24 | 12. [Ultimate Inc. Data Science Challenge - Time Series Project](http://localhost:8888/tree/Data_Science/Springboard_Projects/Take_Home_Challenge-Ultimate_Technologies_Inc)
25 | 13. [Relax Inc. Data Science Challenge](http://localhost:8888/tree/Data_Science/Springboard_Projects/Take_Home_Challenge-Relax_Inc)
26 |
27 |
28 |
29 |
30 | import pandas as pd
31 | import json
32 | from typing import Dict, List, Any, Union
33 |
34 | def expand_nested_json(data: Union[List[Dict], Dict], separator: str = "_") -> pd.DataFrame:
35 |
36 | # If input is a dictionary, convert to a list containing that dictionary
37 | if isinstance(data, dict):
38 | data = [data]
39 |
40 | # If input is a string (JSON), parse it
41 | if isinstance(data, str):
42 | data = json.loads(data)
43 | if isinstance(data, dict):
44 | data = [data]
45 |
46 | # First convert to DataFrame
47 | df = pd.DataFrame(data)
48 |
49 | # Function to flatten nested columns
50 | def flatten_nested_columns(df: pd.DataFrame, separator: str = "_") -> pd.DataFrame:
51 | # Create a copy to avoid modifying the original DataFrame
52 | result_df = df.copy()
53 |
54 | # Find columns with dictionaries or lists
55 | nested_columns = [
56 | col for col in result_df.columns
57 | if any(isinstance(val, (dict, list)) for val in result_df[col].dropna())
58 | ]
59 |
60 | # No nested columns to expand
61 | if not nested_columns:
62 | return result_df
63 |
64 | # Process each nested column
65 | for col in nested_columns:
66 | # Handle dictionary columns
67 | if any(isinstance(val, dict) for val in result_df[col].dropna()):
68 | # Convert column to DataFrame
69 | expanded = pd.json_normalize(
70 | result_df[col].apply(lambda x: {} if pd.isna(x) else x)
71 | )
72 |
73 | # Rename columns with prefix
74 | expanded.columns = [f"{col}{separator}{subcol}" for subcol in expanded.columns]
75 |
76 | # Drop the original column and join with expanded columns
77 | result_df = result_df.drop(col, axis=1).join(expanded)
78 |
79 | # Handle list columns
80 | elif any(isinstance(val, list) for val in result_df[col].dropna()):
81 | # Handle lists of dictionaries
82 | if any(isinstance(item, dict) for sublist in result_df[col].dropna() for item in sublist if sublist):
83 | # Create a temporary column with the index
84 | result_df['_temp_idx'] = range(len(result_df))
85 |
86 | # Explode the list column into separate rows
87 | exploded = result_df[[col, '_temp_idx']].explode(col)
88 |
89 | # Normalize the exploded dictionaries
90 | if not exploded.empty and any(isinstance(val, dict) for val in exploded[col].dropna()):
91 | expanded = pd.json_normalize(
92 | exploded[col].apply(lambda x: {} if pd.isna(x) else x)
93 | )
94 |
95 | # Prefix column names
96 | expanded.columns = [f"{col}{separator}{subcol}" for subcol in expanded.columns]
97 |
98 | # Join with the index column
99 | expanded['_temp_idx'] = exploded['_temp_idx'].values
100 |
101 | # Group by index and convert expanded columns to lists
102 | grouped = expanded.groupby('_temp_idx').agg(list)
103 |
104 | # Join with the original DataFrame
105 | result_df = result_df.drop(col, axis=1).join(grouped, on='_temp_idx')
106 |
107 | # Clean up temporary index column
108 | result_df = result_df.drop('_temp_idx', axis=1)
109 |
110 | # Handle simple lists (strings, numbers)
111 | else:
112 | # Convert lists to strings for simple representation
113 | result_df[col] = result_df[col].apply(
114 | lambda x: json.dumps(x) if isinstance(x, list) else x
115 | )
116 |
117 | # Recursively process any new nested columns that were created
118 | return flatten_nested_columns(result_df, separator)
119 |
120 | # Apply the recursive flattening
121 | flattened_df = flatten_nested_columns(df, separator)
122 |
123 | return flattened_df
124 |
--------------------------------------------------------------------------------
/SQL_Project-Country_Club_Database/SQL_Project-Country_Club_Database.sql:
--------------------------------------------------------------------------------
1 | /* Welcome to the SQL mini project. For this project, you will use
2 | Springboard' online SQL platform.
3 |
4 | The data you need is in the "country_club" database. This database
5 | contains 3 tables:
6 | i) the "Bookings" table,
7 | ii) the "Facilities" table, and
8 | iii) the "Members" table.
9 |
10 | Note that, if you need to, you can also download these tables locally.
11 |
12 | In the mini project, you'll be asked a series of questions. You can
13 | solve them using the platform, but for the final deliverable,
14 | paste the code for each solution into this script, and upload it
15 | to your GitHub.
16 |
17 | Before starting with the questions, feel free to take your time,
18 | exploring the data, and getting acquainted with the 3 tables. */
19 |
20 |
21 | /* Q1: Some of the facilities charge a fee to members, but some do not.
22 | Please list the names of the facilities that do. */
23 |
24 | SELECT
25 | name
26 | FROM
27 | facilities
28 | WHERE
29 | membercost != 0;
30 |
31 |
32 | /* Q2: How many facilities do not charge a fee to members? */
33 |
34 | SELECT
35 | COUNT(name) AS zero_membercost
36 | FROM
37 | facilities
38 | WHERE
39 | membercost = 0;
40 |
41 |
42 | /* Q3: How can you produce a list of facilities that charge a fee to members,
43 | where the fee is less than 20% of the facility's monthly maintenance cost?
44 | Return the facid, facility name, member cost, and monthly maintenance of the
45 | facilities in question. */
46 |
47 | SELECT
48 | facid, name AS facility_name, membercost, monthlymaintenance
49 | FROM
50 | facilities
51 | WHERE
52 | membercost < 0.20 * monthlymaintenance
53 | AND membercost != 0;
54 |
55 |
56 | /* Q4: How can you retrieve the details of facilities with ID 1 and 5?
57 | Write the query without using the OR operator. */
58 |
59 | SELECT
60 | *
61 | FROM
62 | facilities
63 | WHERE
64 | facid IN (1 , 5);
65 |
66 |
67 | /* Q5: How can you produce a list of facilities, with each labelled as
68 | 'cheap' or 'expensive', depending on if their monthly maintenance cost is
69 | more than $100? Return the name and monthly maintenance of the facilities
70 | in question. */
71 |
72 | SELECT
73 | name,
74 | CASE
75 | WHEN monthlymaintenance > 100.0 THEN 'expensive'
76 | ELSE 'cheap'
77 | END AS monthlymaintenance
78 | FROM
79 | facilities;
80 |
81 |
82 | /* Q6: You'd like to get the first and last name of the last member(s)
83 | who signed up. Do not use the LIMIT clause for your solution. */
84 |
85 | SELECT
86 | firstname, surname
87 | FROM
88 | members
89 | WHERE
90 | joindate = (SELECT
91 | MAX(joindate)
92 | FROM
93 | members);
94 |
95 |
96 | /* Q7: How can you produce a list of all members who have used a tennis court?
97 | Include in your output the name of the court, and the name of the member
98 | formatted as a single column. Ensure no duplicate data, and order by
99 | the member name. */
100 |
101 | SELECT DISTINCT
102 | (CONCAT(m.firstname, ' ', m.surname)) AS member, f.name
103 | FROM
104 | facilities f
105 | JOIN
106 | Bookings b ON f.facid = b.facid
107 | JOIN
108 | Members m ON m.memid = b.memid AND m.memid != 0
109 | WHERE
110 | f.name LIKE '%Tennis Court%'
111 | ORDER BY member;
112 |
113 |
114 | /* Q8: How can you produce a list of bookings on the day of 2012-09-14 which
115 | will cost the member (or guest) more than $30? Remember that guests have
116 | different costs to members (the listed costs are per half-hour 'slot'), and
117 | the guest user's ID is always 0. Include in your output the name of the
118 | facility, the name of the member formatted as a single column, and the cost.
119 | Order by descending cost, and do not use any subqueries. */
120 |
121 | SELECT
122 | f.name AS facility_name,
123 | CONCAT(m.firstname, ' ', m.surname) AS member_ID,
124 | CASE
125 | WHEN m.memid != 0 THEN b.slots * f.membercost
126 | WHEN m.memid = 0 THEN b.slots * f.guestcost
127 | END AS cost
128 | FROM
129 | members m
130 | JOIN
131 | bookings b ON m.memid = b.memid
132 | JOIN
133 | facilities AS f ON b.facid = f.facid
134 | WHERE
135 | b.starttime >= '2012-09-14'
136 | AND b.starttime < '2012-09-15'
137 | AND ((m.memid != 0
138 | AND b.slots * f.membercost > 30)
139 | OR (m.memid = 0
140 | AND b.slots * f.guestcost > 30))
141 | ORDER BY cost DESC;
142 |
143 |
144 | /* Q9: This time, produce the same result as in Q8, but using a subquery. */
145 |
146 | SELECT
147 | sub.name AS facility_name,
148 | sub.member AS member_ID,
149 | sub.cost AS cost
150 | FROM
151 | (SELECT
152 | CONCAT(m.firstname, ' ', m.surname) AS member,
153 | f.name,
154 | CASE
155 | WHEN m.memid != 0 THEN b.slots * f.membercost
156 | WHEN m.memid = 0 THEN b.slots * f.guestcost
157 | END AS cost
158 | FROM
159 | members m
160 | JOIN bookings b ON m.memid = b.memid
161 | JOIN facilities f ON b.facid = f.facid
162 | WHERE
163 | b.starttime >= '2012-09-14'
164 | AND b.starttime < '2012-09-15'
165 | HAVING cost > 30) AS sub
166 | ORDER BY cost DESC;
167 |
168 |
169 | /* Q10: Produce a list of facilities with a total revenue less than 1000.
170 | The output of facility name and total revenue, sorted by revenue. Remember
171 | that there's a different cost for guests and members! */
172 |
173 | SELECT
174 | facility_name, revenue AS total_revenue
175 | FROM
176 | (SELECT
177 | SUM(CASE
178 | WHEN b.memid != 0 THEN b.slots * f.membercost
179 | WHEN b.memid = 0 THEN b.slots * f.guestcost
180 | END) AS revenue,
181 | f.name AS facility_name
182 | FROM
183 | bookings b
184 | JOIN facilities f ON b.facid = f.facid
185 | GROUP BY f.name
186 | HAVING revenue < 1000) AS sub
187 | ORDER BY revenue;
188 |
--------------------------------------------------------------------------------
/SQL_Project-Country_Club_Database/Schema.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/SQL_Project-Country_Club_Database/Schema.JPG
--------------------------------------------------------------------------------
/Spark_Project-Databricks/.ipynb_checkpoints/Spark-Mini_Project-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## Spark Mini Project"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "The project was completed in databricks and published. You may reach my Spark mini project via link below. "
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "https://databricks-prod-cloudfront.cloud.databricks.com/public/4027ec902e239c93eaaa8714f173bcfc/8467584438137932/4169206975357756/5443988219023238/latest.html"
22 | ]
23 | }
24 | ],
25 | "metadata": {
26 | "kernelspec": {
27 | "display_name": "Python 3",
28 | "language": "python",
29 | "name": "python3"
30 | },
31 | "language_info": {
32 | "codemirror_mode": {
33 | "name": "ipython",
34 | "version": 3
35 | },
36 | "file_extension": ".py",
37 | "mimetype": "text/x-python",
38 | "name": "python",
39 | "nbconvert_exporter": "python",
40 | "pygments_lexer": "ipython3",
41 | "version": "3.7.1"
42 | }
43 | },
44 | "nbformat": 4,
45 | "nbformat_minor": 2
46 | }
47 |
--------------------------------------------------------------------------------
/Spark_Project-Databricks/Spark-Mini_Project.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## Spark Mini Project"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "The project was completed in databricks and published. You may reach my Spark mini project via link below. "
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "https://databricks-prod-cloudfront.cloud.databricks.com/public/4027ec902e239c93eaaa8714f173bcfc/8467584438137932/4169206975357756/5443988219023238/latest.html"
22 | ]
23 | }
24 | ],
25 | "metadata": {
26 | "kernelspec": {
27 | "display_name": "Python 3",
28 | "language": "python",
29 | "name": "python3"
30 | },
31 | "language_info": {
32 | "codemirror_mode": {
33 | "name": "ipython",
34 | "version": 3
35 | },
36 | "file_extension": ".py",
37 | "mimetype": "text/x-python",
38 | "name": "python",
39 | "nbconvert_exporter": "python",
40 | "pygments_lexer": "ipython3",
41 | "version": "3.7.1"
42 | }
43 | },
44 | "nbformat": 4,
45 | "nbformat_minor": 2
46 | }
47 |
--------------------------------------------------------------------------------
/Take_Home_Challenge-Relax_Inc/Relax_Keynote.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Take_Home_Challenge-Relax_Inc/Relax_Keynote.pdf
--------------------------------------------------------------------------------
/Take_Home_Challenge-Relax_Inc/relax_data_science_challenge.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Take_Home_Challenge-Relax_Inc/relax_data_science_challenge.pdf
--------------------------------------------------------------------------------
/Take_Home_Challenge-Relax_Inc/takehome_users.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Take_Home_Challenge-Relax_Inc/takehome_users.csv
--------------------------------------------------------------------------------
/Take_Home_Challenge-Ultimate_Technologies_Inc/ultimate_data_science_challenge.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Take_Home_Challenge-Ultimate_Technologies_Inc/ultimate_data_science_challenge.pdf
--------------------------------------------------------------------------------