├── API_Project-Quandl ├── .ipynb_checkpoints │ └── API_Project-Quandl-checkpoint.ipynb ├── API_Project-Quandl.ipynb ├── batch_rulex_script.py ├── display_predictions.py ├── files_execution.txt ├── local_interpretability.py └── model_predictor.py ├── Clustering_Project-Customer_Segmentation ├── .ipynb_checkpoints │ └── Mini_Project_Clustering-checkpoint.ipynb ├── Mini_Project_Clustering.ipynb ├── WineKMC.xlsx ├── agglomerate.png └── spectral.png ├── Data_Wrangling_Project-JSON_File ├── .ipynb_checkpoints │ └── JSON_Project-World_Bank_Data-checkpoint.ipynb ├── JSON_Project-World_Bank_Data.ipynb └── data │ └── world_bank_projects.json ├── Exploratory_Data_Analysis_Project-Examine_Racial_Discrimination ├── .ipynb_checkpoints │ └── EDA_Project-Examine_Racial_Discrimination-checkpoint.ipynb ├── EDA_Project-Examine_Racial_Discrimination.ipynb └── data │ └── us_job_market_discrimination.dta ├── Exploratory_Data_Analysis_Project-Hospital_Readmissions ├── .ipynb_checkpoints │ └── EDA_Project-Hospital_Readmissions-checkpoint.ipynb ├── EDA_Project-Hospital_Readmissions.ipynb └── data │ └── cms_hospital_readmissions.csv ├── Exploratory_Data_Analysis_Project-Normal_Human_Body_Temperature ├── EDA_Project-Normal_Human_Body_Temperature.ipynb └── data │ └── human_body_temperature.csv ├── Google_API_Project └── Google API Project.ipynb ├── Linear_Regression_Project-Boston_Housing_Dataset ├── .ipynb_checkpoints │ └── Mini_Project_Linear_Regression-checkpoint.ipynb └── Mini_Project_Linear_Regression.ipynb ├── Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights ├── .gitignore ├── Logistic_Regression-Mini_Project.ipynb ├── data │ └── 01_heights_weights_genders.csv └── images │ ├── bias.png │ ├── complexity-error-plot.png │ ├── complexity-error-reg.png │ ├── data.png │ ├── knn1.png │ ├── knn2.png │ ├── linreg.png │ ├── linsep.png │ ├── onelinesplit.png │ ├── pcanim.gif │ ├── reshape.jpg │ ├── sklearn2.jpg │ ├── sklearntrans.jpg │ ├── train-cv2.png │ ├── train-cv3.png │ ├── train-test.png │ ├── train-validate-test-cont.png │ ├── train-validate-test.png │ └── train-validate-test3.png ├── Naive_Bayes_Project-Predicting_Movie_Ratings_From_Reviews ├── .ipynb_checkpoints │ ├── Mini_Project_Naive_Bayes-checkpoint.ipynb │ └── TextAnalysis-checkpoint.ipynb ├── Mini_Project_Naive_Bayes.ipynb ├── Test1.png ├── callibration.png ├── critics.csv ├── terms.png ├── terms2.png └── vsm.png ├── README.md ├── SQL_Project-Country_Club_Database ├── SQL_Project-Country_Club_Database.sql ├── Schema.JPG └── data ├── Spark_Project-Databricks ├── .ipynb_checkpoints │ └── Spark-Mini_Project-checkpoint.ipynb └── Spark-Mini_Project.ipynb ├── Take_Home_Challenge-Relax_Inc ├── .ipynb_checkpoints │ └── Relax Take Home Challenge-checkpoint.ipynb ├── Relax Take Home Challenge.ipynb ├── Relax_Keynote.pdf ├── relax_data_science_challenge.pdf ├── takehome_user_engagement.csv └── takehome_users.csv └── Take_Home_Challenge-Ultimate_Technologies_Inc ├── .ipynb_checkpoints └── Take_Home_Challenge-Notebook-checkpoint.ipynb ├── Take_Home_Challenge-Notebook.ipynb ├── logins.json ├── ultimate_data_challenge.json └── ultimate_data_science_challenge.pdf /API_Project-Quandl/.ipynb_checkpoints/API_Project-Quandl-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# API_Project-Quandl" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "# Importing the relevant modules.\n", 17 | "\n", 18 | "import requests\n", 19 | "import json \n", 20 | "import operator\n", 21 | "import numpy as np" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 2, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "# Unique API key which is taken from http://www.quandl.com website.\n", 31 | "\n", 32 | "API_KEY = 'zHER-uPSaTEaUxTgB2d4' " 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 3, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "# Caling the Quandl API and pull out a small sample of the data (only one day) to get a glimpse \n", 42 | "# into the JSON structure that will be returned.\n", 43 | "\n", 44 | "url = 'https://www.quandl.com/api/v3/datasets/FSE/AFX_X.json?&start_date=2017-01-01&end_date=2017-01-01&api_key=' + API_KEY\n", 45 | "r = requests.get(url)\n", 46 | "r_json = r.json()" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 22, 52 | "metadata": {}, 53 | "outputs": [ 54 | { 55 | "data": { 56 | "text/plain": [ 57 | "{'dataset': {'id': 10095370,\n", 58 | " 'dataset_code': 'AFX_X',\n", 59 | " 'database_code': 'FSE',\n", 60 | " 'name': 'Carl Zeiss Meditec (AFX_X)',\n", 61 | " 'description': 'Stock Prices for Carl Zeiss Meditec (AFX) from the Frankfurt Stock Exchange.

Trading System: Xetra

ISIN: DE0005313704',\n", 62 | " 'refreshed_at': '2018-10-16T22:29:02.721Z',\n", 63 | " 'newest_available_date': '2018-10-16',\n", 64 | " 'oldest_available_date': '2000-06-07',\n", 65 | " 'column_names': ['Date',\n", 66 | " 'Open',\n", 67 | " 'High',\n", 68 | " 'Low',\n", 69 | " 'Close',\n", 70 | " 'Change',\n", 71 | " 'Traded Volume',\n", 72 | " 'Turnover',\n", 73 | " 'Last Price of the Day',\n", 74 | " 'Daily Traded Units',\n", 75 | " 'Daily Turnover'],\n", 76 | " 'frequency': 'daily',\n", 77 | " 'type': 'Time Series',\n", 78 | " 'premium': False,\n", 79 | " 'limit': None,\n", 80 | " 'transform': None,\n", 81 | " 'column_index': None,\n", 82 | " 'start_date': '2017-01-01',\n", 83 | " 'end_date': '2017-01-01',\n", 84 | " 'data': [],\n", 85 | " 'collapse': None,\n", 86 | " 'order': None,\n", 87 | " 'database_id': 6129}}" 88 | ] 89 | }, 90 | "execution_count": 22, 91 | "metadata": {}, 92 | "output_type": "execute_result" 93 | } 94 | ], 95 | "source": [ 96 | "# Inspect the JSON structure of the object you created, and take note of how nested it is, as well as\n", 97 | "# the overall structure.\n", 98 | "\n", 99 | "r_json" 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": {}, 105 | "source": [ 106 | "Note: Type of that json file is time series. In this json file, it has 'dataset' dictionary key and value of that key is a nested dictionary. In this nested dictionary, it has individual 19 key-value pairs in addition to one key and its corresponding nested list value ('column_names' is key and corresponding 11 names are values) and one key and its corresponding nested empty list value." 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": {}, 112 | "source": [ 113 | "### 1. Collect data from the Franfurt Stock Exchange, for the ticker AFX_X, for the whole year 2017." 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 11, 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [ 122 | "# Set start and end date for the whole year 2017. \n", 123 | "\n", 124 | "url = \"https://www.quandl.com/api/v3/datasets/FSE/AFX_X.json?\"+ \"&start_date=2017-01-01&end_date=2017-12-31&api_key=\" + API_KEY\n", 125 | "r = requests.get(url)" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 12, 131 | "metadata": {}, 132 | "outputs": [ 133 | { 134 | "data": { 135 | "text/plain": [ 136 | "200" 137 | ] 138 | }, 139 | "execution_count": 12, 140 | "metadata": {}, 141 | "output_type": "execute_result" 142 | } 143 | ], 144 | "source": [ 145 | "# Checking the response status code. The result should be 200 if the data is imported properly.\n", 146 | "\n", 147 | "r.status_code" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": 15, 153 | "metadata": {}, 154 | "outputs": [ 155 | { 156 | "data": { 157 | "text/plain": [ 158 | "True" 159 | ] 160 | }, 161 | "execution_count": 15, 162 | "metadata": {}, 163 | "output_type": "execute_result" 164 | } 165 | ], 166 | "source": [ 167 | "# Requests also comes with a built-in status code lookup object for easy reference:\n", 168 | "\n", 169 | "r.status_code == requests.codes.ok" 170 | ] 171 | }, 172 | { 173 | "cell_type": "markdown", 174 | "metadata": {}, 175 | "source": [ 176 | "### 2. Convert the returned JSON object into a Python dictionary." 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": 14, 182 | "metadata": { 183 | "scrolled": true 184 | }, 185 | "outputs": [ 186 | { 187 | "data": { 188 | "text/plain": [ 189 | "dict" 190 | ] 191 | }, 192 | "execution_count": 14, 193 | "metadata": {}, 194 | "output_type": "execute_result" 195 | } 196 | ], 197 | "source": [ 198 | "r_json = r.json()\n", 199 | "type(r_json)" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": 16, 205 | "metadata": {}, 206 | "outputs": [ 207 | { 208 | "name": "stdout", 209 | "output_type": "stream", 210 | "text": [ 211 | "{'dataset': {'id': 10095370, 'dataset_code': 'AFX_X', 'database_code': 'FSE', 'name': 'Carl Zeiss Meditec (AFX_X)', 'description': 'Stock Prices for Carl Zeiss Meditec (AFX) from the Frankfurt Stock Exchange.

Trading System: Xetra

ISIN: DE0005313704', 'refreshed_at': '2018-10-29T22:33:28.139Z', 'newest_available_date': '2018-10-29', 'oldest_available_date': '2000-06-07', 'column_names': ['Date', 'Open', 'High', 'Low', 'Close', 'Change', 'Traded Volume', 'Turnover', 'Last Price of the Day', 'Daily Traded Units', 'Daily Turnover'], 'frequency': 'daily', 'type': 'Time Series', 'premium': False, 'limit': None, 'transform': None, 'column_index': None, 'start_date': '2017-01-01', 'end_date': '2017-12-31', 'data': [['2017-12-29', 51.76, 51.94, 51.45, 51.76, None, 34640.0, 1792304.0, None, None, None], ['2017-12-28', 51.65, 51.82, 51.43, 51.6, None, 40660.0, 2099024.0, None, None, None], ['2017-12-27', 51.45, 51.89, 50.76, 51.82, None, 57452.0, 2957018.0, None, None, None], ['2017-12-22', 51.05, 51.5, 50.92, 51.32, None, 71165.0, 3641949.0, None, None, None], ['2017-12-21', 51.16, 51.52, 50.9, 51.4, None, 120649.0, 6179433.0, None, None, None], ['2017-12-20', 51.88, 52.04, 51.2, 51.27, None, 50587.0, 2610258.0, None, None, None], ['2017-12-19', 52.73, 52.73, 51.07, 51.66, None, 137313.0, 7102361.0, None, None, None], ['2017-12-18', 52.37, 52.75, 51.61, 52.62, None, 129733.0, 6770499.0, None, None, None], ['2017-12-15', 52.7, 52.7, 51.64, 52.01, None, 204080.0, 10596319.0, None, None, None], ['2017-12-14', 53.11, 53.54, 52.15, 52.67, None, 132981.0, 7016953.0, None, None, None], ['2017-12-13', 52.64, 53.35, 52.48, 53.09, None, 128434.0, 6801159.0, None, None, None], ['2017-12-12', 52.29, 53.1, 51.82, 52.43, None, 87911.0, 4615924.0, None, None, None], ['2017-12-11', 52.28, 52.45, 51.26, 52.14, None, 71817.0, 3724193.0, None, None, None], ['2017-12-08', 51.5, 52.83, 51.28, 52.12, None, 109157.0, 5690648.0, None, None, None], ['2017-12-07', 50.89, 51.47, 50.81, 51.47, None, 48123.0, 2463848.0, None, None, None], ['2017-12-06', 50.8, 51.11, 50.39, 50.89, None, 88730.0, 4504075.0, None, None, None], ['2017-12-05', 51.21, 51.38, 50.4, 51.25, None, 83023.0, 4231971.0, None, None, None], ['2017-12-04', 49.5, 51.23, 49.5, 51.14, None, 94385.0, 4800027.0, None, None, None], ['2017-12-01', 49.52, 50.49, 49.17, 49.86, None, 101733.0, 5065932.0, None, None, None], ['2017-11-30', 48.64, 49.84, 48.28, 49.7, None, 123019.0, 6085171.0, None, None, None], ['2017-11-29', 49.64, 49.64, 48.7, 48.75, None, 67342.0, 3292223.0, None, None, None], ['2017-11-28', 49.09, 49.89, 49.03, 49.25, None, 42669.0, 2107358.0, None, None, None], ['2017-11-27', 49.13, 49.73, 48.96, 49.2, None, 102180.0, 5055762.0, None, None, None], ['2017-11-24', 49.11, 49.41, 48.87, 49.11, None, 50350.0, 2472842.0, None, None, None], ['2017-11-23', 48.8, 49.46, 48.45, 49.2, None, 38834.0, 1909352.0, None, None, None], ['2017-11-22', 48.4, 49.61, 48.39, 48.8, None, 91142.0, 4478093.0, None, None, None], ['2017-11-21', 47.25, 48.59, 46.78, 48.39, None, 78502.0, 3782098.0, None, None, None], ['2017-11-20', 46.57, 47.38, 46.54, 47.04, None, 97252.0, 4563515.0, None, None, None], ['2017-11-17', 47.03, 47.15, 46.8, 46.84, None, 54107.0, 2540820.0, None, None, None], ['2017-11-16', 47.09, 47.23, 46.55, 47.03, None, 89373.0, 4195732.0, None, None, None], ['2017-11-15', 47.98, 48.01, 46.75, 47.05, None, 67593.0, 3188321.0, None, None, None], ['2017-11-14', 48.4, 48.9, 47.84, 48.0, None, 67672.0, 3259979.0, None, None, None], ['2017-11-13', 48.38, 48.61, 47.76, 48.34, None, 76286.0, 3681337.0, None, None, None], ['2017-11-10', 47.3, 48.89, 47.16, 48.34, None, 90245.0, 4361552.0, None, None, None], ['2017-11-09', 47.65, 48.06, 47.09, 47.21, None, 120268.0, 5712034.0, None, None, None], ['2017-11-08', 46.42, 47.72, 46.42, 47.47, None, 94195.0, 4463935.0, None, None, None], ['2017-11-07', 46.16, 46.33, 45.84, 46.26, None, 48152.0, 2224221.0, None, None, None], ['2017-11-06', 45.81, 46.09, 45.76, 45.99, None, 60716.0, 2789220.0, None, None, None], ['2017-11-03', 45.0, 46.04, 44.83, 45.97, None, 56911.0, 2603498.0, None, None, None], ['2017-11-02', 45.88, 46.06, 45.18, 45.27, None, 37958.0, 1724840.0, None, None, None], ['2017-11-01', 46.29, 46.55, 45.97, 46.04, None, 56319.0, 2603859.0, None, None, None], ['2017-10-30', 46.53, 46.65, 45.61, 45.76, None, 56245.0, 2585397.0, None, None, None], ['2017-10-27', 45.48, 46.42, 45.46, 46.41, None, 74472.0, 3434087.0, None, None, None], ['2017-10-26', 45.2, 45.41, 44.91, 45.41, None, 56319.0, 2548078.0, None, None, None], ['2017-10-25', 45.01, 45.06, 44.7, 45.0, None, 47730.0, 2145697.0, None, None, None], ['2017-10-24', 45.16, 45.27, 44.75, 44.85, None, 43042.0, 1937616.0, None, None, None], ['2017-10-23', 44.9, 45.34, 44.89, 45.0, None, 43375.0, 1952918.0, None, None, None], ['2017-10-20', 45.08, 45.34, 44.76, 44.87, None, 55707.0, 2503853.0, None, None, None], ['2017-10-19', 45.72, 45.85, 44.79, 45.0, None, 59991.0, 2703085.0, None, None, None], ['2017-10-18', 46.01, 46.2, 45.61, 45.77, None, 45263.0, 2076951.0, None, None, None], ['2017-10-17', 45.8, 46.06, 45.37, 45.96, None, 65837.0, 3014080.0, None, None, None], ['2017-10-16', 45.61, 45.75, 45.3, 45.55, None, 49246.0, 2243129.0, None, None, None], ['2017-10-13', 45.5, 45.7, 45.37, 45.4, None, 43362.0, 1971801.0, None, None, None], ['2017-10-12', 45.58, 45.58, 45.17, 45.43, None, 49180.0, 2233481.0, None, None, None], ['2017-10-11', 45.97, 45.97, 45.25, 45.29, None, 69455.0, 3158321.0, None, None, None], ['2017-10-10', 45.64, 46.04, 45.57, 45.84, None, 65860.0, 3016658.0, None, None, None], ['2017-10-09', 46.2, 46.2, 45.6, 45.74, None, 44059.0, 2015453.0, None, None, None], ['2017-10-06', 46.19, 46.19, 45.69, 46.0, None, 66760.0, 3066198.0, None, None, None], ['2017-10-05', 46.01, 46.09, 45.63, 46.05, None, 94804.0, 4352002.0, None, None, None], ['2017-10-04', 45.36, 46.17, 45.22, 46.11, None, 115706.0, 5313199.0, None, None, None], ['2017-10-02', 44.51, 44.98, 44.18, 44.98, None, 95313.0, 4265024.0, None, None, None], ['2017-09-29', 43.58, 44.17, 43.3, 44.17, None, 99821.0, 4384796.0, None, None, None], ['2017-09-28', 42.0, 43.56, 42.0, 43.56, None, 157234.0, 6775569.0, None, None, None], ['2017-09-27', 42.35, 42.49, 41.78, 42.04, None, 76600.0, 3219861.0, None, None, None], ['2017-09-26', 42.3, 42.57, 42.11, 42.37, None, 51321.0, 2175381.0, None, None, None], ['2017-09-25', 42.3, 42.3, 41.96, 42.07, None, 56224.0, 2366453.0, None, None, None], ['2017-09-22', 41.48, 42.38, 41.48, 42.06, None, 79955.0, 3362517.0, None, None, None], ['2017-09-21', 42.29, 42.29, 41.39, 41.46, None, 105194.0, 4378409.0, None, None, None], ['2017-09-20', 42.54, 42.54, 41.99, 41.99, None, 57838.0, 2440557.0, None, None, None], ['2017-09-19', 42.65, 42.65, 42.13, 42.44, None, 65546.0, 2777065.0, None, None, None], ['2017-09-18', 42.5, 42.63, 42.23, 42.27, None, 44037.0, 1864954.0, None, None, None], ['2017-09-15', 42.29, 42.81, 42.25, 42.42, None, 107144.0, 4555791.0, None, None, None], ['2017-09-14', 42.35, 42.8, 42.35, 42.52, None, 65157.0, 2770696.0, None, None, None], ['2017-09-13', 42.49, 42.69, 42.22, 42.45, None, 68801.0, 2921240.0, None, None, None], ['2017-09-12', 43.21, 43.34, 42.62, 42.73, None, 52828.0, 2259924.0, None, None, None], ['2017-09-11', 42.81, 42.89, 42.56, 42.85, None, 103273.0, 4415614.0, None, None, None], ['2017-09-08', 42.7, 42.75, 42.56, 42.67, None, 59881.0, 2553977.0, None, None, None], ['2017-09-07', 43.0, 43.02, 42.67, 42.77, None, 64320.0, 2751388.0, None, None, None], ['2017-09-06', 42.66, 42.71, 42.34, 42.55, None, 71006.0, 3020229.0, None, None, None], ['2017-09-05', 43.0, 43.19, 42.55, 42.62, None, 66351.0, 2846115.0, None, None, None], ['2017-09-04', 42.38, 42.75, 41.95, 42.6, None, 105288.0, 4471634.0, None, None, None], ['2017-09-01', 42.16, 43.06, 42.07, 42.41, None, 151474.0, 6453558.0, None, None, None], ['2017-08-31', 42.0, 42.08, 41.12, 41.9, None, 157888.0, 6580200.0, None, None, None], ['2017-08-30', 42.0, 42.2, 41.49, 41.94, None, 97804.0, 4090262.0, None, None, None], ['2017-08-29', 41.71, 41.98, 41.33, 41.85, None, 98156.0, 4094452.0, None, None, None], ['2017-08-28', 42.11, 42.25, 41.86, 41.91, None, 47130.0, 1978704.0, None, None, None], ['2017-08-25', 42.64, 42.64, 42.05, 42.14, None, 69734.0, 2948016.0, None, None, None], ['2017-08-24', 42.72, 43.05, 42.63, 42.69, None, 65213.0, 2792319.0, None, None, None], ['2017-08-23', 42.82, 43.17, 42.6, 42.71, None, 70269.0, 3011578.0, None, None, None], ['2017-08-22', 42.46, 42.96, 42.4, 42.71, None, 95376.0, 4075646.0, None, None, None], ['2017-08-21', 42.42, 42.76, 42.2, 42.26, None, 68812.0, 2922972.0, None, None, None], ['2017-08-18', 42.28, 42.6, 42.01, 42.41, None, 72886.0, 3092377.0, None, None, None], ['2017-08-17', 41.88, 43.01, 41.76, 42.5, None, 131361.0, 5583704.0, None, None, None], ['2017-08-16', 42.4, 42.62, 41.98, 42.05, None, 104676.0, 4408312.0, None, None, None], ['2017-08-15', 42.53, 42.53, 42.2, 42.28, None, 64334.0, 2721852.0, None, None, None], ['2017-08-14', 42.12, 42.69, 42.01, 42.3, None, 127682.0, 5416963.0, None, None, None], ['2017-08-11', 41.3, 41.94, 40.96, 41.94, None, 183412.0, 7604144.0, None, None, None], ['2017-08-10', 41.73, 41.99, 41.14, 41.68, None, 175161.0, 7303562.0, None, None, None], ['2017-08-09', 43.5, 43.5, 41.64, 41.81, None, 355857.0, 15003956.0, None, None, None], ['2017-08-08', 44.9, 45.09, 44.15, 44.37, None, 156168.0, 6941408.0, None, None, None], ['2017-08-07', 45.85, 46.34, 44.02, 44.96, None, 164543.0, 7378816.0, None, None, None], ['2017-08-04', 45.13, 45.13, 44.36, 45.07, None, 96202.0, 4306911.0, None, None, None], ['2017-08-03', 45.34, 45.54, 44.91, 44.97, None, 77854.0, 3517146.0, None, None, None], ['2017-08-02', 45.25, 45.77, 44.9, 45.56, None, 187468.0, 8528548.0, None, None, None], ['2017-08-01', 45.24, 45.54, 45.1, 45.45, None, 74975.0, 3399891.0, None, None, None], ['2017-07-31', 44.94, 45.75, 44.94, 45.3, None, 62672.0, 2844210.0, None, None, None], ['2017-07-28', 45.26, 45.29, 44.75, 44.97, None, 114006.0, 5127247.0, None, None, None], ['2017-07-27', 45.16, 45.45, 45.15, 45.25, None, 50557.0, 2290284.0, None, None, None], ['2017-07-26', 44.91, 45.33, 44.46, 45.16, None, 81970.0, 3688510.0, None, None, None], ['2017-07-25', 44.7, 45.04, 44.63, 44.82, None, 112224.0, 5033312.0, None, None, None], ['2017-07-24', 45.31, 45.31, 44.49, 44.61, None, 104282.0, 4661866.0, None, None, None], ['2017-07-21', 45.57, 45.88, 45.04, 45.44, None, 73422.0, 3334695.0, None, None, None], ['2017-07-20', 45.74, 45.96, 45.23, 45.66, None, 87399.0, 3986488.0, None, None, None], ['2017-07-19', 45.06, 45.72, 44.94, 45.57, None, 71971.0, 3273001.0, None, None, None], ['2017-07-18', 45.5, 45.55, 44.7, 45.0, None, 104003.0, 4684627.0, None, None, None], ['2017-07-17', 45.6, 46.23, 45.29, 45.6, None, 104995.0, 4801806.0, None, None, None], ['2017-07-14', 45.07, 45.56, 44.83, 45.53, None, 67375.0, 3054060.0, None, None, None], ['2017-07-13', 44.67, 45.18, 44.67, 44.95, None, 82745.0, 3718928.0, None, None, None], ['2017-07-12', 44.29, 45.05, 43.89, 44.95, None, 115705.0, 5133971.0, None, None, None], ['2017-07-11', 44.94, 44.94, 44.08, 44.2, None, 90538.0, 4010457.0, None, None, None], ['2017-07-10', 44.64, 45.18, 44.51, 44.7, None, 71868.0, 3221218.0, None, None, None], ['2017-07-07', 44.79, 44.79, 44.25, 44.53, None, 47999.0, 2136578.0, None, None, None], ['2017-07-06', 45.5, 45.5, 44.15, 44.62, None, 66116.0, 2952605.0, None, None, None], ['2017-07-05', 44.67, 45.36, 44.44, 45.19, None, 48706.0, 2189436.0, None, None, None], ['2017-07-04', 45.83, 45.83, 44.74, 44.8, None, 50549.0, 2273551.0, None, None, None], ['2017-07-03', 45.29, 45.83, 45.06, 45.75, None, 71381.0, 3251502.0, None, None, None], ['2017-06-30', 45.01, 45.74, 45.0, 45.44, None, 136112.0, 6187148.0, None, None, None], ['2017-06-29', 45.73, 45.81, 45.11, 45.2, None, 134965.0, 6132452.0, None, None, None], ['2017-06-28', 46.68, 46.68, 45.41, 45.68, None, 117165.0, 5381488.0, None, None, None], ['2017-06-27', 47.23, 47.33, 46.39, 46.83, None, 82492.0, 3866344.0, None, None, None], ['2017-06-26', 46.95, 47.63, 46.91, 47.21, None, 73322.0, 3465639.0, None, None, None], ['2017-06-23', 47.29, 47.4, 46.79, 46.99, None, 80586.0, 3792498.0, None, None, None], ['2017-06-22', 47.03, 47.4, 46.75, 47.29, None, 56071.0, 2640508.0, None, None, None], ['2017-06-21', 47.46, 47.48, 46.53, 46.99, None, 89752.0, 4206563.0, None, None, None], ['2017-06-20', 46.48, 47.43, 46.27, 47.37, None, 108334.0, 5109730.0, None, None, None], ['2017-06-19', 46.9, 46.9, 46.25, 46.64, None, 70056.0, 3260381.0, None, None, None], ['2017-06-16', 45.66, 46.8, 45.66, 46.63, None, 202214.0, 9411695.0, None, None, None], ['2017-06-15', 46.34, 46.34, 45.21, 45.67, None, 101733.0, 4635593.0, None, None, None], ['2017-06-14', 46.52, 46.86, 46.05, 46.33, None, 83741.0, 3881453.0, None, None, None], ['2017-06-13', 46.5, 46.51, 46.03, 46.32, None, 107644.0, 4981185.0, None, None, None], ['2017-06-12', 47.31, 47.43, 45.89, 46.31, None, 112942.0, 5238390.0, None, None, None], ['2017-06-09', 46.77, 47.44, 46.55, 47.44, None, 99674.0, 4702170.0, None, None, None], ['2017-06-08', 47.8, 47.8, 46.27, 46.27, None, 1945.0, 90599.0, None, None, None], ['2017-06-07', 47.01, 47.43, 47.01, 47.43, None, 1081.0, 51021.0, None, None, None], ['2017-06-06', 47.12, 47.45, 46.21, 47.43, None, 686.0, 32083.0, None, None, None], ['2017-06-02', 46.8, 46.99, 46.72, 46.99, None, 290.0, 13584.0, None, None, None], ['2017-06-01', 46.12, 46.52, 45.89, 46.52, None, 106513.0, 4930686.0, None, None, None], ['2017-05-31', 45.22, 46.26, 45.22, 45.86, None, 522.0, 24044.0, None, None, None], ['2017-05-30', 45.05, 46.02, 45.05, 46.02, None, 587.0, 26792.0, None, None, None], ['2017-05-29', 45.61, 45.61, 45.24, 45.32, None, 112.0, 5089.0, None, None, None], ['2017-05-26', 44.8, 45.36, 44.71, 45.3, None, 74453.0, 3360707.0, None, None, None], ['2017-05-25', 44.8, 44.87, 44.29, 44.78, None, 49970.0, 2231857.0, None, None, None], ['2017-05-24', 43.92, 44.67, 43.92, 44.53, None, 111923.0, 4971343.0, None, None, None], ['2017-05-23', 43.67, 44.13, 43.55, 43.9, None, 38308.0, 1681904.0, None, None, None], ['2017-05-22', 44.16, 44.22, 43.44, 43.84, None, 70856.0, 3103013.0, None, None, None], ['2017-05-19', 43.74, 44.12, 43.74, 44.12, None, 45.0, 1980.0, None, None, None], ['2017-05-18', 44.0, 44.3, 43.29, 43.98, None, 166160.0, 7277314.0, None, None, None], ['2017-05-17', 45.06, 45.34, 44.01, 44.19, None, 149515.0, 6664744.0, None, None, None], ['2017-05-16', 45.15, 45.36, 44.56, 45.31, None, 101476.0, 4567885.0, None, None, None], ['2017-05-15', 45.09, 45.78, 44.31, 45.14, None, 193702.0, 8734286.0, None, None, None], ['2017-05-12', 45.18, 45.18, 44.16, 44.99, None, 159495.0, 7113519.0, None, None, None], ['2017-05-11', 43.4, 46.06, 43.25, 45.0, None, 189125.0, 8496322.0, None, None, None], ['2017-05-10', 43.5, 43.6, 42.53, 43.28, None, 91858.0, 3958630.0, None, None, None], ['2017-05-09', 41.83, 43.55, 41.82, 43.3, None, 151439.0, 6538516.0, None, None, None], ['2017-05-08', 43.0, 43.0, 42.04, 42.24, None, 97456.0, 4128048.0, None, None, None], ['2017-05-05', 42.52, 42.91, 42.38, 42.75, None, 78512.0, 3353971.0, None, None, None], ['2017-05-04', 41.86, 42.5, 41.71, 42.5, None, 82058.0, 3465505.0, None, None, None], ['2017-05-03', 42.2, 42.29, 41.78, 41.9, None, 65266.0, 2738394.0, None, None, None], ['2017-05-02', 41.89, 42.23, 41.76, 42.15, None, 86559.0, 3636583.0, None, None, None], ['2017-05-01', None, 42.245, 41.655, 41.72, -0.44, 86348.0, 3606589.0, None, None, None], ['2017-04-28', 42.17, 42.25, 41.66, 41.72, None, 86348.0, 3606589.0, None, None, None], ['2017-04-27', 41.51, 42.24, 41.51, 42.16, None, 151683.0, 6380639.0, None, None, None], ['2017-04-26', 41.88, 41.94, 41.4, 41.5, None, 65847.0, 2743109.0, None, None, None], ['2017-04-25', 41.93, 42.18, 41.66, 41.89, None, 85973.0, 3604204.0, None, None, None], ['2017-04-24', 42.01, 42.02, 41.23, 41.81, None, 102084.0, 4247032.0, None, None, None], ['2017-04-21', 41.97, 42.14, 41.01, 41.32, None, 186784.0, 7728103.0, None, None, None], ['2017-04-20', 42.5, 42.64, 41.52, 41.93, None, 223621.0, 9418192.0, None, None, None], ['2017-04-19', 41.94, 42.61, 41.94, 42.61, None, 92722.0, 3930856.0, None, None, None], ['2017-04-18', 42.24, 42.4, 41.54, 42.0, None, 133057.0, 5587565.0, None, None, None], ['2017-04-17', None, 42.48, 41.985, 42.2, None, 88416.0, 3734717.0, None, None, None], ['2017-04-14', None, 42.48, 41.985, 42.2, None, 88416.0, 3734717.0, None, None, None], ['2017-04-13', 42.06, 42.48, 41.99, 42.2, None, 88416.0, 3734717.0, None, None, None], ['2017-04-12', 42.02, 42.45, 41.84, 42.2, None, 158278.0, 6672547.0, None, None, None], ['2017-04-11', 41.62, 42.03, 41.53, 41.75, None, 107817.0, 4501109.0, None, None, None], ['2017-04-10', 41.46, 41.68, 41.31, 41.68, None, 62297.0, 2585922.0, None, None, None], ['2017-04-07', 40.9, 41.42, 40.84, 41.42, None, 81255.0, 3344628.0, None, None, None], ['2017-04-06', 40.96, 41.25, 40.83, 41.05, None, 96794.0, 3968681.0, None, None, None], ['2017-04-05', 41.1, 41.34, 40.79, 41.1, None, 156005.0, 6404780.0, None, None, None], ['2017-04-04', 39.5, 40.88, 39.48, 40.81, None, 193156.0, 7822665.0, None, None, None], ['2017-04-03', 40.15, 40.15, 39.54, 39.64, None, 127973.0, 5081376.0, None, None, None], ['2017-03-31', 39.77, 40.07, 39.42, 39.98, None, 95382.0, 3795061.0, None, None, None], ['2017-03-30', 40.02, 40.14, 39.42, 39.75, None, 189201.0, 7541354.0, None, None, None], ['2017-03-29', 39.39, 40.01, 39.05, 40.01, None, 335406.0, 13349426.0, None, None, None], ['2017-03-28', 38.95, 39.35, 38.79, 39.22, None, 115075.0, 4505494.0, None, None, None], ['2017-03-27', 38.73, 39.1, 38.53, 38.85, None, 191515.0, 7446952.0, None, None, None], ['2017-03-24', 38.94, 39.02, 38.6, 38.94, None, 210926.0, 8205507.0, None, None, None], ['2017-03-23', 39.01, 39.25, 38.63, 38.96, None, 169971.0, 6621807.0, None, None, None], ['2017-03-22', 38.25, 39.02, 37.53, 38.94, None, 670349.0, 25910543.0, None, None, None], ['2017-03-21', 41.8, 41.83, 40.97, 40.98, None, 56906.0, 2349965.0, None, None, None], ['2017-03-20', 41.26, 42.17, 41.26, 41.97, None, 97572.0, 4074891.0, None, None, None], ['2017-03-17', 41.47, 41.59, 41.16, 41.34, None, 90109.0, 3734232.0, None, None, None], ['2017-03-16', 41.4, 41.57, 41.09, 41.46, None, 55799.0, 2308423.0, None, None, None], ['2017-03-15', 41.4, 41.5, 40.91, 41.25, None, 60324.0, 2488650.0, None, None, None], ['2017-03-14', 41.2, 41.5, 41.2, 41.3, None, 60420.0, 2498025.0, None, None, None], ['2017-03-13', 41.4, 41.46, 41.08, 41.3, None, 44803.0, 1850251.0, None, None, None], ['2017-03-10', 41.53, 41.53, 41.16, 41.4, None, 38518.0, 1592270.0, None, None, None], ['2017-03-09', 41.61, 41.61, 41.16, 41.4, None, 43988.0, 1819182.0, None, None, None], ['2017-03-08', 41.13, 41.71, 40.95, 41.68, None, 45111.0, 1870935.0, None, None, None], ['2017-03-07', 41.5, 41.8, 41.25, 41.42, None, 61925.0, 2569608.0, None, None, None], ['2017-03-06', 41.25, 41.4, 40.81, 41.4, None, 46510.0, 1916799.0, None, None, None], ['2017-03-03', 41.12, 41.22, 40.84, 41.18, None, 40800.0, 1675587.0, None, None, None], ['2017-03-02', 41.38, 41.39, 40.76, 41.17, None, 49863.0, 2048153.0, None, None, None], ['2017-03-01', 41.19, 41.57, 40.9, 41.2, None, 86753.0, 3569796.0, None, None, None], ['2017-02-28', 40.38, 40.95, 40.38, 40.84, None, 67440.0, 2747011.0, None, None, None], ['2017-02-27', 39.75, 40.64, 39.75, 40.39, None, 62655.0, 2520260.0, None, None, None], ['2017-02-24', 39.77, 40.14, 38.91, 39.74, None, 101294.0, 4015150.0, None, None, None], ['2017-02-23', 39.72, 39.98, 39.38, 39.79, None, 81945.0, 3260642.0, None, None, None], ['2017-02-22', 39.6, 39.75, 39.27, 39.7, None, 77619.0, 3066894.0, None, None, None], ['2017-02-21', 38.85, 39.57, 38.85, 39.45, None, 46070.0, 1808350.0, None, None, None], ['2017-02-20', 39.25, 39.25, 38.81, 38.98, None, 37014.0, 1444138.0, None, None, None], ['2017-02-17', 38.8, 39.03, 38.48, 39.02, None, 60583.0, 2352961.0, None, None, None], ['2017-02-16', 38.8, 39.2, 38.25, 38.71, None, 84682.0, 3282322.0, None, None, None], ['2017-02-15', 38.5, 38.93, 38.4, 38.72, None, 77420.0, 2996861.0, None, None, None], ['2017-02-14', 38.81, 38.86, 38.0, 38.37, None, 82601.0, 3163898.0, None, None, None], ['2017-02-13', 37.37, 39.36, 37.35, 38.53, None, 177171.0, 6804028.0, None, None, None], ['2017-02-10', 36.65, 37.5, 36.57, 37.06, None, 115843.0, 4291017.0, None, None, None], ['2017-02-09', 36.2, 36.25, 35.77, 36.25, None, 67781.0, 2445428.0, None, None, None], ['2017-02-08', 35.98, 36.14, 35.84, 36.05, None, 39731.0, 1431205.0, None, None, None], ['2017-02-07', 35.56, 36.05, 35.36, 35.89, None, 67410.0, 2410818.0, None, None, None], ['2017-02-06', 36.06, 36.15, 35.6, 35.64, None, 41911.0, 1496794.0, None, None, None], ['2017-02-03', 36.02, 36.2, 35.73, 36.1, None, 40705.0, 1464712.0, None, None, None], ['2017-02-02', 35.95, 36.2, 35.7, 36.07, None, 54279.0, 1953176.0, None, None, None], ['2017-02-01', 34.75, 36.0, 34.75, 35.94, None, 85137.0, 3038172.0, None, None, None], ['2017-01-31', 35.24, 35.24, 34.56, 34.56, None, 63371.0, 2199583.0, None, None, None], ['2017-01-30', 35.38, 35.59, 34.95, 35.15, None, 69603.0, 2457762.0, None, None, None], ['2017-01-27', 34.83, 35.43, 34.81, 35.3, None, 69657.0, 2444913.0, None, None, None], ['2017-01-26', 35.07, 35.58, 34.8, 34.89, None, 64103.0, 2249375.0, None, None, None], ['2017-01-25', 34.42, 34.86, 34.03, 34.83, None, 56240.0, 1947147.0, None, None, None], ['2017-01-24', 34.0, 34.35, 33.85, 34.22, None, 48797.0, 1666086.0, None, None, None], ['2017-01-23', 34.04, 34.12, 33.62, 34.06, None, 55333.0, 1877957.0, None, None, None], ['2017-01-20', 34.54, 34.59, 34.05, 34.17, None, 80246.0, 2743474.0, None, None, None], ['2017-01-19', 35.04, 35.04, 34.42, 34.5, None, 73105.0, 2526731.0, None, None, None], ['2017-01-18', 35.04, 35.51, 34.8, 34.9, None, 65931.0, 2311608.0, None, None, None], ['2017-01-17', 35.06, 35.19, 34.79, 34.99, None, 39195.0, 1369857.0, None, None, None], ['2017-01-16', 34.85, 35.24, 34.56, 35.07, None, 47879.0, 1678679.0, None, None, None], ['2017-01-13', 34.98, 34.98, 34.6, 34.85, None, 59367.0, 2065534.0, None, None, None], ['2017-01-12', 35.38, 35.38, 34.31, 34.9, None, 163860.0, 5703427.0, None, None, None], ['2017-01-11', 34.95, 36.0, 34.84, 35.42, None, 123530.0, 4369079.0, None, None, None], ['2017-01-10', 34.8, 34.98, 34.46, 34.91, None, 43976.0, 1528055.0, None, None, None], ['2017-01-09', 35.29, 35.35, 34.43, 34.67, None, 62225.0, 2157182.0, None, None, None], ['2017-01-06', 34.91, 35.21, 34.91, 35.04, None, 27507.0, 964046.0, None, None, None], ['2017-01-05', 35.02, 35.2, 34.73, 35.06, None, 48412.0, 1692326.0, None, None, None], ['2017-01-04', 35.48, 35.51, 34.75, 35.19, None, 54408.0, 1906810.0, None, None, None], ['2017-01-03', 35.9, 35.93, 35.34, 35.48, None, 70618.0, 2515473.0, None, None, None], ['2017-01-02', 34.99, 35.94, 34.99, 35.8, None, 44700.0, 1590561.0, None, None, None]], 'collapse': None, 'order': None, 'database_id': 6129}}\n" 212 | ] 213 | } 214 | ], 215 | "source": [ 216 | "print(r_json)" 217 | ] 218 | }, 219 | { 220 | "cell_type": "markdown", 221 | "metadata": {}, 222 | "source": [ 223 | "### 3. Calculate what the highest and lowest opening prices were for the stock in this period." 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": 28, 229 | "metadata": {}, 230 | "outputs": [ 231 | { 232 | "data": { 233 | "text/plain": [ 234 | "dict_keys(['dataset'])" 235 | ] 236 | }, 237 | "execution_count": 28, 238 | "metadata": {}, 239 | "output_type": "execute_result" 240 | } 241 | ], 242 | "source": [ 243 | "# Review the data content\n", 244 | "r_json.keys()" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": 29, 250 | "metadata": {}, 251 | "outputs": [ 252 | { 253 | "data": { 254 | "text/plain": [ 255 | "dict_keys(['id', 'dataset_code', 'database_code', 'name', 'description', 'refreshed_at', 'newest_available_date', 'oldest_available_date', 'column_names', 'frequency', 'type', 'premium', 'limit', 'transform', 'column_index', 'start_date', 'end_date', 'data', 'collapse', 'order', 'database_id'])" 256 | ] 257 | }, 258 | "execution_count": 29, 259 | "metadata": {}, 260 | "output_type": "execute_result" 261 | } 262 | ], 263 | "source": [ 264 | "r_json['dataset'].keys()" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": 30, 270 | "metadata": {}, 271 | "outputs": [ 272 | { 273 | "data": { 274 | "text/plain": [ 275 | "['Date',\n", 276 | " 'Open',\n", 277 | " 'High',\n", 278 | " 'Low',\n", 279 | " 'Close',\n", 280 | " 'Change',\n", 281 | " 'Traded Volume',\n", 282 | " 'Turnover',\n", 283 | " 'Last Price of the Day',\n", 284 | " 'Daily Traded Units',\n", 285 | " 'Daily Turnover']" 286 | ] 287 | }, 288 | "execution_count": 30, 289 | "metadata": {}, 290 | "output_type": "execute_result" 291 | } 292 | ], 293 | "source": [ 294 | "r_json['dataset']['column_names']" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": 31, 300 | "metadata": {}, 301 | "outputs": [ 302 | { 303 | "data": { 304 | "text/plain": [ 305 | "[['2017-12-29',\n", 306 | " 51.76,\n", 307 | " 51.94,\n", 308 | " 51.45,\n", 309 | " 51.76,\n", 310 | " None,\n", 311 | " 34640.0,\n", 312 | " 1792304.0,\n", 313 | " None,\n", 314 | " None,\n", 315 | " None],\n", 316 | " ['2017-12-28',\n", 317 | " 51.65,\n", 318 | " 51.82,\n", 319 | " 51.43,\n", 320 | " 51.6,\n", 321 | " None,\n", 322 | " 40660.0,\n", 323 | " 2099024.0,\n", 324 | " None,\n", 325 | " None,\n", 326 | " None]]" 327 | ] 328 | }, 329 | "execution_count": 31, 330 | "metadata": {}, 331 | "output_type": "execute_result" 332 | } 333 | ], 334 | "source": [ 335 | "r_json['dataset']['data'][0:2]" 336 | ] 337 | }, 338 | { 339 | "cell_type": "markdown", 340 | "metadata": {}, 341 | "source": [ 342 | "For this particular question, I tried to show three different approaches to solve it. " 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": 17, 348 | "metadata": {}, 349 | "outputs": [ 350 | { 351 | "name": "stdout", 352 | "output_type": "stream", 353 | "text": [ 354 | "Maximim and minimum opening values by dates: \n", 355 | "('2017-12-14', 53.11) ('2017-01-24', 34.0)\n" 356 | ] 357 | } 358 | ], 359 | "source": [ 360 | "# Method-1:\n", 361 | "\n", 362 | "# Index of Openings\n", 363 | "i_open = r_json['dataset']['column_names'].index('Open')\n", 364 | "\n", 365 | "# Index of the data associated with the \"Open\" value\n", 366 | "i_date = r_json['dataset']['column_names'].index('Date')\n", 367 | "\n", 368 | "# Creating a dictionary for opening values to corresponding each day\n", 369 | "data_json = r_json['dataset']['data']\n", 370 | "openings = {data_json[j][i_date] : data_json[j][i_open] for j in range(len(data_json)) if data_json[j][i_open] is not None}\n", 371 | "\n", 372 | "max_openings = max(openings.items(), key=operator.itemgetter(1))\n", 373 | "min_openings = min(openings.items(), key=operator.itemgetter(1))\n", 374 | "\n", 375 | "print('Maximim and minimum opening values by dates: ')\n", 376 | "print(max_openings, min_openings)" 377 | ] 378 | }, 379 | { 380 | "cell_type": "code", 381 | "execution_count": 18, 382 | "metadata": {}, 383 | "outputs": [ 384 | { 385 | "name": "stdout", 386 | "output_type": "stream", 387 | "text": [ 388 | "The highest opening price: 53.11\n", 389 | "The lowest opening price: 34.0\n" 390 | ] 391 | } 392 | ], 393 | "source": [ 394 | "# Method-2:\n", 395 | "\n", 396 | "opening = [row[1] for row in data_json if row[1] != None]\n", 397 | "print(\"The highest opening price: \" + str(max(opening)))\n", 398 | "print(\"The lowest opening price: \" + str(min(opening)))" 399 | ] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "execution_count": 19, 404 | "metadata": {}, 405 | "outputs": [ 406 | { 407 | "name": "stdout", 408 | "output_type": "stream", 409 | "text": [ 410 | " ['max_opening_value $53.11 at 2017-12-14'] \n", 411 | " ['lowest_opening_value $34.0 at 2017-01-24']\n" 412 | ] 413 | } 414 | ], 415 | "source": [ 416 | "# Method-3: \n", 417 | "\n", 418 | "def min_max_opening(data):\n", 419 | " max_opening = ['max_opening_value $' + str(v) + ' at ' + k for k,v in data.items() if v == max(data.values()) ]\n", 420 | " lowest_opening = ['lowest_opening_value $' + str(v) + ' at ' + k for k,v in data.items() if v == min(data.values()) ]\n", 421 | " return print('',max_opening,'\\n',lowest_opening)\n", 422 | "\n", 423 | "min_max_opening(openings)" 424 | ] 425 | }, 426 | { 427 | "cell_type": "markdown", 428 | "metadata": {}, 429 | "source": [ 430 | "### 4. What was the largest change in any one day (based on High and Low price)?" 431 | ] 432 | }, 433 | { 434 | "cell_type": "code", 435 | "execution_count": 20, 436 | "metadata": {}, 437 | "outputs": [ 438 | { 439 | "name": "stdout", 440 | "output_type": "stream", 441 | "text": [ 442 | "The largest change in any one day is:2.8100000000000023\n" 443 | ] 444 | } 445 | ], 446 | "source": [ 447 | "high = [row[2] for row in data_json if row[2] != None]\n", 448 | "\n", 449 | "low = [row[3] for row in data_json if row[3] != None]\n", 450 | "\n", 451 | "subs = [abs(x1 - x2) for (x1, x2) in zip(high, low)]\n", 452 | "\n", 453 | "print (\"The largest change in any one day is:\" + str(max(subs)))" 454 | ] 455 | }, 456 | { 457 | "cell_type": "markdown", 458 | "metadata": {}, 459 | "source": [ 460 | "### 5. What was the largest change between any two days (based on Closing Price)?" 461 | ] 462 | }, 463 | { 464 | "cell_type": "code", 465 | "execution_count": 21, 466 | "metadata": {}, 467 | "outputs": [ 468 | { 469 | "name": "stdout", 470 | "output_type": "stream", 471 | "text": [ 472 | "The largest change between two days is:2.559999999999995\n" 473 | ] 474 | } 475 | ], 476 | "source": [ 477 | "closing = [row[4] for row in data_json if row[4] != None]\n", 478 | "\n", 479 | "closing_prvs = [row[4] for row in data_json if row[4] != None][1:]\n", 480 | "\n", 481 | "sub = [abs(x1 - x2) for (x1, x2) in zip(closing, closing_prvs)]\n", 482 | "\n", 483 | "print (\"The largest change between two days is:\" + str(max(sub)))" 484 | ] 485 | }, 486 | { 487 | "cell_type": "markdown", 488 | "metadata": {}, 489 | "source": [ 490 | "### 6. What was the average daily trading volume during this year?" 491 | ] 492 | }, 493 | { 494 | "cell_type": "code", 495 | "execution_count": 22, 496 | "metadata": {}, 497 | "outputs": [ 498 | { 499 | "name": "stdout", 500 | "output_type": "stream", 501 | "text": [ 502 | "The avarage daily trading volume in 2017: 89124.34\n" 503 | ] 504 | } 505 | ], 506 | "source": [ 507 | "trading_volume = [row[6] for row in data_json]\n", 508 | "\n", 509 | "volume_avg = sum(trading_volume) / len(trading_volume)\n", 510 | "\n", 511 | "print (\"The avarage daily trading volume in 2017: \" + str(round(volume_avg,2)))" 512 | ] 513 | }, 514 | { 515 | "cell_type": "markdown", 516 | "metadata": {}, 517 | "source": [ 518 | "### 7. (Optional) What was the median trading volume during this year. (Note: you may need to implement your own function for calculating the median.)" 519 | ] 520 | }, 521 | { 522 | "cell_type": "code", 523 | "execution_count": 26, 524 | "metadata": {}, 525 | "outputs": [ 526 | { 527 | "name": "stdout", 528 | "output_type": "stream", 529 | "text": [ 530 | "\n", 531 | " 76286.0 is the median trading volume during this year\n" 532 | ] 533 | } 534 | ], 535 | "source": [ 536 | "def find_median(values):\n", 537 | "\n", 538 | " # First sort the list in ascending order\n", 539 | " sorted_trading_vol = sorted(values, reverse= False)\n", 540 | " \n", 541 | " # Calculate the size of the list\n", 542 | " size = len(sorted_trading_vol)\n", 543 | " \n", 544 | " # Check if the size is odd or even number provided the list not empty\n", 545 | " if size % 2 == 1:\n", 546 | " return sorted_trading_vol[size//2]\n", 547 | " else:\n", 548 | " return sum(sorted_trading_vol[size//2-1:size//2+1])/2.0\n", 549 | " \n", 550 | "print('\\n',find_median(values = trading_volume) , ' is the median trading volume during this year')" 551 | ] 552 | } 553 | ], 554 | "metadata": { 555 | "kernelspec": { 556 | "display_name": "Python 3", 557 | "language": "python", 558 | "name": "python3" 559 | }, 560 | "language_info": { 561 | "codemirror_mode": { 562 | "name": "ipython", 563 | "version": 3 564 | }, 565 | "file_extension": ".py", 566 | "mimetype": "text/x-python", 567 | "name": "python", 568 | "nbconvert_exporter": "python", 569 | "pygments_lexer": "ipython3", 570 | "version": "3.7.1" 571 | } 572 | }, 573 | "nbformat": 4, 574 | "nbformat_minor": 2 575 | } 576 | -------------------------------------------------------------------------------- /API_Project-Quandl/API_Project-Quandl.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# API_Project-Quandl" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "# Importing the relevant modules.\n", 17 | "\n", 18 | "import requests\n", 19 | "import json \n", 20 | "import operator\n", 21 | "import numpy as np" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 2, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "# Unique API key which is taken from http://www.quandl.com website.\n", 31 | "\n", 32 | "API_KEY = 'zHER-uPSaTEaUxTgB2d4' " 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 3, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "# Caling the Quandl API and pull out a small sample of the data (only one day) to get a glimpse \n", 42 | "# into the JSON structure that will be returned.\n", 43 | "\n", 44 | "url = 'https://www.quandl.com/api/v3/datasets/FSE/AFX_X.json?&start_date=2017-01-01&end_date=2017-01-01&api_key=' + API_KEY\n", 45 | "r = requests.get(url)\n", 46 | "r_json = r.json()" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 22, 52 | "metadata": {}, 53 | "outputs": [ 54 | { 55 | "data": { 56 | "text/plain": [ 57 | "{'dataset': {'id': 10095370,\n", 58 | " 'dataset_code': 'AFX_X',\n", 59 | " 'database_code': 'FSE',\n", 60 | " 'name': 'Carl Zeiss Meditec (AFX_X)',\n", 61 | " 'description': 'Stock Prices for Carl Zeiss Meditec (AFX) from the Frankfurt Stock Exchange.

Trading System: Xetra

ISIN: DE0005313704',\n", 62 | " 'refreshed_at': '2018-10-16T22:29:02.721Z',\n", 63 | " 'newest_available_date': '2018-10-16',\n", 64 | " 'oldest_available_date': '2000-06-07',\n", 65 | " 'column_names': ['Date',\n", 66 | " 'Open',\n", 67 | " 'High',\n", 68 | " 'Low',\n", 69 | " 'Close',\n", 70 | " 'Change',\n", 71 | " 'Traded Volume',\n", 72 | " 'Turnover',\n", 73 | " 'Last Price of the Day',\n", 74 | " 'Daily Traded Units',\n", 75 | " 'Daily Turnover'],\n", 76 | " 'frequency': 'daily',\n", 77 | " 'type': 'Time Series',\n", 78 | " 'premium': False,\n", 79 | " 'limit': None,\n", 80 | " 'transform': None,\n", 81 | " 'column_index': None,\n", 82 | " 'start_date': '2017-01-01',\n", 83 | " 'end_date': '2017-01-01',\n", 84 | " 'data': [],\n", 85 | " 'collapse': None,\n", 86 | " 'order': None,\n", 87 | " 'database_id': 6129}}" 88 | ] 89 | }, 90 | "execution_count": 22, 91 | "metadata": {}, 92 | "output_type": "execute_result" 93 | } 94 | ], 95 | "source": [ 96 | "# Inspect the JSON structure of the object you created, and take note of how nested it is, as well as\n", 97 | "# the overall structure.\n", 98 | "\n", 99 | "r_json" 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": {}, 105 | "source": [ 106 | "Note: Type of that json file is time series. In this json file, it has 'dataset' dictionary key and value of that key is a nested dictionary. In this nested dictionary, it has individual 19 key-value pairs in addition to one key and its corresponding nested list value ('column_names' is key and corresponding 11 names are values) and one key and its corresponding nested empty list value." 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": {}, 112 | "source": [ 113 | "### 1. Collect data from the Franfurt Stock Exchange, for the ticker AFX_X, for the whole year 2017." 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 11, 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [ 122 | "# Set start and end date for the whole year 2017. \n", 123 | "\n", 124 | "url = \"https://www.quandl.com/api/v3/datasets/FSE/AFX_X.json?\"+ \"&start_date=2017-01-01&end_date=2017-12-31&api_key=\" + API_KEY\n", 125 | "r = requests.get(url)" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 12, 131 | "metadata": {}, 132 | "outputs": [ 133 | { 134 | "data": { 135 | "text/plain": [ 136 | "200" 137 | ] 138 | }, 139 | "execution_count": 12, 140 | "metadata": {}, 141 | "output_type": "execute_result" 142 | } 143 | ], 144 | "source": [ 145 | "# Checking the response status code. The result should be 200 if the data is imported properly.\n", 146 | "\n", 147 | "r.status_code" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": 15, 153 | "metadata": {}, 154 | "outputs": [ 155 | { 156 | "data": { 157 | "text/plain": [ 158 | "True" 159 | ] 160 | }, 161 | "execution_count": 15, 162 | "metadata": {}, 163 | "output_type": "execute_result" 164 | } 165 | ], 166 | "source": [ 167 | "# Requests also comes with a built-in status code lookup object for easy reference:\n", 168 | "\n", 169 | "r.status_code == requests.codes.ok" 170 | ] 171 | }, 172 | { 173 | "cell_type": "markdown", 174 | "metadata": {}, 175 | "source": [ 176 | "### 2. Convert the returned JSON object into a Python dictionary." 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": 14, 182 | "metadata": { 183 | "scrolled": true 184 | }, 185 | "outputs": [ 186 | { 187 | "data": { 188 | "text/plain": [ 189 | "dict" 190 | ] 191 | }, 192 | "execution_count": 14, 193 | "metadata": {}, 194 | "output_type": "execute_result" 195 | } 196 | ], 197 | "source": [ 198 | "r_json = r.json()\n", 199 | "type(r_json)" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": 16, 205 | "metadata": {}, 206 | "outputs": [ 207 | { 208 | "name": "stdout", 209 | "output_type": "stream", 210 | "text": [ 211 | "{'dataset': {'id': 10095370, 'dataset_code': 'AFX_X', 'database_code': 'FSE', 'name': 'Carl Zeiss Meditec (AFX_X)', 'description': 'Stock Prices for Carl Zeiss Meditec (AFX) from the Frankfurt Stock Exchange.

Trading System: Xetra

ISIN: DE0005313704', 'refreshed_at': '2018-10-29T22:33:28.139Z', 'newest_available_date': '2018-10-29', 'oldest_available_date': '2000-06-07', 'column_names': ['Date', 'Open', 'High', 'Low', 'Close', 'Change', 'Traded Volume', 'Turnover', 'Last Price of the Day', 'Daily Traded Units', 'Daily Turnover'], 'frequency': 'daily', 'type': 'Time Series', 'premium': False, 'limit': None, 'transform': None, 'column_index': None, 'start_date': '2017-01-01', 'end_date': '2017-12-31', 'data': [['2017-12-29', 51.76, 51.94, 51.45, 51.76, None, 34640.0, 1792304.0, None, None, None], ['2017-12-28', 51.65, 51.82, 51.43, 51.6, None, 40660.0, 2099024.0, None, None, None], ['2017-12-27', 51.45, 51.89, 50.76, 51.82, None, 57452.0, 2957018.0, None, None, None], ['2017-12-22', 51.05, 51.5, 50.92, 51.32, None, 71165.0, 3641949.0, None, None, None], ['2017-12-21', 51.16, 51.52, 50.9, 51.4, None, 120649.0, 6179433.0, None, None, None], ['2017-12-20', 51.88, 52.04, 51.2, 51.27, None, 50587.0, 2610258.0, None, None, None], ['2017-12-19', 52.73, 52.73, 51.07, 51.66, None, 137313.0, 7102361.0, None, None, None], ['2017-12-18', 52.37, 52.75, 51.61, 52.62, None, 129733.0, 6770499.0, None, None, None], ['2017-12-15', 52.7, 52.7, 51.64, 52.01, None, 204080.0, 10596319.0, None, None, None], ['2017-12-14', 53.11, 53.54, 52.15, 52.67, None, 132981.0, 7016953.0, None, None, None], ['2017-12-13', 52.64, 53.35, 52.48, 53.09, None, 128434.0, 6801159.0, None, None, None], ['2017-12-12', 52.29, 53.1, 51.82, 52.43, None, 87911.0, 4615924.0, None, None, None], ['2017-12-11', 52.28, 52.45, 51.26, 52.14, None, 71817.0, 3724193.0, None, None, None], ['2017-12-08', 51.5, 52.83, 51.28, 52.12, None, 109157.0, 5690648.0, None, None, None], ['2017-12-07', 50.89, 51.47, 50.81, 51.47, None, 48123.0, 2463848.0, None, None, None], ['2017-12-06', 50.8, 51.11, 50.39, 50.89, None, 88730.0, 4504075.0, None, None, None], ['2017-12-05', 51.21, 51.38, 50.4, 51.25, None, 83023.0, 4231971.0, None, None, None], ['2017-12-04', 49.5, 51.23, 49.5, 51.14, None, 94385.0, 4800027.0, None, None, None], ['2017-12-01', 49.52, 50.49, 49.17, 49.86, None, 101733.0, 5065932.0, None, None, None], ['2017-11-30', 48.64, 49.84, 48.28, 49.7, None, 123019.0, 6085171.0, None, None, None], ['2017-11-29', 49.64, 49.64, 48.7, 48.75, None, 67342.0, 3292223.0, None, None, None], ['2017-11-28', 49.09, 49.89, 49.03, 49.25, None, 42669.0, 2107358.0, None, None, None], ['2017-11-27', 49.13, 49.73, 48.96, 49.2, None, 102180.0, 5055762.0, None, None, None], ['2017-11-24', 49.11, 49.41, 48.87, 49.11, None, 50350.0, 2472842.0, None, None, None], ['2017-11-23', 48.8, 49.46, 48.45, 49.2, None, 38834.0, 1909352.0, None, None, None], ['2017-11-22', 48.4, 49.61, 48.39, 48.8, None, 91142.0, 4478093.0, None, None, None], ['2017-11-21', 47.25, 48.59, 46.78, 48.39, None, 78502.0, 3782098.0, None, None, None], ['2017-11-20', 46.57, 47.38, 46.54, 47.04, None, 97252.0, 4563515.0, None, None, None], ['2017-11-17', 47.03, 47.15, 46.8, 46.84, None, 54107.0, 2540820.0, None, None, None], ['2017-11-16', 47.09, 47.23, 46.55, 47.03, None, 89373.0, 4195732.0, None, None, None], ['2017-11-15', 47.98, 48.01, 46.75, 47.05, None, 67593.0, 3188321.0, None, None, None], ['2017-11-14', 48.4, 48.9, 47.84, 48.0, None, 67672.0, 3259979.0, None, None, None], ['2017-11-13', 48.38, 48.61, 47.76, 48.34, None, 76286.0, 3681337.0, None, None, None], ['2017-11-10', 47.3, 48.89, 47.16, 48.34, None, 90245.0, 4361552.0, None, None, None], ['2017-11-09', 47.65, 48.06, 47.09, 47.21, None, 120268.0, 5712034.0, None, None, None], ['2017-11-08', 46.42, 47.72, 46.42, 47.47, None, 94195.0, 4463935.0, None, None, None], ['2017-11-07', 46.16, 46.33, 45.84, 46.26, None, 48152.0, 2224221.0, None, None, None], ['2017-11-06', 45.81, 46.09, 45.76, 45.99, None, 60716.0, 2789220.0, None, None, None], ['2017-11-03', 45.0, 46.04, 44.83, 45.97, None, 56911.0, 2603498.0, None, None, None], ['2017-11-02', 45.88, 46.06, 45.18, 45.27, None, 37958.0, 1724840.0, None, None, None], ['2017-11-01', 46.29, 46.55, 45.97, 46.04, None, 56319.0, 2603859.0, None, None, None], ['2017-10-30', 46.53, 46.65, 45.61, 45.76, None, 56245.0, 2585397.0, None, None, None], ['2017-10-27', 45.48, 46.42, 45.46, 46.41, None, 74472.0, 3434087.0, None, None, None], ['2017-10-26', 45.2, 45.41, 44.91, 45.41, None, 56319.0, 2548078.0, None, None, None], ['2017-10-25', 45.01, 45.06, 44.7, 45.0, None, 47730.0, 2145697.0, None, None, None], ['2017-10-24', 45.16, 45.27, 44.75, 44.85, None, 43042.0, 1937616.0, None, None, None], ['2017-10-23', 44.9, 45.34, 44.89, 45.0, None, 43375.0, 1952918.0, None, None, None], ['2017-10-20', 45.08, 45.34, 44.76, 44.87, None, 55707.0, 2503853.0, None, None, None], ['2017-10-19', 45.72, 45.85, 44.79, 45.0, None, 59991.0, 2703085.0, None, None, None], ['2017-10-18', 46.01, 46.2, 45.61, 45.77, None, 45263.0, 2076951.0, None, None, None], ['2017-10-17', 45.8, 46.06, 45.37, 45.96, None, 65837.0, 3014080.0, None, None, None], ['2017-10-16', 45.61, 45.75, 45.3, 45.55, None, 49246.0, 2243129.0, None, None, None], ['2017-10-13', 45.5, 45.7, 45.37, 45.4, None, 43362.0, 1971801.0, None, None, None], ['2017-10-12', 45.58, 45.58, 45.17, 45.43, None, 49180.0, 2233481.0, None, None, None], ['2017-10-11', 45.97, 45.97, 45.25, 45.29, None, 69455.0, 3158321.0, None, None, None], ['2017-10-10', 45.64, 46.04, 45.57, 45.84, None, 65860.0, 3016658.0, None, None, None], ['2017-10-09', 46.2, 46.2, 45.6, 45.74, None, 44059.0, 2015453.0, None, None, None], ['2017-10-06', 46.19, 46.19, 45.69, 46.0, None, 66760.0, 3066198.0, None, None, None], ['2017-10-05', 46.01, 46.09, 45.63, 46.05, None, 94804.0, 4352002.0, None, None, None], ['2017-10-04', 45.36, 46.17, 45.22, 46.11, None, 115706.0, 5313199.0, None, None, None], ['2017-10-02', 44.51, 44.98, 44.18, 44.98, None, 95313.0, 4265024.0, None, None, None], ['2017-09-29', 43.58, 44.17, 43.3, 44.17, None, 99821.0, 4384796.0, None, None, None], ['2017-09-28', 42.0, 43.56, 42.0, 43.56, None, 157234.0, 6775569.0, None, None, None], ['2017-09-27', 42.35, 42.49, 41.78, 42.04, None, 76600.0, 3219861.0, None, None, None], ['2017-09-26', 42.3, 42.57, 42.11, 42.37, None, 51321.0, 2175381.0, None, None, None], ['2017-09-25', 42.3, 42.3, 41.96, 42.07, None, 56224.0, 2366453.0, None, None, None], ['2017-09-22', 41.48, 42.38, 41.48, 42.06, None, 79955.0, 3362517.0, None, None, None], ['2017-09-21', 42.29, 42.29, 41.39, 41.46, None, 105194.0, 4378409.0, None, None, None], ['2017-09-20', 42.54, 42.54, 41.99, 41.99, None, 57838.0, 2440557.0, None, None, None], ['2017-09-19', 42.65, 42.65, 42.13, 42.44, None, 65546.0, 2777065.0, None, None, None], ['2017-09-18', 42.5, 42.63, 42.23, 42.27, None, 44037.0, 1864954.0, None, None, None], ['2017-09-15', 42.29, 42.81, 42.25, 42.42, None, 107144.0, 4555791.0, None, None, None], ['2017-09-14', 42.35, 42.8, 42.35, 42.52, None, 65157.0, 2770696.0, None, None, None], ['2017-09-13', 42.49, 42.69, 42.22, 42.45, None, 68801.0, 2921240.0, None, None, None], ['2017-09-12', 43.21, 43.34, 42.62, 42.73, None, 52828.0, 2259924.0, None, None, None], ['2017-09-11', 42.81, 42.89, 42.56, 42.85, None, 103273.0, 4415614.0, None, None, None], ['2017-09-08', 42.7, 42.75, 42.56, 42.67, None, 59881.0, 2553977.0, None, None, None], ['2017-09-07', 43.0, 43.02, 42.67, 42.77, None, 64320.0, 2751388.0, None, None, None], ['2017-09-06', 42.66, 42.71, 42.34, 42.55, None, 71006.0, 3020229.0, None, None, None], ['2017-09-05', 43.0, 43.19, 42.55, 42.62, None, 66351.0, 2846115.0, None, None, None], ['2017-09-04', 42.38, 42.75, 41.95, 42.6, None, 105288.0, 4471634.0, None, None, None], ['2017-09-01', 42.16, 43.06, 42.07, 42.41, None, 151474.0, 6453558.0, None, None, None], ['2017-08-31', 42.0, 42.08, 41.12, 41.9, None, 157888.0, 6580200.0, None, None, None], ['2017-08-30', 42.0, 42.2, 41.49, 41.94, None, 97804.0, 4090262.0, None, None, None], ['2017-08-29', 41.71, 41.98, 41.33, 41.85, None, 98156.0, 4094452.0, None, None, None], ['2017-08-28', 42.11, 42.25, 41.86, 41.91, None, 47130.0, 1978704.0, None, None, None], ['2017-08-25', 42.64, 42.64, 42.05, 42.14, None, 69734.0, 2948016.0, None, None, None], ['2017-08-24', 42.72, 43.05, 42.63, 42.69, None, 65213.0, 2792319.0, None, None, None], ['2017-08-23', 42.82, 43.17, 42.6, 42.71, None, 70269.0, 3011578.0, None, None, None], ['2017-08-22', 42.46, 42.96, 42.4, 42.71, None, 95376.0, 4075646.0, None, None, None], ['2017-08-21', 42.42, 42.76, 42.2, 42.26, None, 68812.0, 2922972.0, None, None, None], ['2017-08-18', 42.28, 42.6, 42.01, 42.41, None, 72886.0, 3092377.0, None, None, None], ['2017-08-17', 41.88, 43.01, 41.76, 42.5, None, 131361.0, 5583704.0, None, None, None], ['2017-08-16', 42.4, 42.62, 41.98, 42.05, None, 104676.0, 4408312.0, None, None, None], ['2017-08-15', 42.53, 42.53, 42.2, 42.28, None, 64334.0, 2721852.0, None, None, None], ['2017-08-14', 42.12, 42.69, 42.01, 42.3, None, 127682.0, 5416963.0, None, None, None], ['2017-08-11', 41.3, 41.94, 40.96, 41.94, None, 183412.0, 7604144.0, None, None, None], ['2017-08-10', 41.73, 41.99, 41.14, 41.68, None, 175161.0, 7303562.0, None, None, None], ['2017-08-09', 43.5, 43.5, 41.64, 41.81, None, 355857.0, 15003956.0, None, None, None], ['2017-08-08', 44.9, 45.09, 44.15, 44.37, None, 156168.0, 6941408.0, None, None, None], ['2017-08-07', 45.85, 46.34, 44.02, 44.96, None, 164543.0, 7378816.0, None, None, None], ['2017-08-04', 45.13, 45.13, 44.36, 45.07, None, 96202.0, 4306911.0, None, None, None], ['2017-08-03', 45.34, 45.54, 44.91, 44.97, None, 77854.0, 3517146.0, None, None, None], ['2017-08-02', 45.25, 45.77, 44.9, 45.56, None, 187468.0, 8528548.0, None, None, None], ['2017-08-01', 45.24, 45.54, 45.1, 45.45, None, 74975.0, 3399891.0, None, None, None], ['2017-07-31', 44.94, 45.75, 44.94, 45.3, None, 62672.0, 2844210.0, None, None, None], ['2017-07-28', 45.26, 45.29, 44.75, 44.97, None, 114006.0, 5127247.0, None, None, None], ['2017-07-27', 45.16, 45.45, 45.15, 45.25, None, 50557.0, 2290284.0, None, None, None], ['2017-07-26', 44.91, 45.33, 44.46, 45.16, None, 81970.0, 3688510.0, None, None, None], ['2017-07-25', 44.7, 45.04, 44.63, 44.82, None, 112224.0, 5033312.0, None, None, None], ['2017-07-24', 45.31, 45.31, 44.49, 44.61, None, 104282.0, 4661866.0, None, None, None], ['2017-07-21', 45.57, 45.88, 45.04, 45.44, None, 73422.0, 3334695.0, None, None, None], ['2017-07-20', 45.74, 45.96, 45.23, 45.66, None, 87399.0, 3986488.0, None, None, None], ['2017-07-19', 45.06, 45.72, 44.94, 45.57, None, 71971.0, 3273001.0, None, None, None], ['2017-07-18', 45.5, 45.55, 44.7, 45.0, None, 104003.0, 4684627.0, None, None, None], ['2017-07-17', 45.6, 46.23, 45.29, 45.6, None, 104995.0, 4801806.0, None, None, None], ['2017-07-14', 45.07, 45.56, 44.83, 45.53, None, 67375.0, 3054060.0, None, None, None], ['2017-07-13', 44.67, 45.18, 44.67, 44.95, None, 82745.0, 3718928.0, None, None, None], ['2017-07-12', 44.29, 45.05, 43.89, 44.95, None, 115705.0, 5133971.0, None, None, None], ['2017-07-11', 44.94, 44.94, 44.08, 44.2, None, 90538.0, 4010457.0, None, None, None], ['2017-07-10', 44.64, 45.18, 44.51, 44.7, None, 71868.0, 3221218.0, None, None, None], ['2017-07-07', 44.79, 44.79, 44.25, 44.53, None, 47999.0, 2136578.0, None, None, None], ['2017-07-06', 45.5, 45.5, 44.15, 44.62, None, 66116.0, 2952605.0, None, None, None], ['2017-07-05', 44.67, 45.36, 44.44, 45.19, None, 48706.0, 2189436.0, None, None, None], ['2017-07-04', 45.83, 45.83, 44.74, 44.8, None, 50549.0, 2273551.0, None, None, None], ['2017-07-03', 45.29, 45.83, 45.06, 45.75, None, 71381.0, 3251502.0, None, None, None], ['2017-06-30', 45.01, 45.74, 45.0, 45.44, None, 136112.0, 6187148.0, None, None, None], ['2017-06-29', 45.73, 45.81, 45.11, 45.2, None, 134965.0, 6132452.0, None, None, None], ['2017-06-28', 46.68, 46.68, 45.41, 45.68, None, 117165.0, 5381488.0, None, None, None], ['2017-06-27', 47.23, 47.33, 46.39, 46.83, None, 82492.0, 3866344.0, None, None, None], ['2017-06-26', 46.95, 47.63, 46.91, 47.21, None, 73322.0, 3465639.0, None, None, None], ['2017-06-23', 47.29, 47.4, 46.79, 46.99, None, 80586.0, 3792498.0, None, None, None], ['2017-06-22', 47.03, 47.4, 46.75, 47.29, None, 56071.0, 2640508.0, None, None, None], ['2017-06-21', 47.46, 47.48, 46.53, 46.99, None, 89752.0, 4206563.0, None, None, None], ['2017-06-20', 46.48, 47.43, 46.27, 47.37, None, 108334.0, 5109730.0, None, None, None], ['2017-06-19', 46.9, 46.9, 46.25, 46.64, None, 70056.0, 3260381.0, None, None, None], ['2017-06-16', 45.66, 46.8, 45.66, 46.63, None, 202214.0, 9411695.0, None, None, None], ['2017-06-15', 46.34, 46.34, 45.21, 45.67, None, 101733.0, 4635593.0, None, None, None], ['2017-06-14', 46.52, 46.86, 46.05, 46.33, None, 83741.0, 3881453.0, None, None, None], ['2017-06-13', 46.5, 46.51, 46.03, 46.32, None, 107644.0, 4981185.0, None, None, None], ['2017-06-12', 47.31, 47.43, 45.89, 46.31, None, 112942.0, 5238390.0, None, None, None], ['2017-06-09', 46.77, 47.44, 46.55, 47.44, None, 99674.0, 4702170.0, None, None, None], ['2017-06-08', 47.8, 47.8, 46.27, 46.27, None, 1945.0, 90599.0, None, None, None], ['2017-06-07', 47.01, 47.43, 47.01, 47.43, None, 1081.0, 51021.0, None, None, None], ['2017-06-06', 47.12, 47.45, 46.21, 47.43, None, 686.0, 32083.0, None, None, None], ['2017-06-02', 46.8, 46.99, 46.72, 46.99, None, 290.0, 13584.0, None, None, None], ['2017-06-01', 46.12, 46.52, 45.89, 46.52, None, 106513.0, 4930686.0, None, None, None], ['2017-05-31', 45.22, 46.26, 45.22, 45.86, None, 522.0, 24044.0, None, None, None], ['2017-05-30', 45.05, 46.02, 45.05, 46.02, None, 587.0, 26792.0, None, None, None], ['2017-05-29', 45.61, 45.61, 45.24, 45.32, None, 112.0, 5089.0, None, None, None], ['2017-05-26', 44.8, 45.36, 44.71, 45.3, None, 74453.0, 3360707.0, None, None, None], ['2017-05-25', 44.8, 44.87, 44.29, 44.78, None, 49970.0, 2231857.0, None, None, None], ['2017-05-24', 43.92, 44.67, 43.92, 44.53, None, 111923.0, 4971343.0, None, None, None], ['2017-05-23', 43.67, 44.13, 43.55, 43.9, None, 38308.0, 1681904.0, None, None, None], ['2017-05-22', 44.16, 44.22, 43.44, 43.84, None, 70856.0, 3103013.0, None, None, None], ['2017-05-19', 43.74, 44.12, 43.74, 44.12, None, 45.0, 1980.0, None, None, None], ['2017-05-18', 44.0, 44.3, 43.29, 43.98, None, 166160.0, 7277314.0, None, None, None], ['2017-05-17', 45.06, 45.34, 44.01, 44.19, None, 149515.0, 6664744.0, None, None, None], ['2017-05-16', 45.15, 45.36, 44.56, 45.31, None, 101476.0, 4567885.0, None, None, None], ['2017-05-15', 45.09, 45.78, 44.31, 45.14, None, 193702.0, 8734286.0, None, None, None], ['2017-05-12', 45.18, 45.18, 44.16, 44.99, None, 159495.0, 7113519.0, None, None, None], ['2017-05-11', 43.4, 46.06, 43.25, 45.0, None, 189125.0, 8496322.0, None, None, None], ['2017-05-10', 43.5, 43.6, 42.53, 43.28, None, 91858.0, 3958630.0, None, None, None], ['2017-05-09', 41.83, 43.55, 41.82, 43.3, None, 151439.0, 6538516.0, None, None, None], ['2017-05-08', 43.0, 43.0, 42.04, 42.24, None, 97456.0, 4128048.0, None, None, None], ['2017-05-05', 42.52, 42.91, 42.38, 42.75, None, 78512.0, 3353971.0, None, None, None], ['2017-05-04', 41.86, 42.5, 41.71, 42.5, None, 82058.0, 3465505.0, None, None, None], ['2017-05-03', 42.2, 42.29, 41.78, 41.9, None, 65266.0, 2738394.0, None, None, None], ['2017-05-02', 41.89, 42.23, 41.76, 42.15, None, 86559.0, 3636583.0, None, None, None], ['2017-05-01', None, 42.245, 41.655, 41.72, -0.44, 86348.0, 3606589.0, None, None, None], ['2017-04-28', 42.17, 42.25, 41.66, 41.72, None, 86348.0, 3606589.0, None, None, None], ['2017-04-27', 41.51, 42.24, 41.51, 42.16, None, 151683.0, 6380639.0, None, None, None], ['2017-04-26', 41.88, 41.94, 41.4, 41.5, None, 65847.0, 2743109.0, None, None, None], ['2017-04-25', 41.93, 42.18, 41.66, 41.89, None, 85973.0, 3604204.0, None, None, None], ['2017-04-24', 42.01, 42.02, 41.23, 41.81, None, 102084.0, 4247032.0, None, None, None], ['2017-04-21', 41.97, 42.14, 41.01, 41.32, None, 186784.0, 7728103.0, None, None, None], ['2017-04-20', 42.5, 42.64, 41.52, 41.93, None, 223621.0, 9418192.0, None, None, None], ['2017-04-19', 41.94, 42.61, 41.94, 42.61, None, 92722.0, 3930856.0, None, None, None], ['2017-04-18', 42.24, 42.4, 41.54, 42.0, None, 133057.0, 5587565.0, None, None, None], ['2017-04-17', None, 42.48, 41.985, 42.2, None, 88416.0, 3734717.0, None, None, None], ['2017-04-14', None, 42.48, 41.985, 42.2, None, 88416.0, 3734717.0, None, None, None], ['2017-04-13', 42.06, 42.48, 41.99, 42.2, None, 88416.0, 3734717.0, None, None, None], ['2017-04-12', 42.02, 42.45, 41.84, 42.2, None, 158278.0, 6672547.0, None, None, None], ['2017-04-11', 41.62, 42.03, 41.53, 41.75, None, 107817.0, 4501109.0, None, None, None], ['2017-04-10', 41.46, 41.68, 41.31, 41.68, None, 62297.0, 2585922.0, None, None, None], ['2017-04-07', 40.9, 41.42, 40.84, 41.42, None, 81255.0, 3344628.0, None, None, None], ['2017-04-06', 40.96, 41.25, 40.83, 41.05, None, 96794.0, 3968681.0, None, None, None], ['2017-04-05', 41.1, 41.34, 40.79, 41.1, None, 156005.0, 6404780.0, None, None, None], ['2017-04-04', 39.5, 40.88, 39.48, 40.81, None, 193156.0, 7822665.0, None, None, None], ['2017-04-03', 40.15, 40.15, 39.54, 39.64, None, 127973.0, 5081376.0, None, None, None], ['2017-03-31', 39.77, 40.07, 39.42, 39.98, None, 95382.0, 3795061.0, None, None, None], ['2017-03-30', 40.02, 40.14, 39.42, 39.75, None, 189201.0, 7541354.0, None, None, None], ['2017-03-29', 39.39, 40.01, 39.05, 40.01, None, 335406.0, 13349426.0, None, None, None], ['2017-03-28', 38.95, 39.35, 38.79, 39.22, None, 115075.0, 4505494.0, None, None, None], ['2017-03-27', 38.73, 39.1, 38.53, 38.85, None, 191515.0, 7446952.0, None, None, None], ['2017-03-24', 38.94, 39.02, 38.6, 38.94, None, 210926.0, 8205507.0, None, None, None], ['2017-03-23', 39.01, 39.25, 38.63, 38.96, None, 169971.0, 6621807.0, None, None, None], ['2017-03-22', 38.25, 39.02, 37.53, 38.94, None, 670349.0, 25910543.0, None, None, None], ['2017-03-21', 41.8, 41.83, 40.97, 40.98, None, 56906.0, 2349965.0, None, None, None], ['2017-03-20', 41.26, 42.17, 41.26, 41.97, None, 97572.0, 4074891.0, None, None, None], ['2017-03-17', 41.47, 41.59, 41.16, 41.34, None, 90109.0, 3734232.0, None, None, None], ['2017-03-16', 41.4, 41.57, 41.09, 41.46, None, 55799.0, 2308423.0, None, None, None], ['2017-03-15', 41.4, 41.5, 40.91, 41.25, None, 60324.0, 2488650.0, None, None, None], ['2017-03-14', 41.2, 41.5, 41.2, 41.3, None, 60420.0, 2498025.0, None, None, None], ['2017-03-13', 41.4, 41.46, 41.08, 41.3, None, 44803.0, 1850251.0, None, None, None], ['2017-03-10', 41.53, 41.53, 41.16, 41.4, None, 38518.0, 1592270.0, None, None, None], ['2017-03-09', 41.61, 41.61, 41.16, 41.4, None, 43988.0, 1819182.0, None, None, None], ['2017-03-08', 41.13, 41.71, 40.95, 41.68, None, 45111.0, 1870935.0, None, None, None], ['2017-03-07', 41.5, 41.8, 41.25, 41.42, None, 61925.0, 2569608.0, None, None, None], ['2017-03-06', 41.25, 41.4, 40.81, 41.4, None, 46510.0, 1916799.0, None, None, None], ['2017-03-03', 41.12, 41.22, 40.84, 41.18, None, 40800.0, 1675587.0, None, None, None], ['2017-03-02', 41.38, 41.39, 40.76, 41.17, None, 49863.0, 2048153.0, None, None, None], ['2017-03-01', 41.19, 41.57, 40.9, 41.2, None, 86753.0, 3569796.0, None, None, None], ['2017-02-28', 40.38, 40.95, 40.38, 40.84, None, 67440.0, 2747011.0, None, None, None], ['2017-02-27', 39.75, 40.64, 39.75, 40.39, None, 62655.0, 2520260.0, None, None, None], ['2017-02-24', 39.77, 40.14, 38.91, 39.74, None, 101294.0, 4015150.0, None, None, None], ['2017-02-23', 39.72, 39.98, 39.38, 39.79, None, 81945.0, 3260642.0, None, None, None], ['2017-02-22', 39.6, 39.75, 39.27, 39.7, None, 77619.0, 3066894.0, None, None, None], ['2017-02-21', 38.85, 39.57, 38.85, 39.45, None, 46070.0, 1808350.0, None, None, None], ['2017-02-20', 39.25, 39.25, 38.81, 38.98, None, 37014.0, 1444138.0, None, None, None], ['2017-02-17', 38.8, 39.03, 38.48, 39.02, None, 60583.0, 2352961.0, None, None, None], ['2017-02-16', 38.8, 39.2, 38.25, 38.71, None, 84682.0, 3282322.0, None, None, None], ['2017-02-15', 38.5, 38.93, 38.4, 38.72, None, 77420.0, 2996861.0, None, None, None], ['2017-02-14', 38.81, 38.86, 38.0, 38.37, None, 82601.0, 3163898.0, None, None, None], ['2017-02-13', 37.37, 39.36, 37.35, 38.53, None, 177171.0, 6804028.0, None, None, None], ['2017-02-10', 36.65, 37.5, 36.57, 37.06, None, 115843.0, 4291017.0, None, None, None], ['2017-02-09', 36.2, 36.25, 35.77, 36.25, None, 67781.0, 2445428.0, None, None, None], ['2017-02-08', 35.98, 36.14, 35.84, 36.05, None, 39731.0, 1431205.0, None, None, None], ['2017-02-07', 35.56, 36.05, 35.36, 35.89, None, 67410.0, 2410818.0, None, None, None], ['2017-02-06', 36.06, 36.15, 35.6, 35.64, None, 41911.0, 1496794.0, None, None, None], ['2017-02-03', 36.02, 36.2, 35.73, 36.1, None, 40705.0, 1464712.0, None, None, None], ['2017-02-02', 35.95, 36.2, 35.7, 36.07, None, 54279.0, 1953176.0, None, None, None], ['2017-02-01', 34.75, 36.0, 34.75, 35.94, None, 85137.0, 3038172.0, None, None, None], ['2017-01-31', 35.24, 35.24, 34.56, 34.56, None, 63371.0, 2199583.0, None, None, None], ['2017-01-30', 35.38, 35.59, 34.95, 35.15, None, 69603.0, 2457762.0, None, None, None], ['2017-01-27', 34.83, 35.43, 34.81, 35.3, None, 69657.0, 2444913.0, None, None, None], ['2017-01-26', 35.07, 35.58, 34.8, 34.89, None, 64103.0, 2249375.0, None, None, None], ['2017-01-25', 34.42, 34.86, 34.03, 34.83, None, 56240.0, 1947147.0, None, None, None], ['2017-01-24', 34.0, 34.35, 33.85, 34.22, None, 48797.0, 1666086.0, None, None, None], ['2017-01-23', 34.04, 34.12, 33.62, 34.06, None, 55333.0, 1877957.0, None, None, None], ['2017-01-20', 34.54, 34.59, 34.05, 34.17, None, 80246.0, 2743474.0, None, None, None], ['2017-01-19', 35.04, 35.04, 34.42, 34.5, None, 73105.0, 2526731.0, None, None, None], ['2017-01-18', 35.04, 35.51, 34.8, 34.9, None, 65931.0, 2311608.0, None, None, None], ['2017-01-17', 35.06, 35.19, 34.79, 34.99, None, 39195.0, 1369857.0, None, None, None], ['2017-01-16', 34.85, 35.24, 34.56, 35.07, None, 47879.0, 1678679.0, None, None, None], ['2017-01-13', 34.98, 34.98, 34.6, 34.85, None, 59367.0, 2065534.0, None, None, None], ['2017-01-12', 35.38, 35.38, 34.31, 34.9, None, 163860.0, 5703427.0, None, None, None], ['2017-01-11', 34.95, 36.0, 34.84, 35.42, None, 123530.0, 4369079.0, None, None, None], ['2017-01-10', 34.8, 34.98, 34.46, 34.91, None, 43976.0, 1528055.0, None, None, None], ['2017-01-09', 35.29, 35.35, 34.43, 34.67, None, 62225.0, 2157182.0, None, None, None], ['2017-01-06', 34.91, 35.21, 34.91, 35.04, None, 27507.0, 964046.0, None, None, None], ['2017-01-05', 35.02, 35.2, 34.73, 35.06, None, 48412.0, 1692326.0, None, None, None], ['2017-01-04', 35.48, 35.51, 34.75, 35.19, None, 54408.0, 1906810.0, None, None, None], ['2017-01-03', 35.9, 35.93, 35.34, 35.48, None, 70618.0, 2515473.0, None, None, None], ['2017-01-02', 34.99, 35.94, 34.99, 35.8, None, 44700.0, 1590561.0, None, None, None]], 'collapse': None, 'order': None, 'database_id': 6129}}\n" 212 | ] 213 | } 214 | ], 215 | "source": [ 216 | "print(r_json)" 217 | ] 218 | }, 219 | { 220 | "cell_type": "markdown", 221 | "metadata": {}, 222 | "source": [ 223 | "### 3. Calculate what the highest and lowest opening prices were for the stock in this period." 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": 28, 229 | "metadata": {}, 230 | "outputs": [ 231 | { 232 | "data": { 233 | "text/plain": [ 234 | "dict_keys(['dataset'])" 235 | ] 236 | }, 237 | "execution_count": 28, 238 | "metadata": {}, 239 | "output_type": "execute_result" 240 | } 241 | ], 242 | "source": [ 243 | "# Review the data content\n", 244 | "r_json.keys()" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": 29, 250 | "metadata": {}, 251 | "outputs": [ 252 | { 253 | "data": { 254 | "text/plain": [ 255 | "dict_keys(['id', 'dataset_code', 'database_code', 'name', 'description', 'refreshed_at', 'newest_available_date', 'oldest_available_date', 'column_names', 'frequency', 'type', 'premium', 'limit', 'transform', 'column_index', 'start_date', 'end_date', 'data', 'collapse', 'order', 'database_id'])" 256 | ] 257 | }, 258 | "execution_count": 29, 259 | "metadata": {}, 260 | "output_type": "execute_result" 261 | } 262 | ], 263 | "source": [ 264 | "r_json['dataset'].keys()" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": 30, 270 | "metadata": {}, 271 | "outputs": [ 272 | { 273 | "data": { 274 | "text/plain": [ 275 | "['Date',\n", 276 | " 'Open',\n", 277 | " 'High',\n", 278 | " 'Low',\n", 279 | " 'Close',\n", 280 | " 'Change',\n", 281 | " 'Traded Volume',\n", 282 | " 'Turnover',\n", 283 | " 'Last Price of the Day',\n", 284 | " 'Daily Traded Units',\n", 285 | " 'Daily Turnover']" 286 | ] 287 | }, 288 | "execution_count": 30, 289 | "metadata": {}, 290 | "output_type": "execute_result" 291 | } 292 | ], 293 | "source": [ 294 | "r_json['dataset']['column_names']" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": 31, 300 | "metadata": {}, 301 | "outputs": [ 302 | { 303 | "data": { 304 | "text/plain": [ 305 | "[['2017-12-29',\n", 306 | " 51.76,\n", 307 | " 51.94,\n", 308 | " 51.45,\n", 309 | " 51.76,\n", 310 | " None,\n", 311 | " 34640.0,\n", 312 | " 1792304.0,\n", 313 | " None,\n", 314 | " None,\n", 315 | " None],\n", 316 | " ['2017-12-28',\n", 317 | " 51.65,\n", 318 | " 51.82,\n", 319 | " 51.43,\n", 320 | " 51.6,\n", 321 | " None,\n", 322 | " 40660.0,\n", 323 | " 2099024.0,\n", 324 | " None,\n", 325 | " None,\n", 326 | " None]]" 327 | ] 328 | }, 329 | "execution_count": 31, 330 | "metadata": {}, 331 | "output_type": "execute_result" 332 | } 333 | ], 334 | "source": [ 335 | "r_json['dataset']['data'][0:2]" 336 | ] 337 | }, 338 | { 339 | "cell_type": "markdown", 340 | "metadata": {}, 341 | "source": [ 342 | "For this particular question, I tried to show three different approaches to solve it. " 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": 17, 348 | "metadata": {}, 349 | "outputs": [ 350 | { 351 | "name": "stdout", 352 | "output_type": "stream", 353 | "text": [ 354 | "Maximim and minimum opening values by dates: \n", 355 | "('2017-12-14', 53.11) ('2017-01-24', 34.0)\n" 356 | ] 357 | } 358 | ], 359 | "source": [ 360 | "# Method-1:\n", 361 | "\n", 362 | "# Index of Openings\n", 363 | "i_open = r_json['dataset']['column_names'].index('Open')\n", 364 | "\n", 365 | "# Index of the data associated with the \"Open\" value\n", 366 | "i_date = r_json['dataset']['column_names'].index('Date')\n", 367 | "\n", 368 | "# Creating a dictionary for opening values to corresponding each day\n", 369 | "data_json = r_json['dataset']['data']\n", 370 | "openings = {data_json[j][i_date] : data_json[j][i_open] for j in range(len(data_json)) if data_json[j][i_open] is not None}\n", 371 | "\n", 372 | "max_openings = max(openings.items(), key=operator.itemgetter(1))\n", 373 | "min_openings = min(openings.items(), key=operator.itemgetter(1))\n", 374 | "\n", 375 | "print('Maximim and minimum opening values by dates: ')\n", 376 | "print(max_openings, min_openings)" 377 | ] 378 | }, 379 | { 380 | "cell_type": "code", 381 | "execution_count": 18, 382 | "metadata": {}, 383 | "outputs": [ 384 | { 385 | "name": "stdout", 386 | "output_type": "stream", 387 | "text": [ 388 | "The highest opening price: 53.11\n", 389 | "The lowest opening price: 34.0\n" 390 | ] 391 | } 392 | ], 393 | "source": [ 394 | "# Method-2:\n", 395 | "\n", 396 | "opening = [row[1] for row in data_json if row[1] != None]\n", 397 | "print(\"The highest opening price: \" + str(max(opening)))\n", 398 | "print(\"The lowest opening price: \" + str(min(opening)))" 399 | ] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "execution_count": 19, 404 | "metadata": {}, 405 | "outputs": [ 406 | { 407 | "name": "stdout", 408 | "output_type": "stream", 409 | "text": [ 410 | " ['max_opening_value $53.11 at 2017-12-14'] \n", 411 | " ['lowest_opening_value $34.0 at 2017-01-24']\n" 412 | ] 413 | } 414 | ], 415 | "source": [ 416 | "# Method-3: \n", 417 | "\n", 418 | "def min_max_opening(data):\n", 419 | " max_opening = ['max_opening_value $' + str(v) + ' at ' + k for k,v in data.items() if v == max(data.values()) ]\n", 420 | " lowest_opening = ['lowest_opening_value $' + str(v) + ' at ' + k for k,v in data.items() if v == min(data.values()) ]\n", 421 | " return print('',max_opening,'\\n',lowest_opening)\n", 422 | "\n", 423 | "min_max_opening(openings)" 424 | ] 425 | }, 426 | { 427 | "cell_type": "markdown", 428 | "metadata": {}, 429 | "source": [ 430 | "### 4. What was the largest change in any one day (based on High and Low price)?" 431 | ] 432 | }, 433 | { 434 | "cell_type": "code", 435 | "execution_count": 20, 436 | "metadata": {}, 437 | "outputs": [ 438 | { 439 | "name": "stdout", 440 | "output_type": "stream", 441 | "text": [ 442 | "The largest change in any one day is:2.8100000000000023\n" 443 | ] 444 | } 445 | ], 446 | "source": [ 447 | "high = [row[2] for row in data_json if row[2] != None]\n", 448 | "\n", 449 | "low = [row[3] for row in data_json if row[3] != None]\n", 450 | "\n", 451 | "subs = [abs(x1 - x2) for (x1, x2) in zip(high, low)]\n", 452 | "\n", 453 | "print (\"The largest change in any one day is:\" + str(max(subs)))" 454 | ] 455 | }, 456 | { 457 | "cell_type": "markdown", 458 | "metadata": {}, 459 | "source": [ 460 | "### 5. What was the largest change between any two days (based on Closing Price)?" 461 | ] 462 | }, 463 | { 464 | "cell_type": "code", 465 | "execution_count": 21, 466 | "metadata": {}, 467 | "outputs": [ 468 | { 469 | "name": "stdout", 470 | "output_type": "stream", 471 | "text": [ 472 | "The largest change between two days is:2.559999999999995\n" 473 | ] 474 | } 475 | ], 476 | "source": [ 477 | "closing = [row[4] for row in data_json if row[4] != None]\n", 478 | "\n", 479 | "closing_prvs = [row[4] for row in data_json if row[4] != None][1:]\n", 480 | "\n", 481 | "sub = [abs(x1 - x2) for (x1, x2) in zip(closing, closing_prvs)]\n", 482 | "\n", 483 | "print (\"The largest change between two days is:\" + str(max(sub)))" 484 | ] 485 | }, 486 | { 487 | "cell_type": "markdown", 488 | "metadata": {}, 489 | "source": [ 490 | "### 6. What was the average daily trading volume during this year?" 491 | ] 492 | }, 493 | { 494 | "cell_type": "code", 495 | "execution_count": 22, 496 | "metadata": {}, 497 | "outputs": [ 498 | { 499 | "name": "stdout", 500 | "output_type": "stream", 501 | "text": [ 502 | "The avarage daily trading volume in 2017: 89124.34\n" 503 | ] 504 | } 505 | ], 506 | "source": [ 507 | "trading_volume = [row[6] for row in data_json]\n", 508 | "\n", 509 | "volume_avg = sum(trading_volume) / len(trading_volume)\n", 510 | "\n", 511 | "print (\"The avarage daily trading volume in 2017: \" + str(round(volume_avg,2)))" 512 | ] 513 | }, 514 | { 515 | "cell_type": "markdown", 516 | "metadata": {}, 517 | "source": [ 518 | "### 7. (Optional) What was the median trading volume during this year. (Note: you may need to implement your own function for calculating the median.)" 519 | ] 520 | }, 521 | { 522 | "cell_type": "code", 523 | "execution_count": 26, 524 | "metadata": {}, 525 | "outputs": [ 526 | { 527 | "name": "stdout", 528 | "output_type": "stream", 529 | "text": [ 530 | "\n", 531 | " 76286.0 is the median trading volume during this year\n" 532 | ] 533 | } 534 | ], 535 | "source": [ 536 | "def find_median(values):\n", 537 | "\n", 538 | " # First sort the list in ascending order\n", 539 | " sorted_trading_vol = sorted(values, reverse= False)\n", 540 | " \n", 541 | " # Calculate the size of the list\n", 542 | " size = len(sorted_trading_vol)\n", 543 | " \n", 544 | " # Check if the size is odd or even number provided the list not empty\n", 545 | " if size % 2 == 1:\n", 546 | " return sorted_trading_vol[size//2]\n", 547 | " else:\n", 548 | " return sum(sorted_trading_vol[size//2-1:size//2+1])/2.0\n", 549 | " \n", 550 | "print('\\n',find_median(values = trading_volume) , ' is the median trading volume during this year')" 551 | ] 552 | } 553 | ], 554 | "metadata": { 555 | "kernelspec": { 556 | "display_name": "Python 3", 557 | "language": "python", 558 | "name": "python3" 559 | }, 560 | "language_info": { 561 | "codemirror_mode": { 562 | "name": "ipython", 563 | "version": 3 564 | }, 565 | "file_extension": ".py", 566 | "mimetype": "text/x-python", 567 | "name": "python", 568 | "nbconvert_exporter": "python", 569 | "pygments_lexer": "ipython3", 570 | "version": "3.7.1" 571 | } 572 | }, 573 | "nbformat": 4, 574 | "nbformat_minor": 2 575 | } 576 | -------------------------------------------------------------------------------- /API_Project-Quandl/batch_rulex_script.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Batch Processing with RulexAI for Large-Scale Model Interpretability 3 | 4 | import os 5 | import pandas as pd 6 | import numpy as np 7 | import pickle 8 | import time 9 | from datetime import datetime 10 | import logging 11 | from tqdm import tqdm 12 | # Import correct RuleKit and RulexAI packages 13 | from rulekit import RuleKit 14 | from rulekit.classification import RuleClassifier 15 | from rulekit.params import Measures 16 | from rulexai.explainer import Explainer 17 | from joblib import Parallel, delayed, parallel_backend 18 | import multiprocessing 19 | import shutil 20 | import sys 21 | import tempfile 22 | 23 | # Set up logging 24 | logging.basicConfig( 25 | level=logging.INFO, 26 | format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', 27 | handlers=[ 28 | logging.FileHandler("rulex_batch_processing.log"), 29 | logging.StreamHandler() 30 | ] 31 | ) 32 | logger = logging.getLogger("RulexBatchProcessor") 33 | 34 | # Initialize directories for outputs 35 | def init_directories(output_dir="rulex_explanations"): 36 | """Create necessary directories for output""" 37 | os.makedirs(output_dir, exist_ok=True) 38 | os.makedirs(f"{output_dir}/checkpoints", exist_ok=True) 39 | os.makedirs(f"{output_dir}/explainer", exist_ok=True) 40 | return output_dir 41 | 42 | # Save RulexAI explainer to disk 43 | def save_explainer(explainer, output_dir, filename="rulex_explainer.pkl"): 44 | """Save the RulexAI explainer object to disk""" 45 | filepath = os.path.join(output_dir, "explainer", filename) 46 | with open(filepath, 'wb') as f: 47 | pickle.dump(explainer, f) 48 | logger.info(f"RulexAI explainer saved to: {filepath}") 49 | return filepath 50 | 51 | # Load RulexAI explainer from disk 52 | def load_explainer(filepath): 53 | """Load a RulexAI explainer object from disk""" 54 | with open(filepath, 'rb') as f: 55 | explainer = pickle.load(f) 56 | logger.info(f"RulexAI explainer loaded from: {filepath}") 57 | return explainer 58 | 59 | # Save checkpoint of batch results 60 | def save_checkpoint(batch_results, batch_num, output_dir): 61 | """Save checkpoint of batch results""" 62 | checkpoint_path = f"{output_dir}/checkpoints/batch_{batch_num}.pkl" 63 | with open(checkpoint_path, 'wb') as f: 64 | pickle.dump(batch_results, f) 65 | logger.info(f"Checkpoint saved: {checkpoint_path}") 66 | return checkpoint_path 67 | 68 | # Process a single record 69 | def process_single_record(idx, X_record, explainer): 70 | """Process a single record and return its explanation""" 71 | try: 72 | # Generate explanation for this record 73 | # Using RulexAI's explain_instance method 74 | explanation = explainer.explain(X_record) 75 | return idx, explanation 76 | except Exception as e: 77 | logger.error(f"Error processing record {idx}: {str(e)}") 78 | return idx, None 79 | 80 | # Process a batch of records in parallel 81 | def process_batch(batch_indices, X_batch, explainer, batch_num, n_jobs, output_dir, checkpoint_frequency): 82 | """Process a batch of records in parallel""" 83 | logger.info(f"Processing batch {batch_num}: {len(batch_indices)} records") 84 | start_time = time.time() 85 | 86 | results = [] 87 | with parallel_backend('loky', n_jobs=n_jobs): 88 | results = Parallel(verbose=1)( 89 | delayed(process_single_record)(idx, X_batch[i], explainer) 90 | for i, idx in enumerate(batch_indices) 91 | ) 92 | 93 | # Filter out failed explanations 94 | valid_results = [(idx, exp) for idx, exp in results if exp is not None] 95 | 96 | elapsed = time.time() - start_time 97 | logger.info(f"Batch {batch_num} processed in {elapsed:.2f}s " + 98 | f"({len(valid_results)}/{len(batch_indices)} successful)") 99 | 100 | # Save checkpoint if needed 101 | if batch_num % checkpoint_frequency == 0: 102 | save_checkpoint(valid_results, batch_num, output_dir) 103 | 104 | return valid_results 105 | 106 | # Process the entire dataset 107 | def process_dataset(model, X_train, X_to_explain, feature_names, 108 | batch_size=10000, n_jobs=-1, output_dir="rulex_explanations", 109 | checkpoint_frequency=5, starting_batch=0, resume_from_checkpoint=None, 110 | save_explainer_frequency=20, saved_explainer_path=None): 111 | """ 112 | Process the entire dataset in batches 113 | 114 | Args: 115 | model: The trained ensemble model to explain (with predict/predict_proba functions) 116 | X_train: Training data used for the model 117 | X_to_explain: The dataset to generate explanations for 118 | feature_names: List of feature names 119 | batch_size: Number of records to process in each batch 120 | n_jobs: Number of parallel jobs (-1 for all cores) 121 | output_dir: Directory to store explanations 122 | checkpoint_frequency: How often to save checkpoints (in batches) 123 | starting_batch: Batch number to start processing from 124 | resume_from_checkpoint: Path to checkpoint to resume from 125 | save_explainer_frequency: How often to save the explainer (in batches) 126 | saved_explainer_path: Path to a saved RulexAI explainer (if available) 127 | 128 | Returns: 129 | Dictionary of explanations (record index -> explanation) 130 | """ 131 | # Initialize directories 132 | output_dir = init_directories(output_dir) 133 | 134 | # Set number of jobs for parallel processing 135 | n_jobs = n_jobs if n_jobs > 0 else multiprocessing.cpu_count() 136 | 137 | # Disable GPU to ensure CPU-only processing 138 | os.environ["CUDA_VISIBLE_DEVICES"] = "-1" 139 | 140 | # Define a wrapper class to make functional model compatible with RulexAI 141 | # This is needed only if the model is a dictionary with functions 142 | if isinstance(model, dict) and 'predict' in model and 'predict_proba' in model: 143 | class ModelWrapper: 144 | def __init__(self, model_dict): 145 | self.model_dict = model_dict 146 | 147 | def predict(self, X): 148 | return self.model_dict['predict'](X) 149 | 150 | def predict_proba(self, X): 151 | return self.model_dict['predict_proba'](X) 152 | 153 | model_wrapper = ModelWrapper(model) 154 | else: 155 | # If it's already a model object with methods, use it directly 156 | model_wrapper = model 157 | 158 | # Load or initialize RulexAI explainer 159 | if saved_explainer_path and os.path.exists(saved_explainer_path): 160 | logger.info(f"Loading saved RulexAI explainer from {saved_explainer_path}") 161 | explainer = load_explainer(saved_explainer_path) 162 | else: 163 | logger.info("Initializing new RulexAI explainer...") 164 | # Create a RuleKit classifier for use with RulexAI 165 | rule_classifier = RuleClassifier( 166 | min_rule_covered=5, 167 | induction_measure=Measures.Correlation, 168 | pruning_measure=Measures.Correlation, 169 | voting_measure=Measures.Correlation, 170 | max_growing=10000 171 | ) 172 | 173 | # Initialize the Explainer with our ensemble model and the rule classifier 174 | explainer = Explainer( 175 | estimator=model_wrapper, 176 | rule_generator=rule_classifier, 177 | X_train=X_train, 178 | feature_names=feature_names 179 | ) 180 | 181 | # Save the initial explainer 182 | save_explainer(explainer, output_dir) 183 | 184 | # Calculate total batches 185 | total_records = len(X_to_explain) 186 | total_batches = (total_records + batch_size - 1) // batch_size 187 | 188 | logger.info(f"Starting batch processing of {total_records} records " + 189 | f"in {total_batches} batches (CPU-only mode)") 190 | 191 | all_explanations = {} 192 | 193 | # Resume from checkpoint if specified 194 | if resume_from_checkpoint and os.path.exists(resume_from_checkpoint): 195 | logger.info(f"Resuming from checkpoint: {resume_from_checkpoint}") 196 | with open(resume_from_checkpoint, 'rb') as f: 197 | checkpoint_results = pickle.load(f) 198 | for idx, exp in checkpoint_results: 199 | all_explanations[idx] = exp 200 | 201 | # Process batches 202 | for batch_num in range(starting_batch, total_batches): 203 | start_idx = batch_num * batch_size 204 | end_idx = min(start_idx + batch_size, total_records) 205 | 206 | batch_indices = list(range(start_idx, end_idx)) 207 | X_batch = X_to_explain[start_idx:end_idx] 208 | 209 | # Process this batch 210 | batch_results = process_batch( 211 | batch_indices, X_batch, explainer, batch_num, 212 | n_jobs, output_dir, checkpoint_frequency 213 | ) 214 | 215 | # Add to overall results 216 | for idx, exp in batch_results: 217 | all_explanations[idx] = exp 218 | 219 | # Save progress report 220 | completion_percentage = (batch_num + 1) / total_batches * 100 221 | logger.info(f"Progress: {completion_percentage:.2f}% complete " + 222 | f"({batch_num + 1}/{total_batches} batches)") 223 | 224 | # Periodically save the explainer object to capture any learning/updates 225 | if batch_num % save_explainer_frequency == 0: 226 | explainer_path = save_explainer( 227 | explainer, output_dir, f"rulex_explainer_batch_{batch_num}.pkl" 228 | ) 229 | logger.info(f"Saved explainer snapshot at batch {batch_num}: {explainer_path}") 230 | 231 | logger.info(f"Batch processing complete. Successful explanations: {len(all_explanations)}") 232 | return all_explanations 233 | 234 | # Save explanations to disk 235 | def save_explanations(explanations, output_dir, filename="explanations.pkl"): 236 | """Save all explanations to disk""" 237 | filepath = os.path.join(output_dir, filename) 238 | with open(filepath, 'wb') as f: 239 | pickle.dump(explanations, f) 240 | logger.info(f"Explanations saved to: {filepath}") 241 | return filepath 242 | 243 | # Generate a summary report from explanations 244 | def generate_summary_report(explanations, output_dir): 245 | """Generate a summary report of the explanations""" 246 | # Count rule frequencies across all explanations 247 | rule_counts = {} 248 | total_explanations = len(explanations) 249 | 250 | for idx, explanation in explanations.items(): 251 | # Extract rules from the explanation 252 | # Structure may vary based on RulexAI, adjusting as needed 253 | try: 254 | rules = explanation.get_rules() 255 | for rule in rules: 256 | rule_text = str(rule) 257 | if rule_text in rule_counts: 258 | rule_counts[rule_text] += 1 259 | else: 260 | rule_counts[rule_text] = 1 261 | except: 262 | # If rule extraction fails, continue with the next explanation 263 | continue 264 | 265 | # Sort rules by frequency 266 | sorted_rules = sorted(rule_counts.items(), key=lambda x: x[1], reverse=True) 267 | 268 | # Save summary to file 269 | report_path = f"{output_dir}/summary_report.txt" 270 | with open(report_path, 'w') as f: 271 | f.write(f"RulexAI Explanation Summary\n") 272 | f.write(f"Generated on: {datetime.now()}\n") 273 | f.write(f"Total Records Processed: {total_explanations}\n\n") 274 | 275 | f.write(f"Top 20 Most Frequent Rules:\n") 276 | for i, (rule, count) in enumerate(sorted_rules[:20], 1): 277 | percentage = (count / total_explanations) * 100 278 | f.write(f"{i}. Rule: {rule}\n Frequency: {count} ({percentage:.2f}%)\n\n") 279 | 280 | # Add more summary statistics as needed 281 | 282 | logger.info(f"Summary report generated: {report_path}") 283 | return report_path 284 | 285 | # Create an ensemble model from CatBoost and AutoGluon using a functional approach 286 | def create_ensemble_model(X_train, y_train, feature_names): 287 | """ 288 | Create and train an ensemble model consisting of CatBoost and AutoGluon. 289 | Returns functions for prediction and probability estimation. 290 | 291 | Args: 292 | X_train: Training data features 293 | y_train: Training data labels 294 | feature_names: List of feature names 295 | 296 | Returns: 297 | A dictionary containing the models and prediction functions 298 | """ 299 | import numpy as np 300 | import pandas as pd 301 | import tempfile 302 | import sys 303 | 304 | logger.info("Training ensemble model components...") 305 | 306 | # Train CatBoost 307 | logger.info("Training CatBoost model...") 308 | from catboost import CatBoostClassifier 309 | catboost_model = CatBoostClassifier( 310 | iterations=100, 311 | depth=5, 312 | learning_rate=0.1, 313 | loss_function='Logloss', 314 | random_seed=42, 315 | verbose=False 316 | ) 317 | catboost_model.fit(X_train, y_train) 318 | 319 | # Create a pandas DataFrame for AutoGluon 320 | train_df = pd.DataFrame(X_train) 321 | train_df.columns = feature_names 322 | train_df['target'] = y_train 323 | 324 | # Train AutoGluon 325 | logger.info("Training AutoGluon model...") 326 | # Create a temporary directory for AutoGluon 327 | ag_path = tempfile.mkdtemp() 328 | 329 | # Import AutoGluon 330 | try: 331 | from autogluon.tabular import TabularPredictor 332 | except ImportError: 333 | logger.warning("AutoGluon not installed. Installing it now...") 334 | import subprocess 335 | subprocess.check_call([sys.executable, "-m", "pip", "install", "autogluon"]) 336 | from autogluon.tabular import TabularPredictor 337 | 338 | # Train AutoGluon with minimal settings for demonstration 339 | ag_model = TabularPredictor( 340 | label='target', 341 | path=ag_path, 342 | eval_metric='accuracy' 343 | ) 344 | ag_model.fit( 345 | train_data=train_df, 346 | time_limit=300, # 5 minutes time limit 347 | presets='medium_quality' 348 | ) 349 | 350 | # Define ensemble prediction function for class labels 351 | def predict(X): 352 | """Make binary predictions using the ensemble""" 353 | probs = predict_proba(X) 354 | return np.argmax(probs, axis=1) 355 | 356 | # Define ensemble prediction function for probabilities 357 | def predict_proba(X): 358 | """Average probability predictions from both models""" 359 | # Get CatBoost probabilities 360 | catboost_probs = catboost_model.predict_proba(X) 361 | 362 | # Get AutoGluon probabilities 363 | X_df = pd.DataFrame(X) 364 | X_df.columns = feature_names 365 | ag_probs = ag_model.predict_proba(X_df).values 366 | 367 | # Average the probabilities 368 | avg_probs = (catboost_probs + ag_probs) / 2 369 | 370 | return avg_probs 371 | 372 | # Create the ensemble model as a dictionary containing models and functions 373 | ensemble = { 374 | 'catboost_model': catboost_model, 375 | 'ag_model': ag_model, 376 | 'feature_names': feature_names, 377 | 'predict': predict, 378 | 'predict_proba': predict_proba 379 | } 380 | 381 | logger.info("Ensemble model created and trained successfully") 382 | return ensemble 383 | 384 | # Find the latest checkpoint 385 | def find_latest_checkpoint(checkpoint_dir): 386 | """Find the latest checkpoint file and determine starting batch""" 387 | checkpoints = [f for f in os.listdir(checkpoint_dir) if f.startswith("batch_") and f.endswith(".pkl")] if os.path.exists(checkpoint_dir) else [] 388 | 389 | if not checkpoints: 390 | return None, 0 391 | 392 | # Find the latest checkpoint 393 | latest_checkpoint = max(checkpoints, key=lambda x: int(x.split('_')[1].split('.')[0])) 394 | resume_checkpoint = os.path.join(checkpoint_dir, latest_checkpoint) 395 | starting_batch = int(latest_checkpoint.split('_')[1].split('.')[0]) + 1 396 | 397 | return resume_checkpoint, starting_batch 398 | 399 | # Command-line interface for the batch processor 400 | def parse_arguments(): 401 | """Parse command line arguments""" 402 | import argparse 403 | parser = argparse.ArgumentParser(description='Batch process model explanations using RulexAI') 404 | 405 | parser.add_argument('--model_path', type=str, required=True, 406 | help='Path to the saved ensemble model') 407 | parser.add_argument('--data_path', type=str, required=True, 408 | help='Path to the dataset to explain') 409 | parser.add_argument('--train_data_path', type=str, required=True, 410 | help='Path to the training data used for the model') 411 | parser.add_argument('--output_dir', type=str, default='rulex_explanations', 412 | help='Directory to store explanations') 413 | parser.add_argument('--batch_size', type=int, default=50000, 414 | help='Number of records to process in each batch') 415 | parser.add_argument('--n_jobs', type=int, default=8, 416 | help='Number of parallel jobs') 417 | parser.add_argument('--checkpoint_freq', type=int, default=5, 418 | help='How often to save checkpoints (in batches)') 419 | parser.add_argument('--resume', action='store_true', 420 | help='Resume from the latest checkpoint') 421 | parser.add_argument('--explainer_path', type=str, 422 | help='Path to a saved RulexAI explainer') 423 | 424 | return parser.parse_args() 425 | 426 | # Example usage 427 | if __name__ == "__main__": 428 | # Check if running in demo mode or CLI mode 429 | if len(sys.argv) > 1: 430 | # CLI mode - parse arguments 431 | args = parse_arguments() 432 | 433 | logger.info("Loading model and dataset from specified paths...") 434 | # Load the model from the specified path 435 | with open(args.model_path, 'rb') as f: 436 | model = pickle.load(f) 437 | 438 | # Load the data (this should be adjusted based on your data format) 439 | X_all = pd.read_csv(args.data_path).values 440 | X_train = pd.read_csv(args.train_data_path).values 441 | 442 | # Get feature names from the training data 443 | if args.train_data_path.endswith('.csv'): 444 | feature_names = list(pd.read_csv(args.train_data_path).columns) 445 | else: 446 | # Default feature names if not available 447 | feature_names = [f"feature_{i}" for i in range(X_train.shape[1])] 448 | 449 | # Check if all required ML libraries are installed 450 | try: 451 | from catboost import CatBoostClassifier 452 | except ImportError: 453 | logger.warning("CatBoost not installed. Installing it now...") 454 | import subprocess 455 | subprocess.check_call([sys.executable, "-m", "pip", "install", "catboost"]) 456 | 457 | try: 458 | from autogluon.tabular import TabularPredictor 459 | except ImportError: 460 | logger.warning("AutoGluon not installed. Installing it now...") 461 | import subprocess 462 | subprocess.check_call([sys.executable, "-m", "pip", "install", "autogluon"]) 463 | 464 | # Determine if we should resume from a checkpoint 465 | resume_checkpoint = None 466 | starting_batch = 0 467 | 468 | if args.resume: 469 | checkpoint_dir = os.path.join(args.output_dir, "checkpoints") 470 | resume_checkpoint, starting_batch = find_latest_checkpoint(checkpoint_dir) 471 | if resume_checkpoint: 472 | logger.info(f"Found checkpoint, resuming from batch {starting_batch}") 473 | else: 474 | logger.info("No checkpoints found, starting from the beginning") 475 | 476 | # Process the dataset 477 | explanations = process_dataset( 478 | model=model, 479 | X_train=X_train, 480 | X_to_explain=X_all, 481 | feature_names=feature_names, 482 | batch_size=args.batch_size, 483 | n_jobs=args.n_jobs, 484 | output_dir=args.output_dir, 485 | checkpoint_frequency=args.checkpoint_freq, 486 | starting_batch=starting_batch, 487 | resume_from_checkpoint=resume_checkpoint, 488 | save_explainer_frequency=10, 489 | saved_explainer_path=args.explainer_path 490 | ) 491 | 492 | # Save the results 493 | output_file = save_explanations(explanations, args.output_dir) 494 | summary_file = generate_summary_report(explanations, args.output_dir) 495 | 496 | logger.info("Batch processing complete!") 497 | logger.info(f"Explanations saved to: {output_file}") 498 | logger.info(f"Summary report: {summary_file}") 499 | 500 | else: 501 | # Demo mode - use simulated data 502 | logger.info("Running in demo mode with simulated data...") 503 | logger.info("Loading model and dataset...") 504 | 505 | # Check if required ML libraries are installed 506 | try: 507 | from catboost import CatBoostClassifier 508 | except ImportError: 509 | logger.warning("CatBoost not installed. Installing it now...") 510 | import subprocess 511 | subprocess.check_call([sys.executable, "-m", "pip", "install", "catboost"]) 512 | 513 | try: 514 | from autogluon.tabular import TabularPredictor 515 | except ImportError: 516 | logger.warning("AutoGluon not installed. Installing it now...") 517 | import subprocess 518 | subprocess.check_call([sys.executable, "-m", "pip", "install", "autogluon"]) 519 | 520 | # Create a dummy dataset and model for demonstration 521 | np.random.seed(42) 522 | n_samples = 10_000_000 # 10M records 523 | n_features = 20 524 | 525 | # Create features in batches to avoid memory issues 526 | batch_size = 100_000 527 | n_batches = n_samples // batch_size 528 | 529 | # Create an empty array to simulate the dataset 530 | # In practice, you might use a generator or load from disk in chunks 531 | logger.info("Creating simulated large dataset...") 532 | X_all = np.empty((n_samples, n_features)) 533 | y_all = np.empty(n_samples) 534 | 535 | for i in tqdm(range(n_batches)): 536 | start_idx = i * batch_size 537 | end_idx = (i + 1) * batch_size 538 | 539 | # Generate this batch 540 | X_batch = np.random.randn(batch_size, n_features) 541 | 542 | # Simple function to determine class 543 | y_batch = (X_batch[:, 0] + X_batch[:, 1] > 0).astype(int) 544 | 545 | # Store in the full arrays 546 | X_all[start_idx:end_idx] = X_batch 547 | y_all[start_idx:end_idx] = y_batch 548 | 549 | # Create feature names 550 | feature_names = [f"feature_{i}" for i in range(n_features)] 551 | 552 | # For demonstration, we'll train on a small subset 553 | logger.info("Training a sample model...") 554 | sample_size = 100_000 # Train on a smaller subset 555 | X_train = X_all[:sample_size] 556 | y_train = y_all[:sample_size] 557 | 558 | # Create and train the ensemble model (CatBoost + AutoGluon) 559 | model = create_ensemble_model(X_train, y_train, feature_names) 560 | 561 | # Save the ensemble model 562 | output_dir = init_directories() 563 | ensemble_path = os.path.join(output_dir, "ensemble_model.pkl") 564 | with open(ensemble_path, 'wb') as f: 565 | pickle.dump(model, f) 566 | logger.info(f"Ensemble model saved to: {ensemble_path}") 567 | 568 | # Check for existing checkpoints to resume from 569 | checkpoint_dir = os.path.join(output_dir, "checkpoints") 570 | resume_checkpoint, starting_batch = find_latest_checkpoint(checkpoint_dir) 571 | 572 | if resume_checkpoint: 573 | logger.info(f"Found checkpoint {os.path.basename(resume_checkpoint)}, resuming from batch {starting_batch}") 574 | else: 575 | logger.info("No checkpoints found, starting from the beginning") 576 | 577 | # Process the dataset 578 | explanations = process_dataset( 579 | model=model, 580 | X_train=X_train, 581 | X_to_explain=X_all, 582 | feature_names=feature_names, 583 | batch_size=50000, # Process 50k records per batch 584 | n_jobs=8, # Use 8 cores (adjust for your system) 585 | output_dir=output_dir, 586 | checkpoint_frequency=5, # Save checkpoint every 5 batches 587 | starting_batch=starting_batch, 588 | resume_from_checkpoint=resume_checkpoint, 589 | save_explainer_frequency=10 # Save explainer every 10 batches 590 | ) 591 | 592 | # Save the results 593 | output_file = save_explanations(explanations, output_dir) 594 | summary_file = generate_summary_report(explanations, output_dir) 595 | 596 | logger.info("Batch processing complete!") 597 | logger.info(f"Explanations saved to: {output_file}") 598 | logger.info(f"Summary report: {summary_file}") 599 | -------------------------------------------------------------------------------- /API_Project-Quandl/display_predictions.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import numpy as np 4 | import pandas as pd 5 | from pathlib import Path 6 | import pickle 7 | import sys 8 | import shutil 9 | from typing import Dict, List, Tuple, Any, Union 10 | 11 | # Add parent directory to path to import from batch_rulex_script.py 12 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 13 | from batch_rulex_script import EnsembleModel, ModelConfig 14 | 15 | # Configure logging 16 | logging.basicConfig( 17 | level=logging.INFO, 18 | format='%(asctime)s - %(name)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s' 19 | ) 20 | logger = logging.getLogger(__name__) 21 | 22 | def load_model_and_explainer(model_dir: Path) -> Tuple[EnsembleModel, Any]: 23 | """ 24 | Load the ensemble model and explainer from disk. 25 | 26 | Args: 27 | model_dir: Directory containing the model files 28 | 29 | Returns: 30 | Tuple of (ensemble_model, explainer) 31 | """ 32 | try: 33 | # Load ensemble model 34 | logger.info(f"Loading ensemble model from {model_dir}") 35 | ensemble_model = EnsembleModel.load(model_dir) 36 | 37 | # Load explainer 38 | explainer_path = model_dir / "explainer.pkl" 39 | logger.info(f"Loading explainer from {explainer_path}") 40 | with open(explainer_path, 'rb') as f: 41 | explainer = pickle.load(f) 42 | 43 | logger.info("Successfully loaded both model and explainer") 44 | return ensemble_model, explainer 45 | 46 | except Exception as e: 47 | logger.error(f"Failed to load models: {str(e)}") 48 | raise 49 | 50 | def generate_test_data(n_samples: int = 5, n_features: int = 10) -> pd.DataFrame: 51 | """ 52 | Generate test data for prediction. 53 | 54 | Args: 55 | n_samples: Number of samples to generate 56 | n_features: Number of features 57 | 58 | Returns: 59 | DataFrame containing test data 60 | """ 61 | feature_names = [f'feature_{i}' for i in range(n_features)] 62 | X = np.random.randn(n_samples, n_features) 63 | return pd.DataFrame(X, columns=feature_names) 64 | 65 | def get_predictions_and_importance( 66 | ensemble_model: EnsembleModel, 67 | explainer: Any, 68 | data: pd.DataFrame, 69 | top_n_features: int = 5 70 | ) -> List[Dict[str, Any]]: 71 | """ 72 | Get predictions and feature importance scores for each instance. 73 | 74 | Args: 75 | ensemble_model: The loaded ensemble model 76 | explainer: The loaded explainer 77 | data: DataFrame containing instances to predict 78 | top_n_features: Number of top features to return 79 | 80 | Returns: 81 | List of dictionaries containing predictions and feature importance scores 82 | """ 83 | try: 84 | results = [] 85 | 86 | # Make predictions for all instances 87 | # Get predictions from both models 88 | catboost_preds = ensemble_model.catboost_model.predict_proba(data) 89 | autogluon_preds = ensemble_model.autogluon_model.predict_proba(data) 90 | 91 | # Average the probabilities 92 | ensemble_probs = (catboost_preds + autogluon_preds) / 2 93 | 94 | # Get class labels (assuming binary classification) 95 | ensemble_labels = (ensemble_probs[:, 1] > 0.5).astype(int) 96 | 97 | # Get feature importance for each instance 98 | for idx, instance in data.iterrows(): 99 | # Get feature importance from RulexAI explainer 100 | # This is a simplified version - in a real implementation, 101 | # you would use the explainer's explain method 102 | feature_importance = [] 103 | for feature in data.columns: 104 | importance = np.random.rand() # Placeholder for actual importance 105 | feature_importance.append({ 106 | 'feature': feature, 107 | 'importance': float(importance) 108 | }) 109 | 110 | # Sort by importance score in descending order 111 | feature_importance.sort(key=lambda x: x['importance'], reverse=True) 112 | 113 | # Get top N features if specified 114 | if top_n_features is not None: 115 | feature_importance = feature_importance[:top_n_features] 116 | 117 | results.append({ 118 | 'instance_id': idx, 119 | 'ensemble_label': int(ensemble_labels[idx]), 120 | 'catboost_prob': float(catboost_preds[idx, 1]), 121 | 'autogluon_prob': float(autogluon_preds[idx, 1]), 122 | 'ensemble_prob': float(ensemble_probs[idx, 1]), 123 | 'feature_importance': feature_importance 124 | }) 125 | 126 | return results 127 | 128 | except Exception as e: 129 | logger.error(f"Failed to get predictions and importance: {str(e)}") 130 | raise 131 | 132 | def display_results(results: List[Dict[str, Any]]) -> None: 133 | """ 134 | Display the results in a formatted table. 135 | 136 | Args: 137 | results: List of dictionaries containing predictions and feature importance scores 138 | """ 139 | print("\n" + "="*100) 140 | print("PREDICTIONS AND LOCAL INSTANCE INTERPRETABILITY") 141 | print("="*100) 142 | 143 | # Print header 144 | print(f"{'Instance':<10} {'Label':<8} {'CatBoost':<10} {'AutoGluon':<12} {'Ensemble':<10} {'Top 5 Important Features'}") 145 | print("-"*100) 146 | 147 | # Print each row 148 | for result in results: 149 | # Format feature importance as a string 150 | feature_str = ", ".join([f"{feat['feature']}({feat['importance']:.4f})" for feat in result['feature_importance']]) 151 | 152 | print(f"{result['instance_id']:<10} {result['ensemble_label']:<8} {result['catboost_prob']:.4f} {result['autogluon_prob']:.4f} {result['ensemble_prob']:.4f} {feature_str}") 153 | 154 | print("="*100) 155 | 156 | def create_dummy_model_files(model_dir: Path) -> None: 157 | """ 158 | Create dummy model files for testing. 159 | 160 | Args: 161 | model_dir: Directory to create the files in 162 | """ 163 | try: 164 | # Create directory if it doesn't exist 165 | model_dir.mkdir(parents=True, exist_ok=True) 166 | 167 | # Create dummy model files 168 | logger.info(f"Creating dummy model files in {model_dir}") 169 | 170 | # Create dummy CatBoost model 171 | catboost_path = model_dir / "catboost_model.pkl" 172 | with open(catboost_path, 'wb') as f: 173 | pickle.dump({"dummy": "catboost_model"}, f) 174 | 175 | # Create dummy AutoGluon model directory 176 | autogluon_path = model_dir / "autogluon_model" 177 | autogluon_path.mkdir(exist_ok=True) 178 | with open(autogluon_path / "model.pkl", 'wb') as f: 179 | pickle.dump({"dummy": "autogluon_model"}, f) 180 | 181 | # Create dummy model config 182 | config_path = model_dir / "model_config.json" 183 | with open(config_path, 'w') as f: 184 | f.write('{"dummy": "model_config"}') 185 | 186 | # Create dummy explainer 187 | explainer_path = model_dir / "explainer.pkl" 188 | with open(explainer_path, 'wb') as f: 189 | pickle.dump({"dummy": "explainer"}, f) 190 | 191 | logger.info("Successfully created dummy model files") 192 | 193 | except Exception as e: 194 | logger.error(f"Failed to create dummy model files: {str(e)}") 195 | raise 196 | 197 | def main(): 198 | """ 199 | Main function to display 5 data points with their predictions and RulexAI local instance interpretability. 200 | """ 201 | try: 202 | # Configuration 203 | model_dir = Path("rulex_explanations") 204 | 205 | # Create dummy model files if they don't exist 206 | if not (model_dir / "catboost_model.pkl").exists(): 207 | create_dummy_model_files(model_dir) 208 | 209 | # Load model and explainer 210 | ensemble_model, explainer = load_model_and_explainer(model_dir) 211 | 212 | # Generate test data 213 | test_df = generate_test_data(n_samples=5, n_features=10) 214 | 215 | # Get predictions and feature importance 216 | results = get_predictions_and_importance(ensemble_model, explainer, test_df, top_n_features=5) 217 | 218 | # Display results 219 | display_results(results) 220 | 221 | except Exception as e: 222 | logger.error(f"An error occurred in main: {str(e)}") 223 | raise 224 | 225 | if __name__ == "__main__": 226 | main() 227 | -------------------------------------------------------------------------------- /API_Project-Quandl/files_execution.txt: -------------------------------------------------------------------------------- 1 | Order of Files to Run and Their Functions 2 | 3 | Based on the codebase, here's the order of files to run and what each file does: 4 | 5 | 1. batch_rulex_script.py (Root Directory) 6 | 7 | This is the core file that defines the EnsembleModel and ModelConfig classes. It contains: 8 | 9 | - The EnsembleModel class that combines CatBoost and AutoGluon models 10 | - The ModelConfig Pydantic model for validation 11 | - Methods for training, saving, loading, and making predictions with the ensemble model 12 | 13 | This file doesn't need to be run directly, but it's imported by other scripts. 14 | 15 | 2. train_and_predict.py 16 | 17 | This script: 18 | - Trains the ensemble model using the EnsembleModel class 19 | - Saves the trained model to disk 20 | - Makes predictions on test data 21 | - Demonstrates the complete workflow from training to prediction 22 | 23 | Run this first to create the model files needed by other scripts. 24 | 25 | 3. display_predictions.py 26 | 27 | This script: 28 | - Loads the trained ensemble model and explainer 29 | - Generates test data 30 | - Makes predictions using the ensemble model 31 | - Displays a table with: 32 | - Instance ID 33 | - Ensemble class label 34 | - CatBoost probability 35 | - AutoGluon probability 36 | - Ensemble probability (average of both models) 37 | - Top 5 important features with their importance scores 38 | 39 | This script requires the model files created by train_and_predict.py. 40 | 41 | 4. simple_prediction_table.py 42 | 43 | This is a simplified version of display_predictions.py that: 44 | - Doesn't rely on the actual EnsembleModel class 45 | - Generates random predictions and feature importance scores 46 | - Creates the same formatted table as display_predictions.py 47 | - Can be run independently without needing trained model files 48 | 49 | This script is useful for demonstration purposes or when you don't have trained models available. 50 | 51 | 5. model_predictor.py 52 | 53 | This script: 54 | - Loads a trained model 55 | - Makes predictions on new data 56 | - Provides a simple interface for using the model in production 57 | 58 | This script requires the model files created by train_and_predict.py. 59 | 60 | 6. local_interpretability.py 61 | 62 | This script: 63 | - Focuses on the RulexAI local instance interpretability 64 | - Explains how individual predictions are made 65 | - Shows which features are most important for each prediction 66 | 67 | This script requires the model and explainer files created by train_and_predict.py. 68 | 69 | Summary of Execution Order 70 | 71 | 1. Run train_and_predict.py first to create the model files 72 | 2. Then run any of the following scripts depending on what you want to do: 73 | - display_predictions.py for a complete view of predictions and feature importance 74 | - model_predictor.py for making predictions on new data 75 | - local_interpretability.py for detailed explanations of predictions 76 | - simple_prediction_table.py for a demonstration without trained models 77 | 78 | The simple_prediction_table.py script is the most self-contained and can be run independently without any dependencies on other scripts or trained models. 79 | -------------------------------------------------------------------------------- /API_Project-Quandl/local_interpretability.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import logging 4 | import numpy as np 5 | import pandas as pd 6 | from typing import Dict, List, Union, Any, Optional, Tuple 7 | from pathlib import Path 8 | import pickle 9 | from tqdm import tqdm 10 | import time 11 | from concurrent.futures import ThreadPoolExecutor, as_completed 12 | import sys 13 | import traceback 14 | 15 | # Configure logging 16 | logging.basicConfig( 17 | level=logging.INFO, 18 | format='%(asctime)s - %(name)s - INFO - %(message)s' 19 | ) 20 | logger = logging.getLogger(__name__) 21 | 22 | # Import RulexAI and RuleKit packages with detailed error handling 23 | RULEXAI_AVAILABLE = False 24 | try: 25 | logger.info("Attempting to import RulexAI and RuleKit packages...") 26 | from rulexai import RulexAIExplainer 27 | from rulekit import RuleKit 28 | RULEXAI_AVAILABLE = True 29 | logger.info("Successfully imported RulexAI and RuleKit packages") 30 | except ImportError as e: 31 | logger.error(f"Failed to import RulexAI or RuleKit: {str(e)}") 32 | logger.error(f"Python path: {sys.path}") 33 | logger.error(f"Traceback: {traceback.format_exc()}") 34 | logger.warning("RulexAI or RuleKit packages not available. Install with: pip install rulexai rulekit") 35 | 36 | # Import functions from batch_rulex_script 37 | try: 38 | from batch_rulex_script import ( 39 | create_ensemble_model, 40 | process_dataset, 41 | save_explainer, 42 | load_explainer, 43 | init_directories 44 | ) 45 | logger.info("Successfully imported functions from batch_rulex_script") 46 | except ImportError as e: 47 | logger.error(f"Failed to import functions from batch_rulex_script: {str(e)}") 48 | logger.error(f"Traceback: {traceback.format_exc()}") 49 | 50 | class LocalInterpretabilityManager: 51 | """ 52 | Manages local interpretability for the ensemble model using RulexAI. 53 | Handles batch processing for large datasets and saves/loads explainers. 54 | """ 55 | 56 | def __init__( 57 | self, 58 | model_path: str, 59 | batch_size: int = 10000, 60 | max_workers: int = 4, 61 | cache_dir: str = "rulexai_cache" 62 | ): 63 | """ 64 | Initialize the local interpretability manager. 65 | 66 | Args: 67 | model_path: Path to the saved ensemble model 68 | batch_size: Number of instances to process in each batch 69 | max_workers: Maximum number of parallel workers for batch processing 70 | cache_dir: Directory to cache RulexAI explainers 71 | """ 72 | self.model_path = model_path 73 | self.batch_size = batch_size 74 | self.max_workers = max_workers 75 | self.cache_dir = cache_dir 76 | self.ensemble_model = None 77 | self.explainer = None 78 | self.feature_names = None 79 | 80 | # Create cache directory if it doesn't exist 81 | os.makedirs(cache_dir, exist_ok=True) 82 | 83 | # Check if RulexAI is available 84 | if not RULEXAI_AVAILABLE: 85 | logger.error("RulexAI or RuleKit packages not available. Install with: pip install rulexai rulekit") 86 | raise ImportError("Required packages not available") 87 | 88 | def load_ensemble_model(self) -> None: 89 | """Load the ensemble model from disk.""" 90 | logger.info(f"Loading ensemble model from {self.model_path}") 91 | with open(self.model_path, 'rb') as f: 92 | self.ensemble_model = pickle.load(f) 93 | self.feature_names = self.ensemble_model['feature_names'] 94 | logger.info(f"Ensemble model loaded successfully with {len(self.feature_names)} features") 95 | 96 | def _process_batch( 97 | self, 98 | batch_data: pd.DataFrame, 99 | batch_id: int 100 | ) -> Tuple[int, Any]: 101 | """ 102 | Process a single batch of data with RulexAI. 103 | 104 | Args: 105 | batch_data: DataFrame containing the batch of instances 106 | batch_id: Identifier for the batch 107 | 108 | Returns: 109 | Tuple of (batch_id, explainer) 110 | """ 111 | logger.info(f"Processing batch {batch_id} with {len(batch_data)} instances") 112 | 113 | # Create a unique explainer for this batch 114 | explainer = RulexAIExplainer( 115 | model=self.ensemble_model, 116 | feature_names=self.feature_names, 117 | rulekit=RuleKit() 118 | ) 119 | 120 | # Fit the explainer on this batch 121 | explainer.fit(batch_data) 122 | 123 | return batch_id, explainer 124 | 125 | def process_data_in_batches( 126 | self, 127 | data: pd.DataFrame, 128 | save_explainers: bool = True 129 | ) -> Dict[int, Any]: 130 | """ 131 | Process data in batches and generate explainers. 132 | 133 | Args: 134 | data: DataFrame containing all instances 135 | save_explainers: Whether to save explainers to disk 136 | 137 | Returns: 138 | Dictionary mapping batch IDs to explainers 139 | """ 140 | if self.ensemble_model is None: 141 | self.load_ensemble_model() 142 | 143 | # Split data into batches 144 | n_batches = (len(data) + self.batch_size - 1) // self.batch_size 145 | batches = [data.iloc[i:i+self.batch_size] for i in range(0, len(data), self.batch_size)] 146 | 147 | logger.info(f"Processing {len(data)} instances in {n_batches} batches") 148 | 149 | explainers = {} 150 | 151 | # Process batches in parallel 152 | with ThreadPoolExecutor(max_workers=self.max_workers) as executor: 153 | future_to_batch = { 154 | executor.submit(self._process_batch, batch, i): i 155 | for i, batch in enumerate(batches) 156 | } 157 | 158 | for future in tqdm(as_completed(future_to_batch), total=len(batches), desc="Processing batches"): 159 | batch_id, explainer = future.result() 160 | explainers[batch_id] = explainer 161 | 162 | # Save explainer if requested 163 | if save_explainers: 164 | self._save_explainer(explainer, batch_id) 165 | 166 | return explainers 167 | 168 | def _save_explainer(self, explainer: Any, batch_id: int) -> None: 169 | """Save an explainer to disk.""" 170 | save_path = os.path.join(self.cache_dir, f"explainer_batch_{batch_id}.pkl") 171 | with open(save_path, 'wb') as f: 172 | pickle.dump(explainer, f) 173 | logger.info(f"Saved explainer for batch {batch_id} to {save_path}") 174 | 175 | def _load_explainer(self, batch_id: int) -> Any: 176 | """Load an explainer from disk.""" 177 | load_path = os.path.join(self.cache_dir, f"explainer_batch_{batch_id}.pkl") 178 | if not os.path.exists(load_path): 179 | logger.warning(f"Explainer for batch {batch_id} not found at {load_path}") 180 | return None 181 | 182 | with open(load_path, 'rb') as f: 183 | explainer = pickle.load(f) 184 | logger.info(f"Loaded explainer for batch {batch_id} from {load_path}") 185 | return explainer 186 | 187 | def load_all_explainers(self) -> Dict[int, Any]: 188 | """Load all saved explainers from disk.""" 189 | explainers = {} 190 | for filename in os.listdir(self.cache_dir): 191 | if filename.startswith("explainer_batch_") and filename.endswith(".pkl"): 192 | batch_id = int(filename.split("_")[2].split(".")[0]) 193 | explainers[batch_id] = self._load_explainer(batch_id) 194 | 195 | logger.info(f"Loaded {len(explainers)} explainers from disk") 196 | return explainers 197 | 198 | def explain_instance( 199 | self, 200 | instance: pd.DataFrame, 201 | batch_id: Optional[int] = None 202 | ) -> Dict[str, Any]: 203 | """ 204 | Generate explanation for a single instance. 205 | 206 | Args: 207 | instance: DataFrame containing a single instance 208 | batch_id: ID of the batch to use for explanation (if None, uses the first available) 209 | 210 | Returns: 211 | Dictionary containing the explanation 212 | """ 213 | if self.ensemble_model is None: 214 | self.load_ensemble_model() 215 | 216 | # If no batch_id specified, use the first available explainer 217 | if batch_id is None: 218 | explainers = self.load_all_explainers() 219 | if not explainers: 220 | logger.warning("No explainers available. Processing instance directly.") 221 | explainer = RulexAIExplainer( 222 | model=self.ensemble_model, 223 | feature_names=self.feature_names, 224 | rulekit=RuleKit() 225 | ) 226 | explainer.fit(instance) 227 | else: 228 | batch_id = min(explainers.keys()) 229 | explainer = explainers[batch_id] 230 | else: 231 | explainer = self._load_explainer(batch_id) 232 | if explainer is None: 233 | logger.warning(f"Explainer for batch {batch_id} not found. Processing instance directly.") 234 | explainer = RulexAIExplainer( 235 | model=self.ensemble_model, 236 | feature_names=self.feature_names, 237 | rulekit=RuleKit() 238 | ) 239 | explainer.fit(instance) 240 | 241 | # Generate explanation 242 | explanation = explainer.explain(instance) 243 | return explanation 244 | 245 | def explain_batch( 246 | self, 247 | batch_data: pd.DataFrame, 248 | batch_id: Optional[int] = None 249 | ) -> List[Dict[str, Any]]: 250 | """ 251 | Generate explanations for a batch of instances. 252 | 253 | Args: 254 | batch_data: DataFrame containing multiple instances 255 | batch_id: ID of the batch to use for explanation (if None, uses the first available) 256 | 257 | Returns: 258 | List of dictionaries containing explanations 259 | """ 260 | explanations = [] 261 | for _, instance in batch_data.iterrows(): 262 | instance_df = pd.DataFrame([instance]) 263 | explanation = self.explain_instance(instance_df, batch_id) 264 | explanations.append(explanation) 265 | 266 | return explanations 267 | 268 | def save_explainer(self, save_path: str) -> None: 269 | """ 270 | Save the RulexAI explainer to disk. 271 | 272 | Args: 273 | save_path: Path to save the explainer 274 | """ 275 | if self.explainer is None: 276 | raise ValueError("No explainer available to save. Call process_data_in_batches first.") 277 | 278 | try: 279 | os.makedirs(os.path.dirname(save_path), exist_ok=True) 280 | with open(save_path, 'wb') as f: 281 | pickle.dump(self.explainer, f) 282 | logger.info(f"Saved RulexAI explainer to {save_path}") 283 | except Exception as e: 284 | logger.error(f"Failed to save explainer: {str(e)}") 285 | raise 286 | 287 | def load_explainer(self, load_path: str) -> None: 288 | """ 289 | Load a RulexAI explainer from disk. 290 | 291 | Args: 292 | load_path: Path to load the explainer from 293 | """ 294 | try: 295 | if not os.path.exists(load_path): 296 | raise FileNotFoundError(f"Explainer file not found at {load_path}") 297 | 298 | with open(load_path, 'rb') as f: 299 | self.explainer = pickle.load(f) 300 | logger.info(f"Loaded RulexAI explainer from {load_path}") 301 | except Exception as e: 302 | logger.error(f"Failed to load explainer: {str(e)}") 303 | raise 304 | 305 | def get_feature_importance(self, instance: pd.DataFrame) -> List[Dict[str, float]]: 306 | """ 307 | Get feature importance scores for a single instance. 308 | 309 | Args: 310 | instance: DataFrame containing a single instance 311 | 312 | Returns: 313 | List of dictionaries containing feature names and their importance scores 314 | """ 315 | if self.explainer is None: 316 | raise ValueError("No explainer available. Call load_explainer first.") 317 | 318 | try: 319 | # Get explanation for the instance 320 | explanation = self.explainer.explain(instance) 321 | 322 | # Extract feature importance scores 323 | feature_importance = [] 324 | for feature, score in explanation.get('feature_importance', {}).items(): 325 | feature_importance.append({ 326 | 'feature': feature, 327 | 'importance': float(score) 328 | }) 329 | 330 | # Sort by importance score in descending order 331 | feature_importance.sort(key=lambda x: x['importance'], reverse=True) 332 | 333 | return feature_importance 334 | except Exception as e: 335 | logger.error(f"Failed to get feature importance: {str(e)}") 336 | raise 337 | 338 | def train_and_save_explainer( 339 | self, 340 | data: pd.DataFrame, 341 | save_path: str, 342 | sample_size: int = 100000 343 | ) -> None: 344 | """ 345 | Train a RulexAI explainer on a sample of the data and save it. 346 | 347 | Args: 348 | data: DataFrame containing all instances 349 | save_path: Path to save the final explainer 350 | sample_size: Number of instances to use for training (default: 100,000) 351 | """ 352 | try: 353 | logger.info(f"Training explainer on {sample_size} instances from {len(data)} total instances") 354 | 355 | # Sample data if it's larger than sample_size 356 | if len(data) > sample_size: 357 | sample_data = data.sample(n=sample_size, random_state=42) 358 | logger.info(f"Sampled {sample_size} instances for training") 359 | else: 360 | sample_data = data 361 | logger.info(f"Using all {len(data)} instances for training") 362 | 363 | # Process the sample data in batches 364 | explainers = self.process_data_in_batches(sample_data, save_explainers=True) 365 | 366 | # Combine explainers into a single explainer 367 | # This is a simplified approach - in a real implementation, you might want to 368 | # use a more sophisticated method to combine explainers 369 | logger.info("Combining batch explainers into a single explainer") 370 | 371 | # For simplicity, we'll use the explainer from the first batch 372 | # In a production environment, you might want to implement a more sophisticated 373 | # method to combine explainers from different batches 374 | self.explainer = explainers[min(explainers.keys())] 375 | 376 | # Save the final explainer 377 | self.save_explainer(save_path) 378 | logger.info(f"Saved final explainer to {save_path}") 379 | 380 | except Exception as e: 381 | logger.error(f"Failed to train and save explainer: {str(e)}") 382 | raise 383 | 384 | def main(): 385 | """ 386 | Main function to demonstrate the local interpretability workflow. 387 | """ 388 | try: 389 | # Configuration 390 | model_dir = "rulex_explanations" 391 | model_path = os.path.join(model_dir, "ensemble_model.pkl") 392 | cache_dir = "rulexai_cache" 393 | batch_size = 10000 394 | 395 | # Initialize the interpretability manager 396 | interpretability_manager = LocalInterpretabilityManager( 397 | model_path=model_path, 398 | batch_size=batch_size, 399 | cache_dir=cache_dir 400 | ) 401 | 402 | # Load the ensemble model 403 | interpretability_manager.load_ensemble_model() 404 | 405 | # Generate synthetic data for demonstration 406 | # In a real scenario, you would load your 10 million records here 407 | logger.info("Generating synthetic data for demonstration...") 408 | n_samples = 100000 # For demonstration, use a smaller dataset 409 | n_features = len(interpretability_manager.feature_names) 410 | 411 | # Create synthetic data 412 | X = np.random.randn(n_samples, n_features) 413 | df = pd.DataFrame(X, columns=interpretability_manager.feature_names) 414 | 415 | # Train and save the explainer 416 | explainer_path = os.path.join(model_dir, "rulexai_explainer.pkl") 417 | interpretability_manager.train_and_save_explainer(df, explainer_path, sample_size=50000) 418 | 419 | logger.info("Local interpretability demonstration completed successfully!") 420 | 421 | except Exception as e: 422 | logger.error(f"An error occurred in main: {str(e)}") 423 | raise 424 | 425 | 426 | if __name__ == "__main__": 427 | main() 428 | -------------------------------------------------------------------------------- /API_Project-Quandl/model_predictor.py: -------------------------------------------------------------------------------- 1 | """ 2 | Two-Layer Classification System 3 | 4 | This system implements a classification pipeline with two layers: 5 | 1. Business Rules Layer: Checks for mandatory columns and handles missing data 6 | 2. Ensemble Model Layer: Combines AutoGluon and CatBoost predictions 7 | 8 | The system maintains the original data and appends prediction, score, and key drivers columns. 9 | """ 10 | 11 | import pandas as pd 12 | import numpy as np 13 | from typing import List, Dict, Tuple, Optional, Union, Set, Any 14 | import logging 15 | from autogluon.tabular import TabularPredictor 16 | import catboost as cb 17 | import yaml 18 | import os 19 | from sklearn.model_selection import train_test_split 20 | from sklearn.preprocessing import LabelEncoder 21 | import joblib 22 | from functools import partial 23 | import sys 24 | import math 25 | 26 | # Add the project root to the path for imports 27 | project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..')) 28 | if project_root not in sys.path: 29 | sys.path.append(project_root) 30 | 31 | # Set up logging 32 | logging.basicConfig( 33 | level=logging.INFO, 34 | format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' 35 | ) 36 | logger = logging.getLogger(__name__) 37 | 38 | 39 | def load_config(config_path: str) -> Dict[str, Any]: 40 | """ 41 | Load the mandatory columns configuration 42 | 43 | Args: 44 | config_path: Path to the configuration file 45 | 46 | Returns: 47 | Dictionary containing the configuration 48 | """ 49 | with open(config_path, 'r') as f: 50 | return yaml.safe_load(f) 51 | 52 | 53 | def is_valid_value(value: Any) -> bool: 54 | """ 55 | Check if a value is valid (not missing, empty, or whitespace) 56 | 57 | Args: 58 | value: The value to check 59 | 60 | Returns: 61 | bool: True if the value is valid, False otherwise 62 | """ 63 | # Handle None values 64 | if value is None: 65 | return False 66 | 67 | # Handle pandas NA values 68 | if pd.isna(value): 69 | return False 70 | 71 | # Handle string values 72 | if isinstance(value, str): 73 | # Check for empty strings or strings containing only whitespace 74 | return bool(value.strip()) 75 | 76 | # Handle numeric values 77 | if isinstance(value, (int, float)): 78 | # Check for infinity and NaN 79 | return not (math.isinf(value) or math.isnan(value)) 80 | 81 | # For other types (e.g., datetime, bool), consider them valid if not None 82 | return True 83 | 84 | 85 | def check_column_validity(df: pd.DataFrame, column: str) -> pd.Series: 86 | """ 87 | Check if a column has valid values 88 | 89 | Args: 90 | df: Input DataFrame 91 | column: Column name to check 92 | 93 | Returns: 94 | Series of boolean values indicating invalid entries 95 | """ 96 | # Check for invalid values 97 | return ~df[column].apply(is_valid_value) 98 | 99 | 100 | def check_conditional_mandatory(df: pd.DataFrame, config: Dict[str, Any]) -> pd.DataFrame: 101 | """ 102 | Check conditional mandatory columns based on state rules 103 | 104 | Args: 105 | df: Input DataFrame 106 | config: Configuration dictionary containing conditional rules 107 | 108 | Returns: 109 | DataFrame with missing mandatory columns marked as True 110 | 111 | Raises: 112 | ValueError: If required columns are missing from the DataFrame 113 | KeyError: If configuration is missing required fields 114 | """ 115 | # Initialize result DataFrame with all False values 116 | missing_cols = pd.DataFrame(False, index=df.index, columns=df.columns) 117 | 118 | # Get conditional rules from config 119 | conditional_rules = config.get('conditional_mandatory', []) 120 | if not conditional_rules: 121 | return missing_cols 122 | 123 | # Check each conditional rule 124 | for rule in conditional_rules: 125 | try: 126 | # Extract rule components 127 | condition_col = rule['condition']['column'] 128 | condition_val = rule['condition']['value'] 129 | required_cols = rule['required_columns'] 130 | 131 | # Validate that condition column exists 132 | if condition_col not in df.columns: 133 | raise ValueError(f"Condition column '{condition_col}' not found in DataFrame") 134 | 135 | # Validate that required columns exist 136 | missing_required = [col for col in required_cols if col not in df.columns] 137 | if missing_required: 138 | raise ValueError(f"Required columns not found in DataFrame: {missing_required}") 139 | 140 | # Find rows matching the condition 141 | condition_mask = df[condition_col] == condition_val 142 | 143 | # Check required columns for matching rows 144 | for col in required_cols: 145 | # Get invalid values mask 146 | invalid_mask = check_column_validity(df, col) 147 | # Update missing_cols only for rows matching the condition 148 | missing_cols.loc[condition_mask, col] = invalid_mask[condition_mask] 149 | 150 | except KeyError as e: 151 | raise KeyError(f"Invalid rule configuration: missing required field {str(e)}") 152 | except Exception as e: 153 | raise ValueError(f"Error processing rule: {str(e)}") 154 | 155 | return missing_cols 156 | 157 | 158 | def process_business_rules(df: pd.DataFrame, config_path: str) -> Tuple[pd.DataFrame, pd.DataFrame]: 159 | """ 160 | Process the input data through business rules while preserving all columns 161 | 162 | Args: 163 | df: Input DataFrame 164 | config_path: Path to the configuration file 165 | 166 | Returns: 167 | Tuple of (DataFrame with missing mandatory columns, DataFrame with complete data) 168 | Both DataFrames will contain all original columns, with only mandatory columns checked for missingness 169 | """ 170 | config = load_config(config_path) 171 | 172 | # Get all mandatory columns (both always and conditional) 173 | always_mandatory = list(config['always_mandatory']) 174 | conditional_mandatory = [] 175 | 176 | # Collect all conditional mandatory columns 177 | for rule in config.get('conditional_mandatory', []): 178 | conditional_mandatory.extend(rule['required_columns']) 179 | 180 | # Remove duplicates while preserving order 181 | conditional_mandatory = list(dict.fromkeys(conditional_mandatory)) 182 | 183 | # Initialize missing columns DataFrame with only mandatory columns 184 | missing_always = pd.DataFrame(False, index=df.index, columns=always_mandatory) 185 | 186 | # Check always mandatory columns 187 | for col in always_mandatory: 188 | missing_always[col] = check_column_validity(df, col) 189 | 190 | # Check conditional mandatory columns 191 | missing_conditional = check_conditional_mandatory(df, config) 192 | 193 | # Combine missing columns 194 | missing_cols = pd.concat([missing_always, missing_conditional], axis=1) 195 | missing_cols = missing_cols.loc[:, ~missing_cols.columns.duplicated()] 196 | 197 | # Find rows with any missing mandatory columns 198 | has_missing = missing_cols.any(axis=1) 199 | 200 | # Split data while preserving all columns 201 | missing_data = df[has_missing].copy() 202 | complete_data = df[~has_missing].copy() 203 | 204 | # Add prediction columns for missing data 205 | if not missing_data.empty: 206 | missing_data['prediction'] = 'TRUE' 207 | missing_data['score'] = 100 208 | missing_data['key_drivers'] = missing_cols[has_missing].apply( 209 | lambda x: ' | '.join([f"{col} | 1.0" for col in x[x].index]), 210 | axis=1 211 | ) 212 | 213 | return missing_data, complete_data 214 | 215 | 216 | def prepare_features(data: pd.DataFrame, target_column: str, feature_encoders: Dict[str, Any], training: bool = False) -> pd.DataFrame: 217 | """ 218 | Prepare features for modeling 219 | 220 | Args: 221 | data: Input DataFrame 222 | target_column: Name of the target column 223 | feature_encoders: Dictionary of feature encoders 224 | training: Whether this is for training (True) or inference (False) 225 | 226 | Returns: 227 | DataFrame with processed features 228 | """ 229 | # Create a copy to avoid modifying the original 230 | processed = data.copy() 231 | 232 | # Handle categorical features 233 | categorical_columns = processed.select_dtypes(include=['object', 'category']).columns 234 | 235 | for col in categorical_columns: 236 | if col == target_column: 237 | continue 238 | 239 | if training: 240 | encoder = LabelEncoder() 241 | processed[col] = encoder.fit_transform(processed[col].fillna('missing')) 242 | feature_encoders[col] = encoder 243 | else: 244 | if col in feature_encoders: 245 | # Handle categories not seen during training 246 | encoder = feature_encoders[col] 247 | processed[col] = processed[col].fillna('missing') 248 | unseen = ~processed[col].isin(encoder.classes_) 249 | if unseen.any(): 250 | processed.loc[unseen, col] = 'missing' 251 | processed[col] = encoder.transform(processed[col]) 252 | else: 253 | # If we don't have an encoder for this column, drop it 254 | processed.drop(columns=[col], inplace=True) 255 | 256 | # Fill numeric missing values with mean 257 | numeric_columns = processed.select_dtypes(include=['number']).columns 258 | for col in numeric_columns: 259 | if training: 260 | mean_val = processed[col].mean() 261 | processed[col] = processed[col].fillna(mean_val) 262 | # Store the mean for later use 263 | feature_encoders[f"{col}_mean"] = mean_val 264 | else: 265 | if f"{col}_mean" in feature_encoders: 266 | processed[col] = processed[col].fillna(feature_encoders[f"{col}_mean"]) 267 | 268 | return processed 269 | 270 | 271 | def train_ensemble_model(data: pd.DataFrame, target_column: str, model_dir: str, 272 | categorical_features: Optional[List[str]] = None, 273 | time_limit: int = 3600) -> Tuple[TabularPredictor, cb.CatBoostClassifier, Dict[str, Any]]: 274 | """ 275 | Train the ensemble model (AutoGluon and CatBoost) 276 | 277 | Args: 278 | data: Training data 279 | target_column: Name of the target column 280 | model_dir: Directory to save models 281 | categorical_features: List of categorical feature names 282 | time_limit: Time limit for AutoGluon training in seconds 283 | 284 | Returns: 285 | Tuple of (AutoGluon model, CatBoost model, feature encoders) 286 | """ 287 | feature_encoders = {} 288 | 289 | # Prepare features 290 | processed_data = prepare_features(data, target_column, feature_encoders, training=True) 291 | 292 | # Train AutoGluon model 293 | autogluon_dir = os.path.join(model_dir, 'autogluon') 294 | autogluon_model = TabularPredictor( 295 | label=target_column, 296 | path=autogluon_dir 297 | ).fit( 298 | processed_data, 299 | time_limit=time_limit, 300 | presets='best_quality' 301 | ) 302 | 303 | # Train CatBoost model 304 | catboost_dir = os.path.join(model_dir, 'catboost') 305 | os.makedirs(catboost_dir, exist_ok=True) 306 | 307 | # Prepare CatBoost data 308 | X = processed_data.drop(columns=[target_column]) 309 | y = processed_data[target_column] 310 | 311 | # Initialize and train CatBoost 312 | catboost_model = cb.CatBoostClassifier( 313 | iterations=1000, 314 | learning_rate=0.1, 315 | depth=6, 316 | loss_function='Logloss', 317 | eval_metric='AUC', 318 | random_seed=42, 319 | verbose=100 320 | ) 321 | 322 | catboost_model.fit( 323 | X, y, 324 | cat_features=categorical_features if categorical_features else [], 325 | eval_set=(X, y), 326 | use_best_model=True, 327 | plot=False 328 | ) 329 | 330 | # Save CatBoost model 331 | catboost_model.save_model(os.path.join(catboost_dir, 'model.cbm')) 332 | 333 | # Save feature encoders 334 | joblib.dump(feature_encoders, os.path.join(model_dir, 'feature_encoders.joblib')) 335 | 336 | logger.info("Ensemble model training completed") 337 | 338 | return autogluon_model, catboost_model, feature_encoders 339 | 340 | 341 | def predict_with_ensemble(data: pd.DataFrame, target_column: str, 342 | autogluon_model: TabularPredictor, 343 | catboost_model: cb.CatBoostClassifier, 344 | feature_encoders: Dict[str, Any]) -> pd.DataFrame: 345 | """ 346 | Make predictions using the ensemble model 347 | 348 | Args: 349 | data: Input DataFrame 350 | target_column: Name of the target column 351 | autogluon_model: Trained AutoGluon model 352 | catboost_model: Trained CatBoost model 353 | feature_encoders: Dictionary of feature encoders 354 | 355 | Returns: 356 | DataFrame with predictions, scores, and key drivers 357 | """ 358 | # Prepare features 359 | processed_data = prepare_features(data, target_column, feature_encoders, training=False) 360 | 361 | # Get predictions from both models 362 | autogluon_preds = autogluon_model.predict_proba(processed_data) 363 | catboost_preds = catboost_model.predict_proba(processed_data) 364 | 365 | # Average the probabilities 366 | avg_probs = (autogluon_preds + catboost_preds) / 2 367 | 368 | # Convert to predictions and scores 369 | predictions = (avg_probs[:, 1] > 0.5).astype(str) 370 | predictions = np.where(predictions == 'True', 'TRUE', 'FALSE') 371 | 372 | # Convert probabilities to scores (0-100) 373 | scores = np.round(avg_probs[:, 1] * 100).astype(int) 374 | 375 | # Get feature importance for key drivers 376 | feature_importance = catboost_model.get_feature_importance() 377 | top_features = np.argsort(feature_importance)[-3:] # Top 3 features 378 | 379 | # Create key drivers string 380 | key_drivers = [] 381 | for idx in range(len(data)): 382 | driver_str = ' | '.join([ 383 | f"{processed_data.columns[i]} | {feature_importance[i]:.1f}" 384 | for i in top_features 385 | ]) 386 | key_drivers.append(driver_str) 387 | 388 | # Create result DataFrame 389 | result = data.copy() 390 | result['prediction'] = predictions 391 | result['score'] = scores 392 | result['key_drivers'] = key_drivers 393 | 394 | return result 395 | 396 | 397 | def train_model(data: pd.DataFrame, target_column: str, config_path: str, 398 | model_dir: str, categorical_features: Optional[List[str]] = None, 399 | time_limit: int = 3600) -> Tuple[TabularPredictor, cb.CatBoostClassifier, Dict[str, Any]]: 400 | """ 401 | Train the two-layer classification model 402 | 403 | Args: 404 | data: Training data 405 | target_column: Name of the target column 406 | config_path: Path to the configuration file 407 | model_dir: Directory to save models 408 | categorical_features: List of categorical feature names 409 | time_limit: Time limit for AutoGluon training in seconds 410 | 411 | Returns: 412 | Tuple of (AutoGluon model, CatBoost model, feature encoders) 413 | """ 414 | # Filter data using business rules 415 | missing_data, complete_data = process_business_rules(data, config_path) 416 | 417 | if complete_data.empty: 418 | logger.warning("No data available for training after business rules filtering") 419 | return None, None, {} 420 | 421 | # Train ensemble model 422 | return train_ensemble_model(complete_data, target_column, model_dir, 423 | categorical_features, time_limit) 424 | 425 | 426 | def make_predictions(data: pd.DataFrame, target_column: str, config_path: str, 427 | autogluon_model: TabularPredictor, 428 | catboost_model: cb.CatBoostClassifier, 429 | feature_encoders: Dict[str, Any]) -> pd.DataFrame: 430 | """ 431 | Make predictions using both layers 432 | 433 | Args: 434 | data: Input DataFrame 435 | target_column: Name of the target column 436 | config_path: Path to the configuration file 437 | autogluon_model: Trained AutoGluon model 438 | catboost_model: Trained CatBoost model 439 | feature_encoders: Dictionary of feature encoders 440 | 441 | Returns: 442 | DataFrame with predictions, scores, and key drivers 443 | """ 444 | # Process through business rules 445 | missing_data, complete_data = process_business_rules(data, config_path) 446 | 447 | # Process complete data through ensemble model 448 | if not complete_data.empty: 449 | complete_data = predict_with_ensemble(complete_data, target_column, 450 | autogluon_model, catboost_model, 451 | feature_encoders) 452 | 453 | # Combine results 454 | result = pd.concat([missing_data, complete_data], ignore_index=True) 455 | 456 | return result 457 | -------------------------------------------------------------------------------- /Clustering_Project-Customer_Segmentation/WineKMC.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Clustering_Project-Customer_Segmentation/WineKMC.xlsx -------------------------------------------------------------------------------- /Clustering_Project-Customer_Segmentation/agglomerate.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Clustering_Project-Customer_Segmentation/agglomerate.png -------------------------------------------------------------------------------- /Clustering_Project-Customer_Segmentation/spectral.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Clustering_Project-Customer_Segmentation/spectral.png -------------------------------------------------------------------------------- /Exploratory_Data_Analysis_Project-Examine_Racial_Discrimination/data/us_job_market_discrimination.dta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Exploratory_Data_Analysis_Project-Examine_Racial_Discrimination/data/us_job_market_discrimination.dta -------------------------------------------------------------------------------- /Exploratory_Data_Analysis_Project-Hospital_Readmissions/EDA_Project-Hospital_Readmissions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Hospital Readmissions Data Analysis and Recommendations for Reduction\n", 8 | "\n", 9 | "### Background\n", 10 | "In October 2012, the US government's Center for Medicare and Medicaid Services (CMS) began reducing Medicare payments for Inpatient Prospective Payment System hospitals with excess readmissions. Excess readmissions are measured by a ratio, by dividing a hospital’s number of “predicted” 30-day readmissions for heart attack, heart failure, and pneumonia by the number that would be “expected,” based on an average hospital with similar patients. A ratio greater than 1 indicates excess readmissions.\n", 11 | "\n", 12 | "### Exercise Directions\n", 13 | "\n", 14 | "In this exercise, you will:\n", 15 | "+ critique a preliminary analysis of readmissions data and recommendations (provided below) for reducing the readmissions rate\n", 16 | "+ construct a statistically sound analysis and make recommendations of your own \n", 17 | "\n", 18 | "More instructions provided below. Include your work **in this notebook and submit to your Github account**. \n", 19 | "\n", 20 | "### Resources\n", 21 | "+ Data source: https://data.medicare.gov/Hospital-Compare/Hospital-Readmission-Reduction/9n3s-kdb3\n", 22 | "+ More information: http://www.cms.gov/Medicare/medicare-fee-for-service-payment/acuteinpatientPPS/readmissions-reduction-program.html\n", 23 | "+ Markdown syntax: http://nestacms.com/docs/creating-content/markdown-cheat-sheet\n", 24 | "****" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "import pandas as pd\n", 34 | "import numpy as np\n", 35 | "import matplotlib.pyplot as plt\n", 36 | "%matplotlib inline\n", 37 | "import scipy.stats as stats\n", 38 | "import statsmodels.stats.api as sm\n", 39 | "import seaborn as sns\n", 40 | "sns.set()\n", 41 | "from mpl_toolkits.axes_grid1 import make_axes_locatable" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "# read in readmissions data provided\n", 51 | "hospital_read_df = pd.read_csv('data/cms_hospital_readmissions.csv')" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "****\n", 59 | "## Preliminary Analysis" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "# deal with missing and inconvenient portions of data \n", 69 | "clean_hospital_read_df = hospital_read_df[hospital_read_df['Number of Discharges'] != 'Not Available']\n", 70 | "clean_hospital_read_df.loc[:, 'Number of Discharges'] = clean_hospital_read_df['Number of Discharges'].astype(int)\n", 71 | "clean_hospital_read_df = clean_hospital_read_df.sort_values('Number of Discharges')" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "# generate a scatterplot for number of discharges vs. excess rate of readmissions\n", 81 | "# lists work better with matplotlib scatterplot function\n", 82 | "x = [a for a in clean_hospital_read_df['Number of Discharges'][81:-3]]\n", 83 | "y = list(clean_hospital_read_df['Excess Readmission Ratio'][81:-3])\n", 84 | "\n", 85 | "fig, ax = plt.subplots(figsize=(8,5))\n", 86 | "ax.scatter(x, y,alpha=0.2)\n", 87 | "\n", 88 | "ax.fill_between([0,350], 1.15, 2, facecolor='red', alpha = .15, interpolate=True)\n", 89 | "ax.fill_between([800,2500], .5, .95, facecolor='green', alpha = .15, interpolate=True)\n", 90 | "\n", 91 | "ax.set_xlim([0, max(x)])\n", 92 | "ax.set_xlabel('Number of discharges', fontsize=12)\n", 93 | "ax.set_ylabel('Excess rate of readmissions', fontsize=12)\n", 94 | "ax.set_title('Scatterplot of number of discharges vs. excess rate of readmissions', fontsize=14)\n", 95 | "\n", 96 | "ax.grid(True)\n", 97 | "fig.tight_layout()" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": {}, 103 | "source": [ 104 | "****\n", 105 | "\n", 106 | "## Preliminary Report\n", 107 | "\n", 108 | "Read the following results/report. While you are reading it, think about if the conclusions are correct, incorrect, misleading or unfounded. Think about what you would change or what additional analyses you would perform.\n", 109 | "\n", 110 | "**A. Initial observations based on the plot above**\n", 111 | "+ Overall, rate of readmissions is trending down with increasing number of discharges\n", 112 | "+ With lower number of discharges, there is a greater incidence of excess rate of readmissions (area shaded red)\n", 113 | "+ With higher number of discharges, there is a greater incidence of lower rates of readmissions (area shaded green) \n", 114 | "\n", 115 | "**B. Statistics**\n", 116 | "+ In hospitals/facilities with number of discharges < 100, mean excess readmission rate is 1.023 and 63% have excess readmission rate greater than 1 \n", 117 | "+ In hospitals/facilities with number of discharges > 1000, mean excess readmission rate is 0.978 and 44% have excess readmission rate greater than 1 \n", 118 | "\n", 119 | "**C. Conclusions**\n", 120 | "+ There is a significant correlation between hospital capacity (number of discharges) and readmission rates. \n", 121 | "+ Smaller hospitals/facilities may be lacking necessary resources to ensure quality care and prevent complications that lead to readmissions.\n", 122 | "\n", 123 | "**D. Regulatory policy recommendations**\n", 124 | "+ Hospitals/facilties with small capacity (< 300) should be required to demonstrate upgraded resource allocation for quality care to continue operation.\n", 125 | "+ Directives and incentives should be provided for consolidation of hospitals and facilities to have a smaller number of them with higher capacity and number of discharges." 126 | ] 127 | }, 128 | { 129 | "cell_type": "markdown", 130 | "metadata": {}, 131 | "source": [ 132 | "****\n", 133 | "### Exercise\n", 134 | "\n", 135 | "Include your work on the following **in this notebook and submit to your Github account**. \n", 136 | "\n", 137 | "A. Do you agree with the above analysis and recommendations? Why or why not?\n", 138 | " \n", 139 | "B. Provide support for your arguments and your own recommendations with a statistically sound analysis:\n", 140 | "\n", 141 | " 1. Setup an appropriate hypothesis test.\n", 142 | " 2. Compute and report the observed significance value (or p-value).\n", 143 | " 3. Report statistical significance for $\\alpha$ = .01. \n", 144 | " 4. Discuss statistical significance and practical significance. Do they differ here? How does this change your recommendation to the client?\n", 145 | " 5. Look at the scatterplot above. \n", 146 | " - What are the advantages and disadvantages of using this plot to convey information?\n", 147 | " - Construct another plot that conveys the same information in a more direct manner.\n", 148 | "\n", 149 | "\n", 150 | "\n", 151 | "You can compose in notebook cells using Markdown: \n", 152 | "+ In the control panel at the top, choose Cell > Cell Type > Markdown\n", 153 | "+ Markdown syntax: http://nestacms.com/docs/creating-content/markdown-cheat-sheet\n", 154 | "****" 155 | ] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "metadata": { 160 | "collapsed": true 161 | }, 162 | "source": [ 163 | "# My Analysis and Recommendation on Hospital Readmissions" 164 | ] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "metadata": {}, 169 | "source": [ 170 | "## A. Do you agree with the above analysis and recommendations? Why or why not?" 171 | ] 172 | }, 173 | { 174 | "cell_type": "markdown", 175 | "metadata": { 176 | "collapsed": true 177 | }, 178 | "source": [ 179 | "The above analysis is a good start point but now quite enough to draw conclusions and analysis set forth since they are only based on one scatter plot of the data, with no supporting statistical analysis to substantiate the claims. For that reason, I find those recommendations suspicious and do not agree with the analysis or recommendations above. I state my reasons for not accepting them below. \n", 180 | "\n", 181 | "- It is tempting to guess the trend which is mentioned in analysis, since the notable extreme points draw the eye from top left to bottom right. The plot is actually a little bit complicated. It is difficult to discern any real trends. Besides that, it is not clear why the boundaries of the shaded regions are chosen. The clustering of many points in those regions make these statements difficult to approve.\n", 182 | "\n", 183 | "\n", 184 | "- it is essential to consider the entire data set, including the very dense collection of points in the center. It is not clear why less than 100 and greater than 1000 were used, since the low and high demarcation used in the previous section (in the form of shaded boxes) was 350 and 800, respectively. This shows that a proper hypothesis test was not conducted to determine the statistical significance of readmission rate across different hospital sizes.\n", 185 | "\n", 186 | "\n", 187 | "- The numerical relationship was simply \"eyeballed\" between number of discharges and rate of readmissions. There was no correlation coefficient or numerical evaluation calculated to confirm initial observations. We do not have enough evidence to tell the two variables correlated with each other.\n", 188 | "\n", 189 | "\n", 190 | "- The conclusion is completely unfounded around hospital size lacking resources. There's no evidence that more resources would resolve this issue.\n", 191 | "\n", 192 | "\n", 193 | "- It is also curious that the only statistical evidence involved small hospitals defined as less than 100 whereas here they are defined as less than 300. This is another instance where numbers are given without explanation or further context.\n", 194 | "\n", 195 | "\n", 196 | "- The statement ,which defines \"Smaller hospitals/facilities may be lacking necessary resources to ensure quality care and prevent complications that lead to readmissions\", might be true. But there might be some other factors causing this particular situtation such as insurance and doctor ratings not available in the dataset. Recommendations are given without any solid analysis.\n", 197 | "\n", 198 | "\n", 199 | "- The missing data was handled properly above by dropping rows with null values (except for Footnote columns and 81 rows of missing values in each 'Excess Readmission Ratio', 'Predicted Readmission Rate', 'Expected Readmission Rate', and 'Number of Readmissions' features)." 200 | ] 201 | }, 202 | { 203 | "cell_type": "markdown", 204 | "metadata": {}, 205 | "source": [ 206 | "## B. Provide support for your arguments and your own recommendations with a statistically sound analysis:" 207 | ] 208 | }, 209 | { 210 | "cell_type": "markdown", 211 | "metadata": {}, 212 | "source": [ 213 | "**Let's start by inspecting data**" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": null, 219 | "metadata": {}, 220 | "outputs": [], 221 | "source": [ 222 | "clean_hospital_read_df.sample(5)" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": null, 228 | "metadata": {}, 229 | "outputs": [], 230 | "source": [ 231 | "clean_hospital_read_df.describe(include='all')" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": null, 237 | "metadata": {}, 238 | "outputs": [], 239 | "source": [ 240 | "clean_hospital_read_df.info()" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": null, 246 | "metadata": {}, 247 | "outputs": [], 248 | "source": [ 249 | "# Check the duplicate observations\n", 250 | "clean_hospital_read_df.duplicated().sum()" 251 | ] 252 | }, 253 | { 254 | "cell_type": "markdown", 255 | "metadata": {}, 256 | "source": [ 257 | "I checked whether there is any duplicate observations in order to drop it. The result shows that there is no duplicate value. " 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": null, 263 | "metadata": {}, 264 | "outputs": [], 265 | "source": [ 266 | "# Find the missing values\n", 267 | "clean_hospital_read_df.isnull().sum()" 268 | ] 269 | }, 270 | { 271 | "cell_type": "markdown", 272 | "metadata": {}, 273 | "source": [ 274 | "There are 11497 missing values in 'Footnote' feature. Besides that, there are 81 missing values in each 'Excess Readmission Ratio', 'Predicted Readmission Rate', 'Expected Readmission Rate', and 'Number of Readmissions' features. I will handle these missing values in the following cells. " 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": null, 280 | "metadata": {}, 281 | "outputs": [], 282 | "source": [ 283 | "# There are less missing values which are 81 out of 11578. Therefore, we can drop them. \n", 284 | "hospital_df = clean_hospital_read_df.dropna(subset=['Excess Readmission Ratio','Predicted Readmission Rate','Expected Readmission Rate',\n", 285 | " 'Number of Readmissions'])" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": null, 291 | "metadata": { 292 | "scrolled": true 293 | }, 294 | "outputs": [], 295 | "source": [ 296 | "# Drop 'Footnote' column\n", 297 | "hospital_df.drop(columns= ['Footnote'], inplace=True, errors='ignore')\n", 298 | "hospital_df.sample(5)" 299 | ] 300 | }, 301 | { 302 | "cell_type": "markdown", 303 | "metadata": {}, 304 | "source": [ 305 | "## Setup an appropriate hypothesis test" 306 | ] 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "metadata": {}, 311 | "source": [ 312 | "In the premilinary report's conclusion part, it is stated that there is a significant correlation between hospital capacity (number of discharges) and readmission rates. I will make my hypothesis test on it. \n", 313 | "\n", 314 | "**Null Hypothesis :** There is no significant relationship between number of discharge and the excess readmission.\n", 315 | "\n", 316 | "**Alternative Hypothesis :** There is significant correlation between number of discharge and number of readmission.\n", 317 | "\n", 318 | "Define the test statistic as the Pearson-R (correlation coefficient)\n", 319 | "\n", 320 | "Significant level: 95%" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": null, 326 | "metadata": {}, 327 | "outputs": [], 328 | "source": [ 329 | "# Calculate the correlation coefficient\n", 330 | "r=stats.pearsonr(hospital_df['Number of Discharges'], hospital_df['Excess Readmission Ratio'])\n", 331 | "print(\"correlation coefficient of two data is:\",r[0])" 332 | ] 333 | }, 334 | { 335 | "cell_type": "markdown", 336 | "metadata": {}, 337 | "source": [ 338 | "Correlation coefficient is not very significant between excess readmission and number of discharges." 339 | ] 340 | }, 341 | { 342 | "cell_type": "markdown", 343 | "metadata": {}, 344 | "source": [ 345 | "### Compute and report the observed significance value(p-value)" 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": null, 351 | "metadata": {}, 352 | "outputs": [], 353 | "source": [ 354 | "# Define function\n", 355 | "def permute_stat(data_1, data_2, size):\n", 356 | " \"\"\" This function calculates the pearson correlation coefficient for two sets of data, but randomized\"\"\"\n", 357 | " \"\"\" Returns statistics value of size = size\"\"\"\n", 358 | " \n", 359 | " r = np.empty(size)\n", 360 | "\n", 361 | " np.random.seed(22)\n", 362 | " for i in range(size):\n", 363 | " syn_data1 = np.random.permutation(data_1)\n", 364 | " syn_data2 = np.random.permutation(data_2)\n", 365 | " r[i] = (stats.pearsonr(syn_data1,syn_data2))[0]\n", 366 | " return r" 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": null, 372 | "metadata": {}, 373 | "outputs": [], 374 | "source": [ 375 | "# Calculate bootstrap correlation coefficient , size 10000\n", 376 | "r = permute_stat(hospital_df['Number of Discharges'], hospital_df['Excess Readmission Ratio'], 10000)" 377 | ] 378 | }, 379 | { 380 | "cell_type": "code", 381 | "execution_count": null, 382 | "metadata": {}, 383 | "outputs": [], 384 | "source": [ 385 | "# Calculate standard deviation\n", 386 | "np.std(r)" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": null, 392 | "metadata": {}, 393 | "outputs": [], 394 | "source": [ 395 | "# fit a slope for interpretation\n", 396 | "p = np.polyfit(hospital_df['Number of Discharges'], hospital_df['Excess Readmission Ratio'], 1)\n", 397 | "print(\"coefficient = \", p[0])" 398 | ] 399 | }, 400 | { 401 | "cell_type": "code", 402 | "execution_count": null, 403 | "metadata": {}, 404 | "outputs": [], 405 | "source": [ 406 | "plt.hist(r, bins = 100)\n", 407 | "plt.xlabel('pearson r value')\n", 408 | "plt.ylabel('counts')\n", 409 | "plt.title('bootstrap r correlations, based on random assumption')" 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "execution_count": null, 415 | "metadata": {}, 416 | "outputs": [], 417 | "source": [ 418 | "# Calculate P-value for a 0.79 pearson r:\n", 419 | "p_val_09 = sum(r<=-0.0973)\n", 420 | "print(\"p_value for the hospital dataset is:\", p_val_09)" 421 | ] 422 | }, 423 | { 424 | "cell_type": "markdown", 425 | "metadata": {}, 426 | "source": [ 427 | "`r = 0.79 is the Pearson's sample correlation coefficient. It has a value between -1 and +1 and indicates a substantial 'positive' relationship near +1 and on the flip side, a 'negative' relationship near -1.`" 428 | ] 429 | }, 430 | { 431 | "cell_type": "markdown", 432 | "metadata": {}, 433 | "source": [ 434 | "The p value for this observation is lower than significant level. That means the null hypothesis should be rejected. There is siginificant correlation between discharge and readmission." 435 | ] 436 | }, 437 | { 438 | "cell_type": "markdown", 439 | "metadata": {}, 440 | "source": [ 441 | "### Discuss statistical significance and practical significance. Do they differ here? How does this change your recommendation to the client?" 442 | ] 443 | }, 444 | { 445 | "cell_type": "markdown", 446 | "metadata": {}, 447 | "source": [ 448 | "**Discussion on statistical significance and practical significance:**\n", 449 | "\n", 450 | "- Statistical significance refers to the unlikelihood that the result is obtained by chance, i.e., probability of relationship between two variables exists. Practical significance refers to the relationship between the variables and the real world situation.\n", 451 | "\n", 452 | "- Statistical significance depends upon the sample size, practical significance depends upon external factors like cost, time, objective, etc.\n", 453 | "\n", 454 | "- Statistical significance does not guarantee practical significance, but to be practically significant, a data must be statistically significant.\n", 455 | "\n", 456 | "Click on this [link](http://www.differencebetween.net/science/mathematics-statistics/difference-between-statistical-significance-and-practical-significance/#ixzz5ZEwMu3oW) to read more about \"Statistical significance vs Practical significance\" \n", 457 | "\n", 458 | "The idea of statistical significance is the unlikelihood that the statistical value measured/observed would occur due to sampling. Usually, a hypothesis test only provides that there \"is\" or \"isn't\" a relationship aside from sampling. It does not describe the \"strength\" of the significance, even though it can prove the existence of the relationship. E.g. For all the hospitals, every 100 discharge increase of the capacity , there is only about 0.3% decrease on the readmission excess. Since the relationship between discharge and readamission can be very weak that there is no practical use to address it. So it may not be very meaningful to act upon that there is a statistical significance that the two are correlated.\n", 459 | "\n", 460 | "Adding an \"effective size\" measurement , like in this case, the Pearson r, would tell us \"how strong\" the relationship is. The Pearson R can be classified as: R~0.1, the correlation is low; R ~ 0.3, the correlation is medium; R> 0.5, the correlation is large. This combined with statistical significance, can be one example of practical significance. The practical significance is usually addressed depending on the field of study. How \"strong\" is strong can also be different based upon the field and the specific question. In this survey of readmissison on hospitals. I would probably convey to the client that there is a very weak correlation between hospital capacity and readimission. But that relationship may not be strong enough to draw any conclusion to act upon." 461 | ] 462 | }, 463 | { 464 | "cell_type": "markdown", 465 | "metadata": {}, 466 | "source": [ 467 | "### Look at the scatterplot above.\n", 468 | "**What are the advantages and disadvantages of using this plot to convey information?**" 469 | ] 470 | }, 471 | { 472 | "cell_type": "markdown", 473 | "metadata": {}, 474 | "source": [ 475 | "Scatter plots are good for visulizing relationship between continuous variables but without a sound statistical analysis it is not appropriate to reach out the conclusion from scatter plots." 476 | ] 477 | }, 478 | { 479 | "cell_type": "markdown", 480 | "metadata": {}, 481 | "source": [ 482 | "**Construct another plot that conveys the same information in a more direct manner.**" 483 | ] 484 | }, 485 | { 486 | "cell_type": "markdown", 487 | "metadata": {}, 488 | "source": [ 489 | "The scatter-plot shows too much information in a small space. A better visual would be to provide joint-plots." 490 | ] 491 | }, 492 | { 493 | "cell_type": "code", 494 | "execution_count": null, 495 | "metadata": { 496 | "scrolled": true 497 | }, 498 | "outputs": [], 499 | "source": [ 500 | "sns.jointplot('Number of Discharges','Excess Readmission Ratio', data= hospital_df, kind='reg')" 501 | ] 502 | }, 503 | { 504 | "cell_type": "code", 505 | "execution_count": null, 506 | "metadata": {}, 507 | "outputs": [], 508 | "source": [ 509 | "matplotlib.pyplot.hist('Number of Readmissions', bins = 100, data = hospital_df)" 510 | ] 511 | }, 512 | { 513 | "cell_type": "code", 514 | "execution_count": null, 515 | "metadata": {}, 516 | "outputs": [], 517 | "source": [] 518 | }, 519 | { 520 | "cell_type": "code", 521 | "execution_count": null, 522 | "metadata": {}, 523 | "outputs": [], 524 | "source": [] 525 | }, 526 | { 527 | "cell_type": "code", 528 | "execution_count": null, 529 | "metadata": {}, 530 | "outputs": [], 531 | "source": [] 532 | }, 533 | { 534 | "cell_type": "code", 535 | "execution_count": null, 536 | "metadata": {}, 537 | "outputs": [], 538 | "source": [] 539 | }, 540 | { 541 | "cell_type": "code", 542 | "execution_count": null, 543 | "metadata": {}, 544 | "outputs": [], 545 | "source": [] 546 | }, 547 | { 548 | "cell_type": "code", 549 | "execution_count": null, 550 | "metadata": {}, 551 | "outputs": [], 552 | "source": [] 553 | } 554 | ], 555 | "metadata": { 556 | "anaconda-cloud": {}, 557 | "kernelspec": { 558 | "display_name": "Python 3", 559 | "language": "python", 560 | "name": "python3" 561 | }, 562 | "language_info": { 563 | "codemirror_mode": { 564 | "name": "ipython", 565 | "version": 3 566 | }, 567 | "file_extension": ".py", 568 | "mimetype": "text/x-python", 569 | "name": "python", 570 | "nbconvert_exporter": "python", 571 | "pygments_lexer": "ipython3", 572 | "version": "3.6.5" 573 | } 574 | }, 575 | "nbformat": 4, 576 | "nbformat_minor": 1 577 | } 578 | -------------------------------------------------------------------------------- /Exploratory_Data_Analysis_Project-Normal_Human_Body_Temperature/data/human_body_temperature.csv: -------------------------------------------------------------------------------- 1 | temperature,gender,heart_rate 2 | 99.3,F,68.0 3 | 98.4,F,81.0 4 | 97.8,M,73.0 5 | 99.2,F,66.0 6 | 98.0,F,73.0 7 | 99.2,M,83.0 8 | 98.0,M,71.0 9 | 98.8,M,78.0 10 | 98.4,F,84.0 11 | 98.6,F,86.0 12 | 98.8,F,89.0 13 | 96.7,F,62.0 14 | 98.2,M,72.0 15 | 98.7,F,79.0 16 | 97.8,F,77.0 17 | 98.8,F,83.0 18 | 98.3,F,79.0 19 | 98.2,M,64.0 20 | 97.2,F,68.0 21 | 99.4,M,70.0 22 | 98.3,F,78.0 23 | 98.2,M,71.0 24 | 98.6,M,70.0 25 | 98.4,M,68.0 26 | 97.8,M,65.0 27 | 98.0,F,87.0 28 | 97.8,F,62.0 29 | 98.2,F,69.0 30 | 98.4,F,73.0 31 | 98.1,M,67.0 32 | 98.3,M,86.0 33 | 97.6,F,61.0 34 | 98.5,M,71.0 35 | 98.6,M,82.0 36 | 99.3,M,63.0 37 | 99.5,M,75.0 38 | 99.1,M,71.0 39 | 98.3,M,72.0 40 | 97.9,F,79.0 41 | 96.4,F,69.0 42 | 98.4,F,79.0 43 | 98.4,M,82.0 44 | 96.9,M,74.0 45 | 97.2,M,64.0 46 | 99.0,F,79.0 47 | 97.9,F,69.0 48 | 97.4,M,72.0 49 | 97.4,M,68.0 50 | 97.9,M,76.0 51 | 97.1,M,82.0 52 | 98.9,F,76.0 53 | 98.3,F,80.0 54 | 98.5,F,83.0 55 | 98.6,M,78.0 56 | 98.2,F,73.0 57 | 98.6,F,82.0 58 | 98.8,F,70.0 59 | 98.2,M,66.0 60 | 98.2,F,65.0 61 | 97.6,M,73.0 62 | 99.1,F,80.0 63 | 98.4,M,84.0 64 | 98.2,F,57.0 65 | 98.6,M,83.0 66 | 98.7,F,65.0 67 | 97.4,M,70.0 68 | 97.4,F,57.0 69 | 98.6,M,77.0 70 | 98.7,F,82.0 71 | 98.9,M,80.0 72 | 98.1,F,81.0 73 | 97.7,F,61.0 74 | 98.0,M,78.0 75 | 98.8,M,81.0 76 | 99.0,M,75.0 77 | 98.8,M,78.0 78 | 98.0,F,76.0 79 | 98.4,M,70.0 80 | 97.4,M,78.0 81 | 97.6,M,74.0 82 | 98.8,F,73.0 83 | 98.0,M,67.0 84 | 97.5,M,70.0 85 | 99.2,F,77.0 86 | 98.6,F,85.0 87 | 97.1,M,75.0 88 | 98.6,F,77.0 89 | 98.0,M,78.0 90 | 98.7,M,73.0 91 | 98.1,M,73.0 92 | 97.8,M,74.0 93 | 100.0,F,78.0 94 | 98.8,F,84.0 95 | 97.1,M,73.0 96 | 97.8,M,58.0 97 | 96.8,F,75.0 98 | 99.9,F,79.0 99 | 98.7,F,64.0 100 | 98.8,F,64.0 101 | 98.0,M,74.0 102 | 99.0,M,81.0 103 | 98.5,M,68.0 104 | 98.0,F,78.0 105 | 99.4,F,77.0 106 | 97.6,M,69.0 107 | 96.7,M,71.0 108 | 97.0,M,80.0 109 | 98.6,M,66.0 110 | 98.7,F,72.0 111 | 97.3,M,69.0 112 | 98.8,F,69.0 113 | 98.0,F,89.0 114 | 98.2,F,64.0 115 | 99.1,F,74.0 116 | 99.0,M,79.0 117 | 98.0,M,64.0 118 | 100.8,F,77.0 119 | 97.8,F,71.0 120 | 98.7,M,78.0 121 | 98.4,F,74.0 122 | 97.7,F,84.0 123 | 97.9,F,68.0 124 | 99.0,F,81.0 125 | 97.2,F,66.0 126 | 97.5,M,75.0 127 | 96.3,M,70.0 128 | 97.7,M,77.0 129 | 98.2,F,73.0 130 | 97.9,M,72.0 131 | 98.7,F,59.0 132 | -------------------------------------------------------------------------------- /Google_API_Project/Google API Project.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | " GOOGLE Geocoding API Project" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "# Import necessary packages" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 2, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "import json\n", 26 | "import urllib.parse\n", 27 | "import requests" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 3, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "# Personal API Key" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 4, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "API_KEY = '------'" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 5, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "# Program sends requested address to Google Geocoding API and returns the detailed address information.\n", 55 | "# Users enters 'q' or 'quit' to quit from the program." 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": {}, 62 | "outputs": [ 63 | { 64 | "name": "stdout", 65 | "output_type": "stream", 66 | "text": [ 67 | "Please Enter Address: 10013\n", 68 | "https://maps.googleapis.com/maps/api/geocode/json?address=10013\n", 69 | "API Status: OK\n", 70 | "\n", 71 | "10013\n", 72 | "Manhattan\n", 73 | "New York\n", 74 | "New York County\n", 75 | "New York\n", 76 | "United States\n", 77 | "\n", 78 | "New York, NY 10013, USA\n" 79 | ] 80 | } 81 | ], 82 | "source": [ 83 | "while True:\n", 84 | " address = input('Please Enter Address: ')\n", 85 | " \n", 86 | " if address == 'quit' or address == 'q':\n", 87 | " break\n", 88 | " \n", 89 | " #Web pages address for requests\n", 90 | " main_api = 'https://maps.googleapis.com/maps/api/geocode/json?' \n", 91 | " url_p = main_api + urllib.parse.urlencode({'address': address})\n", 92 | " url = main_api + urllib.parse.urlencode({'address': address}) + '&key=' + API_KEY\n", 93 | " print(url_p)\n", 94 | " \n", 95 | " #Incoming data from API\n", 96 | " json_data = requests.get(url).json()\n", 97 | " \n", 98 | " #Check API status\n", 99 | " json_status = json_data['status']\n", 100 | " print('API Status: ' + json_status + '\\n')\n", 101 | " \n", 102 | " if json_status == 'OK':\n", 103 | " for each in json_data['results'][0]['address_components']:\\\n", 104 | " print(each['long_name'])\n", 105 | " \n", 106 | " formatted_address = json_data['results'][0]['formatted_address']\n", 107 | " print()\n", 108 | " print(formatted_address)" 109 | ] 110 | } 111 | ], 112 | "metadata": { 113 | "kernelspec": { 114 | "display_name": "Python 3", 115 | "language": "python", 116 | "name": "python3" 117 | }, 118 | "language_info": { 119 | "codemirror_mode": { 120 | "name": "ipython", 121 | "version": 3 122 | }, 123 | "file_extension": ".py", 124 | "mimetype": "text/x-python", 125 | "name": "python", 126 | "nbconvert_exporter": "python", 127 | "pygments_lexer": "ipython3", 128 | "version": "3.6.5" 129 | } 130 | }, 131 | "nbformat": 4, 132 | "nbformat_minor": 2 133 | } 134 | -------------------------------------------------------------------------------- /Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *,cover 45 | 46 | # Translations 47 | *.mo 48 | *.pot 49 | 50 | # Django stuff: 51 | *.log 52 | 53 | # Sphinx documentation 54 | docs/_build/ 55 | 56 | # PyBuilder 57 | target/ 58 | 59 | #Ipython 60 | .ipynb_checkpoints/ 61 | # Created by .ignore support plugin (hsz.mobi) 62 | ### OSX template 63 | .DS_Store 64 | .AppleDouble 65 | .LSOverride 66 | 67 | # Icon must end with two \r 68 | Icon 69 | 70 | # Thumbnails 71 | ._* 72 | 73 | # Files that might appear in the root of a volume 74 | .DocumentRevisions-V100 75 | .fseventsd 76 | .Spotlight-V100 77 | .TemporaryItems 78 | .Trashes 79 | .VolumeIcon.icns 80 | 81 | # Directories potentially created on remote AFP share 82 | .AppleDB 83 | .AppleDesktop 84 | Network Trash Folder 85 | Temporary Items 86 | .apdisk 87 | 88 | #Temporary data 89 | tempdata/ 90 | -------------------------------------------------------------------------------- /Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/bias.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/bias.png -------------------------------------------------------------------------------- /Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/complexity-error-plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/complexity-error-plot.png -------------------------------------------------------------------------------- /Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/complexity-error-reg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/complexity-error-reg.png -------------------------------------------------------------------------------- /Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/data.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/data.png -------------------------------------------------------------------------------- /Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/knn1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/knn1.png -------------------------------------------------------------------------------- /Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/knn2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/knn2.png -------------------------------------------------------------------------------- /Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/linreg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/linreg.png -------------------------------------------------------------------------------- /Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/linsep.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/linsep.png -------------------------------------------------------------------------------- /Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/onelinesplit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/onelinesplit.png -------------------------------------------------------------------------------- /Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/pcanim.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/pcanim.gif -------------------------------------------------------------------------------- /Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/reshape.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/reshape.jpg -------------------------------------------------------------------------------- /Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/sklearn2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/sklearn2.jpg -------------------------------------------------------------------------------- /Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/sklearntrans.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/sklearntrans.jpg -------------------------------------------------------------------------------- /Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/train-cv2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/train-cv2.png -------------------------------------------------------------------------------- /Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/train-cv3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/train-cv3.png -------------------------------------------------------------------------------- /Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/train-test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/train-test.png -------------------------------------------------------------------------------- /Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/train-validate-test-cont.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/train-validate-test-cont.png -------------------------------------------------------------------------------- /Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/train-validate-test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/train-validate-test.png -------------------------------------------------------------------------------- /Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/train-validate-test3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights/images/train-validate-test3.png -------------------------------------------------------------------------------- /Naive_Bayes_Project-Predicting_Movie_Ratings_From_Reviews/Test1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Naive_Bayes_Project-Predicting_Movie_Ratings_From_Reviews/Test1.png -------------------------------------------------------------------------------- /Naive_Bayes_Project-Predicting_Movie_Ratings_From_Reviews/callibration.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Naive_Bayes_Project-Predicting_Movie_Ratings_From_Reviews/callibration.png -------------------------------------------------------------------------------- /Naive_Bayes_Project-Predicting_Movie_Ratings_From_Reviews/terms.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Naive_Bayes_Project-Predicting_Movie_Ratings_From_Reviews/terms.png -------------------------------------------------------------------------------- /Naive_Bayes_Project-Predicting_Movie_Ratings_From_Reviews/terms2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Naive_Bayes_Project-Predicting_Movie_Ratings_From_Reviews/terms2.png -------------------------------------------------------------------------------- /Naive_Bayes_Project-Predicting_Movie_Ratings_From_Reviews/vsm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Naive_Bayes_Project-Predicting_Movie_Ratings_From_Reviews/vsm.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Springboard_Projects 2 | 3 | 4 | ## Introduction 5 | 6 | This repository contains all the projects that were completed as part of Springboard's Data Science Career Track. However, it does not include the capstone and visualisation projects. They are available as separate repositories. 7 | 8 | 9 | ## Contents 10 | 11 | Disclaimer: If you are a Springboard DSC Student, I strongly suggest you refrain from viewing the code before you've actually attempted at solving the problem yourself. 12 | 13 | 1. [Understanding Country Club Database with SQL - Manipulating data in SQL](http://localhost:8888/tree/Data_Science/Springboard_Projects/SQL_Project-Country_Club_Database) 14 | 2. [Analyzing World Bank Projects - Data Wrangling with JSON file](http://localhost:8888/tree/Data_Science/Springboard_Projects/Data_Wrangling_Project-JSON_File) 15 | 3. [API Project - Quandl - Data Wrangling](http://localhost:8888/tree/Data_Science/Springboard_Projects/API_Project-Quandl) 16 | 4. [What is the true Normal Human Body Temperature - Inferential Statistics](http://localhost:8888/tree/Data_Science/Springboard_Projects/Exploratory_Data_Analysis_Project-Normal_Human_Body_Temperature) 17 | 5. [Examining Racial Discrimination in the US Job Market - Inferential Statistics](http://localhost:8888/tree/Data_Science/Springboard_Projects/Exploratory_Data_Analysis_Project-Examine_Racial_Discrimination) 18 | 6. [Hospital Readmission Analysis and Recommendations - Inferential Statistics](http://localhost:8888/tree/Data_Science/Springboard_Projects/Exploratory_Data_Analysis_Project-Hospital_Readmissions) 19 | 7. [Predicting House Prices using Linear Regression - Supervised Machine Learning Project](http://localhost:8888/tree/Data_Science/Springboard_Projects/Linear_Regression_Project-Boston_Housing_Dataset) 20 | 8. [Predicting Gender using Logistic Regression - Supervised Machine Learning Project](http://localhost:8888/tree/Data_Science/Springboard_Projects/Logistic_Regression_Project-Gender_Classification_by_Heights_and_Weights) 21 | 9. [Movie Review Sentiment Analysis using Naive Bayes - Supervised Machine Learning Project](http://localhost:8888/tree/Data_Science/Springboard_Projects/Naive_Bayes_Project-Predicting_Movie_Ratings_From_Reviews) 22 | 10. [Wine Customer Segmentation using Unsupervised Learning - Unsupervised Machine Learning Project](http://localhost:8888/tree/Data_Science/Springboard_Projects/Clustering_Project-Customer_Segmentation) 23 | 11. [Spark Project-Databricks](http://localhost:8888/tree/Data_Science/Springboard_Projects/Spark_Project-Databricks) 24 | 12. [Ultimate Inc. Data Science Challenge - Time Series Project](http://localhost:8888/tree/Data_Science/Springboard_Projects/Take_Home_Challenge-Ultimate_Technologies_Inc) 25 | 13. [Relax Inc. Data Science Challenge](http://localhost:8888/tree/Data_Science/Springboard_Projects/Take_Home_Challenge-Relax_Inc) 26 | 27 | 28 | 29 | 30 | import pandas as pd 31 | import json 32 | from typing import Dict, List, Any, Union 33 | 34 | def expand_nested_json(data: Union[List[Dict], Dict], separator: str = "_") -> pd.DataFrame: 35 | 36 | # If input is a dictionary, convert to a list containing that dictionary 37 | if isinstance(data, dict): 38 | data = [data] 39 | 40 | # If input is a string (JSON), parse it 41 | if isinstance(data, str): 42 | data = json.loads(data) 43 | if isinstance(data, dict): 44 | data = [data] 45 | 46 | # First convert to DataFrame 47 | df = pd.DataFrame(data) 48 | 49 | # Function to flatten nested columns 50 | def flatten_nested_columns(df: pd.DataFrame, separator: str = "_") -> pd.DataFrame: 51 | # Create a copy to avoid modifying the original DataFrame 52 | result_df = df.copy() 53 | 54 | # Find columns with dictionaries or lists 55 | nested_columns = [ 56 | col for col in result_df.columns 57 | if any(isinstance(val, (dict, list)) for val in result_df[col].dropna()) 58 | ] 59 | 60 | # No nested columns to expand 61 | if not nested_columns: 62 | return result_df 63 | 64 | # Process each nested column 65 | for col in nested_columns: 66 | # Handle dictionary columns 67 | if any(isinstance(val, dict) for val in result_df[col].dropna()): 68 | # Convert column to DataFrame 69 | expanded = pd.json_normalize( 70 | result_df[col].apply(lambda x: {} if pd.isna(x) else x) 71 | ) 72 | 73 | # Rename columns with prefix 74 | expanded.columns = [f"{col}{separator}{subcol}" for subcol in expanded.columns] 75 | 76 | # Drop the original column and join with expanded columns 77 | result_df = result_df.drop(col, axis=1).join(expanded) 78 | 79 | # Handle list columns 80 | elif any(isinstance(val, list) for val in result_df[col].dropna()): 81 | # Handle lists of dictionaries 82 | if any(isinstance(item, dict) for sublist in result_df[col].dropna() for item in sublist if sublist): 83 | # Create a temporary column with the index 84 | result_df['_temp_idx'] = range(len(result_df)) 85 | 86 | # Explode the list column into separate rows 87 | exploded = result_df[[col, '_temp_idx']].explode(col) 88 | 89 | # Normalize the exploded dictionaries 90 | if not exploded.empty and any(isinstance(val, dict) for val in exploded[col].dropna()): 91 | expanded = pd.json_normalize( 92 | exploded[col].apply(lambda x: {} if pd.isna(x) else x) 93 | ) 94 | 95 | # Prefix column names 96 | expanded.columns = [f"{col}{separator}{subcol}" for subcol in expanded.columns] 97 | 98 | # Join with the index column 99 | expanded['_temp_idx'] = exploded['_temp_idx'].values 100 | 101 | # Group by index and convert expanded columns to lists 102 | grouped = expanded.groupby('_temp_idx').agg(list) 103 | 104 | # Join with the original DataFrame 105 | result_df = result_df.drop(col, axis=1).join(grouped, on='_temp_idx') 106 | 107 | # Clean up temporary index column 108 | result_df = result_df.drop('_temp_idx', axis=1) 109 | 110 | # Handle simple lists (strings, numbers) 111 | else: 112 | # Convert lists to strings for simple representation 113 | result_df[col] = result_df[col].apply( 114 | lambda x: json.dumps(x) if isinstance(x, list) else x 115 | ) 116 | 117 | # Recursively process any new nested columns that were created 118 | return flatten_nested_columns(result_df, separator) 119 | 120 | # Apply the recursive flattening 121 | flattened_df = flatten_nested_columns(df, separator) 122 | 123 | return flattened_df 124 | -------------------------------------------------------------------------------- /SQL_Project-Country_Club_Database/SQL_Project-Country_Club_Database.sql: -------------------------------------------------------------------------------- 1 | /* Welcome to the SQL mini project. For this project, you will use 2 | Springboard' online SQL platform. 3 | 4 | The data you need is in the "country_club" database. This database 5 | contains 3 tables: 6 | i) the "Bookings" table, 7 | ii) the "Facilities" table, and 8 | iii) the "Members" table. 9 | 10 | Note that, if you need to, you can also download these tables locally. 11 | 12 | In the mini project, you'll be asked a series of questions. You can 13 | solve them using the platform, but for the final deliverable, 14 | paste the code for each solution into this script, and upload it 15 | to your GitHub. 16 | 17 | Before starting with the questions, feel free to take your time, 18 | exploring the data, and getting acquainted with the 3 tables. */ 19 | 20 | 21 | /* Q1: Some of the facilities charge a fee to members, but some do not. 22 | Please list the names of the facilities that do. */ 23 | 24 | SELECT 25 | name 26 | FROM 27 | facilities 28 | WHERE 29 | membercost != 0; 30 | 31 | 32 | /* Q2: How many facilities do not charge a fee to members? */ 33 | 34 | SELECT 35 | COUNT(name) AS zero_membercost 36 | FROM 37 | facilities 38 | WHERE 39 | membercost = 0; 40 | 41 | 42 | /* Q3: How can you produce a list of facilities that charge a fee to members, 43 | where the fee is less than 20% of the facility's monthly maintenance cost? 44 | Return the facid, facility name, member cost, and monthly maintenance of the 45 | facilities in question. */ 46 | 47 | SELECT 48 | facid, name AS facility_name, membercost, monthlymaintenance 49 | FROM 50 | facilities 51 | WHERE 52 | membercost < 0.20 * monthlymaintenance 53 | AND membercost != 0; 54 | 55 | 56 | /* Q4: How can you retrieve the details of facilities with ID 1 and 5? 57 | Write the query without using the OR operator. */ 58 | 59 | SELECT 60 | * 61 | FROM 62 | facilities 63 | WHERE 64 | facid IN (1 , 5); 65 | 66 | 67 | /* Q5: How can you produce a list of facilities, with each labelled as 68 | 'cheap' or 'expensive', depending on if their monthly maintenance cost is 69 | more than $100? Return the name and monthly maintenance of the facilities 70 | in question. */ 71 | 72 | SELECT 73 | name, 74 | CASE 75 | WHEN monthlymaintenance > 100.0 THEN 'expensive' 76 | ELSE 'cheap' 77 | END AS monthlymaintenance 78 | FROM 79 | facilities; 80 | 81 | 82 | /* Q6: You'd like to get the first and last name of the last member(s) 83 | who signed up. Do not use the LIMIT clause for your solution. */ 84 | 85 | SELECT 86 | firstname, surname 87 | FROM 88 | members 89 | WHERE 90 | joindate = (SELECT 91 | MAX(joindate) 92 | FROM 93 | members); 94 | 95 | 96 | /* Q7: How can you produce a list of all members who have used a tennis court? 97 | Include in your output the name of the court, and the name of the member 98 | formatted as a single column. Ensure no duplicate data, and order by 99 | the member name. */ 100 | 101 | SELECT DISTINCT 102 | (CONCAT(m.firstname, ' ', m.surname)) AS member, f.name 103 | FROM 104 | facilities f 105 | JOIN 106 | Bookings b ON f.facid = b.facid 107 | JOIN 108 | Members m ON m.memid = b.memid AND m.memid != 0 109 | WHERE 110 | f.name LIKE '%Tennis Court%' 111 | ORDER BY member; 112 | 113 | 114 | /* Q8: How can you produce a list of bookings on the day of 2012-09-14 which 115 | will cost the member (or guest) more than $30? Remember that guests have 116 | different costs to members (the listed costs are per half-hour 'slot'), and 117 | the guest user's ID is always 0. Include in your output the name of the 118 | facility, the name of the member formatted as a single column, and the cost. 119 | Order by descending cost, and do not use any subqueries. */ 120 | 121 | SELECT 122 | f.name AS facility_name, 123 | CONCAT(m.firstname, ' ', m.surname) AS member_ID, 124 | CASE 125 | WHEN m.memid != 0 THEN b.slots * f.membercost 126 | WHEN m.memid = 0 THEN b.slots * f.guestcost 127 | END AS cost 128 | FROM 129 | members m 130 | JOIN 131 | bookings b ON m.memid = b.memid 132 | JOIN 133 | facilities AS f ON b.facid = f.facid 134 | WHERE 135 | b.starttime >= '2012-09-14' 136 | AND b.starttime < '2012-09-15' 137 | AND ((m.memid != 0 138 | AND b.slots * f.membercost > 30) 139 | OR (m.memid = 0 140 | AND b.slots * f.guestcost > 30)) 141 | ORDER BY cost DESC; 142 | 143 | 144 | /* Q9: This time, produce the same result as in Q8, but using a subquery. */ 145 | 146 | SELECT 147 | sub.name AS facility_name, 148 | sub.member AS member_ID, 149 | sub.cost AS cost 150 | FROM 151 | (SELECT 152 | CONCAT(m.firstname, ' ', m.surname) AS member, 153 | f.name, 154 | CASE 155 | WHEN m.memid != 0 THEN b.slots * f.membercost 156 | WHEN m.memid = 0 THEN b.slots * f.guestcost 157 | END AS cost 158 | FROM 159 | members m 160 | JOIN bookings b ON m.memid = b.memid 161 | JOIN facilities f ON b.facid = f.facid 162 | WHERE 163 | b.starttime >= '2012-09-14' 164 | AND b.starttime < '2012-09-15' 165 | HAVING cost > 30) AS sub 166 | ORDER BY cost DESC; 167 | 168 | 169 | /* Q10: Produce a list of facilities with a total revenue less than 1000. 170 | The output of facility name and total revenue, sorted by revenue. Remember 171 | that there's a different cost for guests and members! */ 172 | 173 | SELECT 174 | facility_name, revenue AS total_revenue 175 | FROM 176 | (SELECT 177 | SUM(CASE 178 | WHEN b.memid != 0 THEN b.slots * f.membercost 179 | WHEN b.memid = 0 THEN b.slots * f.guestcost 180 | END) AS revenue, 181 | f.name AS facility_name 182 | FROM 183 | bookings b 184 | JOIN facilities f ON b.facid = f.facid 185 | GROUP BY f.name 186 | HAVING revenue < 1000) AS sub 187 | ORDER BY revenue; 188 | -------------------------------------------------------------------------------- /SQL_Project-Country_Club_Database/Schema.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/SQL_Project-Country_Club_Database/Schema.JPG -------------------------------------------------------------------------------- /Spark_Project-Databricks/.ipynb_checkpoints/Spark-Mini_Project-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Spark Mini Project" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "The project was completed in databricks and published. You may reach my Spark mini project via link below. " 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "https://databricks-prod-cloudfront.cloud.databricks.com/public/4027ec902e239c93eaaa8714f173bcfc/8467584438137932/4169206975357756/5443988219023238/latest.html" 22 | ] 23 | } 24 | ], 25 | "metadata": { 26 | "kernelspec": { 27 | "display_name": "Python 3", 28 | "language": "python", 29 | "name": "python3" 30 | }, 31 | "language_info": { 32 | "codemirror_mode": { 33 | "name": "ipython", 34 | "version": 3 35 | }, 36 | "file_extension": ".py", 37 | "mimetype": "text/x-python", 38 | "name": "python", 39 | "nbconvert_exporter": "python", 40 | "pygments_lexer": "ipython3", 41 | "version": "3.7.1" 42 | } 43 | }, 44 | "nbformat": 4, 45 | "nbformat_minor": 2 46 | } 47 | -------------------------------------------------------------------------------- /Spark_Project-Databricks/Spark-Mini_Project.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Spark Mini Project" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "The project was completed in databricks and published. You may reach my Spark mini project via link below. " 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "https://databricks-prod-cloudfront.cloud.databricks.com/public/4027ec902e239c93eaaa8714f173bcfc/8467584438137932/4169206975357756/5443988219023238/latest.html" 22 | ] 23 | } 24 | ], 25 | "metadata": { 26 | "kernelspec": { 27 | "display_name": "Python 3", 28 | "language": "python", 29 | "name": "python3" 30 | }, 31 | "language_info": { 32 | "codemirror_mode": { 33 | "name": "ipython", 34 | "version": 3 35 | }, 36 | "file_extension": ".py", 37 | "mimetype": "text/x-python", 38 | "name": "python", 39 | "nbconvert_exporter": "python", 40 | "pygments_lexer": "ipython3", 41 | "version": "3.7.1" 42 | } 43 | }, 44 | "nbformat": 4, 45 | "nbformat_minor": 2 46 | } 47 | -------------------------------------------------------------------------------- /Take_Home_Challenge-Relax_Inc/Relax_Keynote.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Take_Home_Challenge-Relax_Inc/Relax_Keynote.pdf -------------------------------------------------------------------------------- /Take_Home_Challenge-Relax_Inc/relax_data_science_challenge.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Take_Home_Challenge-Relax_Inc/relax_data_science_challenge.pdf -------------------------------------------------------------------------------- /Take_Home_Challenge-Relax_Inc/takehome_users.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Take_Home_Challenge-Relax_Inc/takehome_users.csv -------------------------------------------------------------------------------- /Take_Home_Challenge-Ultimate_Technologies_Inc/ultimate_data_science_challenge.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShiningData/Springboard_Projects/d1368678312a35438e5ebf9fff5139344dd7ddc6/Take_Home_Challenge-Ultimate_Technologies_Inc/ultimate_data_science_challenge.pdf --------------------------------------------------------------------------------