├── Amazon Web Scraper Project.ipynb ├── Automate API Extraction + Appending Data + Extra -- Project.ipynb ├── COVID Portfolio Project - Data Exploration.sql ├── CovidDeaths.xlsx ├── CovidVaccinations.xlsx ├── Data Cleaning Portfolio Project Queries.sql ├── Global YouTube Statistics.csv ├── Movie Portfolio Project.ipynb ├── Nashville Housing Data for Data Cleaning (reuploaded).xlsx ├── Nashville Housing Data for Data Cleaning.xlsx ├── README.md ├── Tableau Joins File.xlsx ├── Tableau Portfolio Project SQL Queries.sql ├── app.py ├── app_starting_code.py └── us-housing-app.zip /Amazon Web Scraper Project.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "f236cbb9", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# import libraries \n", 11 | "\n", 12 | "from bs4 import BeautifulSoup\n", 13 | "import requests\n", 14 | "import time\n", 15 | "import datetime\n", 16 | "\n", 17 | "import smtplib\n" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 16, 23 | "id": "9b531b61", 24 | "metadata": {}, 25 | "outputs": [ 26 | { 27 | "name": "stdout", 28 | "output_type": "stream", 29 | "text": [ 30 | "\n", 31 | " Funny Got Data MIS Data Systems Business Analyst T-Shirt\n", 32 | " \n", 33 | "\n", 34 | " $16.99\n", 35 | " \n" 36 | ] 37 | } 38 | ], 39 | "source": [ 40 | "# Connect to Website and pull in data\n", 41 | "\n", 42 | "URL = 'https://www.amazon.com/Funny-Data-Systems-Business-Analyst/dp/B07FNW9FGJ/ref=sr_1_3?dchild=1&keywords=data%2Banalyst%2Btshirt&qid=1626655184&sr=8-3&customId=B0752XJYNL&th=1'\n", 43 | "\n", 44 | "headers = {\"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36\", \"Accept-Encoding\":\"gzip, deflate\", \"Accept\":\"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\", \"DNT\":\"1\",\"Connection\":\"close\", \"Upgrade-Insecure-Requests\":\"1\"}\n", 45 | "\n", 46 | "page = requests.get(URL, headers=headers)\n", 47 | "\n", 48 | "soup1 = BeautifulSoup(page.content, \"html.parser\")\n", 49 | "\n", 50 | "soup2 = BeautifulSoup(soup1.prettify(), \"html.parser\")\n", 51 | "\n", 52 | "title = soup2.find(id='productTitle').get_text()\n", 53 | "\n", 54 | "price = soup2.find(id='priceblock_ourprice').get_text()\n", 55 | "\n", 56 | "\n", 57 | "print(title)\n", 58 | "print(price)\n", 59 | "\n", 60 | "\n" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 17, 66 | "id": "b6f7d66e", 67 | "metadata": {}, 68 | "outputs": [ 69 | { 70 | "name": "stdout", 71 | "output_type": "stream", 72 | "text": [ 73 | "Funny Got Data MIS Data Systems Business Analyst T-Shirt\n", 74 | "16.99\n" 75 | ] 76 | } 77 | ], 78 | "source": [ 79 | "# Clean up the data a little bit\n", 80 | "\n", 81 | "price = price.strip()[1:]\n", 82 | "title = title.strip()\n", 83 | "\n", 84 | "print(title)\n", 85 | "print(price)" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 21, 91 | "id": "4f021c23", 92 | "metadata": {}, 93 | "outputs": [ 94 | { 95 | "name": "stdout", 96 | "output_type": "stream", 97 | "text": [ 98 | "2021-08-21\n" 99 | ] 100 | } 101 | ], 102 | "source": [ 103 | "# Create a Timestamp for your output to track when data was collected\n", 104 | "\n", 105 | "import datetime\n", 106 | "\n", 107 | "today = datetime.date.today()\n", 108 | "\n", 109 | "print(today)\n" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 22, 115 | "id": "14d703ca", 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "# Create CSV and write headers and data into the file\n", 120 | "\n", 121 | "import csv \n", 122 | "\n", 123 | "header = ['Title', 'Price', 'Date']\n", 124 | "data = [title, price, today]\n", 125 | "\n", 126 | "\n", 127 | "with open('AmazonWebScraperDataset.csv', 'w', newline='', encoding='UTF8') as f:\n", 128 | " writer = csv.writer(f)\n", 129 | " writer.writerow(header)\n", 130 | " writer.writerow(data)\n", 131 | " \n" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "id": "d07eeb86", 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "import pandas as pd\n", 142 | "\n", 143 | "df = pd.read_csv(r'C:\\Users\\alexf\\AmazonWebScraperDataset.csv')\n", 144 | "\n", 145 | "print(df)\n", 146 | "\n" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": 29, 152 | "id": "6b05c1eb", 153 | "metadata": {}, 154 | "outputs": [], 155 | "source": [ 156 | "#Now we are appending data to the csv\n", 157 | "\n", 158 | "with open('AmazonWebScraperDataset.csv', 'a+', newline='', encoding='UTF8') as f:\n", 159 | " writer = csv.writer(f)\n", 160 | " writer.writerow(data)" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 31, 166 | "id": "8e95b9e0", 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [ 170 | "#Combine all of the above code into one function\n", 171 | "\n", 172 | "\n", 173 | "def check_price():\n", 174 | " URL = 'https://www.amazon.com/Funny-Data-Systems-Business-Analyst/dp/B07FNW9FGJ/ref=sr_1_3?dchild=1&keywords=data%2Banalyst%2Btshirt&qid=1626655184&sr=8-3&customId=B0752XJYNL&th=1'\n", 175 | "\n", 176 | " headers = {\"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36\", \"Accept-Encoding\":\"gzip, deflate\", \"Accept\":\"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\", \"DNT\":\"1\",\"Connection\":\"close\", \"Upgrade-Insecure-Requests\":\"1\"}\n", 177 | "\n", 178 | " page = requests.get(URL, headers=headers)\n", 179 | "\n", 180 | " soup1 = BeautifulSoup(page.content, \"html.parser\")\n", 181 | "\n", 182 | " soup2 = BeautifulSoup(soup1.prettify(), \"html.parser\")\n", 183 | "\n", 184 | " title = soup2.find(id='productTitle').get_text()\n", 185 | "\n", 186 | " price = soup2.find(id='priceblock_ourprice').get_text()\n", 187 | "\n", 188 | " price = price.strip()[1:]\n", 189 | " title = title.strip()\n", 190 | "\n", 191 | " import datetime\n", 192 | "\n", 193 | " today = datetime.date.today()\n", 194 | " \n", 195 | " import csv \n", 196 | "\n", 197 | " header = ['Title', 'Price', 'Date']\n", 198 | " data = [title, price, today]\n", 199 | "\n", 200 | " with open('AmazonWebScraperDataset.csv', 'a+', newline='', encoding='UTF8') as f:\n", 201 | " writer = csv.writer(f)\n", 202 | " writer.writerow(data)\n", 203 | " \n", 204 | " " 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "id": "c72f2c4e", 211 | "metadata": {}, 212 | "outputs": [], 213 | "source": [ 214 | "# Runs check_price after a set time and inputs data into your CSV\n", 215 | "\n", 216 | "while(True):\n", 217 | " check_price()\n", 218 | " time.sleep(86400)" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": null, 224 | "id": "00af7126", 225 | "metadata": {}, 226 | "outputs": [], 227 | "source": [ 228 | "import pandas as pd\n", 229 | "\n", 230 | "df = pd.read_csv(r'C:\\Users\\alexf\\AmazonWebScraperDataset.csv')\n", 231 | "\n", 232 | "print(df)" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "id": "d14fce5f", 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "# If uou want to try sending yourself an email (just for fun) when a price hits below a certain level you can try it\n", 243 | "# out with this script\n", 244 | "\n", 245 | "def send_mail():\n", 246 | " server = smtplib.SMTP_SSL('smtp.gmail.com',465)\n", 247 | " server.ehlo()\n", 248 | " #server.starttls()\n", 249 | " server.ehlo()\n", 250 | " server.login('AlexTheAnalyst95@gmail.com','xxxxxxxxxxxxxx')\n", 251 | " \n", 252 | " subject = \"The Shirt you want is below $15! Now is your chance to buy!\"\n", 253 | " body = \"Alex, This is the moment we have been waiting for. Now is your chance to pick up the shirt of your dreams. Don't mess it up! Link here: https://www.amazon.com/Funny-Data-Systems-Business-Analyst/dp/B07FNW9FGJ/ref=sr_1_3?dchild=1&keywords=data+analyst+tshirt&qid=1626655184&sr=8-3\"\n", 254 | " \n", 255 | " msg = f\"Subject: {subject}\\n\\n{body}\"\n", 256 | " \n", 257 | " server.sendmail(\n", 258 | " 'AlexTheAnalyst95@gmail.com',\n", 259 | " msg\n", 260 | " \n", 261 | " )" 262 | ] 263 | } 264 | ], 265 | "metadata": { 266 | "kernelspec": { 267 | "display_name": "Python 3", 268 | "language": "python", 269 | "name": "python3" 270 | }, 271 | "language_info": { 272 | "codemirror_mode": { 273 | "name": "ipython", 274 | "version": 3 275 | }, 276 | "file_extension": ".py", 277 | "mimetype": "text/x-python", 278 | "name": "python", 279 | "nbconvert_exporter": "python", 280 | "pygments_lexer": "ipython3", 281 | "version": "3.8.8" 282 | } 283 | }, 284 | "nbformat": 4, 285 | "nbformat_minor": 5 286 | } 287 | -------------------------------------------------------------------------------- /Automate API Extraction + Appending Data + Extra -- Project.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 6, 6 | "id": "70cbe983", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "from requests import Request, Session\n", 11 | "from requests.exceptions import ConnectionError, Timeout, TooManyRedirects\n", 12 | "import json\n", 13 | "\n", 14 | "url = 'https://pro-api.coinmarketcap.com/v1/cryptocurrency/listings/latest' \n", 15 | "#Original Sandbox Environment: 'https://sandbox-api.coinmarketcap.com/v1/cryptocurrency/listings/latest'\n", 16 | "parameters = {\n", 17 | " 'start':'1',\n", 18 | " 'limit':'15',\n", 19 | " 'convert':'USD'\n", 20 | "}\n", 21 | "headers = {\n", 22 | " 'Accepts': 'application/json',\n", 23 | " 'X-CMC_PRO_API_KEY': '0ad53085-1cb2-4eb8-ad9e-3ffbd7e56509',\n", 24 | "}\n", 25 | "\n", 26 | "session = Session()\n", 27 | "session.headers.update(headers)\n", 28 | "\n", 29 | "try:\n", 30 | " response = session.get(url, params=parameters)\n", 31 | " data = json.loads(response.text)\n", 32 | " #print(data)\n", 33 | "except (ConnectionError, Timeout, TooManyRedirects) as e:\n", 34 | " print(e)\n", 35 | "\n", 36 | "#NOTE:\n", 37 | "# I had to go in and put \"jupyter notebook --NotebookApp.iopub_data_rate_limit=1e10\"\n", 38 | "# Into the Anaconda Prompt to change this to allow to pull data\n", 39 | "\n", 40 | "# If that didn't work try using the local host URL as shown in the video" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 63, 46 | "id": "31bdff98", 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "type(data)" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 18, 56 | "id": "4cbf82ee", 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "import pandas as pd\n", 61 | "\n", 62 | "\n", 63 | "#This allows you to see all the columns, not just like 15\n", 64 | "pd.set_option('display.max_columns', None)\n", 65 | "pd.set_option('display.max_rows', None)" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 64, 71 | "id": "48c3b340", 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "#This normalizes the data and makes it all pretty in a dataframe\n", 76 | "\n", 77 | "df = pd.json_normalize(data['data'])\n", 78 | "df['timestamp'] = pd.to_datetime('now')\n", 79 | "df" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 60, 85 | "id": "d792e388", 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "\n", 90 | "def api_runner():\n", 91 | " global df\n", 92 | " url = 'https://pro-api.coinmarketcap.com/v1/cryptocurrency/listings/latest' \n", 93 | " #Original Sandbox Environment: 'https://sandbox-api.coinmarketcap.com/v1/cryptocurrency/listings/latest'\n", 94 | " parameters = {\n", 95 | " 'start':'1',\n", 96 | " 'limit':'15',\n", 97 | " 'convert':'USD'\n", 98 | " }\n", 99 | " headers = {\n", 100 | " 'Accepts': 'application/json',\n", 101 | " 'X-CMC_PRO_API_KEY': '0ad53085-1cb2-4eb8-ad9e-3ffbd7e56509',\n", 102 | " }\n", 103 | "\n", 104 | " session = Session()\n", 105 | " session.headers.update(headers)\n", 106 | "\n", 107 | " try:\n", 108 | " response = session.get(url, params=parameters)\n", 109 | " data = json.loads(response.text)\n", 110 | " #print(data)\n", 111 | " except (ConnectionError, Timeout, TooManyRedirects) as e:\n", 112 | " print(e)\n", 113 | "\n", 114 | "#NOTE:\n", 115 | "# I had to go in and put \"jupyter notebook --NotebookApp.iopub_data_rate_limit=1e10\"\n", 116 | "# Into the Anaconda Prompt to change this to allow to pull data\n", 117 | " \n", 118 | " # Use this if you just want to keep it in a dataframe\n", 119 | " df2 = pd.json_normalize(data['data'])\n", 120 | " df2['Timestamp'] = pd.to_datetime('now')\n", 121 | " df = df.append(df2)\n", 122 | "\n", 123 | "\n", 124 | " # Use this if you want to create a csv and append data to it\n", 125 | " #df = pd.json_normalize(data['data'])\n", 126 | " #df['timestamp'] = pd.to_datetime('now')\n", 127 | " #df\n", 128 | "\n", 129 | " #if not os.path.isfile(r'C:\\Users\\alexf\\OneDrive\\Documents\\Python Scripts\\API.csv'):\n", 130 | " #df.to_csv(r'C:\\Users\\alexf\\OneDrive\\Documents\\Python Scripts\\API.csv', header='column_names')\n", 131 | " #else:\n", 132 | " #df.to_csv(r'C:\\Users\\alexf\\OneDrive\\Documents\\Python Scripts\\API.csv', mode='a', header=False)\n", 133 | " \n", 134 | " #Then to read in the file: df = pd.read_csv(r'C:\\Users\\alexf\\OneDrive\\Documents\\Python Scripts\\API.csv')\n", 135 | "\n", 136 | "# If that didn't work try using the local host URL as shown in the video" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 67, 142 | "id": "9e272cea", 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "import os \n", 147 | "from time import time\n", 148 | "from time import sleep\n", 149 | "\n", 150 | "for i in range(333):\n", 151 | " api_runner()\n", 152 | " print('API Runner completed')\n", 153 | " sleep(60) #sleep for 1 minute\n", 154 | "exit()" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 68, 160 | "id": "bf9a55d9", 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "df72 = pd.read_csv(r'C:\\Users\\alexf\\OneDrive\\Documents\\Python Scripts\\API.csv')\n", 165 | "df72" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 69, 171 | "id": "8902053b", 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "df" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 22, 181 | "id": "b7c56101", 182 | "metadata": {}, 183 | "outputs": [], 184 | "source": [ 185 | "# One thing I noticed was the scientific notation. I like it, but I want to be able to see the numbers in this case\n", 186 | "\n", 187 | "pd.set_option('display.float_format', lambda x: '%.5f' % x)" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": 70, 193 | "id": "56b5a577", 194 | "metadata": {}, 195 | "outputs": [], 196 | "source": [ 197 | "df" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": 71, 203 | "id": "e4227c53", 204 | "metadata": {}, 205 | "outputs": [], 206 | "source": [ 207 | "# Now let's look at the coin trends over time\n", 208 | "\n", 209 | "df3 = df.groupby('name', sort=False)[['quote.USD.percent_change_1h','quote.USD.percent_change_24h','quote.USD.percent_change_7d','quote.USD.percent_change_30d','quote.USD.percent_change_60d','quote.USD.percent_change_90d']].mean()\n", 210 | "df3" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": 72, 216 | "id": "f3e2d1db", 217 | "metadata": {}, 218 | "outputs": [], 219 | "source": [ 220 | "df4 = df3.stack()\n", 221 | "df4" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": 73, 227 | "id": "2ef8ee34", 228 | "metadata": {}, 229 | "outputs": [], 230 | "source": [ 231 | "type(df4)" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": 74, 237 | "id": "4b7b94bf", 238 | "metadata": {}, 239 | "outputs": [], 240 | "source": [ 241 | "df5 = df4.to_frame(name='values')\n", 242 | "df5" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": 75, 248 | "id": "b8125368", 249 | "metadata": {}, 250 | "outputs": [], 251 | "source": [ 252 | "df5.count()" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": 76, 258 | "id": "fc6ade71", 259 | "metadata": {}, 260 | "outputs": [], 261 | "source": [ 262 | "#Because of how it's structured above we need to set an index. I don't want to pass a column as an index for this dataframe\n", 263 | "#So I'm going to create a range and pass that as the dataframe. You can make this more dynamic, but I'm just going to hard code it\n", 264 | "\n", 265 | "\n", 266 | "index = pd.Index(range(90))\n", 267 | "\n", 268 | "# Set the above DataFrame index object as the index\n", 269 | "# using set_index() function\n", 270 | "df6 = df5.set_index(index)\n", 271 | "df6\n", 272 | "\n", 273 | "# If it only has the index and values try doing reset_index like \"df5.reset_index()\"" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": 77, 279 | "id": "7d13cd4d", 280 | "metadata": {}, 281 | "outputs": [], 282 | "source": [ 283 | "# Change the column name\n", 284 | "\n", 285 | "df7 = df6.rename(columns={'level_1': 'percent_change'})\n", 286 | "df7" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": 78, 292 | "id": "a72480a3", 293 | "metadata": {}, 294 | "outputs": [], 295 | "source": [ 296 | "df7['percent_change'] = df7['percent_change'].replace(['quote.USD.percent_change_24h','quote.USD.percent_change_7d','quote.USD.percent_change_30d','quote.USD.percent_change_60d','quote.USD.percent_change_90d'],['24h','7d','30d','60d','90d'])\n", 297 | "df7" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": 47, 303 | "id": "16a3121f", 304 | "metadata": {}, 305 | "outputs": [], 306 | "source": [ 307 | "import seaborn as sns\n", 308 | "import matplotlib.pyplot as plt" 309 | ] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": 79, 314 | "id": "c287a308", 315 | "metadata": {}, 316 | "outputs": [], 317 | "source": [ 318 | "sns.catplot(x='percent_change', y='values', hue='name', data=df7, kind='point')" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": 80, 324 | "id": "2915d494", 325 | "metadata": {}, 326 | "outputs": [], 327 | "source": [ 328 | "# Now to do something much simpler\n", 329 | "# we are going to create a dataframe with the columns we want\n", 330 | "\n", 331 | "df10 = df[['name','quote.USD.price','timestamp']]\n", 332 | "df10 = df10.query(\"name == 'Bitcoin'\")\n", 333 | "df10" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": 81, 339 | "id": "ae8459af", 340 | "metadata": {}, 341 | "outputs": [], 342 | "source": [ 343 | "sns.set_theme(style=\"darkgrid\")\n", 344 | "\n", 345 | "sns.lineplot(x='timestamp', y='quote.USD.price', data = df10)" 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": null, 351 | "id": "db10f9de", 352 | "metadata": {}, 353 | "outputs": [], 354 | "source": [] 355 | } 356 | ], 357 | "metadata": { 358 | "kernelspec": { 359 | "display_name": "Python 3", 360 | "language": "python", 361 | "name": "python3" 362 | }, 363 | "language_info": { 364 | "codemirror_mode": { 365 | "name": "ipython", 366 | "version": 3 367 | }, 368 | "file_extension": ".py", 369 | "mimetype": "text/x-python", 370 | "name": "python", 371 | "nbconvert_exporter": "python", 372 | "pygments_lexer": "ipython3", 373 | "version": "3.8.8" 374 | } 375 | }, 376 | "nbformat": 4, 377 | "nbformat_minor": 5 378 | } 379 | -------------------------------------------------------------------------------- /COVID Portfolio Project - Data Exploration.sql: -------------------------------------------------------------------------------- 1 | /* 2 | Covid 19 Data Exploration 3 | 4 | Skills used: Joins, CTE's, Temp Tables, Windows Functions, Aggregate Functions, Creating Views, Converting Data Types 5 | 6 | */ 7 | 8 | Select * 9 | From PortfolioProject..CovidDeaths 10 | Where continent is not null 11 | order by 3,4 12 | 13 | 14 | -- Select Data that we are going to be starting with 15 | 16 | Select Location, date, total_cases, new_cases, total_deaths, population 17 | From PortfolioProject..CovidDeaths 18 | Where continent is not null 19 | order by 1,2 20 | 21 | 22 | -- Total Cases vs Total Deaths 23 | -- Shows likelihood of dying if you contract covid in your country 24 | 25 | Select Location, date, total_cases,total_deaths, (total_deaths/total_cases)*100 as DeathPercentage 26 | From PortfolioProject..CovidDeaths 27 | Where location like '%states%' 28 | and continent is not null 29 | order by 1,2 30 | 31 | 32 | -- Total Cases vs Population 33 | -- Shows what percentage of population infected with Covid 34 | 35 | Select Location, date, Population, total_cases, (total_cases/population)*100 as PercentPopulationInfected 36 | From PortfolioProject..CovidDeaths 37 | --Where location like '%states%' 38 | order by 1,2 39 | 40 | 41 | -- Countries with Highest Infection Rate compared to Population 42 | 43 | Select Location, Population, MAX(total_cases) as HighestInfectionCount, Max((total_cases/population))*100 as PercentPopulationInfected 44 | From PortfolioProject..CovidDeaths 45 | --Where location like '%states%' 46 | Group by Location, Population 47 | order by PercentPopulationInfected desc 48 | 49 | 50 | -- Countries with Highest Death Count per Population 51 | 52 | Select Location, MAX(cast(Total_deaths as int)) as TotalDeathCount 53 | From PortfolioProject..CovidDeaths 54 | --Where location like '%states%' 55 | Where continent is not null 56 | Group by Location 57 | order by TotalDeathCount desc 58 | 59 | 60 | 61 | -- BREAKING THINGS DOWN BY CONTINENT 62 | 63 | -- Showing contintents with the highest death count per population 64 | 65 | Select continent, MAX(cast(Total_deaths as int)) as TotalDeathCount 66 | From PortfolioProject..CovidDeaths 67 | --Where location like '%states%' 68 | Where continent is not null 69 | Group by continent 70 | order by TotalDeathCount desc 71 | 72 | 73 | 74 | -- GLOBAL NUMBERS 75 | 76 | Select SUM(new_cases) as total_cases, SUM(cast(new_deaths as int)) as total_deaths, SUM(cast(new_deaths as int))/SUM(New_Cases)*100 as DeathPercentage 77 | From PortfolioProject..CovidDeaths 78 | --Where location like '%states%' 79 | where continent is not null 80 | --Group By date 81 | order by 1,2 82 | 83 | 84 | 85 | -- Total Population vs Vaccinations 86 | -- Shows Percentage of Population that has recieved at least one Covid Vaccine 87 | 88 | Select dea.continent, dea.location, dea.date, dea.population, vac.new_vaccinations 89 | , SUM(CONVERT(int,vac.new_vaccinations)) OVER (Partition by dea.Location Order by dea.location, dea.Date) as RollingPeopleVaccinated 90 | --, (RollingPeopleVaccinated/population)*100 91 | From PortfolioProject..CovidDeaths dea 92 | Join PortfolioProject..CovidVaccinations vac 93 | On dea.location = vac.location 94 | and dea.date = vac.date 95 | where dea.continent is not null 96 | order by 2,3 97 | 98 | 99 | -- Using CTE to perform Calculation on Partition By in previous query 100 | 101 | With PopvsVac (Continent, Location, Date, Population, New_Vaccinations, RollingPeopleVaccinated) 102 | as 103 | ( 104 | Select dea.continent, dea.location, dea.date, dea.population, vac.new_vaccinations 105 | , SUM(CONVERT(int,vac.new_vaccinations)) OVER (Partition by dea.Location Order by dea.location, dea.Date) as RollingPeopleVaccinated 106 | --, (RollingPeopleVaccinated/population)*100 107 | From PortfolioProject..CovidDeaths dea 108 | Join PortfolioProject..CovidVaccinations vac 109 | On dea.location = vac.location 110 | and dea.date = vac.date 111 | where dea.continent is not null 112 | --order by 2,3 113 | ) 114 | Select *, (RollingPeopleVaccinated/Population)*100 115 | From PopvsVac 116 | 117 | 118 | 119 | -- Using Temp Table to perform Calculation on Partition By in previous query 120 | 121 | DROP Table if exists #PercentPopulationVaccinated 122 | Create Table #PercentPopulationVaccinated 123 | ( 124 | Continent nvarchar(255), 125 | Location nvarchar(255), 126 | Date datetime, 127 | Population numeric, 128 | New_vaccinations numeric, 129 | RollingPeopleVaccinated numeric 130 | ) 131 | 132 | Insert into #PercentPopulationVaccinated 133 | Select dea.continent, dea.location, dea.date, dea.population, vac.new_vaccinations 134 | , SUM(CONVERT(int,vac.new_vaccinations)) OVER (Partition by dea.Location Order by dea.location, dea.Date) as RollingPeopleVaccinated 135 | --, (RollingPeopleVaccinated/population)*100 136 | From PortfolioProject..CovidDeaths dea 137 | Join PortfolioProject..CovidVaccinations vac 138 | On dea.location = vac.location 139 | and dea.date = vac.date 140 | --where dea.continent is not null 141 | --order by 2,3 142 | 143 | Select *, (RollingPeopleVaccinated/Population)*100 144 | From #PercentPopulationVaccinated 145 | 146 | 147 | 148 | 149 | -- Creating View to store data for later visualizations 150 | 151 | Create View PercentPopulationVaccinated as 152 | Select dea.continent, dea.location, dea.date, dea.population, vac.new_vaccinations 153 | , SUM(CONVERT(int,vac.new_vaccinations)) OVER (Partition by dea.Location Order by dea.location, dea.Date) as RollingPeopleVaccinated 154 | --, (RollingPeopleVaccinated/population)*100 155 | From PortfolioProject..CovidDeaths dea 156 | Join PortfolioProject..CovidVaccinations vac 157 | On dea.location = vac.location 158 | and dea.date = vac.date 159 | where dea.continent is not null 160 | 161 | 162 | -------------------------------------------------------------------------------- /CovidDeaths.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexTheAnalyst/PortfolioProjects/39c541bae76eb109652e8d834b0fa2aa3f15fd8b/CovidDeaths.xlsx -------------------------------------------------------------------------------- /CovidVaccinations.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexTheAnalyst/PortfolioProjects/39c541bae76eb109652e8d834b0fa2aa3f15fd8b/CovidVaccinations.xlsx -------------------------------------------------------------------------------- /Data Cleaning Portfolio Project Queries.sql: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Cleaning Data in SQL Queries 4 | 5 | */ 6 | 7 | 8 | Select * 9 | From PortfolioProject.dbo.NashvilleHousing 10 | 11 | -------------------------------------------------------------------------------------------------------------------------- 12 | 13 | -- Standardize Date Format 14 | 15 | 16 | Select saleDateConverted, CONVERT(Date,SaleDate) 17 | From PortfolioProject.dbo.NashvilleHousing 18 | 19 | 20 | Update NashvilleHousing 21 | SET SaleDate = CONVERT(Date,SaleDate) 22 | 23 | -- If it doesn't Update properly 24 | 25 | ALTER TABLE NashvilleHousing 26 | Add SaleDateConverted Date; 27 | 28 | Update NashvilleHousing 29 | SET SaleDateConverted = CONVERT(Date,SaleDate) 30 | 31 | 32 | -------------------------------------------------------------------------------------------------------------------------- 33 | 34 | -- Populate Property Address data 35 | 36 | Select * 37 | From PortfolioProject.dbo.NashvilleHousing 38 | --Where PropertyAddress is null 39 | order by ParcelID 40 | 41 | 42 | 43 | Select a.ParcelID, a.PropertyAddress, b.ParcelID, b.PropertyAddress, ISNULL(a.PropertyAddress,b.PropertyAddress) 44 | From PortfolioProject.dbo.NashvilleHousing a 45 | JOIN PortfolioProject.dbo.NashvilleHousing b 46 | on a.ParcelID = b.ParcelID 47 | AND a.[UniqueID ] <> b.[UniqueID ] 48 | Where a.PropertyAddress is null 49 | 50 | 51 | Update a 52 | SET PropertyAddress = ISNULL(a.PropertyAddress,b.PropertyAddress) 53 | From PortfolioProject.dbo.NashvilleHousing a 54 | JOIN PortfolioProject.dbo.NashvilleHousing b 55 | on a.ParcelID = b.ParcelID 56 | AND a.[UniqueID ] <> b.[UniqueID ] 57 | Where a.PropertyAddress is null 58 | 59 | 60 | 61 | 62 | -------------------------------------------------------------------------------------------------------------------------- 63 | 64 | -- Breaking out Address into Individual Columns (Address, City, State) 65 | 66 | 67 | Select PropertyAddress 68 | From PortfolioProject.dbo.NashvilleHousing 69 | --Where PropertyAddress is null 70 | --order by ParcelID 71 | 72 | SELECT 73 | SUBSTRING(PropertyAddress, 1, CHARINDEX(',', PropertyAddress) -1 ) as Address 74 | , SUBSTRING(PropertyAddress, CHARINDEX(',', PropertyAddress) + 1 , LEN(PropertyAddress)) as Address 75 | 76 | From PortfolioProject.dbo.NashvilleHousing 77 | 78 | 79 | ALTER TABLE NashvilleHousing 80 | Add PropertySplitAddress Nvarchar(255); 81 | 82 | Update NashvilleHousing 83 | SET PropertySplitAddress = SUBSTRING(PropertyAddress, 1, CHARINDEX(',', PropertyAddress) -1 ) 84 | 85 | 86 | ALTER TABLE NashvilleHousing 87 | Add PropertySplitCity Nvarchar(255); 88 | 89 | Update NashvilleHousing 90 | SET PropertySplitCity = SUBSTRING(PropertyAddress, CHARINDEX(',', PropertyAddress) + 1 , LEN(PropertyAddress)) 91 | 92 | 93 | 94 | 95 | Select * 96 | From PortfolioProject.dbo.NashvilleHousing 97 | 98 | 99 | 100 | 101 | 102 | Select OwnerAddress 103 | From PortfolioProject.dbo.NashvilleHousing 104 | 105 | 106 | Select 107 | PARSENAME(REPLACE(OwnerAddress, ',', '.') , 3) 108 | ,PARSENAME(REPLACE(OwnerAddress, ',', '.') , 2) 109 | ,PARSENAME(REPLACE(OwnerAddress, ',', '.') , 1) 110 | From PortfolioProject.dbo.NashvilleHousing 111 | 112 | 113 | 114 | ALTER TABLE NashvilleHousing 115 | Add OwnerSplitAddress Nvarchar(255); 116 | 117 | Update NashvilleHousing 118 | SET OwnerSplitAddress = PARSENAME(REPLACE(OwnerAddress, ',', '.') , 3) 119 | 120 | 121 | ALTER TABLE NashvilleHousing 122 | Add OwnerSplitCity Nvarchar(255); 123 | 124 | Update NashvilleHousing 125 | SET OwnerSplitCity = PARSENAME(REPLACE(OwnerAddress, ',', '.') , 2) 126 | 127 | 128 | 129 | ALTER TABLE NashvilleHousing 130 | Add OwnerSplitState Nvarchar(255); 131 | 132 | Update NashvilleHousing 133 | SET OwnerSplitState = PARSENAME(REPLACE(OwnerAddress, ',', '.') , 1) 134 | 135 | 136 | 137 | Select * 138 | From PortfolioProject.dbo.NashvilleHousing 139 | 140 | 141 | 142 | 143 | -------------------------------------------------------------------------------------------------------------------------- 144 | 145 | 146 | -- Change Y and N to Yes and No in "Sold as Vacant" field 147 | 148 | 149 | Select Distinct(SoldAsVacant), Count(SoldAsVacant) 150 | From PortfolioProject.dbo.NashvilleHousing 151 | Group by SoldAsVacant 152 | order by 2 153 | 154 | 155 | 156 | 157 | Select SoldAsVacant 158 | , CASE When SoldAsVacant = 'Y' THEN 'Yes' 159 | When SoldAsVacant = 'N' THEN 'No' 160 | ELSE SoldAsVacant 161 | END 162 | From PortfolioProject.dbo.NashvilleHousing 163 | 164 | 165 | Update NashvilleHousing 166 | SET SoldAsVacant = CASE When SoldAsVacant = 'Y' THEN 'Yes' 167 | When SoldAsVacant = 'N' THEN 'No' 168 | ELSE SoldAsVacant 169 | END 170 | 171 | 172 | 173 | 174 | 175 | 176 | ----------------------------------------------------------------------------------------------------------------------------------------------------------- 177 | 178 | -- Remove Duplicates 179 | 180 | WITH RowNumCTE AS( 181 | Select *, 182 | ROW_NUMBER() OVER ( 183 | PARTITION BY ParcelID, 184 | PropertyAddress, 185 | SalePrice, 186 | SaleDate, 187 | LegalReference 188 | ORDER BY 189 | UniqueID 190 | ) row_num 191 | 192 | From PortfolioProject.dbo.NashvilleHousing 193 | --order by ParcelID 194 | ) 195 | Select * 196 | From RowNumCTE 197 | Where row_num > 1 198 | Order by PropertyAddress 199 | 200 | 201 | 202 | Select * 203 | From PortfolioProject.dbo.NashvilleHousing 204 | 205 | 206 | 207 | 208 | --------------------------------------------------------------------------------------------------------- 209 | 210 | -- Delete Unused Columns 211 | 212 | 213 | 214 | Select * 215 | From PortfolioProject.dbo.NashvilleHousing 216 | 217 | 218 | ALTER TABLE PortfolioProject.dbo.NashvilleHousing 219 | DROP COLUMN OwnerAddress, TaxDistrict, PropertyAddress, SaleDate 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | ----------------------------------------------------------------------------------------------- 236 | ----------------------------------------------------------------------------------------------- 237 | 238 | --- Importing Data using OPENROWSET and BULK INSERT 239 | 240 | -- More advanced and looks cooler, but have to configure server appropriately to do correctly 241 | -- Wanted to provide this in case you wanted to try it 242 | 243 | 244 | --sp_configure 'show advanced options', 1; 245 | --RECONFIGURE; 246 | --GO 247 | --sp_configure 'Ad Hoc Distributed Queries', 1; 248 | --RECONFIGURE; 249 | --GO 250 | 251 | 252 | --USE PortfolioProject 253 | 254 | --GO 255 | 256 | --EXEC master.dbo.sp_MSset_oledb_prop N'Microsoft.ACE.OLEDB.12.0', N'AllowInProcess', 1 257 | 258 | --GO 259 | 260 | --EXEC master.dbo.sp_MSset_oledb_prop N'Microsoft.ACE.OLEDB.12.0', N'DynamicParameters', 1 261 | 262 | --GO 263 | 264 | 265 | ---- Using BULK INSERT 266 | 267 | --USE PortfolioProject; 268 | --GO 269 | --BULK INSERT nashvilleHousing FROM 'C:\Temp\SQL Server Management Studio\Nashville Housing Data for Data Cleaning Project.csv' 270 | -- WITH ( 271 | -- FIELDTERMINATOR = ',', 272 | -- ROWTERMINATOR = '\n' 273 | --); 274 | --GO 275 | 276 | 277 | ---- Using OPENROWSET 278 | --USE PortfolioProject; 279 | --GO 280 | --SELECT * INTO nashvilleHousing 281 | --FROM OPENROWSET('Microsoft.ACE.OLEDB.12.0', 282 | -- 'Excel 12.0; Database=C:\Users\alexf\OneDrive\Documents\SQL Server Management Studio\Nashville Housing Data for Data Cleaning Project.csv', [Sheet1$]); 283 | --GO 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | -------------------------------------------------------------------------------- /Nashville Housing Data for Data Cleaning (reuploaded).xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexTheAnalyst/PortfolioProjects/39c541bae76eb109652e8d834b0fa2aa3f15fd8b/Nashville Housing Data for Data Cleaning (reuploaded).xlsx -------------------------------------------------------------------------------- /Nashville Housing Data for Data Cleaning.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexTheAnalyst/PortfolioProjects/39c541bae76eb109652e8d834b0fa2aa3f15fd8b/Nashville Housing Data for Data Cleaning.xlsx -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Data Analyst Portfolio Project Repository 2 | 3 | This Repository will hold all of the code and queries from the Portfolio Projects we create. 4 | 5 | Please feel free to take these and run with them. Make them your own and find you own insights 6 | 7 | I really do hope this is helpful and helps you land that dream job! :D 8 | -------------------------------------------------------------------------------- /Tableau Joins File.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexTheAnalyst/PortfolioProjects/39c541bae76eb109652e8d834b0fa2aa3f15fd8b/Tableau Joins File.xlsx -------------------------------------------------------------------------------- /Tableau Portfolio Project SQL Queries.sql: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Queries used for Tableau Project 4 | 5 | */ 6 | 7 | 8 | 9 | -- 1. 10 | 11 | Select SUM(new_cases) as total_cases, SUM(cast(new_deaths as int)) as total_deaths, SUM(cast(new_deaths as int))/SUM(New_Cases)*100 as DeathPercentage 12 | From PortfolioProject..CovidDeaths 13 | --Where location like '%states%' 14 | where continent is not null 15 | --Group By date 16 | order by 1,2 17 | 18 | -- Just a double check based off the data provided 19 | -- numbers are extremely close so we will keep them - The Second includes "International" Location 20 | 21 | 22 | --Select SUM(new_cases) as total_cases, SUM(cast(new_deaths as int)) as total_deaths, SUM(cast(new_deaths as int))/SUM(New_Cases)*100 as DeathPercentage 23 | --From PortfolioProject..CovidDeaths 24 | ----Where location like '%states%' 25 | --where location = 'World' 26 | ----Group By date 27 | --order by 1,2 28 | 29 | 30 | -- 2. 31 | 32 | -- We take these out as they are not inluded in the above queries and want to stay consistent 33 | -- European Union is part of Europe 34 | 35 | Select location, SUM(cast(new_deaths as int)) as TotalDeathCount 36 | From PortfolioProject..CovidDeaths 37 | --Where location like '%states%' 38 | Where continent is null 39 | and location not in ('World', 'European Union', 'International') 40 | Group by location 41 | order by TotalDeathCount desc 42 | 43 | 44 | -- 3. 45 | 46 | Select Location, Population, MAX(total_cases) as HighestInfectionCount, Max((total_cases/population))*100 as PercentPopulationInfected 47 | From PortfolioProject..CovidDeaths 48 | --Where location like '%states%' 49 | Group by Location, Population 50 | order by PercentPopulationInfected desc 51 | 52 | 53 | -- 4. 54 | 55 | 56 | Select Location, Population,date, MAX(total_cases) as HighestInfectionCount, Max((total_cases/population))*100 as PercentPopulationInfected 57 | From PortfolioProject..CovidDeaths 58 | --Where location like '%states%' 59 | Group by Location, Population, date 60 | order by PercentPopulationInfected desc 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | -- Queries I originally had, but excluded some because it created too long of video 74 | -- Here only in case you want to check them out 75 | 76 | 77 | -- 1. 78 | 79 | Select dea.continent, dea.location, dea.date, dea.population 80 | , MAX(vac.total_vaccinations) as RollingPeopleVaccinated 81 | --, (RollingPeopleVaccinated/population)*100 82 | From PortfolioProject..CovidDeaths dea 83 | Join PortfolioProject..CovidVaccinations vac 84 | On dea.location = vac.location 85 | and dea.date = vac.date 86 | where dea.continent is not null 87 | group by dea.continent, dea.location, dea.date, dea.population 88 | order by 1,2,3 89 | 90 | 91 | 92 | 93 | -- 2. 94 | Select SUM(new_cases) as total_cases, SUM(cast(new_deaths as int)) as total_deaths, SUM(cast(new_deaths as int))/SUM(New_Cases)*100 as DeathPercentage 95 | From PortfolioProject..CovidDeaths 96 | --Where location like '%states%' 97 | where continent is not null 98 | --Group By date 99 | order by 1,2 100 | 101 | 102 | -- Just a double check based off the data provided 103 | -- numbers are extremely close so we will keep them - The Second includes "International" Location 104 | 105 | 106 | --Select SUM(new_cases) as total_cases, SUM(cast(new_deaths as int)) as total_deaths, SUM(cast(new_deaths as int))/SUM(New_Cases)*100 as DeathPercentage 107 | --From PortfolioProject..CovidDeaths 108 | ----Where location like '%states%' 109 | --where location = 'World' 110 | ----Group By date 111 | --order by 1,2 112 | 113 | 114 | -- 3. 115 | 116 | -- We take these out as they are not inluded in the above queries and want to stay consistent 117 | -- European Union is part of Europe 118 | 119 | Select location, SUM(cast(new_deaths as int)) as TotalDeathCount 120 | From PortfolioProject..CovidDeaths 121 | --Where location like '%states%' 122 | Where continent is null 123 | and location not in ('World', 'European Union', 'International') 124 | Group by location 125 | order by TotalDeathCount desc 126 | 127 | 128 | 129 | -- 4. 130 | 131 | Select Location, Population, MAX(total_cases) as HighestInfectionCount, Max((total_cases/population))*100 as PercentPopulationInfected 132 | From PortfolioProject..CovidDeaths 133 | --Where location like '%states%' 134 | Group by Location, Population 135 | order by PercentPopulationInfected desc 136 | 137 | 138 | 139 | -- 5. 140 | 141 | --Select Location, date, total_cases,total_deaths, (total_deaths/total_cases)*100 as DeathPercentage 142 | --From PortfolioProject..CovidDeaths 143 | ----Where location like '%states%' 144 | --where continent is not null 145 | --order by 1,2 146 | 147 | -- took the above query and added population 148 | Select Location, date, population, total_cases, total_deaths 149 | From PortfolioProject..CovidDeaths 150 | --Where location like '%states%' 151 | where continent is not null 152 | order by 1,2 153 | 154 | 155 | -- 6. 156 | 157 | 158 | With PopvsVac (Continent, Location, Date, Population, New_Vaccinations, RollingPeopleVaccinated) 159 | as 160 | ( 161 | Select dea.continent, dea.location, dea.date, dea.population, vac.new_vaccinations 162 | , SUM(CONVERT(int,vac.new_vaccinations)) OVER (Partition by dea.Location Order by dea.location, dea.Date) as RollingPeopleVaccinated 163 | --, (RollingPeopleVaccinated/population)*100 164 | From PortfolioProject..CovidDeaths dea 165 | Join PortfolioProject..CovidVaccinations vac 166 | On dea.location = vac.location 167 | and dea.date = vac.date 168 | where dea.continent is not null 169 | --order by 2,3 170 | ) 171 | Select *, (RollingPeopleVaccinated/Population)*100 as PercentPeopleVaccinated 172 | From PopvsVac 173 | 174 | 175 | -- 7. 176 | 177 | Select Location, Population,date, MAX(total_cases) as HighestInfectionCount, Max((total_cases/population))*100 as PercentPopulationInfected 178 | From PortfolioProject..CovidDeaths 179 | --Where location like '%states%' 180 | Group by Location, Population, date 181 | order by PercentPopulationInfected desc 182 | 183 | 184 | 185 | 186 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from pathlib import Path 3 | 4 | import pandas as pd 5 | import plotly.express as px 6 | from faicons import icon_svg 7 | from shinywidgets import render_plotly 8 | from state_choices import STATE_CHOICES 9 | 10 | from shiny import reactive 11 | from shiny.express import input, render, ui 12 | 13 | # --------------------------------------------------------------------- 14 | # Reading in Files 15 | # --------------------------------------------------------------------- 16 | new_listings_df = pd.read_csv(Path(__file__).parent / "Metro_new_listings_uc_sfrcondo_sm_month.csv") 17 | median_listing_price_df = pd.read_csv(Path(__file__).parent / "Metro_mlp_uc_sfrcondo_sm_month.csv") 18 | for_sale_inventory_df = pd.read_csv(Path(__file__).parent / "Metro_invt_fs_uc_sfrcondo_sm_month.csv") 19 | 20 | 21 | # --------------------------------------------------------------------- 22 | # Helper functions - converting to DateTime 23 | # --------------------------------------------------------------------- 24 | def string_to_date(date_str): 25 | return datetime.strptime(date_str, "%Y-%m-%d").date() 26 | 27 | 28 | def filter_by_date(df: pd.DataFrame,date_range: tuple): 29 | rng = sorted(date_range) 30 | dates = pd.to_datetime(df["Date"], format="%Y-%m-%d").dt.date 31 | return df[(dates >= rng[0]) & (dates <= rng[1])] 32 | 33 | 34 | # --------------------------------------------------------------------- 35 | # Visualizations 36 | # --------------------------------------------------------------------- 37 | 38 | #for_sale_inventory_df2 = for_sale_inventory_df["StateName"].fillna("United States") 39 | #for_sale_inventory_df2 = for_sale_inventory_df["StateName"].drop_duplicates() 40 | #for_sale_inventory_df2 = for_sale_inventory_df2.sort_values().tolist() 41 | 42 | 43 | ui.page_opts(title= "US Housing App") 44 | 45 | 46 | 47 | 48 | 49 | with ui.sidebar(): 50 | ui.input_select("state","Filter by State", choices=STATE_CHOICES), 51 | ui.input_slider("date_range","Filter by Date Range", 52 | min = string_to_date("2018-3-31"), 53 | max = string_to_date("2024-4-30"), 54 | value = [string_to_date(x) for x in ["2018-3-31","2024-4-30"]]) 55 | 56 | with ui.layout_column_wrap(): 57 | with ui.value_box(showcase = icon_svg("dollar-sign")): 58 | "Current Median List Price" 59 | 60 | @render.ui 61 | def price(): 62 | date_columns = median_listing_price_df.columns[6:] 63 | states = median_listing_price_df.groupby("StateName").mean(numeric_only=True) 64 | dates = states[date_columns].reset_index() 65 | states = dates.melt(id_vars=["StateName"], var_name="Date", value_name="Value") 66 | country = median_listing_price_df[median_listing_price_df["RegionType"] == "country"] 67 | country_dates = country[date_columns].reset_index() 68 | country_dates["StateName"] = "United States" 69 | country = country_dates.melt( 70 | id_vars=["StateName"], var_name="Date", value_name="Value" 71 | ) 72 | 73 | res = pd.concat([states, country]) 74 | 75 | res = res[res["Date"] != "index"] 76 | 77 | df = res[res["StateName"] == input.state()] 78 | 79 | last_value = df.iloc[-1,-1] 80 | return f"${last_value:,.0f}" 81 | 82 | with ui.value_box(showcase = icon_svg("house")): 83 | "Home Inventory % Change" 84 | @render.ui 85 | def change(): 86 | date_columns = median_listing_price_df.columns[6:] 87 | states = median_listing_price_df.groupby("StateName").mean(numeric_only=True) 88 | dates = states[date_columns].reset_index() 89 | states = dates.melt(id_vars=["StateName"], var_name="Date", value_name="Value") 90 | country = median_listing_price_df[median_listing_price_df["RegionType"] == "country"] 91 | country_dates = country[date_columns].reset_index() 92 | country_dates["StateName"] = "United States" 93 | country = country_dates.melt( 94 | id_vars=["StateName"], var_name="Date", value_name="Value" 95 | ) 96 | 97 | res = pd.concat([states, country]) 98 | 99 | res = res[res["Date"] != "index"] 100 | 101 | df = res[res["StateName"] == input.state()] 102 | 103 | last_value = df.iloc[-1,-1] 104 | second_last_value = df.iloc[-2,-1] 105 | 106 | percentage_change = ((last_value - second_last_value)/second_last_value *100) 107 | sign = "+" if percentage_change > 0 else "-" 108 | return f"{sign}{percentage_change:.2f}%" 109 | 110 | # Plotly visualization of Median Home Price Per State 111 | 112 | with ui.navset_card_underline(title = "Median List Price"): 113 | 114 | with ui.nav_panel("Plot", icon = icon_svg("chart-line")): 115 | 116 | @render_plotly 117 | def list_price_plot(): 118 | # Grouping by State Name and specifying the Date Columns 119 | price_grouped = median_listing_price_df.groupby('StateName').mean(numeric_only=True) 120 | date_columns = median_listing_price_df.columns[6:] 121 | price_grouped_dates = price_grouped[date_columns].reset_index() 122 | price_df_for_viz = price_grouped_dates.melt(id_vars=["StateName"], var_name="Date", value_name="Value") 123 | 124 | price_df_for_viz = filter_by_date(price_df_for_viz, input.date_range()) 125 | 126 | if input.state() == "United States": 127 | df = price_df_for_viz 128 | else: 129 | df = price_df_for_viz[price_df_for_viz["StateName"] == input.state()] 130 | 131 | 132 | # Creating Visualization using Ployly 133 | fig = px.line(df, x="Date", y="Value", color="StateName") 134 | fig.update_xaxes(title_text="") 135 | fig.update_yaxes(title_text="") 136 | return fig 137 | with ui.nav_panel("Table", icon = icon_svg("table")): 138 | @render.data_frame 139 | def list_price_data(): 140 | if input.state() == "United States": 141 | df = median_listing_price_df 142 | else: 143 | df = median_listing_price_df[median_listing_price_df["StateName"] == input.state()] 144 | return render.DataGrid(df) 145 | 146 | # Plotly visualization of Homes For Sale Per State 147 | 148 | with ui.navset_card_underline(title = "Home Inventory"): 149 | 150 | with ui.nav_panel("Plot", icon = icon_svg("chart-line")): 151 | @render_plotly 152 | def for_sale_plot(): 153 | # Grouping by State Name and specifying the Date Columns 154 | for_sale_grouped = for_sale_inventory_df.groupby('StateName').sum(numeric_only=True) 155 | date_columns = for_sale_inventory_df.columns[6:] 156 | for_sale_grouped_grouped_dates = for_sale_grouped[date_columns].reset_index() 157 | for_sale_df_for_viz = for_sale_grouped_grouped_dates.melt(id_vars=["StateName"], var_name="Date", value_name="Value") 158 | 159 | 160 | for_sale_df_for_viz = filter_by_date(for_sale_df_for_viz, input.date_range()) 161 | 162 | if input.state() == "United States": 163 | df = for_sale_df_for_viz 164 | else: 165 | df = for_sale_df_for_viz[for_sale_df_for_viz["StateName"] == input.state()] 166 | 167 | # Creating Visualization using Ployly 168 | fig = px.line(df, x="Date", y="Value", color="StateName") 169 | fig.update_xaxes(title_text="") 170 | fig.update_yaxes(title_text="") 171 | return fig 172 | with ui.nav_panel("Table", icon = icon_svg("table")): 173 | @render.data_frame 174 | def for_sale_data(): 175 | if input.state() == "United States": 176 | df = for_sale_inventory_df 177 | else: 178 | df = for_sale_inventory_df[for_sale_inventory_df["StateName"] == input.state()] 179 | return render.DataGrid(df) 180 | 181 | # Plotly visualization of Listings Per State 182 | 183 | with ui.navset_card_underline(title = "New Listings"): 184 | 185 | with ui.nav_panel("Plot", icon = icon_svg("chart-line")): 186 | 187 | @render_plotly 188 | def listings_plot(): 189 | # Grouping by State Name and specifying the Date Columns 190 | new_listings_grouped = new_listings_df.groupby('StateName').sum(numeric_only=True) 191 | date_columns = new_listings_df.columns[6:] 192 | new_listings_grouped_dates = new_listings_grouped[date_columns].reset_index() 193 | new_listings_df_for_viz = new_listings_grouped_dates.melt(id_vars=["StateName"], var_name="Date", value_name="Value") 194 | 195 | new_listings_df_for_viz = filter_by_date(new_listings_df_for_viz, input.date_range()) 196 | 197 | if input.state() == "United States": 198 | df = new_listings_df_for_viz 199 | else: 200 | df = new_listings_df_for_viz[new_listings_df_for_viz["StateName"] == input.state()] 201 | 202 | 203 | # Creating Visualization using Ployly 204 | fig = px.line(df, x="Date", y="Value", color="StateName") 205 | fig.update_xaxes(title_text="") 206 | fig.update_yaxes(title_text="") 207 | return fig 208 | 209 | with ui.nav_panel("Table", icon = icon_svg("table")): 210 | @render.data_frame 211 | def listings_data(): 212 | if input.state() == "United States": 213 | df = new_listings_df 214 | else: 215 | df = new_listings_df[new_listings_df["StateName"] == input.state()] 216 | return render.DataGrid(df) 217 | -------------------------------------------------------------------------------- /app_starting_code.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from pathlib import Path 3 | 4 | import pandas as pd 5 | import plotly.express as px 6 | from faicons import icon_svg 7 | from shinywidgets import render_plotly 8 | from state_choices import STATE_CHOICES 9 | 10 | from shiny import reactive 11 | from shiny.express import input, render, ui 12 | 13 | # --------------------------------------------------------------------- 14 | # Reading in Files 15 | # --------------------------------------------------------------------- 16 | new_listings_df = pd.read_csv(Path(__file__).parent / "Metro_new_listings_uc_sfrcondo_sm_month.csv") 17 | median_listing_price_df = pd.read_csv(Path(__file__).parent / "Metro_mlp_uc_sfrcondo_sm_month.csv") 18 | for_sale_inventory_df = pd.read_csv(Path(__file__).parent / "Metro_invt_fs_uc_sfrcondo_sm_month.csv") 19 | 20 | 21 | # --------------------------------------------------------------------- 22 | # Helper functions - converting to DateTime 23 | # --------------------------------------------------------------------- 24 | def string_to_date(date_str): 25 | return datetime.strptime(date_str, "%Y-%m-%d").date() 26 | 27 | 28 | def filter_by_date(df: pd.DataFrame, date_range: tuple): 29 | rng = sorted(date_range) 30 | dates = pd.to_datetime(df["Date"], format="%Y-%m-%d").dt.date 31 | return df[(dates >= rng[0]) & (dates <= rng[1])] 32 | 33 | 34 | # --------------------------------------------------------------------- 35 | # Visualizations 36 | # --------------------------------------------------------------------- 37 | 38 | # Plotly visualization of Median Home Price Per State 39 | def list_price_plot(): 40 | # Grouping by State Name and specifying the Date Columns 41 | price_grouped = median_listing_price_df.groupby('StateName').mean(numeric_only=True) 42 | date_columns = median_listing_price_df.columns[6:] 43 | price_grouped_dates = price_grouped[date_columns].reset_index() 44 | price_df_for_viz = price_grouped_dates.melt(id_vars=["StateName"], var_name="Date", value_name="Value") 45 | # Creating Visualization using Ployly 46 | fig = px.line(price_df_for_viz, x="Date", y="Value", color="StateName") 47 | fig.update_xaxes(title_text="") 48 | fig.update_yaxes(title_text="") 49 | return fig 50 | 51 | 52 | # Plotly visualization of Homes For Sale Per State 53 | def for_sale_plot(): 54 | # Grouping by State Name and specifying the Date Columns 55 | df2_grouped = for_sale_inventory_df.groupby('StateName').sum(numeric_only=True) 56 | date_columns = for_sale_inventory_df.columns[6:] 57 | df2_grouped_dates = df2_grouped[date_columns].reset_index() 58 | df2_melted = df2_grouped_dates.melt(id_vars=["StateName"], var_name="Date", value_name="Value") 59 | # Creating Visualization using Ployly 60 | df = for_sale_filtered() 61 | fig = px.line(df, x="Date", y="Value", color="StateName") 62 | fig.update_xaxes(title_text="") 63 | fig.update_yaxes(title_text="") 64 | return fig 65 | 66 | # Plotly visualization of Listings Per State 67 | def listings_plot(): 68 | # Grouping by State Name and specifying the Date Columns 69 | df3_grouped = new_listings_df.groupby('StateName').sum(numeric_only=True) 70 | date_columns = new_listings_df.columns[6:] 71 | df3_grouped_dates = df3_grouped[date_columns].reset_index() 72 | df3_melted = df3_grouped_dates.melt(id_vars=["StateName"], var_name="Date", value_name="Value") 73 | # Creating Visualization using Ployly 74 | df = listings_filtered() 75 | fig = px.line(df, x="Date", y="Value", color="StateName") 76 | fig.update_xaxes(title_text="") 77 | fig.update_yaxes(title_text="") 78 | return fig 79 | 80 | 81 | 82 | -------------------------------------------------------------------------------- /us-housing-app.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexTheAnalyst/PortfolioProjects/39c541bae76eb109652e8d834b0fa2aa3f15fd8b/us-housing-app.zip --------------------------------------------------------------------------------