├── Figures
├── readme
├── Fig1.vsd
├── Fig3.jpg
├── Fig4.jpg
├── Fig5.png
├── Fig7.png
├── Fig8.jpg
├── LSTM.jpg
├── Fig 6.png
├── Fig 9.png
├── Fig 3-4.docx
├── Figure 1.vsdx
└── Picture 1.jpg
├── Statistical result
├── Readme
├── Graphs.spv
├── Output01.spv
├── Results.docx
└── Correlation and Regression.spv
├── Implementation
├── Dataset
│ ├── readme
│ └── data_cases1.csv
├── readme
├── 4_29_2020Cov_ID_Dashboard_.ipynb
├── 5_3_2020_Covid_Prophet_Final.ipynb
└── 5_13_2020_Covid_LSTM.ipynb
└── README.md
/Figures/readme:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/Statistical result/Readme:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/Implementation/Dataset/readme:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Machine-Learning-Driven-Approach-for-2019-nCoV-Warning-System
--------------------------------------------------------------------------------
/Figures/Fig1.vsd:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dr-mushtaq/Machine-Learning-Driven-Approach-for-2019-nCoV-Warning-System/master/Figures/Fig1.vsd
--------------------------------------------------------------------------------
/Figures/Fig3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dr-mushtaq/Machine-Learning-Driven-Approach-for-2019-nCoV-Warning-System/master/Figures/Fig3.jpg
--------------------------------------------------------------------------------
/Figures/Fig4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dr-mushtaq/Machine-Learning-Driven-Approach-for-2019-nCoV-Warning-System/master/Figures/Fig4.jpg
--------------------------------------------------------------------------------
/Figures/Fig5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dr-mushtaq/Machine-Learning-Driven-Approach-for-2019-nCoV-Warning-System/master/Figures/Fig5.png
--------------------------------------------------------------------------------
/Figures/Fig7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dr-mushtaq/Machine-Learning-Driven-Approach-for-2019-nCoV-Warning-System/master/Figures/Fig7.png
--------------------------------------------------------------------------------
/Figures/Fig8.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dr-mushtaq/Machine-Learning-Driven-Approach-for-2019-nCoV-Warning-System/master/Figures/Fig8.jpg
--------------------------------------------------------------------------------
/Figures/LSTM.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dr-mushtaq/Machine-Learning-Driven-Approach-for-2019-nCoV-Warning-System/master/Figures/LSTM.jpg
--------------------------------------------------------------------------------
/Figures/Fig 6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dr-mushtaq/Machine-Learning-Driven-Approach-for-2019-nCoV-Warning-System/master/Figures/Fig 6.png
--------------------------------------------------------------------------------
/Figures/Fig 9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dr-mushtaq/Machine-Learning-Driven-Approach-for-2019-nCoV-Warning-System/master/Figures/Fig 9.png
--------------------------------------------------------------------------------
/Figures/Fig 3-4.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dr-mushtaq/Machine-Learning-Driven-Approach-for-2019-nCoV-Warning-System/master/Figures/Fig 3-4.docx
--------------------------------------------------------------------------------
/Figures/Figure 1.vsdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dr-mushtaq/Machine-Learning-Driven-Approach-for-2019-nCoV-Warning-System/master/Figures/Figure 1.vsdx
--------------------------------------------------------------------------------
/Figures/Picture 1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dr-mushtaq/Machine-Learning-Driven-Approach-for-2019-nCoV-Warning-System/master/Figures/Picture 1.jpg
--------------------------------------------------------------------------------
/Statistical result/Graphs.spv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dr-mushtaq/Machine-Learning-Driven-Approach-for-2019-nCoV-Warning-System/master/Statistical result/Graphs.spv
--------------------------------------------------------------------------------
/Statistical result/Output01.spv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dr-mushtaq/Machine-Learning-Driven-Approach-for-2019-nCoV-Warning-System/master/Statistical result/Output01.spv
--------------------------------------------------------------------------------
/Statistical result/Results.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dr-mushtaq/Machine-Learning-Driven-Approach-for-2019-nCoV-Warning-System/master/Statistical result/Results.docx
--------------------------------------------------------------------------------
/Implementation/Dataset/data_cases1.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dr-mushtaq/Machine-Learning-Driven-Approach-for-2019-nCoV-Warning-System/master/Implementation/Dataset/data_cases1.csv
--------------------------------------------------------------------------------
/Statistical result/Correlation and Regression.spv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dr-mushtaq/Machine-Learning-Driven-Approach-for-2019-nCoV-Warning-System/master/Statistical result/Correlation and Regression.spv
--------------------------------------------------------------------------------
/Implementation/readme:
--------------------------------------------------------------------------------
1 | https://www.youtube.com/watch?v=zlFKgMnaDXk&feature=youtu.be&fbclid=IwAR3vSH3rUeeQRvhxSzADMlqK4CY06GaSbJSaeq-mAYfHXZb9E0M8Dlor4wU
2 | https://www.youtube.com/watch?v=BKceuStrbPo
3 | Python Project – COVID-19 Spread Analysis with Flask
4 | https://data-flair.training/blogs/covid-19-spread-analysis-python/
5 | VISUALIZING & PREDICTING CORONA CASES – LATEST AI PROJECT
6 | https://aihubprojects.com/visualizing-predicting-corona-cases/?fbclid=IwAR2Nhi1mhrTQHuUNQIfDgruUD8I2Dar55HvBpf1bSCI5BQ0G3fICotI28WE
7 | https://aihubprojects.com/visualizing-predicting-corona-cases/
8 | Dect Covid-19 with phone record
9 | https://www.youtube.com/watch?v=J_OmBva8_RA&feature=youtu.be
10 | 5 Covid-19 Projects with Python and Machine Learning
11 | https://thecleverprogrammer.com/2020/11/21/covid-19-projects-with-python-and-machine-learning/?fbclid=IwAR1OQsn_T10YloV0fMcQDBJCa4gauxmKGwq19w4-KRhDgFhexhNUSP97MKM
12 | Attempting to model COVID-related deaths
13 | https://towardsdatascience.com/attempting-to-model-covid-related-deaths-3333045feffa
14 | Machine Learning Project on Covid-19 Cases Prediction with Python
15 | https://thecleverprogrammer.com/2020/11/29/covid-19-cases-prediction-with-python/
16 | https://github.com/SeanPLeary/time-series-h2o-automl-example?fbclid=IwAR2pfRMurZRp_x4Y7ULt1xyQ8Y9wo6YngtAtKQQylhPIleta8Jj-JaOA0lg
17 | 10 Machine Learning Projects on Time Series Forecasting
18 | https://amankharwal.medium.com/10-machine-learning-projects-on-time-series-forecasting-ee0368420ccd
19 | Covid or just a Cough? AI for detecting COVID-19 from Cough Sounds
20 | https://www.kdnuggets.com/2020/12/covid-cough-ai-detecting-sounds.html?fbclid=IwAR3rvlRVAver2FwqyNRO2P6RKQXpl3TfKN5hyLhBCfqsVfVUIsuFDl_Phc8
21 | Develop A Covid-19 Live Web App With Python Django
22 | https://www.udemy.com/course/develop-a-covid-19-live-web-app-with-python-django/learn/lecture/21573382#overview
23 | AI, Machine Learning Tools Help Predict COVID-19 Outcomes
24 | https://healthitanalytics.com/news/ai-machine-learning-tools-help-predict-covid-19-outcomes?fbclid=IwAR1agRS3KCjp67LQkT_Xf6Yo7QB2OInHMmKWBzGznHhKINFeAQSylgStXVs
25 | Time Series Analysis Real World Projects in Python
26 | https://www.udemy.com/course/time-series-analysis-real-world-projects-in-python/learn/lecture/24586578#overview
27 |
28 |
--------------------------------------------------------------------------------
/Implementation/4_29_2020Cov_ID_Dashboard_.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "name": "4-29-2020Cov-ID Dashboard .ipynb",
7 | "provenance": [],
8 | "collapsed_sections": []
9 | },
10 | "kernelspec": {
11 | "name": "python3",
12 | "display_name": "Python 3"
13 | }
14 | },
15 | "cells": [
16 | {
17 | "cell_type": "markdown",
18 | "metadata": {
19 | "id": "view-in-github",
20 | "colab_type": "text"
21 | },
22 | "source": [
23 | "
"
24 | ]
25 | },
26 | {
27 | "cell_type": "markdown",
28 | "metadata": {
29 | "id": "TMaTt-5yvUDc",
30 | "colab_type": "text"
31 | },
32 | "source": [
33 | "## Table of Contents\n",
34 | "\n",
35 | "
\n",
36 | "\n",
37 | "
\n",
38 | "1. Importing Needed Libary \n",
39 | "2. Colab Upgradation \n",
40 | "3. Upload the Dataset \n",
41 | "4. Read Dataset \n",
42 | "5. Data preprocessing \n",
43 | "4.1 Dataset Editing \n",
44 | "4.2 Data Description and analysis \n",
45 | "4.3 Missing values \n",
46 | "4.4 Data Sorting \n",
47 | "4.5 Data Encoding into Categorical \n",
48 | "4.4 Data Sorting \n",
49 | "4.4 Data Sorting \n",
50 | " \n",
51 | "\n",
52 | "
# New Section"
53 | ]
54 | },
55 | {
56 | "cell_type": "markdown",
57 | "metadata": {
58 | "id": "xbTL0mLS5XLo",
59 | "colab_type": "text"
60 | },
61 | "source": [
62 | "\n",
63 | "1. **Importing Needed Libary** \n",
64 | "\n",
65 | "This program are used to used artifical recourrent neural network called short long term memory (LSTM)\n",
66 | " predicting next week death cases of different state\n",
67 | " \n",
68 | "\n",
69 | "\n",
70 | "\n",
71 | "\n",
72 | "\n"
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "metadata": {
78 | "id": "YSX92Kj0G665",
79 | "colab_type": "code",
80 | "colab": {}
81 | },
82 | "source": [
83 | "import matplotlib.pyplot as plt #6\n",
84 | "plt.style.use('fivethirtyeight') #7\n",
85 | "import math #1\n",
86 | "import pandas\n",
87 | "import pandas_datareader as web #2\n",
88 | "from sklearn.preprocessing import MinMaxScaler #3\n",
89 | "from keras.models import Sequential #4 \n",
90 | "from keras.layers import Dense, LSTM #5\n",
91 | "import pandas as pd # selected\n",
92 | "import pylab as pl\n",
93 | "import numpy as np\n",
94 | "import pandas as pd\n",
95 | "import os\n",
96 | "import numpy\n",
97 | "import scipy\n",
98 | "from sklearn.preprocessing import MinMaxScaler\n",
99 | "from sklearn.preprocessing import LabelEncoder\n",
100 | "from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier\n",
101 | "from sklearn.model_selection import train_test_split # Import train_test_split function\n",
102 | "from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation\n",
103 | "from sklearn import preprocessing\n",
104 | "from sklearn.tree import export_graphviz\n",
105 | "from sklearn.externals.six import StringIO \n",
106 | "from IPython.display import Image \n",
107 | "import pydotplus\n",
108 | "from sklearn.ensemble import RandomForestClassifier\n",
109 | "from sklearn.metrics import confusion_matrix \n",
110 | "from sklearn.metrics import accuracy_score \n",
111 | "from sklearn.metrics import classification_report\n",
112 | "from sklearn.linear_model import LogisticRegression\n",
113 | "from sklearn.preprocessing import StandardScaler\n",
114 | "from pandas.plotting import scatter_matrix\n",
115 | "import seaborn as sns"
116 | ],
117 | "execution_count": 0,
118 | "outputs": []
119 | },
120 | {
121 | "cell_type": "code",
122 | "metadata": {
123 | "id": "HRdnMRskS8N1",
124 | "colab_type": "code",
125 | "colab": {}
126 | },
127 | "source": [
128 | "#[3]\n",
129 | "Enrol_window = 100"
130 | ],
131 | "execution_count": 0,
132 | "outputs": []
133 | },
134 | {
135 | "cell_type": "markdown",
136 | "metadata": {
137 | "id": "Rzl0FkXmK1AY",
138 | "colab_type": "text"
139 | },
140 | "source": [
141 | " 2.**Colab Upgradation** "
142 | ]
143 | },
144 | {
145 | "cell_type": "code",
146 | "metadata": {
147 | "id": "JGsk7x5Tna9a",
148 | "colab_type": "code",
149 | "colab": {}
150 | },
151 | "source": [
152 | "# This command is used to update the latest version of tensorflow inside colab\n",
153 | "!pip install tensorflow==1.14.0"
154 | ],
155 | "execution_count": 0,
156 | "outputs": []
157 | },
158 | {
159 | "cell_type": "code",
160 | "metadata": {
161 | "id": "_Z53WTKdAj4t",
162 | "colab_type": "code",
163 | "colab": {}
164 | },
165 | "source": [
166 | "# fix random seed for reproducibility\n",
167 | "numpy.random.seed(7)"
168 | ],
169 | "execution_count": 0,
170 | "outputs": []
171 | },
172 | {
173 | "cell_type": "markdown",
174 | "metadata": {
175 | "id": "mSo57tJe6AaC",
176 | "colab_type": "text"
177 | },
178 | "source": [
179 | "3.**Upload the Dataset**\n",
180 | "\n",
181 | " This module is used to upload dataset from Hardisk "
182 | ]
183 | },
184 | {
185 | "cell_type": "code",
186 | "metadata": {
187 | "id": "8o3x_F3IWbht",
188 | "colab_type": "code",
189 | "colab": {}
190 | },
191 | "source": [
192 | "# this code is used to upload dataset from Pc to colab\n",
193 | "from google.colab import files # Please First run this cod in chrom \n",
194 | "def getLocalFiles():\n",
195 | " _files = files.upload() # upload StudentNextSessionf.csv datase\n",
196 | " if len(_files) >0: # Then run above libray \n",
197 | " for k,v in _files.items():\n",
198 | " open(k,'wb').write(v)\n",
199 | "getLocalFiles()"
200 | ],
201 | "execution_count": 0,
202 | "outputs": []
203 | },
204 | {
205 | "cell_type": "markdown",
206 | "metadata": {
207 | "id": "MP2UU-zE62DM",
208 | "colab_type": "text"
209 | },
210 | "source": [
211 | "\n",
212 | "4. **Read Dataset**\n"
213 | ]
214 | },
215 | {
216 | "cell_type": "code",
217 | "metadata": {
218 | "id": "zoh_zm51WnO1",
219 | "colab_type": "code",
220 | "colab": {}
221 | },
222 | "source": [
223 | "##https://www.pluralsight.com/guides/handling-categorical-data-in-machine-learning-models\n",
224 | "#https://machinelearningmastery.com/power-transform-time-series-forecast-data-python/\n",
225 | "#df = pd.read_excel (r'Path where the Excel file is stored\\File name.xlsx')\n",
226 | "#https://www.shanelynn.ie/select-pandas-dataframe-rows-and-columns-using-iloc-loc-and-ix/\n",
227 | "df = pd.read_csv(r\"data_cases.csv\", header=0,index_col=0) # [1]\n",
228 | "X = df.iloc[:,0:3]\n",
229 | "y = df.iloc[:,3]\n",
230 | "#df= web.DataReader('AAPL', data_source='yahoo', start='2012', end='2019-12-17') [1]"
231 | ],
232 | "execution_count": 0,
233 | "outputs": []
234 | },
235 | {
236 | "cell_type": "code",
237 | "metadata": {
238 | "id": "vJRX-_KKmAgY",
239 | "colab_type": "code",
240 | "colab": {}
241 | },
242 | "source": [
243 | "# [2]\n",
244 | "df = pandas.read_csv('Xdata.csv',usecols=['Date','State','Recovered','Deaths'])\n",
245 | "#dataframe.drop(dataframe.index[[16452,16453]])\n",
246 | "X= df.values\n",
247 | "len(X)"
248 | ],
249 | "execution_count": 0,
250 | "outputs": []
251 | },
252 | {
253 | "cell_type": "code",
254 | "metadata": {
255 | "id": "xRz5xQGHRWrB",
256 | "colab_type": "code",
257 | "colab": {}
258 | },
259 | "source": [
260 | "df = pd.read_csv('Xdata.csv', index_col='Date', parse_dates=['Date'])\n",
261 | "df.head() # [3]"
262 | ],
263 | "execution_count": 0,
264 | "outputs": []
265 | },
266 | {
267 | "cell_type": "code",
268 | "metadata": {
269 | "id": "B8QEfc2Evza1",
270 | "colab_type": "code",
271 | "colab": {}
272 | },
273 | "source": [
274 | "# !wget https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Confirmed.csv\n",
275 | "!gdown --id 1AsfdLrGESCQnRW5rbMz56A1KBc3Fe5aV # [10] read dataset from google drive\n"
276 | ],
277 | "execution_count": 0,
278 | "outputs": []
279 | },
280 | {
281 | "cell_type": "code",
282 | "metadata": {
283 | "id": "t6rDOZpClbZB",
284 | "colab_type": "code",
285 | "colab": {}
286 | },
287 | "source": [
288 | "#Removing unwanted Columns from the Data Frame [16]\n",
289 | "data = data.drop('Date',axis=1) \n",
290 | "data = data.drop('Adj Close',axis = 1)\n",
291 | "print('\\n\\nData after removing Date and Adj Close : ')\n",
292 | "print(data.head())"
293 | ],
294 | "execution_count": 0,
295 | "outputs": []
296 | },
297 | {
298 | "cell_type": "markdown",
299 | "metadata": {
300 | "id": "Pl4_Vcie85p5",
301 | "colab_type": "text"
302 | },
303 | "source": [
304 | "\n",
305 | "\n",
306 | "5.**Data preprocessing**\n",
307 | "\n",
308 | "\n"
309 | ]
310 | },
311 | {
312 | "cell_type": "markdown",
313 | "metadata": {
314 | "id": "pT3ekeKYCYfr",
315 | "colab_type": "text"
316 | },
317 | "source": [
318 | " **5.1 Dataset Editing**"
319 | ]
320 | },
321 | {
322 | "cell_type": "code",
323 | "metadata": {
324 | "id": "swOcYkYrCRSj",
325 | "colab_type": "code",
326 | "colab": {}
327 | },
328 | "source": [
329 | " d3=dataframe2.iloc[:-1] # [4]"
330 | ],
331 | "execution_count": 0,
332 | "outputs": []
333 | },
334 | {
335 | "cell_type": "code",
336 | "metadata": {
337 | "id": "cj3AyzLZF3R2",
338 | "colab_type": "code",
339 | "colab": {}
340 | },
341 | "source": [
342 | "# Convert the datafram to a numpy array\n",
343 | "dataset=data.values"
344 | ],
345 | "execution_count": 0,
346 | "outputs": []
347 | },
348 | {
349 | "cell_type": "code",
350 | "metadata": {
351 | "id": "KB9b4xdOrH59",
352 | "colab_type": "code",
353 | "colab": {}
354 | },
355 | "source": [
356 | "df = df.iloc[:, 4:] # taking first foure colum [10]"
357 | ],
358 | "execution_count": 0,
359 | "outputs": []
360 | },
361 | {
362 | "cell_type": "code",
363 | "metadata": {
364 | "id": "9FnbVCtWRTzP",
365 | "colab_type": "code",
366 | "colab": {}
367 | },
368 | "source": [
369 | "#removing all rows with zero values on their confirmed field[13]\n",
370 | "df = df[df[\"Confirmed\"] > 0]\n",
371 | "df.describe()"
372 | ],
373 | "execution_count": 0,
374 | "outputs": []
375 | },
376 | {
377 | "cell_type": "code",
378 | "metadata": {
379 | "id": "CB-4-4CHxa_o",
380 | "colab_type": "code",
381 | "colab": {}
382 | },
383 | "source": [
384 | "#Split data into predictors and target [14]\n",
385 | "concrete_data_columns = concrete_data.columns\n",
386 | "predictors = concrete_data[concrete_data_columns[concrete_data_columns != 'Strength']] # all columns except Strength\n",
387 | "target = concrete_data['Strength'] # Strength column"
388 | ],
389 | "execution_count": 0,
390 | "outputs": []
391 | },
392 | {
393 | "cell_type": "markdown",
394 | "metadata": {
395 | "id": "SaGc_loUBjRH",
396 | "colab_type": "text"
397 | },
398 | "source": [
399 | "5.2 **Data Description and analysis**"
400 | ]
401 | },
402 | {
403 | "cell_type": "code",
404 | "metadata": {
405 | "id": "YjBH1EXroPmh",
406 | "colab_type": "code",
407 | "colab": {}
408 | },
409 | "source": [
410 | "df.describe() #[2]"
411 | ],
412 | "execution_count": 0,
413 | "outputs": []
414 | },
415 | {
416 | "cell_type": "code",
417 | "metadata": {
418 | "id": "WBEKOpF-pjQ3",
419 | "colab_type": "code",
420 | "colab": {}
421 | },
422 | "source": [
423 | "df.columns #[2] show colums in dataset "
424 | ],
425 | "execution_count": 0,
426 | "outputs": []
427 | },
428 | {
429 | "cell_type": "code",
430 | "metadata": {
431 | "id": "Gqyio6sxpx2C",
432 | "colab_type": "code",
433 | "colab": {}
434 | },
435 | "source": [
436 | "df.astype(str)# [2] Convert dataset into string "
437 | ],
438 | "execution_count": 0,
439 | "outputs": []
440 | },
441 | {
442 | "cell_type": "code",
443 | "metadata": {
444 | "id": "zF6e2mtMErz5",
445 | "colab_type": "code",
446 | "colab": {}
447 | },
448 | "source": [
449 | "df.shape # Dimension of datasets"
450 | ],
451 | "execution_count": 0,
452 | "outputs": []
453 | },
454 | {
455 | "cell_type": "code",
456 | "metadata": {
457 | "id": "dpko9Z7cE31t",
458 | "colab_type": "code",
459 | "colab": {}
460 | },
461 | "source": [
462 | "df.head(10) # we will see first 10 row of the dataset"
463 | ],
464 | "execution_count": 0,
465 | "outputs": []
466 | },
467 | {
468 | "cell_type": "code",
469 | "metadata": {
470 | "id": "OuRp5yD64AGa",
471 | "colab_type": "code",
472 | "colab": {}
473 | },
474 | "source": [
475 | "daily_cases.shape # Test [10]"
476 | ],
477 | "execution_count": 0,
478 | "outputs": []
479 | },
480 | {
481 | "cell_type": "code",
482 | "metadata": {
483 | "id": "EixOwc_oQVnk",
484 | "colab_type": "code",
485 | "colab": {}
486 | },
487 | "source": [
488 | "#Top 10 Countries by Confirmed Cases, Deaths & Recoveries [13]\n",
489 | "grouped_df = df.groupby(\"County\").max()"
490 | ],
491 | "execution_count": 0,
492 | "outputs": []
493 | },
494 | {
495 | "cell_type": "code",
496 | "metadata": {
497 | "id": "oxpg7mg9X-OR",
498 | "colab_type": "code",
499 | "colab": {}
500 | },
501 | "source": [
502 | "# Confirmed cases[13]\n",
503 | "print(grouped_df.sort_values(\"Confirmed\", ascending=False)[\"Confirmed\"][:10])"
504 | ],
505 | "execution_count": 0,
506 | "outputs": []
507 | },
508 | {
509 | "cell_type": "code",
510 | "metadata": {
511 | "id": "nLT33ParYdAV",
512 | "colab_type": "code",
513 | "colab": {}
514 | },
515 | "source": [
516 | "# Deaths [13]\n",
517 | "print(grouped_df.sort_values(\"Deaths\", ascending=False)[\"Deaths\"][:10])"
518 | ],
519 | "execution_count": 0,
520 | "outputs": []
521 | },
522 | {
523 | "cell_type": "code",
524 | "metadata": {
525 | "id": "L7-2_F6qYvZO",
526 | "colab_type": "code",
527 | "colab": {}
528 | },
529 | "source": [
530 | "# Recoveries\n",
531 | "print(grouped_df.sort_values(\"Recovered\", ascending=False)[\"Recovered\"][:10])"
532 | ],
533 | "execution_count": 0,
534 | "outputs": []
535 | },
536 | {
537 | "cell_type": "markdown",
538 | "metadata": {
539 | "colab_type": "text",
540 | "id": "udLxKyKMuWc6"
541 | },
542 | "source": [
543 | "**5.3 Missing Values**"
544 | ]
545 | },
546 | {
547 | "cell_type": "code",
548 | "metadata": {
549 | "id": "uDAulKRvqsCv",
550 | "colab_type": "code",
551 | "colab": {}
552 | },
553 | "source": [
554 | "### finding all columns that have nan:[2]\n",
555 | "droping_list_all=[]\n",
556 | "for j in range(1,4):\n",
557 | " if not df.iloc[:, j].notnull().all():\n",
558 | " droping_list_all.append(j) \n",
559 | " #print(df.iloc[:,j].unique())\n",
560 | "droping_list_all"
561 | ],
562 | "execution_count": 0,
563 | "outputs": []
564 | },
565 | {
566 | "cell_type": "code",
567 | "metadata": {
568 | "id": "wX6WfCfZq-wy",
569 | "colab_type": "code",
570 | "colab": {}
571 | },
572 | "source": [
573 | "# filling nan with mean in any columns [2]\n",
574 | "for j in range(2,4): \n",
575 | " df.iloc[:,j]=df.iloc[:,j].fillna(df.iloc[:,j].mean()) "
576 | ],
577 | "execution_count": 0,
578 | "outputs": []
579 | },
580 | {
581 | "cell_type": "code",
582 | "metadata": {
583 | "id": "8OxXAMDvrPZV",
584 | "colab_type": "code",
585 | "colab": {}
586 | },
587 | "source": [
588 | "# another sanity check to make sure that there are not more any nan\n",
589 | "df.isnull().sum()\n"
590 | ],
591 | "execution_count": 0,
592 | "outputs": []
593 | },
594 | {
595 | "cell_type": "code",
596 | "metadata": {
597 | "id": "wGPHMsWg8yU-",
598 | "colab_type": "code",
599 | "colab": {}
600 | },
601 | "source": [
602 | "#Count the Null Columns [14]\n",
603 | "train = pd.read_csv(\"train.csv\")\n",
604 | "null_columns=train.columns[train.isnull().any()]\n",
605 | "train[null_columns].isnull().sum()"
606 | ],
607 | "execution_count": 0,
608 | "outputs": []
609 | },
610 | {
611 | "cell_type": "code",
612 | "metadata": {
613 | "id": "PDELWczZ9CvV",
614 | "colab_type": "code",
615 | "colab": {}
616 | },
617 | "source": [
618 | "#Single Column Is Null [14]\n",
619 | "print(train[train[\"Electrical\"].isnull()][null_columns])"
620 | ],
621 | "execution_count": 0,
622 | "outputs": []
623 | },
624 | {
625 | "cell_type": "code",
626 | "metadata": {
627 | "id": "r25aSrQ_9Lyd",
628 | "colab_type": "code",
629 | "colab": {}
630 | },
631 | "source": [
632 | "#All Null Columns[14]\n",
633 | "print(train[train.isnull().any(axis=1)][null_columns].head())"
634 | ],
635 | "execution_count": 0,
636 | "outputs": []
637 | },
638 | {
639 | "cell_type": "markdown",
640 | "metadata": {
641 | "id": "PyPgneFu_v7J",
642 | "colab_type": "text"
643 | },
644 | "source": [
645 | "**5.4 Data Sorting**"
646 | ]
647 | },
648 | {
649 | "cell_type": "code",
650 | "metadata": {
651 | "id": "2EYw6AtRAXPt",
652 | "colab_type": "code",
653 | "colab": {}
654 | },
655 | "source": [
656 | "df=df.sort_values('Date') #[4]"
657 | ],
658 | "execution_count": 0,
659 | "outputs": []
660 | },
661 | {
662 | "cell_type": "code",
663 | "metadata": {
664 | "id": "fONM5PbRxJPP",
665 | "colab_type": "code",
666 | "colab": {}
667 | },
668 | "source": [
669 | "# testing [10]\n",
670 | "daily_cases = df.sum(axis=0)\n",
671 | "daily_cases.index = pd.to_datetime(daily_cases.index)\n",
672 | "daily_cases.head()"
673 | ],
674 | "execution_count": 0,
675 | "outputs": []
676 | },
677 | {
678 | "cell_type": "markdown",
679 | "metadata": {
680 | "id": "bTghoeL4C_Cn",
681 | "colab_type": "text"
682 | },
683 | "source": [
684 | " *5.5* **Data Encoding into Categorical**"
685 | ]
686 | },
687 | {
688 | "cell_type": "code",
689 | "metadata": {
690 | "id": "2OWLnFO3fgoE",
691 | "colab_type": "code",
692 | "colab": {}
693 | },
694 | "source": [
695 | "#=============================================================>Encoding categorical data\n",
696 | "#https://discuss.analyticsvidhya.com/t/error-could-not-convert-string-to-float-while-running-randomforest-model-in-python/4855/2\n",
697 | "#https://medium.com/@contactsunny/label-encoder-vs-one-hot-encoder-in-machine-learning-3fc273365621\n",
698 | "##https://www.pluralsight.com/guides/handling-categorical-data-in-machine-learning-models\n",
699 | "feature_cols = ['Student ID', 'final_result','num_of_prev_attempts','Age','highest_education','Region','Gender','disability','date_submitted','studied_credits','score','sum_click']\n",
700 | "from sklearn.preprocessing import LabelEncoder\n",
701 | "labelencoder_X = LabelEncoder()\n",
702 | "X[:,0] = labelencoder_X.fit_transform(X[:,0])\n",
703 | "X[:,1] = labelencoder_X.fit_transform(X[:,1])\n",
704 | "X[:,2] = labelencoder_X.fit_transform(X[:,2])\n",
705 | "X[:,3] = labelencoder_X.fit_transform(X[:,3])\n",
706 | "X[:,4] = labelencoder_X.fit_transform(X[:,4])\n",
707 | "X[:,5] = labelencoder_X.fit_transform(X[:,5])\n",
708 | "X[:,6] = labelencoder_X.fit_transform(X[:,6])\n",
709 | "X[:,7] = labelencoder_X.fit_transform(X[:,7])\n",
710 | "X[:,8] = labelencoder_X.fit_transform(X[:,8])\n",
711 | "X[:,9] = labelencoder_X.fit_transform(X[:,9])\n",
712 | "X[:,10] = labelencoder_X.fit_transform(X[:,10])\n",
713 | "X[:,11] = labelencoder_X.fit_transform(X[:,11])"
714 | ],
715 | "execution_count": 0,
716 | "outputs": []
717 | },
718 | {
719 | "cell_type": "code",
720 | "metadata": {
721 | "id": "GUWjXWnMEQK7",
722 | "colab_type": "code",
723 | "colab": {}
724 | },
725 | "source": [
726 | "# for classification we need to divide our target variable into categories\n",
727 | "from keras.utils import to_categorical\n",
728 | "y_train = to_categorical(y_train)"
729 | ],
730 | "execution_count": 0,
731 | "outputs": []
732 | },
733 | {
734 | "cell_type": "code",
735 | "metadata": {
736 | "id": "cmR5mrIhEQIH",
737 | "colab_type": "code",
738 | "colab": {}
739 | },
740 | "source": [
741 | ""
742 | ],
743 | "execution_count": 0,
744 | "outputs": []
745 | },
746 | {
747 | "cell_type": "markdown",
748 | "metadata": {
749 | "id": "5F8biWQz4mq5",
750 | "colab_type": "text"
751 | },
752 | "source": [
753 | "*4.6* **Data Normalizaton**"
754 | ]
755 | },
756 | {
757 | "cell_type": "code",
758 | "metadata": {
759 | "id": "a-k0DTjiz2Qq",
760 | "colab_type": "code",
761 | "colab": {}
762 | },
763 | "source": [
764 | "# Normalize the data by substracting the mean and dividing by the standard deviation [14]\n",
765 | "# Data normaliz for regression model\n",
766 | "concrete_data_columns = concrete_data.columns\n",
767 | "predictors = concrete_data[concrete_data_columns[concrete_data_columns != 'Strength']] # all columns except Strength\n",
768 | "target = concrete_data['Strength'] # Strength column"
769 | ],
770 | "execution_count": 0,
771 | "outputs": []
772 | },
773 | {
774 | "cell_type": "markdown",
775 | "metadata": {
776 | "id": "ooJN9wu2tYa4",
777 | "colab_type": "text"
778 | },
779 | "source": [
780 | "**5.0 Statiscial Analysis**"
781 | ]
782 | },
783 | {
784 | "cell_type": "code",
785 | "metadata": {
786 | "id": "X2eXsqXYpm8b",
787 | "colab_type": "code",
788 | "colab": {}
789 | },
790 | "source": [
791 | "# Correlations among columns [2]\n",
792 | "plt.matshow(df.corr(method='spearman'),vmax=1,vmin=-1,cmap='PRGn')\n",
793 | "plt.title('without resampling', size=15)\n",
794 | "plt.colorbar()\n",
795 | "plt.show()"
796 | ],
797 | "execution_count": 0,
798 | "outputs": []
799 | },
800 | {
801 | "cell_type": "code",
802 | "metadata": {
803 | "id": "oZwUcDjSEIih",
804 | "colab_type": "code",
805 | "colab": {}
806 | },
807 | "source": [
808 | "# normalize the dataset\n",
809 | "scaler = MinMaxScaler(feature_range=(0, 1))\n",
810 | "dataset = scaler.fit_transform(dataset)"
811 | ],
812 | "execution_count": 0,
813 | "outputs": []
814 | },
815 | {
816 | "cell_type": "markdown",
817 | "metadata": {
818 | "id": "3dmLdRYDB84U",
819 | "colab_type": "text"
820 | },
821 | "source": [
822 | "6. **Data Ploting**\n"
823 | ]
824 | },
825 | {
826 | "cell_type": "code",
827 | "metadata": {
828 | "id": "eFzAWsNfBWKD",
829 | "colab_type": "code",
830 | "colab": {}
831 | },
832 | "source": [
833 | "# ploting [4]\n",
834 | "plt.figure(figsize = (18,9))\n",
835 | "plt.plot(range(df.shape[0]),(df['Recovered']+df['Deaths'])/2.0)\n",
836 | "plt.xticks(range(0,df.shape[0],500),df['Date'].loc[::500],rotation=45)\n",
837 | "plt.xlabel('Date',fontsize=18)\n",
838 | "plt.ylabel('Death and Recoverd',fontsize=18)\n",
839 | "plt.show()\n"
840 | ],
841 | "execution_count": 0,
842 | "outputs": []
843 | },
844 | {
845 | "cell_type": "code",
846 | "metadata": {
847 | "id": "0MOK018gFf-Y",
848 | "colab_type": "code",
849 | "colab": {}
850 | },
851 | "source": [
852 | "df.hist(column='Deaths', bins=50) # PLOT histogram from dataset \n",
853 | "plt.show ()"
854 | ],
855 | "execution_count": 0,
856 | "outputs": []
857 | },
858 | {
859 | "cell_type": "markdown",
860 | "metadata": {
861 | "id": "KFu9c2SfLDRN",
862 | "colab_type": "text"
863 | },
864 | "source": [
865 | ""
866 | ]
867 | },
868 | {
869 | "cell_type": "code",
870 | "metadata": {
871 | "colab_type": "code",
872 | "id": "NWno337-z89f",
873 | "colab": {}
874 | },
875 | "source": [
876 | "# plot diagram (1)\n",
877 | "def plot_features_distribution(features, title,isLog=False): \n",
878 | " plt.figure(figsize=(12,6))\n",
879 | " plt.title(title)\n",
880 | " for feature in features:\n",
881 | " if(isLog):\n",
882 | " sns.distplot(np.log1p(df[feature]),kde=True,hist=False, bins=120, label=feature)\n",
883 | " else:\n",
884 | " sns.distplot(df[feature],kde=True,hist=False, bins=120, label=feature)\n",
885 | " plt.xlabel('')\n",
886 | " plt.legend()\n",
887 | " plt.show()"
888 | ],
889 | "execution_count": 0,
890 | "outputs": []
891 | },
892 | {
893 | "cell_type": "code",
894 | "metadata": {
895 | "colab_type": "code",
896 | "id": "67xm9epV0JJC",
897 | "colab": {}
898 | },
899 | "source": [
900 | "# plot diagram (2)\n",
901 | "def plot_count(feature, title,size=1,df=df):\n",
902 | " f, ax = plt.subplots(1,1, figsize=(4*size,4))\n",
903 | " total = float(len(df))\n",
904 | " g = sns.countplot(df[feature], order = df[feature].value_counts().index[:30], palette='Set3')\n",
905 | " g.set_title(\"Number and percentage of {}\".format(title))\n",
906 | " if(size > 2):\n",
907 | " plt.xticks(rotation=90, size=8)\n",
908 | " for p in ax.patches:\n",
909 | " height = p.get_height()\n",
910 | " ax.text(p.get_x()+p.get_width()/2.,\n",
911 | " height + 3,\n",
912 | " '{:1.2f}%'.format(100*height/total),\n",
913 | " ha=\"center\") \n",
914 | " plt.show() "
915 | ],
916 | "execution_count": 0,
917 | "outputs": []
918 | },
919 | {
920 | "cell_type": "code",
921 | "metadata": {
922 | "colab_type": "code",
923 | "id": "WtUMzmIl0YXh",
924 | "colab": {}
925 | },
926 | "source": [
927 | "# plot diagram (3)\n",
928 | "plot_count('State','Confirmed')"
929 | ],
930 | "execution_count": 0,
931 | "outputs": []
932 | },
933 | {
934 | "cell_type": "code",
935 | "metadata": {
936 | "id": "PvIrPaXRUi2i",
937 | "colab_type": "code",
938 | "colab": {}
939 | },
940 | "source": [
941 | "# Bar Chart [7,8]\n",
942 | "df=df[:57]\n",
943 | "df.plot('State',['Recovered','Deaths'],kind = 'bar') # Kind=Bar, line, box\n"
944 | ],
945 | "execution_count": 0,
946 | "outputs": []
947 | },
948 | {
949 | "cell_type": "code",
950 | "metadata": {
951 | "id": "7QSi_2eVjC_I",
952 | "colab_type": "code",
953 | "colab": {}
954 | },
955 | "source": [
956 | "# [9]\n",
957 | "from matplotlib import pyplot\n",
958 | "pyplot.figure(1)\n",
959 | "# line plot\n",
960 | "pyplot.subplot(211)\n",
961 | "pyplot.plot(Date)\n",
962 | "# histogram\n",
963 | "pyplot.subplot(212)\n",
964 | "pyplot.hist(Deate)\n",
965 | "pyplot.show()"
966 | ],
967 | "execution_count": 0,
968 | "outputs": []
969 | },
970 | {
971 | "cell_type": "code",
972 | "metadata": {
973 | "id": "EhqXHGV0Lgcq",
974 | "colab_type": "code",
975 | "colab": {}
976 | },
977 | "source": [
978 | "#https://www.datacamp.com/courses/preprocessing-for-machine-learning-in-python\n",
979 | "#https://www.geeksforgeeks.org/data-preprocessing-machine-learning-python/\n",
980 | "#https://towardsdatascience.com/introduction-to-data-preprocessing-in-machine-learning-a9fa83a5dc9d\n",
981 | "#https://machinelearningmastery.com/prepare-data-machine-learning-python-scikit-learn/\n",
982 | "#https://medium.com/data-py-blog/data-preprocessing-for-python-2ab52cbc0edd\n",
983 | "#https://medium.com/datadriveninvestor/data-preprocessing-for-machine-learning-188e9eef1d2c"
984 | ],
985 | "execution_count": 0,
986 | "outputs": []
987 | },
988 | {
989 | "cell_type": "markdown",
990 | "metadata": {
991 | "id": "H7r5nhwzKdFT",
992 | "colab_type": "text"
993 | },
994 | "source": [
995 | " 7.**Feature Importance**\n",
996 | " "
997 | ]
998 | },
999 | {
1000 | "cell_type": "code",
1001 | "metadata": {
1002 | "id": "ngVFSyIiKz8u",
1003 | "colab_type": "code",
1004 | "colab": {}
1005 | },
1006 | "source": [
1007 | "#How to Calculate Feature Importance With Python\n",
1008 | "#https://machinelearningmastery.com/calculate-feature-importance-with-python/?fbclid=IwAR0uvlTGLX2qDJ8bu78fo8HQxq2msGX6mbu9QIQzw9YdUPKeZrOB8Mf4_44\n",
1009 | "#https://machinelearningmastery.com/calculate-feature-importance-with-python/\n",
1010 | " from sklearn.datasets import make_regression\n",
1011 | "from sklearn.ensemble import RandomForestRegressor\n",
1012 | "from matplotlib import pyplot\n",
1013 | "x1=atop(X);\n",
1014 | "model = RandomForestRegressor()\n",
1015 | "# fit the model\n",
1016 | "model.fit(X, y)"
1017 | ],
1018 | "execution_count": 0,
1019 | "outputs": []
1020 | },
1021 | {
1022 | "cell_type": "markdown",
1023 | "metadata": {
1024 | "id": "YQDUyjWsDbZo",
1025 | "colab_type": "text"
1026 | },
1027 | "source": [
1028 | "8.**Data Spliting**"
1029 | ]
1030 | },
1031 | {
1032 | "cell_type": "code",
1033 | "metadata": {
1034 | "id": "prGtyX2iaBLU",
1035 | "colab_type": "code",
1036 | "colab": {}
1037 | },
1038 | "source": [
1039 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test\n"
1040 | ],
1041 | "execution_count": 0,
1042 | "outputs": []
1043 | },
1044 | {
1045 | "cell_type": "code",
1046 | "metadata": {
1047 | "id": "wdHiejJ2R5b7",
1048 | "colab_type": "code",
1049 | "colab": {}
1050 | },
1051 | "source": [
1052 | "# [3]\n",
1053 | "feature_train, label_train, feature_test, label_test = load_data(df, 'Deaths', Enrol_window, True)\n"
1054 | ],
1055 | "execution_count": 0,
1056 | "outputs": []
1057 | },
1058 | {
1059 | "cell_type": "code",
1060 | "metadata": {
1061 | "id": "_L_zdec1SPoH",
1062 | "colab_type": "code",
1063 | "colab": {}
1064 | },
1065 | "source": [
1066 | "def load_data(datasetname, column, seq_len, normalise_window):\n",
1067 | " # A support function to help prepare datasets for an RNN/LSTM/GRU\n",
1068 | " data = datasetname.loc[:,column]\n",
1069 | "\n",
1070 | " sequence_length = seq_len + 1\n",
1071 | " result = []\n",
1072 | " for index in range(len(data) - sequence_length):\n",
1073 | " result.append(data[index: index + sequence_length])\n",
1074 | " \n",
1075 | " if normalise_window:\n",
1076 | " #result = sc.fit_transform(result)\n",
1077 | " result = normalise_windows(result)\n",
1078 | "\n",
1079 | " result = np.array(result)\n",
1080 | " #Last 10% is used for validation test, first 90% for training\n",
1081 | " row = round(0.9 * result.shape[0])\n",
1082 | " train = result[:int(row), :]\n",
1083 | " np.random.shuffle(train)\n",
1084 | " x_train = train[:, :-1]\n",
1085 | " y_train = train[:, -1]\n",
1086 | " x_test = result[int(row):, :-1]\n",
1087 | " y_test = result[int(row):, -1]\n",
1088 | "\n",
1089 | " x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))\n",
1090 | " x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1)) \n",
1091 | "\n",
1092 | " return [x_train, y_train, x_test, y_test]\n"
1093 | ],
1094 | "execution_count": 0,
1095 | "outputs": []
1096 | },
1097 | {
1098 | "cell_type": "code",
1099 | "metadata": {
1100 | "id": "5kJQYO1ZSYuG",
1101 | "colab_type": "code",
1102 | "colab": {}
1103 | },
1104 | "source": [
1105 | "# [3]\n",
1106 | "def normalise_windows(window_data):\n",
1107 | " # A support function to normalize a dataset\n",
1108 | " normalised_data = []\n",
1109 | " for window in window_data:\n",
1110 | " normalised_window = [((float(p) / float(window[0])) - 1) for p in window]\n",
1111 | " normalised_data.append(normalised_window)\n",
1112 | " return normalised_data"
1113 | ],
1114 | "execution_count": 0,
1115 | "outputs": []
1116 | },
1117 | {
1118 | "cell_type": "code",
1119 | "metadata": {
1120 | "id": "U_tdcIaGq32o",
1121 | "colab_type": "code",
1122 | "colab": {}
1123 | },
1124 | "source": [
1125 | "# create the training dataset\n",
1126 | "# create the scaled training dataset\n",
1127 | "train_data=scaled_data[0:training_data_len,:]\n",
1128 | "# split the data into x_train and y_train dataset\n",
1129 | "x_train = []\n",
1130 | "y_train = []\n",
1131 | "\n",
1132 | "for i in range (50, len(train_data)):\n",
1133 | " x_train.append(train_data[i-50:i,0])\n",
1134 | " y_train.append(train_data[i,0])\n",
1135 | " if i<= 51:\n",
1136 | " print(x_train)\n",
1137 | " print(y_train)\n",
1138 | " print()"
1139 | ],
1140 | "execution_count": 0,
1141 | "outputs": []
1142 | },
1143 | {
1144 | "cell_type": "code",
1145 | "metadata": {
1146 | "id": "v7l1O6TJaxr3",
1147 | "colab_type": "code",
1148 | "colab": {}
1149 | },
1150 | "source": [
1151 | "# Convert the x_train and y_train to numpy arrays\n",
1152 | "x_train, y_train = np.array(x_train), np.array(y_train)\n",
1153 | "x_train"
1154 | ],
1155 | "execution_count": 0,
1156 | "outputs": []
1157 | },
1158 | {
1159 | "cell_type": "code",
1160 | "metadata": {
1161 | "id": "On-pcXAqcJGG",
1162 | "colab_type": "code",
1163 | "colab": {}
1164 | },
1165 | "source": [
1166 | "# Reshap the data\n",
1167 | "x_train=np.reshape(x_train, (x_train.shape[0], x_train.shape[1],1))\n",
1168 | "x_train.shape"
1169 | ],
1170 | "execution_count": 0,
1171 | "outputs": []
1172 | },
1173 | {
1174 | "cell_type": "code",
1175 | "metadata": {
1176 | "id": "G2WbDQA3maFZ",
1177 | "colab_type": "code",
1178 | "colab": {}
1179 | },
1180 | "source": [
1181 | "#Split into train and test data [16]\n",
1182 | "data_X = data.loc[:,data.columns != 'Close' ]\n",
1183 | "data_Y = data['Close']\n",
1184 | "train_X, test_X, train_y,test_y = train_test_split(data_X,data_Y,test_size=0.25)\n",
1185 | "print('\\n\\nTraining Set')\n",
1186 | "print(train_X.head())\n",
1187 | "print(train_y.head())"
1188 | ],
1189 | "execution_count": 0,
1190 | "outputs": []
1191 | },
1192 | {
1193 | "cell_type": "markdown",
1194 | "metadata": {
1195 | "id": "SN9GM4UrD_W3",
1196 | "colab_type": "text"
1197 | },
1198 | "source": [
1199 | "9.**Building Models**"
1200 | ]
1201 | },
1202 | {
1203 | "cell_type": "markdown",
1204 | "metadata": {
1205 | "id": "MhZR8yOwETba",
1206 | "colab_type": "text"
1207 | },
1208 | "source": [
1209 | "9.1 **Decision Tree Models**"
1210 | ]
1211 | },
1212 | {
1213 | "cell_type": "code",
1214 | "metadata": {
1215 | "id": "tzSi_BST2TJh",
1216 | "colab_type": "code",
1217 | "colab": {}
1218 | },
1219 | "source": [
1220 | "#https://www.datacamp.com/community/tutorials/decision-tree-classification-python\n",
1221 | "# Create Decision Tree classifer object\n",
1222 | "clf = DecisionTreeClassifier(criterion=\"entropy\", max_depth = 4)\n",
1223 | "# Train Decision Tree Classifer\n",
1224 | "clf = clf.fit(X_train,y_train)\n",
1225 | "#Predict the response for test dataset\n",
1226 | "y_pred = clf.predict(X_test)\n",
1227 | "print(y_pred)\n"
1228 | ],
1229 | "execution_count": 0,
1230 | "outputs": []
1231 | },
1232 | {
1233 | "cell_type": "code",
1234 | "metadata": {
1235 | "id": "6T6ktFVYdd_A",
1236 | "colab_type": "code",
1237 | "colab": {}
1238 | },
1239 | "source": [
1240 | "# build the LSTM model\n",
1241 | "model=Sequential()\n",
1242 | "model.add(LSTM(40, return_sequences=True,input_shape=(x_train.shape[1],1)))\n",
1243 | "model.add(LSTM(40, return_sequences=False))\n",
1244 | "model.add(Dense(25))\n",
1245 | "model.add(Dense(1))"
1246 | ],
1247 | "execution_count": 0,
1248 | "outputs": []
1249 | },
1250 | {
1251 | "cell_type": "code",
1252 | "metadata": {
1253 | "id": "V8B53I7gnv3F",
1254 | "colab_type": "code",
1255 | "colab": {}
1256 | },
1257 | "source": [
1258 | "# compile the model\n",
1259 | "model.compile(optimizer='adam',loss='mean_squared_error')"
1260 | ],
1261 | "execution_count": 0,
1262 | "outputs": []
1263 | },
1264 | {
1265 | "cell_type": "code",
1266 | "metadata": {
1267 | "id": "asZwD5jhoXk9",
1268 | "colab_type": "code",
1269 | "colab": {}
1270 | },
1271 | "source": [
1272 | "# train the model\n",
1273 | "model.fit(x_train,y_train,batch_size=1,epochs=1)"
1274 | ],
1275 | "execution_count": 0,
1276 | "outputs": []
1277 | },
1278 | {
1279 | "cell_type": "code",
1280 | "metadata": {
1281 | "id": "qECXeVx1vy9h",
1282 | "colab_type": "code",
1283 | "colab": {}
1284 | },
1285 | "source": [
1286 | "test_data=scaled_data[training_data_len -40:,:]\n",
1287 | "x_test= []\n",
1288 | "y_test = dataset[training_data_len:,:]\n",
1289 | "for i in range(50, len(test_data)):\n",
1290 | " x_test.append(test_data[i -50:i,0])\n"
1291 | ],
1292 | "execution_count": 0,
1293 | "outputs": []
1294 | },
1295 | {
1296 | "cell_type": "code",
1297 | "metadata": {
1298 | "id": "jfuKDYyExdz5",
1299 | "colab_type": "code",
1300 | "colab": {}
1301 | },
1302 | "source": [
1303 | "x_test=np.array(x_test)"
1304 | ],
1305 | "execution_count": 0,
1306 | "outputs": []
1307 | },
1308 | {
1309 | "cell_type": "code",
1310 | "metadata": {
1311 | "id": "SM_5z8cExsSB",
1312 | "colab_type": "code",
1313 | "colab": {}
1314 | },
1315 | "source": [
1316 | "x_test=np.reshape(x_test,(x_test.shape[0], x_test.shape[1],1))"
1317 | ],
1318 | "execution_count": 0,
1319 | "outputs": []
1320 | },
1321 | {
1322 | "cell_type": "code",
1323 | "metadata": {
1324 | "id": "jk7qbLkYyjhq",
1325 | "colab_type": "code",
1326 | "colab": {}
1327 | },
1328 | "source": [
1329 | "predictions=model.predict(x_test)\n",
1330 | "predictions=scaler.inverse_transform(predictions)"
1331 | ],
1332 | "execution_count": 0,
1333 | "outputs": []
1334 | },
1335 | {
1336 | "cell_type": "code",
1337 | "metadata": {
1338 | "id": "VgZ1BGWyzVaj",
1339 | "colab_type": "code",
1340 | "colab": {}
1341 | },
1342 | "source": [
1343 | "rmse=np.sqrt(np.mean(predictions - y_test)**2)"
1344 | ],
1345 | "execution_count": 0,
1346 | "outputs": []
1347 | },
1348 | {
1349 | "cell_type": "code",
1350 | "metadata": {
1351 | "id": "28nSDcQAaPsr",
1352 | "colab_type": "code",
1353 | "colab": {}
1354 | },
1355 | "source": [
1356 | "predictions"
1357 | ],
1358 | "execution_count": 0,
1359 | "outputs": []
1360 | },
1361 | {
1362 | "cell_type": "markdown",
1363 | "metadata": {
1364 | "id": "DaIXIV8vPPzg",
1365 | "colab_type": "text"
1366 | },
1367 | "source": [
1368 | "6.2 **KNN Classifier** "
1369 | ]
1370 | },
1371 | {
1372 | "cell_type": "code",
1373 | "metadata": {
1374 | "id": "DQlL-JQYWe3e",
1375 | "colab_type": "code",
1376 | "colab": {}
1377 | },
1378 | "source": [
1379 | "from sklearn.neighbors import KNeighborsClassifier\n",
1380 | "k = 1\n",
1381 | "#Train Model and Predict \n",
1382 | "neigh = KNeighborsClassifier(n_neighbors = k).fit(X_train,y_train)\n",
1383 | "y_pred = neigh.predict(X_test)"
1384 | ],
1385 | "execution_count": 0,
1386 | "outputs": []
1387 | },
1388 | {
1389 | "cell_type": "code",
1390 | "metadata": {
1391 | "id": "sv3ZMg7qXBJ3",
1392 | "colab_type": "code",
1393 | "colab": {}
1394 | },
1395 | "source": [
1396 | "from sklearn import metrics\n",
1397 | "print(\"Train set Accuracy: \", metrics.accuracy_score(y_train, neigh.predict(X_train)))\n",
1398 | "print(\"Test set Accuracy: \", metrics.accuracy_score(y_test, yhat))\n"
1399 | ],
1400 | "execution_count": 0,
1401 | "outputs": []
1402 | },
1403 | {
1404 | "cell_type": "code",
1405 | "metadata": {
1406 | "id": "P14FdEkQXF1g",
1407 | "colab_type": "code",
1408 | "colab": {}
1409 | },
1410 | "source": [
1411 | "# We can calculate the accuracy of KNN for different Ks.\n",
1412 | "Ks = 10\n",
1413 | "mean_acc = np.zeros((Ks-1))\n",
1414 | "std_acc = np.zeros((Ks-1))\n",
1415 | "ConfustionMx = [];\n",
1416 | "for n in range(1,Ks):\n",
1417 | " \n",
1418 | " #Train Model and Predict \n",
1419 | " neigh = KNeighborsClassifier(n_neighbors = n).fit(X_train,y_train)\n",
1420 | " yhat=neigh.predict(X_test)\n",
1421 | " mean_acc[n-1] = metrics.accuracy_score(y_test, yhat)\n",
1422 | "\n",
1423 | " \n",
1424 | " std_acc[n-1]=np.std(yhat==y_test)/np.sqrt(yhat.shape[0])\n",
1425 | "\n",
1426 | "mean_acc\n"
1427 | ],
1428 | "execution_count": 0,
1429 | "outputs": []
1430 | },
1431 | {
1432 | "cell_type": "code",
1433 | "metadata": {
1434 | "id": "HGZJSorVXRzp",
1435 | "colab_type": "code",
1436 | "colab": {}
1437 | },
1438 | "source": [
1439 | "# Plot model accuracy for Different number of Neighbors \n",
1440 | "plt.plot(range(1,Ks),mean_acc,'g')\n",
1441 | "plt.fill_between(range(1,Ks),mean_acc - 1 * std_acc,mean_acc + 1 * std_acc, alpha=0.10)\n",
1442 | "plt.legend(('Accuracy ', '+/- 3xstd'))\n",
1443 | "plt.ylabel('Accuracy ')\n",
1444 | "plt.xlabel('Number of Nabors (K)')\n",
1445 | "plt.tight_layout()\n",
1446 | "plt.show()"
1447 | ],
1448 | "execution_count": 0,
1449 | "outputs": []
1450 | },
1451 | {
1452 | "cell_type": "code",
1453 | "metadata": {
1454 | "id": "7qY94l-DXBIN",
1455 | "colab_type": "code",
1456 | "colab": {}
1457 | },
1458 | "source": [
1459 | "print( \"The best accuracy was with\", mean_acc.max(), \"with k=\", mean_acc.argmax()+1) "
1460 | ],
1461 | "execution_count": 0,
1462 | "outputs": []
1463 | },
1464 | {
1465 | "cell_type": "markdown",
1466 | "metadata": {
1467 | "id": "PMnJfVQOQTob",
1468 | "colab_type": "text"
1469 | },
1470 | "source": [
1471 | "6.3 **Logistic Rregression**"
1472 | ]
1473 | },
1474 | {
1475 | "cell_type": "code",
1476 | "metadata": {
1477 | "id": "E3qQWXzV4H2m",
1478 | "colab_type": "code",
1479 | "colab": {}
1480 | },
1481 | "source": [
1482 | "#https://www.marktechpost.com/2019/06/12/logistic-regression-with-a-real-world-example-in-python/?fbclid=IwAR31FyvXdFxxWam-n6lCKmsBxA7m_MIHdrhwerqpqow1-V9dx2ZeQ_gq-s0\n",
1483 | "classifier = LogisticRegression(random_state=0)\n",
1484 | "classifier.fit(X_train, y_train)\n",
1485 | "y_pred = classifier.predict(X_test)\n",
1486 | "print(y_pred)\n"
1487 | ],
1488 | "execution_count": 0,
1489 | "outputs": []
1490 | },
1491 | {
1492 | "cell_type": "markdown",
1493 | "metadata": {
1494 | "id": "CJHwOa0KRB4X",
1495 | "colab_type": "text"
1496 | },
1497 | "source": [
1498 | "6.4 **Random Forest Classifier**"
1499 | ]
1500 | },
1501 | {
1502 | "cell_type": "code",
1503 | "metadata": {
1504 | "id": "gTZGi49RT3la",
1505 | "colab_type": "code",
1506 | "colab": {}
1507 | },
1508 | "source": [
1509 | "# =========================================================================> RandomForestClassifier================\n",
1510 | "#https://www.kaggle.com/willkoehrsen/visualize-a-decision-tree-w-python-scikit-learn\n",
1511 | "#https://www.kaggle.com/willkoehrsen/visualize-a-decision-tree-w-python-scikit-learn\n",
1512 | "# Limit max depth\n",
1513 | "model = RandomForestClassifier(max_depth = 3, n_estimators=12)\n",
1514 | "# Train\n",
1515 | "model.fit(X_train, y_train)\n",
1516 | "#Predict the response for test dataset\n",
1517 | "y_pred = model.predict(X_test)"
1518 | ],
1519 | "execution_count": 0,
1520 | "outputs": []
1521 | },
1522 | {
1523 | "cell_type": "code",
1524 | "metadata": {
1525 | "id": "BVb6yvuLLdVn",
1526 | "colab_type": "code",
1527 | "colab": {}
1528 | },
1529 | "source": [
1530 | "# Extract single tree\n",
1531 | "estimator_limited = model.estimators_[5]\n",
1532 | "estimator_limited\n",
1533 | "# No max depth\n",
1534 | "model = RandomForestClassifier(max_depth = 3, n_estimators=10)\n",
1535 | "model.fit(X_train, y_train)\n",
1536 | "estimator_nonlimited = model.estimators_[5]\n",
1537 | "\n",
1538 | "from sklearn.tree import export_graphviz\n",
1539 | "export_graphviz(estimator_limited, out_file='tree_limited.dot', feature_names =feature_cols,\n",
1540 | " class_names = y,\n",
1541 | " rounded = True, proportion = False, precision = 2, filled = True)"
1542 | ],
1543 | "execution_count": 0,
1544 | "outputs": []
1545 | },
1546 | {
1547 | "cell_type": "code",
1548 | "metadata": {
1549 | "id": "m5b3WT4hViYx",
1550 | "colab_type": "code",
1551 | "colab": {}
1552 | },
1553 | "source": [
1554 | "export_graphviz(estimator_nonlimited, out_file='tree_nonlimited.dot', feature_names = feature_cols,\n",
1555 | " class_names =y,\n",
1556 | " rounded = True, proportion = False, precision = 2, filled = True)"
1557 | ],
1558 | "execution_count": 0,
1559 | "outputs": []
1560 | },
1561 | {
1562 | "cell_type": "code",
1563 | "metadata": {
1564 | "id": "aXMsWMjgWnsm",
1565 | "colab_type": "code",
1566 | "colab": {}
1567 | },
1568 | "source": [
1569 | "#Convert to png from the command line\n",
1570 | "!dot -Tpng tree_limited.dot -o tree_limited.png -Gdpi=600"
1571 | ],
1572 | "execution_count": 0,
1573 | "outputs": []
1574 | },
1575 | {
1576 | "cell_type": "markdown",
1577 | "metadata": {
1578 | "id": "n284gqLAQzSb",
1579 | "colab_type": "text"
1580 | },
1581 | "source": [
1582 | "6.4 **ANN** "
1583 | ]
1584 | },
1585 | {
1586 | "cell_type": "code",
1587 | "metadata": {
1588 | "id": "ZaPXtuXvRbt9",
1589 | "colab_type": "code",
1590 | "colab": {}
1591 | },
1592 | "source": [
1593 | "#==============================================================================================================================================================\n",
1594 | "# Evaluating Model\n",
1595 | "#===============================================================================================================================================================\n",
1596 | "\n",
1597 | "#=============================================================> accuracy \n",
1598 | "print(\"Accuracy:\",metrics.accuracy_score(y_test, y_pred))"
1599 | ],
1600 | "execution_count": 0,
1601 | "outputs": []
1602 | },
1603 | {
1604 | "cell_type": "code",
1605 | "metadata": {
1606 | "id": "Sdd3fQaQWrNO",
1607 | "colab_type": "code",
1608 | "colab": {}
1609 | },
1610 | "source": [
1611 | "from IPython.display import Image\n",
1612 | "Image(filename = 'tree_limited.png')\n",
1613 | "\n",
1614 | "#======================================================== build CNN\n",
1615 | "#https://www.tensorflow.org/tutorials/estimators/cnn\n",
1616 | "import tensorflow as tf\n",
1617 | "def cnn_model_f(features,labels, mod):\n",
1618 | " # input layer\n",
1619 | " input_layer = tf.reshape(features[\"x\"], [-1, 28, 28, 1])\n",
1620 | " # Convolutional Layer #1\n",
1621 | " conv1 = tf.layers.conv2d(\n",
1622 | " inputs=input_layer,\n",
1623 | " filters=32,\n",
1624 | " kernel_size=[5, 5],\n",
1625 | " padding=\"same\",\n",
1626 | " activation=tf.nn.relu)\n",
1627 | "# Pooling Layer #1\n",
1628 | " pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2)\n",
1629 | " # Convolutional Layer #2 and Pooling Layer #2\n",
1630 | " conv2 = tf.layers.conv2d(\n",
1631 | " inputs=pool1,\n",
1632 | " filters=64,\n",
1633 | " kernel_size=[5, 5],\n",
1634 | " padding=\"same\",\n",
1635 | " activation=tf.nn.relu)\n",
1636 | " pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2)\n",
1637 | " # Dense Layer\n",
1638 | " pool2_flat = tf.reshape(pool2, [-1, 7 * 7 * 64])\n",
1639 | " dense = tf.layers.dense(inputs=pool2_flat, units=1024, activation=tf.nn.relu)\n",
1640 | " dropout = tf.layers.dropout(\n",
1641 | " inputs=dense, rate=0.4, training=mode == tf.estimator.ModeKeys.TRAIN)\n",
1642 | " # Logits Layer\n",
1643 | " logits = tf.layers.dense(inputs=dropout, units=10)\n",
1644 | " predictions = {\n",
1645 | " # Generate predictions (for PREDICT and EVAL mode)\n",
1646 | " \"classes\": tf.argmax(input=logits, axis=1),\n",
1647 | " # Add `softmax_tensor` to the graph. It is used for PREDICT and by the\n",
1648 | " # `logging_hook`.\n",
1649 | " \"probabilities\": tf.nn.softmax(logits, name=\"softmax_tensor\")\n",
1650 | " }\n",
1651 | " if mode == tf.estimator.ModeKeys.PREDICT:\n",
1652 | " return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)\n",
1653 | "\n",
1654 | " # Calculate Loss (for both TRAIN and EVAL modes)\n",
1655 | " loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)\n",
1656 | "\n",
1657 | " # Configure the Training Op (for TRAIN mode)\n",
1658 | " if mode == tf.estimator.ModeKeys.TRAIN:\n",
1659 | " optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)\n",
1660 | " train_op = optimizer.minimize(\n",
1661 | " loss=loss,\n",
1662 | " global_step=tf.train.get_global_step())\n",
1663 | " return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)\n",
1664 | "\n",
1665 | " # Add evaluation metrics (for EVAL mode)\n",
1666 | " eval_metric_ops = {\n",
1667 | " \"accuracy\": tf.metrics.accuracy(\n",
1668 | " labels=labels, predictions=predictions[\"classes\"])\n",
1669 | " }\n",
1670 | " return tf.estimator.EstimatorSpec(\n",
1671 | " mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)"
1672 | ],
1673 | "execution_count": 0,
1674 | "outputs": []
1675 | },
1676 | {
1677 | "cell_type": "code",
1678 | "metadata": {
1679 | "id": "JdWT57wSmxP4",
1680 | "colab_type": "code",
1681 | "colab": {}
1682 | },
1683 | "source": [
1684 | "#==============================================================> Making the Confusion Matrix\n",
1685 | "# https://www.marktechpost.com/2019/06/12/logistic-regression-with-a-real-world-example-in-python/?fbclid=IwAR31FyvXdFxxWam-n6lCKmsBxA7m_MIHdrhwerqpqow1-V9dx2ZeQ_gq-s0\n",
1686 | "# https://www.geeksforgeeks.org/confusion-matrix-machine-learning/\n",
1687 | "#https://stackoverflow.com/questions/30746460/how-to-interpret-scikits-learn-confusion-matrix-and-classification-report\n",
1688 | "from sklearn.metrics import confusion_matrix\n",
1689 | "cm = confusion_matrix(y_test, y_pred)\n",
1690 | "print(y_test.size)\n",
1691 | "ACC=( cm[0][0] + cm[1][1] ) / ( 117 )\n",
1692 | "print('Accuracy',ACC)\n",
1693 | "\n",
1694 | "print('Accuracy Score :',accuracy_score(y_test, y_pred)) \n",
1695 | "print('Report : ', classification_report(y_test, y_pred)) \n",
1696 | "print(confusion_matrix)\n",
1697 | "print('\\nConfussion matrix:\\n',confusion_matrix(y_test, y_pred))"
1698 | ],
1699 | "execution_count": 0,
1700 | "outputs": []
1701 | },
1702 | {
1703 | "cell_type": "code",
1704 | "metadata": {
1705 | "id": "FKhofH_7Rswy",
1706 | "colab_type": "code",
1707 | "colab": {}
1708 | },
1709 | "source": [
1710 | "# Visualizing Decision Trees\n",
1711 | "dot_data = StringIO()\n",
1712 | "export_graphviz(clf, out_file=dot_data, \n",
1713 | " filled=True, rounded=True,\n",
1714 | " special_characters=True,feature_names = feature_cols,class_names=['Not-submitted','Submitted'])\n",
1715 | "graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) \n",
1716 | "graph.write_png('StudentNextSessionf.png')\n",
1717 | "Image(graph.create_png())"
1718 | ],
1719 | "execution_count": 0,
1720 | "outputs": []
1721 | },
1722 | {
1723 | "cell_type": "markdown",
1724 | "metadata": {
1725 | "id": "dDDTGM0Nkg1E",
1726 | "colab_type": "text"
1727 | },
1728 | "source": [
1729 | " **References**\n",
1730 | " \n",
1731 | "1. [1] Stock Price Prediction Using Python & Machine Learning\n",
1732 | " https://www.youtube.com/watch?v=QIUxPv5PJOY\n",
1733 | "\n",
1734 | "2. [2] Time-series data analysis using LSTM (Tutorial)\n",
1735 | "https://www.kaggle.com/amirrezaeian/time-series-data-analysis-using-lstm-tutorial\n",
1736 | "3. [3] Learn by example RNN/LSTM/GRU time series\n",
1737 | "https://www.kaggle.com/charel/learn-by-example-rnn-lstm-gru-time-series\n",
1738 | "4. [4] Stock Market Predictions with LSTM in Python\n",
1739 | "https://www.datacamp.com/community/tutorials/lstm-python-stock-market\n",
1740 | "5. [5] LSTM Time Series Prediction Tutorial using PyTorch in Python | Coronavirus Daily Cases Forecasting\n",
1741 | "https://www.youtube.com/watch?v=8A6TEjG2DNw\n",
1742 | "6. [6] LSTM Time Series Prediction Tutorial using PyTorch in Python | Coronavirus Daily Cases Forecasting\n",
1743 | "https://morioh.com/p/5a74f94cfd6b\n",
1744 | "7.[7] Pandas Dataframe: Plot Examples with Matplotlib and Pyplot\n",
1745 | "http://queirozf.com/entries/pandas-dataframe-plot-examples-with-matplotlib-pyplot\n",
1746 | "8.[8]Dataframe Visualization with Pandas Plot\n",
1747 | "https://kanoki.org/2019/09/16/dataframe-visualization-with-pandas-plot/\n",
1748 | "9.[9]How to Use Power Transforms for Time Series Forecast Data with Python\n",
1749 | "https://machinelearningmastery.com/power-transform-time-series-forecast-data-python/\n",
1750 | "10.[10] Time Series Forecasting with LSTMs for Daily Coronavirus Cases using PyTorch in Python\n",
1751 | "https://www.curiousily.com/posts/time-series-forecasting-with-lstm-for-daily-coronavirus-cases/\n",
1752 | "11. [11] Building COVID-19 interactive dashboard from Jupyter Notebook https://morioh.com/p/127b5a302cb1?fbclid=IwAR28dsncAPM184LyzEhJCMxsCx5im2X4XJp3gSULd9Tq70XTivsMH75piLg\n",
1753 | "12. [12] How to Calculate Precision, Recall, F1, and More for Deep Learning Models\n",
1754 | "https://machinelearningmastery.com/how-to-calculate-precision-recall-f1-and-more-for-deep-learning-models/?fbclid=IwAR30_eKuKXmIYbyiaMZdCDzhe9YPLaI-ATGjRj83U8mcjySuwikcaQt6Vfw\n",
1755 | "13.[13] Data ETL & Analysis on the global and Mexican datasets of the COVID-19 pandemic\n",
1756 | "https://github.com/PhantomInsights/covid-19\n",
1757 | "14.[14]Regression Models with Keras\n",
1758 | "https://colab.research.google.com/github/hussain0048/Deep-Learning-with-Keras/blob/master/DL0101EN-3-1-Regression-with-Keras-py-v1.0.ipynb#scrollTo=4ORZhKfduSrw\n",
1759 | "15.[15] Pandas: Find Rows Where Column/Field Is Null\n",
1760 | "https://dzone.com/articles/pandas-find-rows-where-columnfield-is-null\n",
1761 | "16. [16] Analysis and Predicting Stock Trends with Python\n",
1762 | "https://morioh.com/p/8e9d9d4161c5?f=5c21f93bc16e2556b555ab2f&fbclid=IwAR2oIByWVFfg-b7f1Phd3cZ84mwmp9LriMb0pobqdJa9TWV1-pK04YkHqH4\n",
1763 | "\n",
1764 | "\n",
1765 | "\n"
1766 | ]
1767 | }
1768 | ]
1769 | }
--------------------------------------------------------------------------------
/Implementation/5_3_2020_Covid_Prophet_Final.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "kernelspec": {
6 | "display_name": "Python 3",
7 | "language": "python",
8 | "name": "python3"
9 | },
10 | "language_info": {
11 | "codemirror_mode": {
12 | "name": "ipython",
13 | "version": 3
14 | },
15 | "file_extension": ".py",
16 | "mimetype": "text/x-python",
17 | "name": "python",
18 | "nbconvert_exporter": "python",
19 | "pygments_lexer": "ipython3",
20 | "version": "3.6.6"
21 | },
22 | "colab": {
23 | "name": " 5-3-2020 Covid_Prophet_Final.ipynb",
24 | "provenance": [],
25 | "collapsed_sections": [],
26 | "toc_visible": true,
27 | "machine_shape": "hm",
28 | "include_colab_link": true
29 | }
30 | },
31 | "cells": [
32 | {
33 | "cell_type": "markdown",
34 | "metadata": {
35 | "id": "view-in-github",
36 | "colab_type": "text"
37 | },
38 | "source": [
39 | "
"
40 | ]
41 | },
42 | {
43 | "cell_type": "markdown",
44 | "metadata": {
45 | "id": "FfcjS4CKTyAP"
46 | },
47 | "source": [
48 | "# **1-Introducton** \n",
49 | "\n",
50 | "Predicting confirmed cases of Convid-19 with Prophet"
51 | ]
52 | },
53 | {
54 | "cell_type": "markdown",
55 | "metadata": {
56 | "id": "r609ISbsmq2t"
57 | },
58 | "source": [
59 | "#**2-Importing necessary libraries**"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "metadata": {
65 | "id": "FxdLWHIJTyAU"
66 | },
67 | "source": [
68 | "import pandas as pd\n",
69 | "import numpy as np\n",
70 | "from fbprophet import Prophet\n",
71 | "import matplotlib.pyplot as plt\n",
72 | " \n",
73 | "plt.rcParams['figure.figsize']=(20,10)\n",
74 | "plt.style.use('ggplot')"
75 | ],
76 | "execution_count": 2,
77 | "outputs": []
78 | },
79 | {
80 | "cell_type": "code",
81 | "metadata": {
82 | "id": "5NeSSztObv-7"
83 | },
84 | "source": [
85 | "plt.rcParams['figure.figsize']=(20,10)\n",
86 | "plt.style.use('ggplot')"
87 | ],
88 | "execution_count": 3,
89 | "outputs": []
90 | },
91 | {
92 | "cell_type": "code",
93 | "metadata": {
94 | "id": "XhadCtyRcFFY"
95 | },
96 | "source": [
97 | "pd.plotting.register_matplotlib_converters()"
98 | ],
99 | "execution_count": 4,
100 | "outputs": []
101 | },
102 | {
103 | "cell_type": "code",
104 | "metadata": {
105 | "id": "-vcGxjHQGL1c"
106 | },
107 | "source": [
108 | "#[3]\n",
109 | "import warnings\n",
110 | "warnings.filterwarnings('ignore')\n",
111 | "\n",
112 | "import numpy as np\n",
113 | "import pandas as pd\n",
114 | "from scipy import stats\n",
115 | "import statsmodels.api as sm\n",
116 | "import matplotlib.pyplot as plt\n",
117 | "\n",
118 | "%matplotlib inline"
119 | ],
120 | "execution_count": 5,
121 | "outputs": []
122 | },
123 | {
124 | "cell_type": "code",
125 | "metadata": {
126 | "id": "d17uQ_kpKn5w"
127 | },
128 | "source": [
129 | "from fbprophet import Prophet\n",
130 | "\n",
131 | "import logging\n",
132 | "logging.getLogger().setLevel(logging.ERROR)"
133 | ],
134 | "execution_count": 6,
135 | "outputs": []
136 | },
137 | {
138 | "cell_type": "markdown",
139 | "metadata": {
140 | "id": "X5p8oL_STyAg"
141 | },
142 | "source": [
143 | "#**3-Import dataset**"
144 | ]
145 | },
146 | {
147 | "cell_type": "code",
148 | "metadata": {
149 | "id": "4lsb_UGenA-Y",
150 | "outputId": "70bd3db4-797d-407c-88b1-0cf5182a48d2",
151 | "colab": {
152 | "base_uri": "https://localhost:8080/"
153 | }
154 | },
155 | "source": [
156 | "from google.colab import drive\n",
157 | "drive.mount('/content/drive')"
158 | ],
159 | "execution_count": 7,
160 | "outputs": [
161 | {
162 | "output_type": "stream",
163 | "text": [
164 | "Mounted at /content/drive\n"
165 | ],
166 | "name": "stdout"
167 | }
168 | ]
169 | },
170 | {
171 | "cell_type": "code",
172 | "metadata": {
173 | "id": "RDKPEa7keRWE"
174 | },
175 | "source": [
176 | "data = pd.read_csv(\"/content/drive/My Drive/Datasets/Covid-19/data_cases1.csv\", usecols=['Date','Confirmed'],index_col='Date', parse_dates=True )\n",
177 | "data.head()"
178 | ],
179 | "execution_count": null,
180 | "outputs": []
181 | },
182 | {
183 | "cell_type": "code",
184 | "metadata": {
185 | "id": "_qRu1k2Dyt2O"
186 | },
187 | "source": [
188 | "data.head()"
189 | ],
190 | "execution_count": null,
191 | "outputs": []
192 | },
193 | {
194 | "cell_type": "code",
195 | "metadata": {
196 | "id": "t-E75RQRTyAx"
197 | },
198 | "source": [
199 | "data.shape"
200 | ],
201 | "execution_count": null,
202 | "outputs": []
203 | },
204 | {
205 | "cell_type": "markdown",
206 | "metadata": {
207 | "id": "R8BthJfITyBS"
208 | },
209 | "source": [
210 | "#**4-Data cleaning and feature engineering**"
211 | ]
212 | },
213 | {
214 | "cell_type": "code",
215 | "metadata": {
216 | "id": "Ns2tk6CCGzDo"
217 | },
218 | "source": [
219 | "df = data.reset_index()"
220 | ],
221 | "execution_count": 11,
222 | "outputs": []
223 | },
224 | {
225 | "cell_type": "code",
226 | "metadata": {
227 | "id": "WGxkU_bZ_eoq"
228 | },
229 | "source": [
230 | "df.head()"
231 | ],
232 | "execution_count": null,
233 | "outputs": []
234 | },
235 | {
236 | "cell_type": "code",
237 | "metadata": {
238 | "id": "wiEfm8xA9Cb4"
239 | },
240 | "source": [
241 | "#we get rid of possible duplicates and missing values in the data [3]\n",
242 | "df = df[['Date', 'Confirmed']].dropna().drop_duplicates()"
243 | ],
244 | "execution_count": 13,
245 | "outputs": []
246 | },
247 | {
248 | "cell_type": "code",
249 | "metadata": {
250 | "id": "fqj24WXrFwbV",
251 | "outputId": "8bd45a73-a772-48f3-c96f-c05f27829a71",
252 | "colab": {
253 | "base_uri": "https://localhost:8080/"
254 | }
255 | },
256 | "source": [
257 | "df.shape"
258 | ],
259 | "execution_count": 14,
260 | "outputs": [
261 | {
262 | "output_type": "execute_result",
263 | "data": {
264 | "text/plain": [
265 | "(1157, 2)"
266 | ]
267 | },
268 | "metadata": {
269 | "tags": []
270 | },
271 | "execution_count": 14
272 | }
273 | ]
274 | },
275 | {
276 | "cell_type": "code",
277 | "metadata": {
278 | "id": "8oKf0uV2GG02"
279 | },
280 | "source": [
281 | "#we need to convert Date to the datetime format because by default pandas treats this field as string-valued[3]\n",
282 | "df['Date'] = pd.to_datetime(df['Date'])"
283 | ],
284 | "execution_count": 15,
285 | "outputs": []
286 | },
287 | {
288 | "cell_type": "code",
289 | "metadata": {
290 | "id": "ijc8tkqbGqpE"
291 | },
292 | "source": [
293 | "df.head()"
294 | ],
295 | "execution_count": null,
296 | "outputs": []
297 | },
298 | {
299 | "cell_type": "code",
300 | "metadata": {
301 | "id": "-cnFzmAAHIWq"
302 | },
303 | "source": [
304 | "#Let's sort the dataframe by time and take a look at what we've got [3]\n",
305 | "df.sort_values(by=['Date']).head(n=3)"
306 | ],
307 | "execution_count": null,
308 | "outputs": []
309 | },
310 | {
311 | "cell_type": "code",
312 | "metadata": {
313 | "id": "vegTUwkq9kw8"
314 | },
315 | "source": [
316 | "#We will just trim our time series to keep only those rows that fall onto the period from March 10, 2020 to March 31, 2020[3]\n",
317 | "df = df[(df['Date'] > '2020-03-09') & (df['Date'] < '2020-04-01')].sort_values(by=['Date'])\n",
318 | "df.head(n=10)"
319 | ],
320 | "execution_count": null,
321 | "outputs": []
322 | },
323 | {
324 | "cell_type": "code",
325 | "metadata": {
326 | "id": "rahdPkyK-81A"
327 | },
328 | "source": [
329 | "#As we are going to predict the number of confirm Cases of Covid-19, we will aggregate and count unique Cases at each given point in time. We will name the corresponding new column Confirmed_Cases:\n",
330 | "aggr_df = df.groupby('Date')[['Confirmed']].count()\n",
331 | "aggr_df.columns = ['Confirmed_Cases']"
332 | ],
333 | "execution_count": 19,
334 | "outputs": []
335 | },
336 | {
337 | "cell_type": "code",
338 | "metadata": {
339 | "id": "K-av4_rg_pfa"
340 | },
341 | "source": [
342 | "aggr_df.head(n=3)\n"
343 | ],
344 | "execution_count": null,
345 | "outputs": []
346 | },
347 | {
348 | "cell_type": "code",
349 | "metadata": {
350 | "id": "8WPfYYLU_6dD"
351 | },
352 | "source": [
353 | "#In this practice, we are interested in the number of confirmed cases a day. But at this moment all our data is divided into irregular time intervals that are less than a day. \n",
354 | "#This is called a sub-daily time series.To fix this, we need to aggregate the Cases counts by \"bins\" of a date size. \n",
355 | "#In time series analysis, this process is referred to as resampling. And if we reduce the sampling rate of data it is often called downsampling.\n",
356 | "#Luckily, pandas has a built-in functionality for this task. We will resample our time index down to 1-day bins [3]\n",
357 | "daily_df = aggr_df.resample('D').apply(sum)\n",
358 | "daily_df.head(n=3)"
359 | ],
360 | "execution_count": null,
361 | "outputs": []
362 | },
363 | {
364 | "cell_type": "code",
365 | "metadata": {
366 | "id": "9n0GTxkvAeq5"
367 | },
368 | "source": [
369 | "weekly_df = daily_df.resample('W').apply(sum)"
370 | ],
371 | "execution_count": 22,
372 | "outputs": []
373 | },
374 | {
375 | "cell_type": "code",
376 | "metadata": {
377 | "id": "SUR6GRnOJp5T"
378 | },
379 | "source": [
380 | "weekly_df.head()"
381 | ],
382 | "execution_count": null,
383 | "outputs": []
384 | },
385 | {
386 | "cell_type": "code",
387 | "metadata": {
388 | "id": "lVCbk-vsAz-n"
389 | },
390 | "source": [
391 | "daily_df = daily_df.loc[daily_df.index >= '2020-03-10']\n",
392 | "daily_df.head(n=3)"
393 | ],
394 | "execution_count": null,
395 | "outputs": []
396 | },
397 | {
398 | "cell_type": "code",
399 | "metadata": {
400 | "id": "ZQLlg3g4OvWh"
401 | },
402 | "source": [
403 | "plt.figure(figsize=(16,8))\n",
404 | "plt.title('Daily Confirmed Cases of COVID-19 (World)')\n",
405 | "plt.plot(daily_df['Confirmed_Cases'])\n",
406 | "plt.xlabel('Date', fontsize=18)\n",
407 | "plt.ylabel('Confirmed Cases', fontsize=18)\n",
408 | "plt.show()"
409 | ],
410 | "execution_count": null,
411 | "outputs": []
412 | },
413 | {
414 | "cell_type": "markdown",
415 | "metadata": {
416 | "id": "C8ivtMFNTyEs"
417 | },
418 | "source": [
419 | "#**5- Modelling**\n",
420 | "\n",
421 | "Let's focus on predictig the confirmed cases of COVI-ID.Prophet's API is very similar to the one you can find in sklearn. First we create a model, then call the method fit, and, finally, make a forecast. The input to the method fit is a DataFrame with two columns:\n",
422 | "\n",
423 | "ds (datestamp) must be of type date or datetime.\n",
424 | "y is a numeric value we want to predict."
425 | ]
426 | },
427 | {
428 | "cell_type": "code",
429 | "metadata": {
430 | "id": "IVFBqHaJBM2J"
431 | },
432 | "source": [
433 | "#[3]\n",
434 | "df = daily_df.reset_index()\n",
435 | "df.columns = ['ds', 'y']\n",
436 | "df.tail(n=3)"
437 | ],
438 | "execution_count": null,
439 | "outputs": []
440 | },
441 | {
442 | "cell_type": "code",
443 | "metadata": {
444 | "id": "0_egvgIBTB2U"
445 | },
446 | "source": [
447 | "df.set_index('ds').y.plot().get_figure()"
448 | ],
449 | "execution_count": null,
450 | "outputs": []
451 | },
452 | {
453 | "cell_type": "code",
454 | "metadata": {
455 | "id": "m2sSEx8DLfGB"
456 | },
457 | "source": [
458 | "#To measure the quality of our forecast, we need to split our dataset into the historical part, which is the first and biggest slice of our data,\n",
459 | "# and the prediction part, which will be located at the end of the timeline. \n",
460 | "#We will remove the last 3 days from the dataset in order to use it later as a prediction target[3]\n",
461 | "prediction_size = 3\n",
462 | "train_df = df[:-prediction_size]\n",
463 | "train_df.tail(n=3)"
464 | ],
465 | "execution_count": null,
466 | "outputs": []
467 | },
468 | {
469 | "cell_type": "code",
470 | "metadata": {
471 | "id": "Yi-9PTvvMaOg"
472 | },
473 | "source": [
474 | "# now we create model [3]\n",
475 | "m = Prophet()\n",
476 | "m.fit(train_df);"
477 | ],
478 | "execution_count": null,
479 | "outputs": []
480 | },
481 | {
482 | "cell_type": "code",
483 | "metadata": {
484 | "id": "a_ki-3zQMxf9",
485 | "outputId": "c429c6f7-1b09-4972-9991-29d0041948ed",
486 | "colab": {
487 | "base_uri": "https://localhost:8080/",
488 | "height": 142
489 | }
490 | },
491 | "source": [
492 | "#[3]\n",
493 | "future = m.make_future_dataframe(periods=prediction_size)\n",
494 | "future.tail(n=3)"
495 | ],
496 | "execution_count": 30,
497 | "outputs": [
498 | {
499 | "output_type": "execute_result",
500 | "data": {
501 | "text/html": [
502 | "\n",
503 | "\n",
516 | "
\n",
517 | " \n",
518 | " \n",
519 | " | \n",
520 | " ds | \n",
521 | "
\n",
522 | " \n",
523 | " \n",
524 | " \n",
525 | " | 14 | \n",
526 | " 2020-03-24 | \n",
527 | "
\n",
528 | " \n",
529 | " | 15 | \n",
530 | " 2020-03-25 | \n",
531 | "
\n",
532 | " \n",
533 | " | 16 | \n",
534 | " 2020-03-26 | \n",
535 | "
\n",
536 | " \n",
537 | "
\n",
538 | "
"
539 | ],
540 | "text/plain": [
541 | " ds\n",
542 | "14 2020-03-24\n",
543 | "15 2020-03-25\n",
544 | "16 2020-03-26"
545 | ]
546 | },
547 | "metadata": {
548 | "tags": []
549 | },
550 | "execution_count": 30
551 | }
552 | ]
553 | },
554 | {
555 | "cell_type": "code",
556 | "metadata": {
557 | "id": "CTCGpSIMNM8X"
558 | },
559 | "source": [
560 | "#[3]\n",
561 | "forecast = m.predict(future)\n",
562 | "forecast.tail(n=3)"
563 | ],
564 | "execution_count": null,
565 | "outputs": []
566 | },
567 | {
568 | "cell_type": "code",
569 | "metadata": {
570 | "id": "qJSMTYcAOl4M",
571 | "outputId": "cbdbfb3e-0699-469c-9f33-2938a4106d99",
572 | "colab": {
573 | "base_uri": "https://localhost:8080/",
574 | "height": 204
575 | }
576 | },
577 | "source": [
578 | "#[1]\n",
579 | "forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()"
580 | ],
581 | "execution_count": 33,
582 | "outputs": [
583 | {
584 | "output_type": "execute_result",
585 | "data": {
586 | "text/html": [
587 | "\n",
588 | "\n",
601 | "
\n",
602 | " \n",
603 | " \n",
604 | " | \n",
605 | " ds | \n",
606 | " yhat | \n",
607 | " yhat_lower | \n",
608 | " yhat_upper | \n",
609 | "
\n",
610 | " \n",
611 | " \n",
612 | " \n",
613 | " | 12 | \n",
614 | " 2020-03-22 | \n",
615 | " 83.510020 | \n",
616 | " 56.517001 | \n",
617 | " 104.968454 | \n",
618 | "
\n",
619 | " \n",
620 | " | 13 | \n",
621 | " 2020-03-23 | \n",
622 | " 89.762552 | \n",
623 | " 66.425144 | \n",
624 | " 113.424199 | \n",
625 | "
\n",
626 | " \n",
627 | " | 14 | \n",
628 | " 2020-03-24 | \n",
629 | " 96.015084 | \n",
630 | " 74.384712 | \n",
631 | " 121.402555 | \n",
632 | "
\n",
633 | " \n",
634 | " | 15 | \n",
635 | " 2020-03-25 | \n",
636 | " 102.267617 | \n",
637 | " 76.591055 | \n",
638 | " 125.693865 | \n",
639 | "
\n",
640 | " \n",
641 | " | 16 | \n",
642 | " 2020-03-26 | \n",
643 | " 108.520149 | \n",
644 | " 83.544659 | \n",
645 | " 133.043532 | \n",
646 | "
\n",
647 | " \n",
648 | "
\n",
649 | "
"
650 | ],
651 | "text/plain": [
652 | " ds yhat yhat_lower yhat_upper\n",
653 | "12 2020-03-22 83.510020 56.517001 104.968454\n",
654 | "13 2020-03-23 89.762552 66.425144 113.424199\n",
655 | "14 2020-03-24 96.015084 74.384712 121.402555\n",
656 | "15 2020-03-25 102.267617 76.591055 125.693865\n",
657 | "16 2020-03-26 108.520149 83.544659 133.043532"
658 | ]
659 | },
660 | "metadata": {
661 | "tags": []
662 | },
663 | "execution_count": 33
664 | }
665 | ]
666 | },
667 | {
668 | "cell_type": "markdown",
669 | "metadata": {
670 | "id": "DPnDCm1jeFIp"
671 | },
672 | "source": [
673 | "#**6- Result Visualization**"
674 | ]
675 | },
676 | {
677 | "cell_type": "code",
678 | "metadata": {
679 | "id": "BB5Kty-_N7k1"
680 | },
681 | "source": [
682 | "#[3]forecasting\n",
683 | "m.plot(forecast);\n",
684 | "plt.title(\"Forecasting Covid-19 confirmed Cases(world) \")\n",
685 | "plt.show()"
686 | ],
687 | "execution_count": null,
688 | "outputs": []
689 | },
690 | {
691 | "cell_type": "code",
692 | "metadata": {
693 | "id": "Auutf8VWdchF"
694 | },
695 | "source": [
696 | "#[3]\n",
697 | "m.plot_components(forecast);\n",
698 | "plt.show()"
699 | ],
700 | "execution_count": null,
701 | "outputs": []
702 | },
703 | {
704 | "cell_type": "code",
705 | "metadata": {
706 | "id": "Jirxk0dDTs-n"
707 | },
708 | "source": [
709 | "# [1]\n",
710 | "df.set_index('ds', inplace=True)\n",
711 | "forecast.set_index('ds', inplace=True)"
712 | ],
713 | "execution_count": 36,
714 | "outputs": []
715 | },
716 | {
717 | "cell_type": "code",
718 | "metadata": {
719 | "id": "fnUxJ25aTztl"
720 | },
721 | "source": [
722 | "viz_df = daily_df.join(forecast[['yhat', 'yhat_lower','yhat_upper']], how = 'outer')"
723 | ],
724 | "execution_count": 37,
725 | "outputs": []
726 | },
727 | {
728 | "cell_type": "code",
729 | "metadata": {
730 | "id": "pgD7XbqwT_LV",
731 | "outputId": "0db294b6-f0e2-436a-e438-07f711201add",
732 | "colab": {
733 | "base_uri": "https://localhost:8080/",
734 | "height": 235
735 | }
736 | },
737 | "source": [
738 | "viz_df.head()"
739 | ],
740 | "execution_count": 38,
741 | "outputs": [
742 | {
743 | "output_type": "execute_result",
744 | "data": {
745 | "text/html": [
746 | "\n",
747 | "\n",
760 | "
\n",
761 | " \n",
762 | " \n",
763 | " | \n",
764 | " Confirmed_Cases | \n",
765 | " yhat | \n",
766 | " yhat_lower | \n",
767 | " yhat_upper | \n",
768 | "
\n",
769 | " \n",
770 | " | Date | \n",
771 | " | \n",
772 | " | \n",
773 | " | \n",
774 | " | \n",
775 | "
\n",
776 | " \n",
777 | " \n",
778 | " \n",
779 | " | 2020-03-10 | \n",
780 | " 17 | \n",
781 | " 8.479636 | \n",
782 | " -15.735117 | \n",
783 | " 31.833502 | \n",
784 | "
\n",
785 | " \n",
786 | " | 2020-03-11 | \n",
787 | " 24 | \n",
788 | " 14.732168 | \n",
789 | " -11.257902 | \n",
790 | " 38.105296 | \n",
791 | "
\n",
792 | " \n",
793 | " | 2020-03-12 | \n",
794 | " 28 | \n",
795 | " 20.984699 | \n",
796 | " -2.669664 | \n",
797 | " 46.315372 | \n",
798 | "
\n",
799 | " \n",
800 | " | 2020-03-13 | \n",
801 | " 30 | \n",
802 | " 27.237231 | \n",
803 | " 4.537930 | \n",
804 | " 51.516875 | \n",
805 | "
\n",
806 | " \n",
807 | " | 2020-03-14 | \n",
808 | " 37 | \n",
809 | " 33.489763 | \n",
810 | " 9.121017 | \n",
811 | " 57.230878 | \n",
812 | "
\n",
813 | " \n",
814 | "
\n",
815 | "
"
816 | ],
817 | "text/plain": [
818 | " Confirmed_Cases yhat yhat_lower yhat_upper\n",
819 | "Date \n",
820 | "2020-03-10 17 8.479636 -15.735117 31.833502\n",
821 | "2020-03-11 24 14.732168 -11.257902 38.105296\n",
822 | "2020-03-12 28 20.984699 -2.669664 46.315372\n",
823 | "2020-03-13 30 27.237231 4.537930 51.516875\n",
824 | "2020-03-14 37 33.489763 9.121017 57.230878"
825 | ]
826 | },
827 | "metadata": {
828 | "tags": []
829 | },
830 | "execution_count": 38
831 | }
832 | ]
833 | },
834 | {
835 | "cell_type": "code",
836 | "metadata": {
837 | "id": "TacZpRviVQlQ",
838 | "outputId": "bcbf0e0f-e8af-4977-f6e7-7104a1a079f0",
839 | "colab": {
840 | "base_uri": "https://localhost:8080/",
841 | "height": 595
842 | }
843 | },
844 | "source": [
845 | "viz_df[['Confirmed_Cases', 'yhat']].plot().get_figure()"
846 | ],
847 | "execution_count": 39,
848 | "outputs": [
849 | {
850 | "output_type": "execute_result",
851 | "data": {
852 | "image/png": "\n",
853 | "text/plain": [
854 | ""
855 | ]
856 | },
857 | "metadata": {
858 | "tags": []
859 | },
860 | "execution_count": 39
861 | },
862 | {
863 | "output_type": "display_data",
864 | "data": {
865 | "image/png": "\n",
866 | "text/plain": [
867 | ""
868 | ]
869 | },
870 | "metadata": {
871 | "tags": [],
872 | "needs_background": "light"
873 | }
874 | }
875 | ]
876 | },
877 | {
878 | "cell_type": "markdown",
879 | "metadata": {
880 | "id": "_3WYU3uraZOQ"
881 | },
882 | "source": [
883 | "#**7- Forecast quality evaluation** \n",
884 | "Let's evaluate the quality of the algorithm by calculating the error metrics for the last 30 days that we predicted. For this, we will need the observations yi and the corresponding predicted values y^i ."
885 | ]
886 | },
887 | {
888 | "cell_type": "code",
889 | "metadata": {
890 | "id": "V294mk5-PUcV"
891 | },
892 | "source": [
893 | "def make_comparison_dataframe(historical, forecast):\n",
894 | " \"\"\"Join the history with the forecast.\n",
895 | " \n",
896 | " The resulting dataset will contain columns 'yhat', 'yhat_lower', 'yhat_upper' and 'y'.\n",
897 | " \"\"\"\n",
898 | " return forecast.set_index('ds')[['yhat', 'yhat_lower', 'yhat_upper']].join(historical.set_index('ds'))"
899 | ],
900 | "execution_count": 40,
901 | "outputs": []
902 | },
903 | {
904 | "cell_type": "code",
905 | "metadata": {
906 | "id": "7rkQB64bPdTp"
907 | },
908 | "source": [
909 | "cmp_df = make_comparison_dataframe(df, forecast)\n",
910 | "cmp_df.tail(n=3)"
911 | ],
912 | "execution_count": null,
913 | "outputs": []
914 | },
915 | {
916 | "cell_type": "code",
917 | "metadata": {
918 | "id": "hHZkxBo2PvyI"
919 | },
920 | "source": [
921 | "def calculate_forecast_errors(df, prediction_size):\n",
922 | " \"\"\"Calculate MAPE and MAE of the forecast.\n",
923 | " \n",
924 | " Args:\n",
925 | " df: joined dataset with 'y' and 'yhat' columns.\n",
926 | " prediction_size: number of days at the end to predict.\n",
927 | " \"\"\"\n",
928 | " \n",
929 | " # Make a copy\n",
930 | " df = df.copy()\n",
931 | " \n",
932 | " # Now we calculate the values of e_i and p_i according to the formulas given in the article above.\n",
933 | " df['e'] = df['y'] - df['yhat']\n",
934 | " df['p'] = 100 * df['e'] / df['y']\n",
935 | " \n",
936 | " # Recall that we held out the values of the last `prediction_size` days\n",
937 | " # in order to predict them and measure the quality of the model. \n",
938 | " \n",
939 | " # Now cut out the part of the data which we made our prediction for.\n",
940 | " predicted_part = df[-prediction_size:]\n",
941 | " \n",
942 | " # Define the function that averages absolute error values over the predicted part.\n",
943 | " error_mean = lambda error_name: np.mean(np.abs(predicted_part[error_name]))\n",
944 | " \n",
945 | " # Now we can calculate MAPE and MAE and return the resulting dictionary of errors.\n",
946 | " return {'MAPE': error_mean('p'), 'MAE': error_mean('e')}\n"
947 | ],
948 | "execution_count": 42,
949 | "outputs": []
950 | },
951 | {
952 | "cell_type": "code",
953 | "metadata": {
954 | "id": "6lpRKrLhP4NQ",
955 | "outputId": "ac59aaa7-0ac1-41d7-9c17-d19c296134f7",
956 | "colab": {
957 | "base_uri": "https://localhost:8080/",
958 | "height": 51
959 | }
960 | },
961 | "source": [
962 | "for err_name, err_value in calculate_forecast_errors(cmp_df, prediction_size).items():\n",
963 | " print(err_name, err_value)"
964 | ],
965 | "execution_count": null,
966 | "outputs": [
967 | {
968 | "output_type": "stream",
969 | "text": [
970 | "MAPE 34.774518380154696\n",
971 | "MAE 55.0657164657149\n"
972 | ],
973 | "name": "stdout"
974 | }
975 | ]
976 | },
977 | {
978 | "cell_type": "markdown",
979 | "metadata": {
980 | "id": "HLConDmcvWW-"
981 | },
982 | "source": [
983 | "#**8- RMSE**"
984 | ]
985 | },
986 | {
987 | "cell_type": "code",
988 | "metadata": {
989 | "id": "3mn74pRGqITN"
990 | },
991 | "source": [
992 | "se = np.square(forecast.loc[:, 'yhat'] - df['y'])\n",
993 | "mse = np.mean(se)\n",
994 | "rmse = np.sqrt(mse)"
995 | ],
996 | "execution_count": 46,
997 | "outputs": []
998 | },
999 | {
1000 | "cell_type": "code",
1001 | "metadata": {
1002 | "id": "P3nGmOylu3nv",
1003 | "outputId": "d4b1e353-79a6-4435-99a0-c41b86a7ddbb",
1004 | "colab": {
1005 | "base_uri": "https://localhost:8080/"
1006 | }
1007 | },
1008 | "source": [
1009 | "print(rmse)"
1010 | ],
1011 | "execution_count": 47,
1012 | "outputs": [
1013 | {
1014 | "output_type": "stream",
1015 | "text": [
1016 | "29.077235826002735\n"
1017 | ],
1018 | "name": "stdout"
1019 | }
1020 | ]
1021 | },
1022 | {
1023 | "cell_type": "markdown",
1024 | "metadata": {
1025 | "id": "QmrbiUI8sjn8"
1026 | },
1027 | "source": [
1028 | "# **References** "
1029 | ]
1030 | },
1031 | {
1032 | "cell_type": "code",
1033 | "metadata": {
1034 | "id": "KZy09JqTtWPt"
1035 | },
1036 | "source": [
1037 | "#[1]Forecasting Time Series Data With Prophet I & II\n",
1038 | "# https://nextjournal.com/eric-brown/forecasting-with-prophet \n",
1039 | "#[2]Forecasting Time Series Data With Prophet IV\n",
1040 | "#https://nextjournal.com/eric-brown/forecasting-with-prophet-part-4\n",
1041 | "#[3]Time series analysis in Python\n",
1042 | "#https://mlcourse.ai/articles/topic9-part2-prophet/\n",
1043 | "#https://www.kaggle.com/kashnitsky/topic-9-part-2-time-series-with-facebook-prophet\n",
1044 | "#[4] Covid-19 Deaths Predictions (Time Series - Prophet) - DIY\n",
1045 | "#https://www.youtube.com/watch?v=zlFKgMnaDXk&feature=youtu.be&fbclid=IwAR3vSH3rUeeQRvhxSzADMlqK4CY06GaSbJSaeq-mAYfHXZb9E0M8Dlor4wU\n"
1046 | ],
1047 | "execution_count": null,
1048 | "outputs": []
1049 | }
1050 | ]
1051 | }
--------------------------------------------------------------------------------
/Implementation/5_13_2020_Covid_LSTM.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "kernelspec": {
6 | "display_name": "Python 3",
7 | "language": "python",
8 | "name": "python3"
9 | },
10 | "language_info": {
11 | "codemirror_mode": {
12 | "name": "ipython",
13 | "version": 3
14 | },
15 | "file_extension": ".py",
16 | "mimetype": "text/x-python",
17 | "name": "python",
18 | "nbconvert_exporter": "python",
19 | "pygments_lexer": "ipython3",
20 | "version": "3.6.6"
21 | },
22 | "colab": {
23 | "name": " 5-13-2020 Covid_LSTM.ipynb",
24 | "provenance": [],
25 | "collapsed_sections": [],
26 | "toc_visible": true,
27 | "machine_shape": "hm",
28 | "include_colab_link": true
29 | }
30 | },
31 | "cells": [
32 | {
33 | "cell_type": "markdown",
34 | "metadata": {
35 | "id": "view-in-github",
36 | "colab_type": "text"
37 | },
38 | "source": [
39 | "
"
40 | ]
41 | },
42 | {
43 | "cell_type": "markdown",
44 | "metadata": {
45 | "id": "FfcjS4CKTyAP"
46 | },
47 | "source": [
48 | "# **1- Introduction** \n",
49 | "\n",
50 | "Predicting confirmed cases of Convid-19 with LSTM"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "metadata": {
56 | "id": "uTYReJPf9UFm",
57 | "outputId": "ccf747f0-ee30-4185-91e5-de0485b17e52",
58 | "colab": {
59 | "base_uri": "https://localhost:8080/"
60 | }
61 | },
62 | "source": [
63 | "!git clone https://github.com/hussain0048/Machine-Learning-Driven-Approach-for-2019-nCoV-Warning-System"
64 | ],
65 | "execution_count": 1,
66 | "outputs": [
67 | {
68 | "output_type": "stream",
69 | "text": [
70 | "Cloning into 'Machine-Learning-Driven-Approach-for-2019-nCoV-Warning-System'...\n",
71 | "fatal: could not read Username for 'https://github.com': No such device or address\n"
72 | ],
73 | "name": "stdout"
74 | }
75 | ]
76 | },
77 | {
78 | "cell_type": "markdown",
79 | "metadata": {
80 | "id": "85cNlo5f1-Z3"
81 | },
82 | "source": [
83 | "# **2- Import Nessary library**"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "metadata": {
89 | "id": "FxdLWHIJTyAU"
90 | },
91 | "source": [
92 | "import pandas as pd\n",
93 | "import numpy as np\n",
94 | "from fbprophet import Prophet\n",
95 | "import matplotlib.pyplot as plt\n",
96 | " \n",
97 | "plt.rcParams['figure.figsize']=(20,10)\n",
98 | "plt.style.use('ggplot')"
99 | ],
100 | "execution_count": 3,
101 | "outputs": []
102 | },
103 | {
104 | "cell_type": "code",
105 | "metadata": {
106 | "id": "5NeSSztObv-7"
107 | },
108 | "source": [
109 | "plt.rcParams['figure.figsize']=(20,10)\n",
110 | "plt.style.use('ggplot')"
111 | ],
112 | "execution_count": 4,
113 | "outputs": []
114 | },
115 | {
116 | "cell_type": "code",
117 | "metadata": {
118 | "id": "XhadCtyRcFFY"
119 | },
120 | "source": [
121 | "pd.plotting.register_matplotlib_converters()"
122 | ],
123 | "execution_count": 5,
124 | "outputs": []
125 | },
126 | {
127 | "cell_type": "code",
128 | "metadata": {
129 | "id": "-vcGxjHQGL1c"
130 | },
131 | "source": [
132 | "#[3]\n",
133 | "import warnings\n",
134 | "warnings.filterwarnings('ignore')\n",
135 | "\n",
136 | "import numpy as np\n",
137 | "import pandas as pd\n",
138 | "from scipy import stats\n",
139 | "import statsmodels.api as sm\n",
140 | "import matplotlib.pyplot as plt\n",
141 | "\n",
142 | "%matplotlib inline"
143 | ],
144 | "execution_count": 6,
145 | "outputs": []
146 | },
147 | {
148 | "cell_type": "code",
149 | "metadata": {
150 | "id": "d17uQ_kpKn5w"
151 | },
152 | "source": [
153 | "from fbprophet import Prophet\n",
154 | "\n",
155 | "import logging\n",
156 | "logging.getLogger().setLevel(logging.ERROR)"
157 | ],
158 | "execution_count": 7,
159 | "outputs": []
160 | },
161 | {
162 | "cell_type": "code",
163 | "metadata": {
164 | "id": "Ia255xaIs1iD"
165 | },
166 | "source": [
167 | "# LSTM\n",
168 | "import math\n",
169 | "import pandas_datareader as web\n",
170 | "import numpy as np\n",
171 | "import pandas as pd\n",
172 | "from sklearn.preprocessing import MinMaxScaler\n",
173 | "from keras.models import Sequential\n",
174 | "from keras.layers import Dense,LSTM\n",
175 | "import matplotlib.pyplot as plt\n",
176 | "plt.style.use('fivethirtyeight')"
177 | ],
178 | "execution_count": 8,
179 | "outputs": []
180 | },
181 | {
182 | "cell_type": "code",
183 | "metadata": {
184 | "id": "NwsfXTJk2R0M",
185 | "outputId": "e03f8c93-a8da-420e-b702-f430e02607b8",
186 | "colab": {
187 | "base_uri": "https://localhost:8080/"
188 | }
189 | },
190 | "source": [
191 | "from google.colab import drive\n",
192 | "drive.mount('/content/drive')"
193 | ],
194 | "execution_count": 9,
195 | "outputs": [
196 | {
197 | "output_type": "stream",
198 | "text": [
199 | "Mounted at /content/drive\n"
200 | ],
201 | "name": "stdout"
202 | }
203 | ]
204 | },
205 | {
206 | "cell_type": "markdown",
207 | "metadata": {
208 | "id": "X5p8oL_STyAg"
209 | },
210 | "source": [
211 | "# **3-Import dataset**"
212 | ]
213 | },
214 | {
215 | "cell_type": "code",
216 | "metadata": {
217 | "id": "RDKPEa7keRWE"
218 | },
219 | "source": [
220 | "df = pd.read_csv(\"/content/drive/My Drive/Datasets/Covid-19/data_cases1.csv\", usecols=['Date','Confirmed'],index_col='Date', parse_dates=True )\n",
221 | "df.head()"
222 | ],
223 | "execution_count": null,
224 | "outputs": []
225 | },
226 | {
227 | "cell_type": "code",
228 | "metadata": {
229 | "id": "_qRu1k2Dyt2O"
230 | },
231 | "source": [
232 | "df.head()"
233 | ],
234 | "execution_count": null,
235 | "outputs": []
236 | },
237 | {
238 | "cell_type": "code",
239 | "metadata": {
240 | "id": "t-E75RQRTyAx"
241 | },
242 | "source": [
243 | "df.shape"
244 | ],
245 | "execution_count": null,
246 | "outputs": []
247 | },
248 | {
249 | "cell_type": "markdown",
250 | "metadata": {
251 | "id": "R8BthJfITyBS"
252 | },
253 | "source": [
254 | "#**4-Data cleaning and feature engineering**"
255 | ]
256 | },
257 | {
258 | "cell_type": "code",
259 | "metadata": {
260 | "id": "Ns2tk6CCGzDo"
261 | },
262 | "source": [
263 | "df1 = df.reset_index()"
264 | ],
265 | "execution_count": 68,
266 | "outputs": []
267 | },
268 | {
269 | "cell_type": "code",
270 | "metadata": {
271 | "id": "WGxkU_bZ_eoq"
272 | },
273 | "source": [
274 | "df1.head()"
275 | ],
276 | "execution_count": null,
277 | "outputs": []
278 | },
279 | {
280 | "cell_type": "code",
281 | "metadata": {
282 | "id": "wiEfm8xA9Cb4"
283 | },
284 | "source": [
285 | "#we get rid of possible duplicates and missing values in the data [3]\n",
286 | "df1 = df1[['Date', 'Confirmed']].dropna().drop_duplicates()"
287 | ],
288 | "execution_count": 70,
289 | "outputs": []
290 | },
291 | {
292 | "cell_type": "code",
293 | "metadata": {
294 | "id": "fqj24WXrFwbV"
295 | },
296 | "source": [
297 | "df1.shape"
298 | ],
299 | "execution_count": null,
300 | "outputs": []
301 | },
302 | {
303 | "cell_type": "code",
304 | "metadata": {
305 | "id": "8oKf0uV2GG02"
306 | },
307 | "source": [
308 | "#we need to convert Date to the datetime format because by default pandas treats this field as string-valued[3]\n",
309 | "df1['Date'] = pd.to_datetime(df1['Date'])"
310 | ],
311 | "execution_count": 72,
312 | "outputs": []
313 | },
314 | {
315 | "cell_type": "code",
316 | "metadata": {
317 | "id": "ijc8tkqbGqpE"
318 | },
319 | "source": [
320 | "df1.head()"
321 | ],
322 | "execution_count": null,
323 | "outputs": []
324 | },
325 | {
326 | "cell_type": "code",
327 | "metadata": {
328 | "id": "-cnFzmAAHIWq"
329 | },
330 | "source": [
331 | "#Let's sort the dataframe by time and take a look at what we've got [3]\n",
332 | "df1.sort_values(by=['Date']).head(n=3)"
333 | ],
334 | "execution_count": null,
335 | "outputs": []
336 | },
337 | {
338 | "cell_type": "code",
339 | "metadata": {
340 | "id": "vegTUwkq9kw8"
341 | },
342 | "source": [
343 | "#We will just trim our time series to keep only those rows that fall onto the period from March 10, 2020 to March 31, 2020[3]\n",
344 | "df1 = df1[(df1['Date'] > '2020-03-09') & (df1['Date'] < '2020-04-01')].sort_values(by=['Date'])\n",
345 | "df1.head(n=10)"
346 | ],
347 | "execution_count": null,
348 | "outputs": []
349 | },
350 | {
351 | "cell_type": "code",
352 | "metadata": {
353 | "id": "rahdPkyK-81A"
354 | },
355 | "source": [
356 | "#As we are going to predict the number of confirm Cases of Covid-19, we will aggregate and count unique Cases at each given point in time. We will name the corresponding new column Confirmed_Cases:\n",
357 | "aggr_df = df1.groupby('Date')[['Confirmed']].count()\n",
358 | "aggr_df.columns = ['Confirmed_Cases']"
359 | ],
360 | "execution_count": 77,
361 | "outputs": []
362 | },
363 | {
364 | "cell_type": "code",
365 | "metadata": {
366 | "id": "K-av4_rg_pfa"
367 | },
368 | "source": [
369 | "aggr_df.head(n=3)\n"
370 | ],
371 | "execution_count": null,
372 | "outputs": []
373 | },
374 | {
375 | "cell_type": "code",
376 | "metadata": {
377 | "id": "8WPfYYLU_6dD"
378 | },
379 | "source": [
380 | "#In this practice, we are interested in the number of confirmed cases a day. But at this moment all our data is divided into irregular time intervals that are less than a day. \n",
381 | "#This is called a sub-daily time series.To fix this, we need to aggregate the Cases counts by \"bins\" of a date size. \n",
382 | "#In time series analysis, this process is referred to as resampling. And if we reduce the sampling rate of data it is often called downsampling.\n",
383 | "#Luckily, pandas has a built-in functionality for this task. We will resample our time index down to 1-day bins [3]\n",
384 | "daily_df = aggr_df.resample('D').apply(sum)\n",
385 | "daily_df.head(n=3)"
386 | ],
387 | "execution_count": null,
388 | "outputs": []
389 | },
390 | {
391 | "cell_type": "code",
392 | "metadata": {
393 | "id": "9n0GTxkvAeq5"
394 | },
395 | "source": [
396 | "weekly_df = df1.resample('W').apply(sum)"
397 | ],
398 | "execution_count": null,
399 | "outputs": []
400 | },
401 | {
402 | "cell_type": "code",
403 | "metadata": {
404 | "id": "SUR6GRnOJp5T",
405 | "outputId": "de7058f9-8c63-4574-9f99-7d16f1ad130a",
406 | "colab": {
407 | "base_uri": "https://localhost:8080/",
408 | "height": 164
409 | }
410 | },
411 | "source": [
412 | "weekly_df.head()"
413 | ],
414 | "execution_count": 24,
415 | "outputs": [
416 | {
417 | "output_type": "error",
418 | "ename": "NameError",
419 | "evalue": "ignored",
420 | "traceback": [
421 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
422 | "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
423 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mweekly_df\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
424 | "\u001b[0;31mNameError\u001b[0m: name 'weekly_df' is not defined"
425 | ]
426 | }
427 | ]
428 | },
429 | {
430 | "cell_type": "code",
431 | "metadata": {
432 | "id": "lVCbk-vsAz-n",
433 | "outputId": "10a05488-8331-4da1-9360-b60fd39a21cc",
434 | "colab": {
435 | "base_uri": "https://localhost:8080/",
436 | "height": 181
437 | }
438 | },
439 | "source": [
440 | "daily_df = daily_df.loc[daily_df.index >= '2020-03-10']\n",
441 | "daily_df.head(n=3)"
442 | ],
443 | "execution_count": 25,
444 | "outputs": [
445 | {
446 | "output_type": "error",
447 | "ename": "NameError",
448 | "evalue": "ignored",
449 | "traceback": [
450 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
451 | "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
452 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdaily_df\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdaily_df\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mdaily_df\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m \u001b[0;34m>=\u001b[0m \u001b[0;34m'2020-03-10'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mdaily_df\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mn\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
453 | "\u001b[0;31mNameError\u001b[0m: name 'daily_df' is not defined"
454 | ]
455 | }
456 | ]
457 | },
458 | {
459 | "cell_type": "code",
460 | "metadata": {
461 | "id": "ZQLlg3g4OvWh"
462 | },
463 | "source": [
464 | "plt.figure(figsize=(16,8))\n",
465 | "plt.title('Daily Confirmed Cases of COVID-19 (World)')\n",
466 | "plt.plot(daily_df['Confirmed_Cases'])\n",
467 | "plt.xlabel('Date', fontsize=18)\n",
468 | "plt.ylabel('Confirmed Cases', fontsize=18)\n",
469 | "plt.show()"
470 | ],
471 | "execution_count": null,
472 | "outputs": []
473 | },
474 | {
475 | "cell_type": "markdown",
476 | "metadata": {
477 | "id": "C8ivtMFNTyEs"
478 | },
479 | "source": [
480 | "#**5- Modelling**\n"
481 | ]
482 | },
483 | {
484 | "cell_type": "code",
485 | "metadata": {
486 | "id": "IVFBqHaJBM2J"
487 | },
488 | "source": [
489 | "#Lstm\n",
490 | "data=aggr_df.filter(['Confirmed'])"
491 | ],
492 | "execution_count": 79,
493 | "outputs": []
494 | },
495 | {
496 | "cell_type": "code",
497 | "metadata": {
498 | "id": "Uucawvui-7iT"
499 | },
500 | "source": [
501 | "dataset=aggr_df.values"
502 | ],
503 | "execution_count": 80,
504 | "outputs": []
505 | },
506 | {
507 | "cell_type": "code",
508 | "metadata": {
509 | "id": "BMJiPPce_C3A"
510 | },
511 | "source": [
512 | "training_data_len=math.ceil(len(dataset)*.8)"
513 | ],
514 | "execution_count": 81,
515 | "outputs": []
516 | },
517 | {
518 | "cell_type": "code",
519 | "metadata": {
520 | "id": "LqoNjyXg_qjc"
521 | },
522 | "source": [
523 | "training_data_len"
524 | ],
525 | "execution_count": null,
526 | "outputs": []
527 | },
528 | {
529 | "cell_type": "code",
530 | "metadata": {
531 | "id": "YgLle6F7_0DK"
532 | },
533 | "source": [
534 | "scaler=MinMaxScaler(feature_range=(0,1))"
535 | ],
536 | "execution_count": 83,
537 | "outputs": []
538 | },
539 | {
540 | "cell_type": "code",
541 | "metadata": {
542 | "id": "QUPaY-3DAE9v"
543 | },
544 | "source": [
545 | "scaled_data=scaler.fit_transform(dataset)"
546 | ],
547 | "execution_count": 84,
548 | "outputs": []
549 | },
550 | {
551 | "cell_type": "code",
552 | "metadata": {
553 | "id": "CQgo0uoxAZT9"
554 | },
555 | "source": [
556 | "scaled_data"
557 | ],
558 | "execution_count": null,
559 | "outputs": []
560 | },
561 | {
562 | "cell_type": "code",
563 | "metadata": {
564 | "id": "2TtxrK2kBS8F",
565 | "outputId": "87e86f76-7a27-476d-c702-a8b717dcb985",
566 | "colab": {
567 | "base_uri": "https://localhost:8080/"
568 | }
569 | },
570 | "source": [
571 | "# create the training dataset\n",
572 | "# create the scaled training dataset\n",
573 | "train_data=scaled_data[0:training_data_len,:]\n",
574 | "# split the data into x_train and y_train dataset\n",
575 | "x_train = []\n",
576 | "y_train = []\n",
577 | "\n",
578 | "for i in range (12, len(train_data)):\n",
579 | " x_train.append(train_data[i-12:i,0])\n",
580 | " y_train.append(train_data[i,0])\n",
581 | " if i<= 12:\n",
582 | " print(x_train)\n",
583 | " print(y_train)\n",
584 | " print()"
585 | ],
586 | "execution_count": 86,
587 | "outputs": [
588 | {
589 | "output_type": "stream",
590 | "text": [
591 | "[array([0. , 0.04458599, 0.07006369, 0.08280255, 0.12738854,\n",
592 | " 0.12101911, 0.15923567, 0.15286624, 0.15923567, 0.17197452,\n",
593 | " 0.1910828 , 0.21656051])]\n",
594 | "[0.624203821656051]\n",
595 | "\n"
596 | ],
597 | "name": "stdout"
598 | }
599 | ]
600 | },
601 | {
602 | "cell_type": "code",
603 | "metadata": {
604 | "id": "oZl7H0pBChBy",
605 | "outputId": "1e2b2a96-c815-4174-f808-9f137cddfc95",
606 | "colab": {
607 | "base_uri": "https://localhost:8080/"
608 | }
609 | },
610 | "source": [
611 | "# Convert the x_train and y_train to numpy arrays\n",
612 | "x_train, y_train = np.array(x_train), np.array(y_train)\n",
613 | "x_train"
614 | ],
615 | "execution_count": 87,
616 | "outputs": [
617 | {
618 | "output_type": "execute_result",
619 | "data": {
620 | "text/plain": [
621 | "array([[0. , 0.04458599, 0.07006369, 0.08280255, 0.12738854,\n",
622 | " 0.12101911, 0.15923567, 0.15286624, 0.15923567, 0.17197452,\n",
623 | " 0.1910828 , 0.21656051],\n",
624 | " [0.04458599, 0.07006369, 0.08280255, 0.12738854, 0.12101911,\n",
625 | " 0.15923567, 0.15286624, 0.15923567, 0.17197452, 0.1910828 ,\n",
626 | " 0.21656051, 0.62420382]])"
627 | ]
628 | },
629 | "metadata": {
630 | "tags": []
631 | },
632 | "execution_count": 87
633 | }
634 | ]
635 | },
636 | {
637 | "cell_type": "code",
638 | "metadata": {
639 | "id": "901KPDDeCpu5",
640 | "outputId": "380d8dd1-a08a-4a7c-e397-7a5832ac87d8",
641 | "colab": {
642 | "base_uri": "https://localhost:8080/"
643 | }
644 | },
645 | "source": [
646 | "# Reshap the data\n",
647 | "x_train=np.reshape(x_train, (x_train.shape[0], x_train.shape[1],1))\n",
648 | "x_train.shape"
649 | ],
650 | "execution_count": 88,
651 | "outputs": [
652 | {
653 | "output_type": "execute_result",
654 | "data": {
655 | "text/plain": [
656 | "(2, 12, 1)"
657 | ]
658 | },
659 | "metadata": {
660 | "tags": []
661 | },
662 | "execution_count": 88
663 | }
664 | ]
665 | },
666 | {
667 | "cell_type": "code",
668 | "metadata": {
669 | "id": "Qo5bwUE-Cz9g"
670 | },
671 | "source": [
672 | "# build the LSTM model\n",
673 | "model=Sequential()\n",
674 | "model.add(LSTM(40, return_sequences=True,input_shape=(x_train.shape[1],1)))\n",
675 | "model.add(LSTM(40, return_sequences=False))\n",
676 | "model.add(Dense(25))\n",
677 | "model.add(Dense(1))"
678 | ],
679 | "execution_count": 89,
680 | "outputs": []
681 | },
682 | {
683 | "cell_type": "code",
684 | "metadata": {
685 | "id": "Ph3rquB-DZsX",
686 | "outputId": "a754b201-7b40-4edc-81b9-41b542a57d36",
687 | "colab": {
688 | "base_uri": "https://localhost:8080/"
689 | }
690 | },
691 | "source": [
692 | "# train the model\n",
693 | "model.fit(x_train,y_train,batch_size=1,epochs=1)"
694 | ],
695 | "execution_count": 91,
696 | "outputs": [
697 | {
698 | "output_type": "stream",
699 | "text": [
700 | "2/2 [==============================] - 0s 6ms/step - loss: 0.4434\n"
701 | ],
702 | "name": "stdout"
703 | },
704 | {
705 | "output_type": "execute_result",
706 | "data": {
707 | "text/plain": [
708 | ""
709 | ]
710 | },
711 | "metadata": {
712 | "tags": []
713 | },
714 | "execution_count": 91
715 | }
716 | ]
717 | },
718 | {
719 | "cell_type": "code",
720 | "metadata": {
721 | "id": "Mxp-MSYPDOzz"
722 | },
723 | "source": [
724 | "# compile the model\n",
725 | "model.compile(optimizer='adam',loss='mean_squared_error')"
726 | ],
727 | "execution_count": 90,
728 | "outputs": []
729 | },
730 | {
731 | "cell_type": "code",
732 | "metadata": {
733 | "id": "I8b3sFhSEDiL"
734 | },
735 | "source": [
736 | "test_data=scaled_data[training_data_len -12:,:]\n",
737 | "x_test= []\n",
738 | "y_test = dataset[training_data_len:,:]\n",
739 | "for i in range(12, len(test_data)):\n",
740 | " x_test.append(test_data[i -12:i,0])"
741 | ],
742 | "execution_count": 92,
743 | "outputs": []
744 | },
745 | {
746 | "cell_type": "code",
747 | "metadata": {
748 | "id": "RuY3KUkGEOYA"
749 | },
750 | "source": [
751 | "x_test=np.array(x_test)"
752 | ],
753 | "execution_count": 93,
754 | "outputs": []
755 | },
756 | {
757 | "cell_type": "code",
758 | "metadata": {
759 | "id": "IrrrGaW1EQuf",
760 | "outputId": "8bc16ae1-9e25-4d7e-d53b-bed388093491",
761 | "colab": {
762 | "base_uri": "https://localhost:8080/"
763 | }
764 | },
765 | "source": [
766 | "x_test"
767 | ],
768 | "execution_count": 94,
769 | "outputs": [
770 | {
771 | "output_type": "execute_result",
772 | "data": {
773 | "text/plain": [
774 | "array([[0.07006369, 0.08280255, 0.12738854, 0.12101911, 0.15923567,\n",
775 | " 0.15286624, 0.15923567, 0.17197452, 0.1910828 , 0.21656051,\n",
776 | " 0.62420382, 0.72611465],\n",
777 | " [0.08280255, 0.12738854, 0.12101911, 0.15923567, 0.15286624,\n",
778 | " 0.15923567, 0.17197452, 0.1910828 , 0.21656051, 0.62420382,\n",
779 | " 0.72611465, 0.78343949],\n",
780 | " [0.12738854, 0.12101911, 0.15923567, 0.15286624, 0.15923567,\n",
781 | " 0.17197452, 0.1910828 , 0.21656051, 0.62420382, 0.72611465,\n",
782 | " 0.78343949, 0.89808917]])"
783 | ]
784 | },
785 | "metadata": {
786 | "tags": []
787 | },
788 | "execution_count": 94
789 | }
790 | ]
791 | },
792 | {
793 | "cell_type": "code",
794 | "metadata": {
795 | "id": "dLKwuQ2mEUY-"
796 | },
797 | "source": [
798 | "x_test=np.reshape(x_test,(x_test.shape[0], x_test.shape[1],1))"
799 | ],
800 | "execution_count": 95,
801 | "outputs": []
802 | },
803 | {
804 | "cell_type": "code",
805 | "metadata": {
806 | "id": "u2GibjoyEco3"
807 | },
808 | "source": [
809 | "predictions=model.predict(x_test)\n",
810 | "predictions=scaler.inverse_transform(predictions)"
811 | ],
812 | "execution_count": 96,
813 | "outputs": []
814 | },
815 | {
816 | "cell_type": "code",
817 | "metadata": {
818 | "id": "oCknKvIiDSvC"
819 | },
820 | "source": [
821 | "rmse=np.sqrt(np.mean(predictions - y_test)**2)"
822 | ],
823 | "execution_count": 97,
824 | "outputs": []
825 | },
826 | {
827 | "cell_type": "code",
828 | "metadata": {
829 | "id": "4UAIiiUzDStS",
830 | "outputId": "3446c8d6-c16c-49fb-eecf-bb3d06d0f811",
831 | "colab": {
832 | "base_uri": "https://localhost:8080/"
833 | }
834 | },
835 | "source": [
836 | "rmse"
837 | ],
838 | "execution_count": 98,
839 | "outputs": [
840 | {
841 | "output_type": "execute_result",
842 | "data": {
843 | "text/plain": [
844 | "130.11727078755698"
845 | ]
846 | },
847 | "metadata": {
848 | "tags": []
849 | },
850 | "execution_count": 98
851 | }
852 | ]
853 | },
854 | {
855 | "cell_type": "code",
856 | "metadata": {
857 | "id": "auMST9VMFXil"
858 | },
859 | "source": [
860 | "train=aggr_df[:training_data_len]\n",
861 | "valid=aggr_df[training_data_len:]"
862 | ],
863 | "execution_count": 100,
864 | "outputs": []
865 | },
866 | {
867 | "cell_type": "code",
868 | "metadata": {
869 | "id": "9U1h6R3DGG_4"
870 | },
871 | "source": [
872 | "valid['Predictions']=predictions"
873 | ],
874 | "execution_count": 101,
875 | "outputs": []
876 | },
877 | {
878 | "cell_type": "code",
879 | "metadata": {
880 | "id": "L0bIwuB0J-nn",
881 | "outputId": "1ac7531b-a6d0-4236-a2c7-61c51c05eaeb",
882 | "colab": {
883 | "base_uri": "https://localhost:8080/",
884 | "height": 173
885 | }
886 | },
887 | "source": [
888 | "valid.head()"
889 | ],
890 | "execution_count": 102,
891 | "outputs": [
892 | {
893 | "output_type": "execute_result",
894 | "data": {
895 | "text/html": [
896 | "\n",
897 | "\n",
910 | "
\n",
911 | " \n",
912 | " \n",
913 | " | \n",
914 | " Confirmed_Cases | \n",
915 | " Predictions | \n",
916 | "
\n",
917 | " \n",
918 | " | Date | \n",
919 | " | \n",
920 | " | \n",
921 | "
\n",
922 | " \n",
923 | " \n",
924 | " \n",
925 | " | 2020-03-24 | \n",
926 | " 140 | \n",
927 | " 26.088554 | \n",
928 | "
\n",
929 | " \n",
930 | " | 2020-03-25 | \n",
931 | " 158 | \n",
932 | " 27.168270 | \n",
933 | "
\n",
934 | " \n",
935 | " | 2020-03-26 | \n",
936 | " 174 | \n",
937 | " 28.391363 | \n",
938 | "
\n",
939 | " \n",
940 | "
\n",
941 | "
"
942 | ],
943 | "text/plain": [
944 | " Confirmed_Cases Predictions\n",
945 | "Date \n",
946 | "2020-03-24 140 26.088554\n",
947 | "2020-03-25 158 27.168270\n",
948 | "2020-03-26 174 28.391363"
949 | ]
950 | },
951 | "metadata": {
952 | "tags": []
953 | },
954 | "execution_count": 102
955 | }
956 | ]
957 | },
958 | {
959 | "cell_type": "code",
960 | "metadata": {
961 | "id": "S9o1cWAPG85L",
962 | "outputId": "e85abd2f-70b0-4105-ad29-aed877108e7d",
963 | "colab": {
964 | "base_uri": "https://localhost:8080/",
965 | "height": 468
966 | }
967 | },
968 | "source": [
969 | "plt.figure(figsize=(16,8))\n",
970 | "plt.title('Model')\n",
971 | "plt.xlabel('Date',fontsize=18)\n",
972 | "plt.ylabel('Confirmed Cases',fontsize=18)\n",
973 | "plt.plot(train['Confirmed_Cases'])\n",
974 | "plt.plot(valid[['Confirmed_Cases','Predictions']])\n",
975 | "plt.legend(['Train','Val','Predictions'],loc='lower right')"
976 | ],
977 | "execution_count": 103,
978 | "outputs": [
979 | {
980 | "output_type": "execute_result",
981 | "data": {
982 | "text/plain": [
983 | ""
984 | ]
985 | },
986 | "metadata": {
987 | "tags": []
988 | },
989 | "execution_count": 103
990 | },
991 | {
992 | "output_type": "display_data",
993 | "data": {
994 | "image/png": "\n",
995 | "text/plain": [
996 | ""
997 | ]
998 | },
999 | "metadata": {
1000 | "tags": []
1001 | }
1002 | }
1003 | ]
1004 | },
1005 | {
1006 | "cell_type": "markdown",
1007 | "metadata": {
1008 | "id": "QmrbiUI8sjn8"
1009 | },
1010 | "source": [
1011 | "# **References**\n",
1012 | "\n",
1013 | "[1]Forecasting Time Series Data With Prophet I & II\n",
1014 | "https://nextjournal.com/eric-brown/forecasting-with-prophet\n",
1015 | "\n",
1016 | "[2]Forecasting Time Series Data With Prophet IV\n",
1017 | "https://nextjournal.com/eric-brown/forecasting-with-prophet-part-4\n",
1018 | "\n",
1019 | "[3]Time series analysis in Python\n",
1020 | "https://mlcourse.ai/articles/topic9-part2-prophet/\n",
1021 | "\n",
1022 | "[4] https://www.kaggle.com/kashnitsky/topic-9-part-2-time-series-with-facebook-prophet\n",
1023 | "\n",
1024 | "[5] How To Predict Coronavirus (COVID-19) Cases Using Deep Learning in Python\n",
1025 | "https://laconicml.com/predict-coronavirus-cases-deep-learning/"
1026 | ]
1027 | }
1028 | ]
1029 | }
--------------------------------------------------------------------------------