├── .gitignore
├── requirements.txt
├── collect_weather.py
├── preprocess.py
├── train_test_dnn.py
├── train_test.py
├── weather.py
├── README.md
└── Collect Weather Data API.ipynb
/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints/
2 | *-working.ipynb
3 | .vscode/
4 | tf_wx_model/
5 | *.csv
6 | *.pkl
7 | *.pxi
8 | __pycache__
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | absl-py==0.7.0
2 | asn1crypto==0.24.0
3 | astor==0.7.1
4 | astroid==2.1.0
5 | certifi==2018.11.29
6 | cffi==1.11.5
7 | chardet==3.0.4
8 | colorama==0.4.1
9 | cryptography==2.5
10 | gast==0.2.2
11 | grpcio==1.16.1
12 | h5py==2.9.0
13 | idna==2.8
14 | isort==4.3.4
15 | Keras-Applications==1.0.6
16 | Keras-Preprocessing==1.0.5
17 | lazy-object-proxy==1.3.1
18 | Markdown==3.0.1
19 | mccabe==0.6.1
20 | mkl-fft==1.0.10
21 | mkl-random==1.0.2
22 | numpy==1.15.4
23 | pandas==0.24.1
24 | patsy==0.5.1
25 | protobuf==3.6.1
26 | psutil==5.5.0
27 | pycparser==2.19
28 | pylint==2.2.2
29 | pyOpenSSL==19.0.0
30 | PyPrind==2.11.2
31 | pyreadline==2.1
32 | PySocks==1.6.8
33 | python-dateutil==2.7.5
34 | pytz==2018.9
35 | requests==2.21.0
36 | scikit-learn==0.20.2
37 | scipy==1.2.0
38 | six==1.12.0
39 | statsmodels==0.9.0
40 | tensorboard==1.12.2
41 | tensorflow==1.12.2
42 | termcolor==1.1.0
43 | typed-ast==1.1.0
44 | urllib3==1.24.2
45 | Werkzeug==0.15.3
46 | win-inet-pton==1.0.1
47 | wincertstore==0.2
48 | wrapt==1.11.1
49 |
--------------------------------------------------------------------------------
/collect_weather.py:
--------------------------------------------------------------------------------
1 | import os.path
2 | import pickle
3 | from datetime import timedelta
4 |
5 | from weather import API_KEY, BASE_URL, extract_weather_data, get_target_date
6 |
7 | filename1 = 'records_pt1.pkl'
8 | filename2 = 'records_pt2.pkl'
9 |
10 | if os.path.isfile(filename2):
11 | print('1000 records already collected from Dark Sky API')
12 |
13 | elif os.path.isfile(filename1):
14 | with open(filename1, 'rb') as fp:
15 | records = pickle.load(fp)
16 |
17 | target_date = records[-1][0] + timedelta(days=1)
18 |
19 | records += extract_weather_data(BASE_URL, API_KEY, target_date, 500)
20 |
21 | records_length = len(records)
22 | print(f'{records_length} records collected from Dark Sky API')
23 |
24 | with open(filename2, 'wb') as f:
25 | pickle.dump(records, f)
26 |
27 | print(f'Weather records from day 2 saved to {filename2}.')
28 |
29 | else:
30 | target_date = get_target_date()
31 |
32 | records = extract_weather_data(BASE_URL, API_KEY, target_date, 500)
33 |
34 | records_length = len(records)
35 | print(f'{records_length} records collected from Dark Sky API')
36 |
37 | with open(filename1, 'wb') as f:
38 | pickle.dump(records, f)
39 |
40 | print(f'Weather records from day 1 saved to {filename1}.')
41 |
--------------------------------------------------------------------------------
/preprocess.py:
--------------------------------------------------------------------------------
1 | import pickle
2 |
3 | import pandas as pd
4 |
5 | from weather import derive_nth_day_feature, features
6 |
7 | with open('records_pt2.pkl', 'rb') as fp:
8 | records = pickle.load(fp)
9 |
10 | df = pd.DataFrame(records, columns=features).set_index('date')
11 |
12 | for feature in features:
13 | if feature != 'date':
14 | for N in range(1, 4):
15 | derive_nth_day_feature(df, feature, N)
16 |
17 | # make list of original features without temperatureMean, temperatureMin, and temperatureMax
18 | to_remove = [
19 | feature
20 | for feature in features
21 | if feature not in ['temperatureMean', 'temperatureMin', 'temperatureMax']
22 | ]
23 |
24 | # make a list of columns to keep
25 | to_keep = [col for col in df.columns if col not in to_remove]
26 |
27 | # select only the columns in to_keep and assign to df
28 | df = df[to_keep]
29 |
30 | df = df.apply(pd.to_numeric, errors='coerce')
31 |
32 | # Call describe on df and transpose it due to the large number of columns
33 | spread = df.describe().T
34 |
35 | # precalculate interquartile range for ease of use in next calculation
36 | IQR = spread['75%'] - spread['25%']
37 |
38 | # create an outliers column which is either 3 IQRs below the first quartile or
39 | # 3 IQRs above the third quartile
40 | spread['outliers'] = (spread['min'] < (spread['25%'] - (3 * IQR))) | (
41 | spread['max'] > (spread['75%'] + 3 * IQR)
42 | )
43 |
44 | # iterate over the precip columns
45 | for precip_col in ['precipProbability_1', 'precipProbability_2', 'precipProbability_3']:
46 | # create a boolean array of values representing nans
47 | missing_vals = pd.isnull(df[precip_col])
48 | df[precip_col][missing_vals] = 0
49 |
50 | df = df.dropna()
51 |
52 | with open('end-part1_df.pkl', 'wb') as f:
53 | pickle.dump(df, f)
54 |
--------------------------------------------------------------------------------
/train_test_dnn.py:
--------------------------------------------------------------------------------
1 | import pickle
2 |
3 | import numpy as np
4 | import tensorflow as tf
5 | from sklearn.metrics import (
6 | explained_variance_score,
7 | mean_absolute_error,
8 | median_absolute_error,
9 | )
10 | from sklearn.model_selection import train_test_split
11 |
12 | with open('end-part1_df.pkl', 'rb') as fp:
13 | df = pickle.load(fp)
14 |
15 | df.index = df.index.values.astype(float)
16 |
17 | # First drop the temperatureMax and temperatureMin from the dataframe
18 | df = df.drop(['temperatureMin', 'temperatureMax'], axis=1)
19 |
20 | # X will be a pandas dataframe of all columns except temperatureMean
21 | X = df[[col for col in df.columns if col != 'temperatureMean']]
22 |
23 | # y will be a pandas series of the temperatureMean
24 | y = df['temperatureMean']
25 |
26 | # split data into training set and a temporary set
27 | X_train, X_tmp, y_train, y_tmp = train_test_split(X, y, test_size=0.2, random_state=23)
28 |
29 | # split the remaining 20% of data evenly
30 | X_test, X_val, y_test, y_val = train_test_split(
31 | X_tmp, y_tmp, test_size=0.5, random_state=23
32 | )
33 |
34 | X_train.shape, X_test.shape, X_val.shape
35 | print(
36 | f'Training instances {X_train.shape[0]}, Training features {X_train.shape[1]}'
37 | )
38 | print(f'Validation instances {X_val.shape[0]}, Validation features {X_val.shape[1]}')
39 | print(f'Testing instances {X_test.shape[0]}, Testing features {X_test.shape[1]}')
40 |
41 | feature_cols = [tf.feature_column.numeric_column(col) for col in X.columns]
42 |
43 | regressor = tf.estimator.DNNRegressor(
44 | feature_columns=feature_cols,
45 | hidden_units=[50, 50],
46 | model_dir='~/Projects/machine-learning-predict-weather/tf_models/tf_wx_model',
47 | )
48 |
49 |
50 | def wx_input_fn(X, y=None, num_epochs=None, shuffle=True, batch_size=400):
51 | return tf.estimator.inputs.pandas_input_fn(
52 | x=X, y=y, num_epochs=num_epochs, shuffle=shuffle, batch_size=batch_size
53 | )
54 |
55 |
56 | evaluations = []
57 | STEPS = 400
58 | for i in range(100):
59 | regressor.train(input_fn=wx_input_fn(X_train, y=y_train), steps=STEPS)
60 | evaluations.append(
61 | regressor.evaluate(
62 | input_fn=wx_input_fn(X_val, y_val, num_epochs=1, shuffle=False)
63 | )
64 | )
65 |
66 | pred = regressor.predict(input_fn=wx_input_fn(X_test, num_epochs=1, shuffle=False))
67 | predictions = np.array([p['predictions'][0] for p in pred])
68 |
69 | print(f'The Explained Variance: {explained_variance_score(y_test, predictions):.2f}')
70 | print(
71 | f'The Mean Absolute Error: {mean_absolute_error(y_test, predictions):.2f} degrees Celcius'
72 | )
73 | print(
74 | f'The Median Absolute Error: {median_absolute_error(y_test, predictions):.2f} degrees Celcius'
75 | )
76 |
--------------------------------------------------------------------------------
/train_test.py:
--------------------------------------------------------------------------------
1 | import pickle
2 |
3 | import pandas as pd
4 | import statsmodels.api as sm
5 | from sklearn.linear_model import LinearRegression
6 | from sklearn.metrics import mean_absolute_error, median_absolute_error
7 | from sklearn.model_selection import train_test_split
8 |
9 | with open('end-part1_df.pkl', 'rb') as fp:
10 | df = pickle.load(fp)
11 |
12 | df_corr = df.corr()[['temperatureMean']].sort_values('temperatureMean')
13 | df_corr_fil = df_corr[abs(df_corr['temperatureMean']) > 0.55]
14 |
15 | unwanted = ['temperatureMin', 'temperatureMax', 'temperatureMean']
16 | predictors = df_corr_fil.index.tolist()
17 | predictors = [i for i in predictors if i not in unwanted]
18 |
19 | df2 = df[['temperatureMean'] + predictors]
20 |
21 | X = df2[predictors]
22 | y = df2['temperatureMean']
23 | alpha = 0.05
24 |
25 |
26 | def stepwise_selection(
27 | X, y, initial_list=predictors, threshold_out=alpha, verbose=True
28 | ):
29 | """ Perform a forward-backward feature selection
30 | based on p-value from statsmodels.api.OLS
31 | Arguments:
32 | X - pandas.DataFrame with candidate features
33 | y - list-like with the target
34 | initial_list - list of features to start with (column names of X)
35 | threshold_in - include a feature if its p-value < threshold_in
36 | threshold_out - exclude a feature if its p-value > threshold_out
37 | verbose - whether to print the sequence of inclusions and exclusions
38 | Returns: list of selected features
39 | See https://en.wikipedia.org/wiki/Stepwise_regression for the details
40 | """
41 | included = list(initial_list)
42 | while True:
43 | changed = False
44 | model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
45 | # use all coefs except intercept
46 | pvalues = model.pvalues.iloc[1:]
47 | worst_pval = pvalues.max() # null if pvalues is empty
48 | if worst_pval > threshold_out:
49 | changed = True
50 | worst_feature = pvalues.idxmax()
51 | included.remove(worst_feature)
52 | if verbose:
53 | print('Drop {:30} with p-value {:.6}'.format(worst_feature, worst_pval))
54 | if not changed:
55 | break
56 | return included
57 |
58 |
59 | result = stepwise_selection(X, y)
60 |
61 | print('resulting features:')
62 | print(result)
63 |
64 | X = X[result]
65 | model = sm.OLS(y, X).fit()
66 | print(model.summary())
67 |
68 | X_train, X_test, y_train, y_test = train_test_split(
69 | X, y, test_size=0.2, random_state=12
70 | )
71 |
72 | regressor = LinearRegression()
73 |
74 | regressor.fit(X_train, y_train)
75 |
76 | prediction = regressor.predict(X_test)
77 |
78 | print(f'The Explained Variance: {regressor.score(X_test, y_test):.2f}')
79 | print(
80 | f'The Mean Absolute Error: {mean_absolute_error(y_test, prediction):.2f} degrees celcius'
81 | )
82 | print(
83 | f'The Median Absolute Error: {median_absolute_error(y_test, prediction):.2f} degrees celcius'
84 | )
85 |
--------------------------------------------------------------------------------
/weather.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 | from collections import namedtuple
4 | from datetime import datetime, timedelta
5 |
6 | import requests
7 | from pyprind import ProgBar
8 |
9 | loc = '30.578806,-97.853065'
10 |
11 | API_KEY = os.environ.get('MY_API_KEY')
12 | BASE_URL = 'https://api.darksky.net/forecast/{}/{},{}'
13 |
14 | features = [
15 | 'date',
16 | 'temperatureMean',
17 | 'dewPoint',
18 | 'pressure',
19 | 'humidity',
20 | 'temperatureMax',
21 | 'temperatureMin',
22 | 'precipProbability',
23 | ]
24 | DailySummary = namedtuple('DailySummary', features)
25 |
26 |
27 | def extract_weather_data(url, api_key, target_date, days):
28 | """Call Wunderground API to extract weather data."""
29 | records = []
30 | bar = ProgBar(days)
31 | for _ in range(days):
32 | request = BASE_URL.format(
33 | API_KEY, loc, target_date.strftime('%Y-%m-%dT%H:%M:%S')
34 | )
35 | response = requests.get(request)
36 | if response.status_code == 200:
37 |
38 | def get_mean_temp():
39 | """Return average temperature across a 24 hour period."""
40 | total_temp = 0
41 | for i in range(len(hdata)):
42 | try:
43 | total_temp += hdata[i]['temperature']
44 | except KeyError:
45 | total_temp += hdata[i-1]['temperature']
46 | meanTemp = total_temp / 24
47 | return meanTemp
48 |
49 | data = response.json()['daily']['data'][0]
50 | hdata = response.json()['hourly']['data']
51 | try:
52 | records.append(
53 | DailySummary(
54 | date=target_date,
55 | temperatureMean=get_mean_temp(),
56 | dewPoint=data['dewPoint'],
57 | pressure=data['pressure'],
58 | humidity=data['humidity'],
59 | temperatureMax=data['temperatureMax'],
60 | temperatureMin=data['temperatureMin'],
61 | precipProbability=data['precipProbability'],
62 | )
63 | )
64 | except KeyError:
65 | records.append(
66 | DailySummary(
67 | date=target_date,
68 | temperatureMean=get_mean_temp(),
69 | dewPoint=data['dewPoint'],
70 | pressure=data['pressure'],
71 | humidity=data['humidity'],
72 | temperatureMax=data['temperatureMax'],
73 | temperatureMin=data['temperatureMin'],
74 | precipProbability=0,
75 | )
76 | )
77 | # time.sleep(6)
78 | bar.update()
79 | target_date += timedelta(days=1)
80 | return records
81 |
82 |
83 | def get_target_date():
84 | """Return target date 1000 days prior to current date."""
85 | current_date = datetime.now()
86 | target_date = current_date - timedelta(days=1000)
87 | return target_date
88 |
89 |
90 | def derive_nth_day_feature(df, feature, N):
91 | nth_prior_measurements = df[feature].shift(periods=N)
92 | col_name = f'{feature}_{N}'
93 | df[col_name] = nth_prior_measurements
94 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Using Machine Learning to Predict the Weather ([Powered by Dark Sky](https://darksky.net/poweredby/))
2 | This project is based on a three-part article written by Adam McQuistan in [stackabuse.com](http://stackabuse.com/using-machine-learning-to-predict-the-weather-part-1/).
3 |
4 | ## Update regarding the weather API
5 | My original disclaimer was Weather Underground ([wunderground.com](https://www.wunderground.com/)) was no longer providing free API accounts. At some point (I don't know exactly when), they discontinued their API service altogether. I have since signed up for a [Dark Sky API](https://darksky.net/dev). They don't have a free tier but they do have a trial account which allows 1,000 API calls per day to evaluate the service. Every API request over the free daily limit costs $0.0001.
6 |
7 | ## Summary
8 | I won't go into too much detail about the project since you can go to the original article on stackabuse.com; however, here is a little background if you wish to save time. (Although checkout the series, it's worth the read.)
9 |
10 | The project is split into three separate Jupyter Notebooks: one to collect the weather data from the Wunderground.com developer's API (again I'm using Dark Sky's API), inspect it, and clean it; a second to further refine the features and fit the data to a Linear Regression model; and a third to train and evaluate a deep neural net regressor.
11 |
12 | ## Changes
13 | For the most part I did not deviate from the author's original process. I did seek to automate and streamline the code. For example, I added a progress bar to the data collection function and created another function to automatically set a target date that is 1000 days prior to the current date. I automated the code to remove features that did not show a strong correlation and implemented a stepwise regression function to automate removing features that had p-values that were too high. (The original author did this manually.)
14 |
15 | ## Added modules
16 | Automating the code allowed me to adapt the Python code in the Jupyter Notebooks to regular .py files. Jupyter Notebooks are fantastic tools but I believe the final product should be Python scripts that run in the background. Here are the scripts I added and a quick summary:
17 |
18 | 1. weather.py- a utility file that contains reused methods and variables
19 | 2. collect_weather.py- uses the Requests library to download weather data for 1000 days. Also uses ```os.path.isfile()``` and a ```if/elif/else``` statement to determine whether the data from the first 500 days should be collected, data from the second 500 days should be collected, or no data is to be collected. (This no longer necessary since the daily limit is 1,000 calls.)
20 | 3. preprocess.py- creates a Pandas DataFrame from the weather records and cleans the data
21 | 4. train_test.py- performs some additional preprocessing and fits the data to a Linear Regression model
22 | 5. train_test_dnn- uses the same weather data to train, evaluate, and test a deep neural network regressor
23 |
24 | ## Still To Do
25 | * Update collect_weather.py to make 1,000 API calls at once instead of 500 over two days
26 | * Update the Jupyter Notebooks for the Dark Sky API
27 | * Replace/remove some deprecated methods in the train_test.py and train_test_dnn.py modules
28 | * Add better documentation in the form of markdown cells to the notebooks.
29 | * Apply the model to future forecasts and validate against actual weather data.
30 |
--------------------------------------------------------------------------------
/Collect Weather Data API.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import os\n",
10 | "import pickle\n",
11 | "import time\n",
12 | "from collections import namedtuple\n",
13 | "from datetime import datetime, timedelta\n",
14 | "\n",
15 | "import pandas as pd\n",
16 | "import requests\n",
17 | "\n",
18 | "import matplotlib.pyplot as plt\n",
19 | "from pyprind import ProgBar\n",
20 | "\n",
21 | "%matplotlib inline"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": 2,
27 | "metadata": {},
28 | "outputs": [],
29 | "source": [
30 | "API_KEY = os.environ.get('MY_API_KEY')\n",
31 | "BASE_URL = 'http://api.wunderground.com/api/{}/history_{}/q/TX/Round_Rock.json'"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": 3,
37 | "metadata": {},
38 | "outputs": [],
39 | "source": [
40 | "features = [\n",
41 | " \"date\", \"meantempm\", \"meandewptm\", \"meanpressurem\", \"maxhumidity\",\n",
42 | " \"minhumidity\", \"maxtempm\", \"mintempm\", \"maxdewptm\", \"mindewptm\",\n",
43 | " \"maxpressurem\", \"minpressurem\", \"precipm\"\n",
44 | "]\n",
45 | "DailySummary = namedtuple('DailySummary', features)"
46 | ]
47 | },
48 | {
49 | "cell_type": "code",
50 | "execution_count": 4,
51 | "metadata": {},
52 | "outputs": [],
53 | "source": [
54 | "def extract_weather_data(url, api_key, target_date, days):\n",
55 | " \"\"\"Call Wunderground API to extract weather data.\"\"\"\n",
56 | " records = []\n",
57 | " bar = ProgBar(days)\n",
58 | " for _ in range(days):\n",
59 | " request = BASE_URL.format(API_KEY, target_date.strftime('%Y%m%d'))\n",
60 | " response = requests.get(request)\n",
61 | " if response.status_code == 200:\n",
62 | " data = response.json()['history']['dailysummary'][0]\n",
63 | " records.append(DailySummary(\n",
64 | " date=target_date,\n",
65 | " meantempm=data['meantempm'],\n",
66 | " meandewptm=data['meandewptm'],\n",
67 | " meanpressurem=data['meanpressurem'],\n",
68 | " maxhumidity=data['maxhumidity'],\n",
69 | " minhumidity=data['minhumidity'],\n",
70 | " maxtempm=data['maxtempm'],\n",
71 | " mintempm=data['mintempm'],\n",
72 | " maxdewptm=data['maxdewptm'],\n",
73 | " mindewptm=data['mindewptm'],\n",
74 | " maxpressurem=data['maxpressurem'],\n",
75 | " minpressurem=data['minpressurem'],\n",
76 | " precipm=data['precipm']))\n",
77 | " time.sleep(6)\n",
78 | " bar.update()\n",
79 | " target_date += timedelta(days=1)\n",
80 | " return records"
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": 5,
86 | "metadata": {},
87 | "outputs": [],
88 | "source": [
89 | "# Do not run this cell when collecting data on day 2\n",
90 | "def get_target_date():\n",
91 | " \"\"\"Return target date 1000 days prior to current date.\"\"\"\n",
92 | " current_date = datetime.now()\n",
93 | " target_date = current_date - timedelta(days=1000)\n",
94 | " return target_date\n",
95 | "\n",
96 | "target_date = get_target_date()"
97 | ]
98 | },
99 | {
100 | "cell_type": "code",
101 | "execution_count": 6,
102 | "metadata": {},
103 | "outputs": [
104 | {
105 | "name": "stderr",
106 | "output_type": "stream",
107 | "text": [
108 | "0% [##############################] 100% | ETA: 00:00:00\n",
109 | "Total time elapsed: 00:53:56\n"
110 | ]
111 | }
112 | ],
113 | "source": [
114 | "records = extract_weather_data(BASE_URL, API_KEY, target_date, 500)"
115 | ]
116 | },
117 | {
118 | "cell_type": "code",
119 | "execution_count": 7,
120 | "metadata": {},
121 | "outputs": [
122 | {
123 | "data": {
124 | "text/plain": [
125 | "[DailySummary(date=datetime.datetime(2015, 10, 3, 22, 13, 6, 559948), meantempm='21', meandewptm='6', meanpressurem='1012', maxhumidity='63', minhumidity='20', maxtempm='29', mintempm='14', maxdewptm='8', mindewptm='4', maxpressurem='1014', minpressurem='1010', precipm='0.00'),\n",
126 | " DailySummary(date=datetime.datetime(2015, 10, 4, 22, 13, 6, 559948), meantempm='22', meandewptm='8', meanpressurem='1015', maxhumidity='63', minhumidity='25', maxtempm='29', mintempm='15', maxdewptm='10', mindewptm='7', maxpressurem='1017', minpressurem='1013', precipm='0.00'),\n",
127 | " DailySummary(date=datetime.datetime(2015, 10, 5, 22, 13, 6, 559948), meantempm='24', meandewptm='11', meanpressurem='1018', maxhumidity='64', minhumidity='35', maxtempm='29', mintempm='19', maxdewptm='13', mindewptm='8', maxpressurem='1020', minpressurem='1015', precipm='0.00'),\n",
128 | " DailySummary(date=datetime.datetime(2015, 10, 6, 22, 13, 6, 559948), meantempm='23', meandewptm='11', meanpressurem='1019', maxhumidity='73', minhumidity='25', maxtempm='30', mintempm='17', maxdewptm='14', mindewptm='8', maxpressurem='1022', minpressurem='1017', precipm='0.00'),\n",
129 | " DailySummary(date=datetime.datetime(2015, 10, 7, 22, 13, 6, 559948), meantempm='24', meandewptm='13', meanpressurem='1017', maxhumidity='72', minhumidity='31', maxtempm='32', mintempm='17', maxdewptm='16', mindewptm='10', maxpressurem='1020', minpressurem='1015', precipm='0.00')]"
130 | ]
131 | },
132 | "execution_count": 7,
133 | "metadata": {},
134 | "output_type": "execute_result"
135 | }
136 | ],
137 | "source": [
138 | "# Look at first five records\n",
139 | "records[:5]"
140 | ]
141 | },
142 | {
143 | "cell_type": "code",
144 | "execution_count": 8,
145 | "metadata": {
146 | "scrolled": true
147 | },
148 | "outputs": [
149 | {
150 | "data": {
151 | "text/plain": [
152 | "500"
153 | ]
154 | },
155 | "execution_count": 8,
156 | "metadata": {},
157 | "output_type": "execute_result"
158 | }
159 | ],
160 | "source": [
161 | "len(records)"
162 | ]
163 | },
164 | {
165 | "cell_type": "code",
166 | "execution_count": 9,
167 | "metadata": {},
168 | "outputs": [],
169 | "source": [
170 | "# save records list\n",
171 | "with open('records_pt1.pkl', 'wb') as f:\n",
172 | " pickle.dump(records, f)"
173 | ]
174 | },
175 | {
176 | "cell_type": "code",
177 | "execution_count": 5,
178 | "metadata": {},
179 | "outputs": [],
180 | "source": [
181 | "# load records list - still need to run cells 1-4\n",
182 | "with open('records_pt1.pkl', 'rb') as fp:\n",
183 | " records = pickle.load(fp)"
184 | ]
185 | },
186 | {
187 | "cell_type": "code",
188 | "execution_count": 6,
189 | "metadata": {},
190 | "outputs": [
191 | {
192 | "data": {
193 | "text/plain": [
194 | "DailySummary(date=datetime.datetime(2017, 2, 13, 22, 13, 6, 559948), meantempm='20', meandewptm='13', meanpressurem='1018', maxhumidity='94', minhumidity='42', maxtempm='25', mintempm='16', maxdewptm='18', mindewptm='5', maxpressurem='1022', minpressurem='1012', precipm='0.00')"
195 | ]
196 | },
197 | "execution_count": 6,
198 | "metadata": {},
199 | "output_type": "execute_result"
200 | }
201 | ],
202 | "source": [
203 | "# Inspect last record to date; next target date should be plus one day\n",
204 | "records[-1]"
205 | ]
206 | },
207 | {
208 | "cell_type": "code",
209 | "execution_count": 7,
210 | "metadata": {},
211 | "outputs": [],
212 | "source": [
213 | "# set new target date based on date above plus one day\n",
214 | "target_date = datetime(2017, 2, 14)"
215 | ]
216 | },
217 | {
218 | "cell_type": "code",
219 | "execution_count": 8,
220 | "metadata": {},
221 | "outputs": [
222 | {
223 | "name": "stderr",
224 | "output_type": "stream",
225 | "text": [
226 | "0% [##############################] 100% | ETA: 00:00:00\n",
227 | "Total time elapsed: 00:53:38\n"
228 | ]
229 | }
230 | ],
231 | "source": [
232 | "records += extract_weather_data(BASE_URL, API_KEY, target_date, 500)"
233 | ]
234 | },
235 | {
236 | "cell_type": "code",
237 | "execution_count": 9,
238 | "metadata": {
239 | "scrolled": false
240 | },
241 | "outputs": [
242 | {
243 | "data": {
244 | "text/plain": [
245 | "1000"
246 | ]
247 | },
248 | "execution_count": 9,
249 | "metadata": {},
250 | "output_type": "execute_result"
251 | }
252 | ],
253 | "source": [
254 | "len(records)"
255 | ]
256 | },
257 | {
258 | "cell_type": "code",
259 | "execution_count": 10,
260 | "metadata": {},
261 | "outputs": [],
262 | "source": [
263 | "# with open('records_pt2.pkl', 'wb') as f:\n",
264 | "# pickle.dump(records, f)"
265 | ]
266 | },
267 | {
268 | "cell_type": "code",
269 | "execution_count": 11,
270 | "metadata": {},
271 | "outputs": [],
272 | "source": [
273 | "# load records list - still need to run cells 1 and 3\n",
274 | "# with open('records_pt2.pkl', 'rb') as fp:\n",
275 | "# records = pickle.load(fp)"
276 | ]
277 | },
278 | {
279 | "cell_type": "code",
280 | "execution_count": 12,
281 | "metadata": {},
282 | "outputs": [],
283 | "source": [
284 | "df = pd.DataFrame(records, columns=features).set_index('date')"
285 | ]
286 | },
287 | {
288 | "cell_type": "code",
289 | "execution_count": 13,
290 | "metadata": {},
291 | "outputs": [
292 | {
293 | "data": {
294 | "text/html": [
295 | "
\n",
296 | "\n",
309 | "
\n",
310 | " \n",
311 | " \n",
312 | " | \n",
313 | " meantempm | \n",
314 | " meandewptm | \n",
315 | "
\n",
316 | " \n",
317 | " | date | \n",
318 | " | \n",
319 | " | \n",
320 | "
\n",
321 | " \n",
322 | " \n",
323 | " \n",
324 | " | 2015-10-03 22:13:06.559948 | \n",
325 | " 21 | \n",
326 | " 6 | \n",
327 | "
\n",
328 | " \n",
329 | " | 2015-10-04 22:13:06.559948 | \n",
330 | " 22 | \n",
331 | " 8 | \n",
332 | "
\n",
333 | " \n",
334 | " | 2015-10-05 22:13:06.559948 | \n",
335 | " 24 | \n",
336 | " 11 | \n",
337 | "
\n",
338 | " \n",
339 | " | 2015-10-06 22:13:06.559948 | \n",
340 | " 23 | \n",
341 | " 11 | \n",
342 | "
\n",
343 | " \n",
344 | " | 2015-10-07 22:13:06.559948 | \n",
345 | " 24 | \n",
346 | " 13 | \n",
347 | "
\n",
348 | " \n",
349 | " | 2015-10-08 22:13:06.559948 | \n",
350 | " 26 | \n",
351 | " 17 | \n",
352 | "
\n",
353 | " \n",
354 | " | 2015-10-09 22:13:06.559948 | \n",
355 | " 26 | \n",
356 | " 17 | \n",
357 | "
\n",
358 | " \n",
359 | " | 2015-10-10 22:13:06.559948 | \n",
360 | " 24 | \n",
361 | " 14 | \n",
362 | "
\n",
363 | " \n",
364 | " | 2015-10-11 22:13:06.559948 | \n",
365 | " 26 | \n",
366 | " 16 | \n",
367 | "
\n",
368 | " \n",
369 | " | 2015-10-12 22:13:06.559948 | \n",
370 | " 28 | \n",
371 | " 19 | \n",
372 | "
\n",
373 | " \n",
374 | "
\n",
375 | "
"
376 | ],
377 | "text/plain": [
378 | " meantempm meandewptm\n",
379 | "date \n",
380 | "2015-10-03 22:13:06.559948 21 6\n",
381 | "2015-10-04 22:13:06.559948 22 8\n",
382 | "2015-10-05 22:13:06.559948 24 11\n",
383 | "2015-10-06 22:13:06.559948 23 11\n",
384 | "2015-10-07 22:13:06.559948 24 13\n",
385 | "2015-10-08 22:13:06.559948 26 17\n",
386 | "2015-10-09 22:13:06.559948 26 17\n",
387 | "2015-10-10 22:13:06.559948 24 14\n",
388 | "2015-10-11 22:13:06.559948 26 16\n",
389 | "2015-10-12 22:13:06.559948 28 19"
390 | ]
391 | },
392 | "execution_count": 13,
393 | "metadata": {},
394 | "output_type": "execute_result"
395 | }
396 | ],
397 | "source": [
398 | "tmp = df[['meantempm', 'meandewptm']].head(10)\n",
399 | "tmp"
400 | ]
401 | },
402 | {
403 | "cell_type": "code",
404 | "execution_count": 14,
405 | "metadata": {},
406 | "outputs": [
407 | {
408 | "data": {
409 | "text/html": [
410 | "\n",
411 | "\n",
424 | "
\n",
425 | " \n",
426 | " \n",
427 | " | \n",
428 | " meantempm | \n",
429 | " meandewptm | \n",
430 | " meantempm_1 | \n",
431 | "
\n",
432 | " \n",
433 | " | date | \n",
434 | " | \n",
435 | " | \n",
436 | " | \n",
437 | "
\n",
438 | " \n",
439 | " \n",
440 | " \n",
441 | " | 2015-10-03 22:13:06.559948 | \n",
442 | " 21 | \n",
443 | " 6 | \n",
444 | " NaN | \n",
445 | "
\n",
446 | " \n",
447 | " | 2015-10-04 22:13:06.559948 | \n",
448 | " 22 | \n",
449 | " 8 | \n",
450 | " 21 | \n",
451 | "
\n",
452 | " \n",
453 | " | 2015-10-05 22:13:06.559948 | \n",
454 | " 24 | \n",
455 | " 11 | \n",
456 | " 22 | \n",
457 | "
\n",
458 | " \n",
459 | " | 2015-10-06 22:13:06.559948 | \n",
460 | " 23 | \n",
461 | " 11 | \n",
462 | " 24 | \n",
463 | "
\n",
464 | " \n",
465 | " | 2015-10-07 22:13:06.559948 | \n",
466 | " 24 | \n",
467 | " 13 | \n",
468 | " 23 | \n",
469 | "
\n",
470 | " \n",
471 | " | 2015-10-08 22:13:06.559948 | \n",
472 | " 26 | \n",
473 | " 17 | \n",
474 | " 24 | \n",
475 | "
\n",
476 | " \n",
477 | " | 2015-10-09 22:13:06.559948 | \n",
478 | " 26 | \n",
479 | " 17 | \n",
480 | " 26 | \n",
481 | "
\n",
482 | " \n",
483 | " | 2015-10-10 22:13:06.559948 | \n",
484 | " 24 | \n",
485 | " 14 | \n",
486 | " 26 | \n",
487 | "
\n",
488 | " \n",
489 | " | 2015-10-11 22:13:06.559948 | \n",
490 | " 26 | \n",
491 | " 16 | \n",
492 | " 24 | \n",
493 | "
\n",
494 | " \n",
495 | " | 2015-10-12 22:13:06.559948 | \n",
496 | " 28 | \n",
497 | " 19 | \n",
498 | " 26 | \n",
499 | "
\n",
500 | " \n",
501 | "
\n",
502 | "
"
503 | ],
504 | "text/plain": [
505 | " meantempm meandewptm meantempm_1\n",
506 | "date \n",
507 | "2015-10-03 22:13:06.559948 21 6 NaN\n",
508 | "2015-10-04 22:13:06.559948 22 8 21\n",
509 | "2015-10-05 22:13:06.559948 24 11 22\n",
510 | "2015-10-06 22:13:06.559948 23 11 24\n",
511 | "2015-10-07 22:13:06.559948 24 13 23\n",
512 | "2015-10-08 22:13:06.559948 26 17 24\n",
513 | "2015-10-09 22:13:06.559948 26 17 26\n",
514 | "2015-10-10 22:13:06.559948 24 14 26\n",
515 | "2015-10-11 22:13:06.559948 26 16 24\n",
516 | "2015-10-12 22:13:06.559948 28 19 26"
517 | ]
518 | },
519 | "execution_count": 14,
520 | "metadata": {},
521 | "output_type": "execute_result"
522 | }
523 | ],
524 | "source": [
525 | "# 1 day prior\n",
526 | "N = 1\n",
527 | "\n",
528 | "# target measurement of mean temperature\n",
529 | "feature = 'meantempm'\n",
530 | "\n",
531 | "# total number of rows\n",
532 | "rows = tmp.shape[0]\n",
533 | "\n",
534 | "# a list representing Nth prior measurements of feature\n",
535 | "nth_prior_measurements = tmp[feature].shift(periods=N)\n",
536 | "\n",
537 | "# makee a new column name of feature_N and add to DataFrame\n",
538 | "col_name = f'{feature}_{N}'\n",
539 | "tmp[col_name] = nth_prior_measurements\n",
540 | "tmp"
541 | ]
542 | },
543 | {
544 | "cell_type": "code",
545 | "execution_count": 15,
546 | "metadata": {},
547 | "outputs": [],
548 | "source": [
549 | "def derive_nth_day_feature(df, feature, N):\n",
550 | " nth_prior_measurements = df[feature].shift(periods=N)\n",
551 | " col_name = f'{feature}_{N}'\n",
552 | " df[col_name] = nth_prior_measurements"
553 | ]
554 | },
555 | {
556 | "cell_type": "code",
557 | "execution_count": 16,
558 | "metadata": {},
559 | "outputs": [],
560 | "source": [
561 | "for feature in features:\n",
562 | " if feature != 'date':\n",
563 | " for N in range(1, 4):\n",
564 | " derive_nth_day_feature(df, feature, N)"
565 | ]
566 | },
567 | {
568 | "cell_type": "code",
569 | "execution_count": 17,
570 | "metadata": {},
571 | "outputs": [
572 | {
573 | "data": {
574 | "text/plain": [
575 | "Index(['meantempm', 'meandewptm', 'meanpressurem', 'maxhumidity',\n",
576 | " 'minhumidity', 'maxtempm', 'mintempm', 'maxdewptm', 'mindewptm',\n",
577 | " 'maxpressurem', 'minpressurem', 'precipm', 'meantempm_1', 'meantempm_2',\n",
578 | " 'meantempm_3', 'meandewptm_1', 'meandewptm_2', 'meandewptm_3',\n",
579 | " 'meanpressurem_1', 'meanpressurem_2', 'meanpressurem_3',\n",
580 | " 'maxhumidity_1', 'maxhumidity_2', 'maxhumidity_3', 'minhumidity_1',\n",
581 | " 'minhumidity_2', 'minhumidity_3', 'maxtempm_1', 'maxtempm_2',\n",
582 | " 'maxtempm_3', 'mintempm_1', 'mintempm_2', 'mintempm_3', 'maxdewptm_1',\n",
583 | " 'maxdewptm_2', 'maxdewptm_3', 'mindewptm_1', 'mindewptm_2',\n",
584 | " 'mindewptm_3', 'maxpressurem_1', 'maxpressurem_2', 'maxpressurem_3',\n",
585 | " 'minpressurem_1', 'minpressurem_2', 'minpressurem_3', 'precipm_1',\n",
586 | " 'precipm_2', 'precipm_3'],\n",
587 | " dtype='object')"
588 | ]
589 | },
590 | "execution_count": 17,
591 | "metadata": {},
592 | "output_type": "execute_result"
593 | }
594 | ],
595 | "source": [
596 | "df.columns"
597 | ]
598 | },
599 | {
600 | "cell_type": "code",
601 | "execution_count": 18,
602 | "metadata": {},
603 | "outputs": [
604 | {
605 | "data": {
606 | "text/plain": [
607 | "Index(['meantempm', 'maxtempm', 'mintempm', 'meantempm_1', 'meantempm_2',\n",
608 | " 'meantempm_3', 'meandewptm_1', 'meandewptm_2', 'meandewptm_3',\n",
609 | " 'meanpressurem_1', 'meanpressurem_2', 'meanpressurem_3',\n",
610 | " 'maxhumidity_1', 'maxhumidity_2', 'maxhumidity_3', 'minhumidity_1',\n",
611 | " 'minhumidity_2', 'minhumidity_3', 'maxtempm_1', 'maxtempm_2',\n",
612 | " 'maxtempm_3', 'mintempm_1', 'mintempm_2', 'mintempm_3', 'maxdewptm_1',\n",
613 | " 'maxdewptm_2', 'maxdewptm_3', 'mindewptm_1', 'mindewptm_2',\n",
614 | " 'mindewptm_3', 'maxpressurem_1', 'maxpressurem_2', 'maxpressurem_3',\n",
615 | " 'minpressurem_1', 'minpressurem_2', 'minpressurem_3', 'precipm_1',\n",
616 | " 'precipm_2', 'precipm_3'],\n",
617 | " dtype='object')"
618 | ]
619 | },
620 | "execution_count": 18,
621 | "metadata": {},
622 | "output_type": "execute_result"
623 | }
624 | ],
625 | "source": [
626 | "# make list of original features without meantempm, mintempm, and maxtempm\n",
627 | "to_remove = [feature\n",
628 | " for feature in features\n",
629 | " if feature not in ['meantempm', 'mintempm', 'maxtempm']]\n",
630 | "\n",
631 | "# make a list of columns to keep\n",
632 | "to_keep = [col for col in df.columns if col not in to_remove]\n",
633 | "\n",
634 | "# select only the columns in to_keep and assign to df\n",
635 | "df = df[to_keep]\n",
636 | "df.columns"
637 | ]
638 | },
639 | {
640 | "cell_type": "code",
641 | "execution_count": 19,
642 | "metadata": {},
643 | "outputs": [
644 | {
645 | "name": "stdout",
646 | "output_type": "stream",
647 | "text": [
648 | "\n",
649 | "DatetimeIndex: 1000 entries, 2015-10-03 22:13:06.559948 to 2018-06-28 00:00:00\n",
650 | "Data columns (total 39 columns):\n",
651 | "meantempm 1000 non-null object\n",
652 | "maxtempm 1000 non-null object\n",
653 | "mintempm 1000 non-null object\n",
654 | "meantempm_1 999 non-null object\n",
655 | "meantempm_2 998 non-null object\n",
656 | "meantempm_3 997 non-null object\n",
657 | "meandewptm_1 999 non-null object\n",
658 | "meandewptm_2 998 non-null object\n",
659 | "meandewptm_3 997 non-null object\n",
660 | "meanpressurem_1 999 non-null object\n",
661 | "meanpressurem_2 998 non-null object\n",
662 | "meanpressurem_3 997 non-null object\n",
663 | "maxhumidity_1 999 non-null object\n",
664 | "maxhumidity_2 998 non-null object\n",
665 | "maxhumidity_3 997 non-null object\n",
666 | "minhumidity_1 999 non-null object\n",
667 | "minhumidity_2 998 non-null object\n",
668 | "minhumidity_3 997 non-null object\n",
669 | "maxtempm_1 999 non-null object\n",
670 | "maxtempm_2 998 non-null object\n",
671 | "maxtempm_3 997 non-null object\n",
672 | "mintempm_1 999 non-null object\n",
673 | "mintempm_2 998 non-null object\n",
674 | "mintempm_3 997 non-null object\n",
675 | "maxdewptm_1 999 non-null object\n",
676 | "maxdewptm_2 998 non-null object\n",
677 | "maxdewptm_3 997 non-null object\n",
678 | "mindewptm_1 999 non-null object\n",
679 | "mindewptm_2 998 non-null object\n",
680 | "mindewptm_3 997 non-null object\n",
681 | "maxpressurem_1 999 non-null object\n",
682 | "maxpressurem_2 998 non-null object\n",
683 | "maxpressurem_3 997 non-null object\n",
684 | "minpressurem_1 999 non-null object\n",
685 | "minpressurem_2 998 non-null object\n",
686 | "minpressurem_3 997 non-null object\n",
687 | "precipm_1 999 non-null object\n",
688 | "precipm_2 998 non-null object\n",
689 | "precipm_3 997 non-null object\n",
690 | "dtypes: object(39)\n",
691 | "memory usage: 312.5+ KB\n"
692 | ]
693 | }
694 | ],
695 | "source": [
696 | "df.info()"
697 | ]
698 | },
699 | {
700 | "cell_type": "code",
701 | "execution_count": 20,
702 | "metadata": {},
703 | "outputs": [
704 | {
705 | "name": "stdout",
706 | "output_type": "stream",
707 | "text": [
708 | "\n",
709 | "DatetimeIndex: 1000 entries, 2015-10-03 22:13:06.559948 to 2018-06-28 00:00:00\n",
710 | "Data columns (total 39 columns):\n",
711 | "meantempm 997 non-null float64\n",
712 | "maxtempm 997 non-null float64\n",
713 | "mintempm 997 non-null float64\n",
714 | "meantempm_1 996 non-null float64\n",
715 | "meantempm_2 995 non-null float64\n",
716 | "meantempm_3 994 non-null float64\n",
717 | "meandewptm_1 996 non-null float64\n",
718 | "meandewptm_2 995 non-null float64\n",
719 | "meandewptm_3 994 non-null float64\n",
720 | "meanpressurem_1 994 non-null float64\n",
721 | "meanpressurem_2 993 non-null float64\n",
722 | "meanpressurem_3 992 non-null float64\n",
723 | "maxhumidity_1 996 non-null float64\n",
724 | "maxhumidity_2 995 non-null float64\n",
725 | "maxhumidity_3 994 non-null float64\n",
726 | "minhumidity_1 996 non-null float64\n",
727 | "minhumidity_2 995 non-null float64\n",
728 | "minhumidity_3 994 non-null float64\n",
729 | "maxtempm_1 996 non-null float64\n",
730 | "maxtempm_2 995 non-null float64\n",
731 | "maxtempm_3 994 non-null float64\n",
732 | "mintempm_1 996 non-null float64\n",
733 | "mintempm_2 995 non-null float64\n",
734 | "mintempm_3 994 non-null float64\n",
735 | "maxdewptm_1 996 non-null float64\n",
736 | "maxdewptm_2 995 non-null float64\n",
737 | "maxdewptm_3 994 non-null float64\n",
738 | "mindewptm_1 996 non-null float64\n",
739 | "mindewptm_2 995 non-null float64\n",
740 | "mindewptm_3 994 non-null float64\n",
741 | "maxpressurem_1 994 non-null float64\n",
742 | "maxpressurem_2 993 non-null float64\n",
743 | "maxpressurem_3 992 non-null float64\n",
744 | "minpressurem_1 994 non-null float64\n",
745 | "minpressurem_2 993 non-null float64\n",
746 | "minpressurem_3 992 non-null float64\n",
747 | "precipm_1 999 non-null float64\n",
748 | "precipm_2 998 non-null float64\n",
749 | "precipm_3 997 non-null float64\n",
750 | "dtypes: float64(39)\n",
751 | "memory usage: 312.5 KB\n"
752 | ]
753 | }
754 | ],
755 | "source": [
756 | "df = df.apply(pd.to_numeric, errors='coerce')\n",
757 | "df.info()"
758 | ]
759 | },
760 | {
761 | "cell_type": "code",
762 | "execution_count": 21,
763 | "metadata": {},
764 | "outputs": [
765 | {
766 | "data": {
767 | "text/html": [
768 | "\n",
769 | "\n",
782 | "
\n",
783 | " \n",
784 | " \n",
785 | " | \n",
786 | " count | \n",
787 | " mean | \n",
788 | " std | \n",
789 | " min | \n",
790 | " 25% | \n",
791 | " 50% | \n",
792 | " 75% | \n",
793 | " max | \n",
794 | " outliers | \n",
795 | "
\n",
796 | " \n",
797 | " \n",
798 | " \n",
799 | " | maxhumidity_1 | \n",
800 | " 996.0 | \n",
801 | " 94.326305 | \n",
802 | " 10.732047 | \n",
803 | " 45.0 | \n",
804 | " 94.0 | \n",
805 | " 100.0 | \n",
806 | " 100.0 | \n",
807 | " 100.00 | \n",
808 | " True | \n",
809 | "
\n",
810 | " \n",
811 | " | maxhumidity_2 | \n",
812 | " 995.0 | \n",
813 | " 94.320603 | \n",
814 | " 10.735934 | \n",
815 | " 45.0 | \n",
816 | " 94.0 | \n",
817 | " 100.0 | \n",
818 | " 100.0 | \n",
819 | " 100.00 | \n",
820 | " True | \n",
821 | "
\n",
822 | " \n",
823 | " | maxhumidity_3 | \n",
824 | " 994.0 | \n",
825 | " 94.314889 | \n",
826 | " 10.739825 | \n",
827 | " 45.0 | \n",
828 | " 94.0 | \n",
829 | " 100.0 | \n",
830 | " 100.0 | \n",
831 | " 100.00 | \n",
832 | " True | \n",
833 | "
\n",
834 | " \n",
835 | " | minpressurem_1 | \n",
836 | " 994.0 | \n",
837 | " 1014.230382 | \n",
838 | " 5.858541 | \n",
839 | " 996.0 | \n",
840 | " 1011.0 | \n",
841 | " 1014.0 | \n",
842 | " 1017.0 | \n",
843 | " 1037.00 | \n",
844 | " True | \n",
845 | "
\n",
846 | " \n",
847 | " | minpressurem_2 | \n",
848 | " 993.0 | \n",
849 | " 1014.231621 | \n",
850 | " 5.861363 | \n",
851 | " 996.0 | \n",
852 | " 1011.0 | \n",
853 | " 1014.0 | \n",
854 | " 1017.0 | \n",
855 | " 1037.00 | \n",
856 | " True | \n",
857 | "
\n",
858 | " \n",
859 | " | minpressurem_3 | \n",
860 | " 992.0 | \n",
861 | " 1014.231855 | \n",
862 | " 5.864315 | \n",
863 | " 996.0 | \n",
864 | " 1011.0 | \n",
865 | " 1014.0 | \n",
866 | " 1017.0 | \n",
867 | " 1037.00 | \n",
868 | " True | \n",
869 | "
\n",
870 | " \n",
871 | " | precipm_1 | \n",
872 | " 999.0 | \n",
873 | " 1.419109 | \n",
874 | " 7.958652 | \n",
875 | " 0.0 | \n",
876 | " 0.0 | \n",
877 | " 0.0 | \n",
878 | " 0.0 | \n",
879 | " 131.57 | \n",
880 | " True | \n",
881 | "
\n",
882 | " \n",
883 | " | precipm_2 | \n",
884 | " 998.0 | \n",
885 | " 1.420531 | \n",
886 | " 7.962515 | \n",
887 | " 0.0 | \n",
888 | " 0.0 | \n",
889 | " 0.0 | \n",
890 | " 0.0 | \n",
891 | " 131.57 | \n",
892 | " True | \n",
893 | "
\n",
894 | " \n",
895 | " | precipm_3 | \n",
896 | " 997.0 | \n",
897 | " 1.421956 | \n",
898 | " 7.966384 | \n",
899 | " 0.0 | \n",
900 | " 0.0 | \n",
901 | " 0.0 | \n",
902 | " 0.0 | \n",
903 | " 131.57 | \n",
904 | " True | \n",
905 | "
\n",
906 | " \n",
907 | "
\n",
908 | "
"
909 | ],
910 | "text/plain": [
911 | " count mean std min 25% 50% 75% \\\n",
912 | "maxhumidity_1 996.0 94.326305 10.732047 45.0 94.0 100.0 100.0 \n",
913 | "maxhumidity_2 995.0 94.320603 10.735934 45.0 94.0 100.0 100.0 \n",
914 | "maxhumidity_3 994.0 94.314889 10.739825 45.0 94.0 100.0 100.0 \n",
915 | "minpressurem_1 994.0 1014.230382 5.858541 996.0 1011.0 1014.0 1017.0 \n",
916 | "minpressurem_2 993.0 1014.231621 5.861363 996.0 1011.0 1014.0 1017.0 \n",
917 | "minpressurem_3 992.0 1014.231855 5.864315 996.0 1011.0 1014.0 1017.0 \n",
918 | "precipm_1 999.0 1.419109 7.958652 0.0 0.0 0.0 0.0 \n",
919 | "precipm_2 998.0 1.420531 7.962515 0.0 0.0 0.0 0.0 \n",
920 | "precipm_3 997.0 1.421956 7.966384 0.0 0.0 0.0 0.0 \n",
921 | "\n",
922 | " max outliers \n",
923 | "maxhumidity_1 100.00 True \n",
924 | "maxhumidity_2 100.00 True \n",
925 | "maxhumidity_3 100.00 True \n",
926 | "minpressurem_1 1037.00 True \n",
927 | "minpressurem_2 1037.00 True \n",
928 | "minpressurem_3 1037.00 True \n",
929 | "precipm_1 131.57 True \n",
930 | "precipm_2 131.57 True \n",
931 | "precipm_3 131.57 True "
932 | ]
933 | },
934 | "execution_count": 21,
935 | "metadata": {},
936 | "output_type": "execute_result"
937 | }
938 | ],
939 | "source": [
940 | "# Call describe on df and transpose it due to the large number of columns\n",
941 | "spread = df.describe().T\n",
942 | "\n",
943 | "# precalculate interquartile range for ease of use in next calculation\n",
944 | "IQR = spread['75%'] - spread['25%']\n",
945 | "\n",
946 | "# create an outliers column which is either 3 IQRs below the first quartile or\n",
947 | "# 3 IQRs above the third quartile\n",
948 | "spread['outliers'] = (spread['min'] <\n",
949 | " (spread['25%'] -\n",
950 | " (3 * IQR))) | (spread['max'] >\n",
951 | " (spread['75%'] + 3 * IQR))\n",
952 | "\n",
953 | "# just display the features containing extreame outliers\n",
954 | "spread.loc[spread.outliers, ]"
955 | ]
956 | },
957 | {
958 | "cell_type": "code",
959 | "execution_count": 22,
960 | "metadata": {},
961 | "outputs": [],
962 | "source": [
963 | "# iterate over the precip columns\n",
964 | "for precip_col in ['precipm_1', 'precipm_2', 'precipm_3']:\n",
965 | " # create a boolean array of values representing nans\n",
966 | " missing_vals = pd.isnull(df[precip_col])\n",
967 | " df[precip_col][missing_vals] = 0"
968 | ]
969 | },
970 | {
971 | "cell_type": "code",
972 | "execution_count": 23,
973 | "metadata": {},
974 | "outputs": [],
975 | "source": [
976 | "df = df.dropna()"
977 | ]
978 | },
979 | {
980 | "cell_type": "code",
981 | "execution_count": 24,
982 | "metadata": {},
983 | "outputs": [
984 | {
985 | "data": {
986 | "image/png": "\n",
987 | "text/plain": [
988 | ""
989 | ]
990 | },
991 | "metadata": {},
992 | "output_type": "display_data"
993 | }
994 | ],
995 | "source": [
996 | "fig, ax = plt.subplots(figsize = (14, 8))\n",
997 | "ax.hist(df.maxhumidity_1)\n",
998 | "ax.set_title('Distribution of maxhumidity_1')\n",
999 | "ax.set_xlabel('maxhumidity_1')\n",
1000 | "ax.grid()"
1001 | ]
1002 | },
1003 | {
1004 | "cell_type": "code",
1005 | "execution_count": 25,
1006 | "metadata": {},
1007 | "outputs": [
1008 | {
1009 | "data": {
1010 | "image/png": "\n",
1011 | "text/plain": [
1012 | ""
1013 | ]
1014 | },
1015 | "metadata": {},
1016 | "output_type": "display_data"
1017 | }
1018 | ],
1019 | "source": [
1020 | "fig, ax = plt.subplots(figsize = (14, 8))\n",
1021 | "ax.hist(df.minpressurem_1)\n",
1022 | "ax.set_title('Distribution of minpressurem_1')\n",
1023 | "ax.set_xlabel('minpressurem_1')\n",
1024 | "ax.grid()"
1025 | ]
1026 | },
1027 | {
1028 | "cell_type": "code",
1029 | "execution_count": 26,
1030 | "metadata": {},
1031 | "outputs": [],
1032 | "source": [
1033 | "# import pickle\n",
1034 | "with open('end-part1_df.pkl', 'wb') as f:\n",
1035 | " pickle.dump(df, f)"
1036 | ]
1037 | }
1038 | ],
1039 | "metadata": {
1040 | "kernelspec": {
1041 | "display_name": "Python 3",
1042 | "language": "python",
1043 | "name": "python3"
1044 | },
1045 | "language_info": {
1046 | "codemirror_mode": {
1047 | "name": "ipython",
1048 | "version": 3
1049 | },
1050 | "file_extension": ".py",
1051 | "mimetype": "text/x-python",
1052 | "name": "python",
1053 | "nbconvert_exporter": "python",
1054 | "pygments_lexer": "ipython3",
1055 | "version": "3.6.5"
1056 | }
1057 | },
1058 | "nbformat": 4,
1059 | "nbformat_minor": 2
1060 | }
1061 |
--------------------------------------------------------------------------------