├── .gitignore ├── requirements.txt ├── collect_weather.py ├── preprocess.py ├── train_test_dnn.py ├── train_test.py ├── weather.py ├── README.md └── Collect Weather Data API.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints/ 2 | *-working.ipynb 3 | .vscode/ 4 | tf_wx_model/ 5 | *.csv 6 | *.pkl 7 | *.pxi 8 | __pycache__ -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py==0.7.0 2 | asn1crypto==0.24.0 3 | astor==0.7.1 4 | astroid==2.1.0 5 | certifi==2018.11.29 6 | cffi==1.11.5 7 | chardet==3.0.4 8 | colorama==0.4.1 9 | cryptography==2.5 10 | gast==0.2.2 11 | grpcio==1.16.1 12 | h5py==2.9.0 13 | idna==2.8 14 | isort==4.3.4 15 | Keras-Applications==1.0.6 16 | Keras-Preprocessing==1.0.5 17 | lazy-object-proxy==1.3.1 18 | Markdown==3.0.1 19 | mccabe==0.6.1 20 | mkl-fft==1.0.10 21 | mkl-random==1.0.2 22 | numpy==1.15.4 23 | pandas==0.24.1 24 | patsy==0.5.1 25 | protobuf==3.6.1 26 | psutil==5.5.0 27 | pycparser==2.19 28 | pylint==2.2.2 29 | pyOpenSSL==19.0.0 30 | PyPrind==2.11.2 31 | pyreadline==2.1 32 | PySocks==1.6.8 33 | python-dateutil==2.7.5 34 | pytz==2018.9 35 | requests==2.21.0 36 | scikit-learn==0.20.2 37 | scipy==1.2.0 38 | six==1.12.0 39 | statsmodels==0.9.0 40 | tensorboard==1.12.2 41 | tensorflow==1.12.2 42 | termcolor==1.1.0 43 | typed-ast==1.1.0 44 | urllib3==1.24.2 45 | Werkzeug==0.15.3 46 | win-inet-pton==1.0.1 47 | wincertstore==0.2 48 | wrapt==1.11.1 49 | -------------------------------------------------------------------------------- /collect_weather.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | import pickle 3 | from datetime import timedelta 4 | 5 | from weather import API_KEY, BASE_URL, extract_weather_data, get_target_date 6 | 7 | filename1 = 'records_pt1.pkl' 8 | filename2 = 'records_pt2.pkl' 9 | 10 | if os.path.isfile(filename2): 11 | print('1000 records already collected from Dark Sky API') 12 | 13 | elif os.path.isfile(filename1): 14 | with open(filename1, 'rb') as fp: 15 | records = pickle.load(fp) 16 | 17 | target_date = records[-1][0] + timedelta(days=1) 18 | 19 | records += extract_weather_data(BASE_URL, API_KEY, target_date, 500) 20 | 21 | records_length = len(records) 22 | print(f'{records_length} records collected from Dark Sky API') 23 | 24 | with open(filename2, 'wb') as f: 25 | pickle.dump(records, f) 26 | 27 | print(f'Weather records from day 2 saved to {filename2}.') 28 | 29 | else: 30 | target_date = get_target_date() 31 | 32 | records = extract_weather_data(BASE_URL, API_KEY, target_date, 500) 33 | 34 | records_length = len(records) 35 | print(f'{records_length} records collected from Dark Sky API') 36 | 37 | with open(filename1, 'wb') as f: 38 | pickle.dump(records, f) 39 | 40 | print(f'Weather records from day 1 saved to {filename1}.') 41 | -------------------------------------------------------------------------------- /preprocess.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | import pandas as pd 4 | 5 | from weather import derive_nth_day_feature, features 6 | 7 | with open('records_pt2.pkl', 'rb') as fp: 8 | records = pickle.load(fp) 9 | 10 | df = pd.DataFrame(records, columns=features).set_index('date') 11 | 12 | for feature in features: 13 | if feature != 'date': 14 | for N in range(1, 4): 15 | derive_nth_day_feature(df, feature, N) 16 | 17 | # make list of original features without temperatureMean, temperatureMin, and temperatureMax 18 | to_remove = [ 19 | feature 20 | for feature in features 21 | if feature not in ['temperatureMean', 'temperatureMin', 'temperatureMax'] 22 | ] 23 | 24 | # make a list of columns to keep 25 | to_keep = [col for col in df.columns if col not in to_remove] 26 | 27 | # select only the columns in to_keep and assign to df 28 | df = df[to_keep] 29 | 30 | df = df.apply(pd.to_numeric, errors='coerce') 31 | 32 | # Call describe on df and transpose it due to the large number of columns 33 | spread = df.describe().T 34 | 35 | # precalculate interquartile range for ease of use in next calculation 36 | IQR = spread['75%'] - spread['25%'] 37 | 38 | # create an outliers column which is either 3 IQRs below the first quartile or 39 | # 3 IQRs above the third quartile 40 | spread['outliers'] = (spread['min'] < (spread['25%'] - (3 * IQR))) | ( 41 | spread['max'] > (spread['75%'] + 3 * IQR) 42 | ) 43 | 44 | # iterate over the precip columns 45 | for precip_col in ['precipProbability_1', 'precipProbability_2', 'precipProbability_3']: 46 | # create a boolean array of values representing nans 47 | missing_vals = pd.isnull(df[precip_col]) 48 | df[precip_col][missing_vals] = 0 49 | 50 | df = df.dropna() 51 | 52 | with open('end-part1_df.pkl', 'wb') as f: 53 | pickle.dump(df, f) 54 | -------------------------------------------------------------------------------- /train_test_dnn.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | import numpy as np 4 | import tensorflow as tf 5 | from sklearn.metrics import ( 6 | explained_variance_score, 7 | mean_absolute_error, 8 | median_absolute_error, 9 | ) 10 | from sklearn.model_selection import train_test_split 11 | 12 | with open('end-part1_df.pkl', 'rb') as fp: 13 | df = pickle.load(fp) 14 | 15 | df.index = df.index.values.astype(float) 16 | 17 | # First drop the temperatureMax and temperatureMin from the dataframe 18 | df = df.drop(['temperatureMin', 'temperatureMax'], axis=1) 19 | 20 | # X will be a pandas dataframe of all columns except temperatureMean 21 | X = df[[col for col in df.columns if col != 'temperatureMean']] 22 | 23 | # y will be a pandas series of the temperatureMean 24 | y = df['temperatureMean'] 25 | 26 | # split data into training set and a temporary set 27 | X_train, X_tmp, y_train, y_tmp = train_test_split(X, y, test_size=0.2, random_state=23) 28 | 29 | # split the remaining 20% of data evenly 30 | X_test, X_val, y_test, y_val = train_test_split( 31 | X_tmp, y_tmp, test_size=0.5, random_state=23 32 | ) 33 | 34 | X_train.shape, X_test.shape, X_val.shape 35 | print( 36 | f'Training instances {X_train.shape[0]}, Training features {X_train.shape[1]}' 37 | ) 38 | print(f'Validation instances {X_val.shape[0]}, Validation features {X_val.shape[1]}') 39 | print(f'Testing instances {X_test.shape[0]}, Testing features {X_test.shape[1]}') 40 | 41 | feature_cols = [tf.feature_column.numeric_column(col) for col in X.columns] 42 | 43 | regressor = tf.estimator.DNNRegressor( 44 | feature_columns=feature_cols, 45 | hidden_units=[50, 50], 46 | model_dir='~/Projects/machine-learning-predict-weather/tf_models/tf_wx_model', 47 | ) 48 | 49 | 50 | def wx_input_fn(X, y=None, num_epochs=None, shuffle=True, batch_size=400): 51 | return tf.estimator.inputs.pandas_input_fn( 52 | x=X, y=y, num_epochs=num_epochs, shuffle=shuffle, batch_size=batch_size 53 | ) 54 | 55 | 56 | evaluations = [] 57 | STEPS = 400 58 | for i in range(100): 59 | regressor.train(input_fn=wx_input_fn(X_train, y=y_train), steps=STEPS) 60 | evaluations.append( 61 | regressor.evaluate( 62 | input_fn=wx_input_fn(X_val, y_val, num_epochs=1, shuffle=False) 63 | ) 64 | ) 65 | 66 | pred = regressor.predict(input_fn=wx_input_fn(X_test, num_epochs=1, shuffle=False)) 67 | predictions = np.array([p['predictions'][0] for p in pred]) 68 | 69 | print(f'The Explained Variance: {explained_variance_score(y_test, predictions):.2f}') 70 | print( 71 | f'The Mean Absolute Error: {mean_absolute_error(y_test, predictions):.2f} degrees Celcius' 72 | ) 73 | print( 74 | f'The Median Absolute Error: {median_absolute_error(y_test, predictions):.2f} degrees Celcius' 75 | ) 76 | -------------------------------------------------------------------------------- /train_test.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | import pandas as pd 4 | import statsmodels.api as sm 5 | from sklearn.linear_model import LinearRegression 6 | from sklearn.metrics import mean_absolute_error, median_absolute_error 7 | from sklearn.model_selection import train_test_split 8 | 9 | with open('end-part1_df.pkl', 'rb') as fp: 10 | df = pickle.load(fp) 11 | 12 | df_corr = df.corr()[['temperatureMean']].sort_values('temperatureMean') 13 | df_corr_fil = df_corr[abs(df_corr['temperatureMean']) > 0.55] 14 | 15 | unwanted = ['temperatureMin', 'temperatureMax', 'temperatureMean'] 16 | predictors = df_corr_fil.index.tolist() 17 | predictors = [i for i in predictors if i not in unwanted] 18 | 19 | df2 = df[['temperatureMean'] + predictors] 20 | 21 | X = df2[predictors] 22 | y = df2['temperatureMean'] 23 | alpha = 0.05 24 | 25 | 26 | def stepwise_selection( 27 | X, y, initial_list=predictors, threshold_out=alpha, verbose=True 28 | ): 29 | """ Perform a forward-backward feature selection 30 | based on p-value from statsmodels.api.OLS 31 | Arguments: 32 | X - pandas.DataFrame with candidate features 33 | y - list-like with the target 34 | initial_list - list of features to start with (column names of X) 35 | threshold_in - include a feature if its p-value < threshold_in 36 | threshold_out - exclude a feature if its p-value > threshold_out 37 | verbose - whether to print the sequence of inclusions and exclusions 38 | Returns: list of selected features 39 | See https://en.wikipedia.org/wiki/Stepwise_regression for the details 40 | """ 41 | included = list(initial_list) 42 | while True: 43 | changed = False 44 | model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit() 45 | # use all coefs except intercept 46 | pvalues = model.pvalues.iloc[1:] 47 | worst_pval = pvalues.max() # null if pvalues is empty 48 | if worst_pval > threshold_out: 49 | changed = True 50 | worst_feature = pvalues.idxmax() 51 | included.remove(worst_feature) 52 | if verbose: 53 | print('Drop {:30} with p-value {:.6}'.format(worst_feature, worst_pval)) 54 | if not changed: 55 | break 56 | return included 57 | 58 | 59 | result = stepwise_selection(X, y) 60 | 61 | print('resulting features:') 62 | print(result) 63 | 64 | X = X[result] 65 | model = sm.OLS(y, X).fit() 66 | print(model.summary()) 67 | 68 | X_train, X_test, y_train, y_test = train_test_split( 69 | X, y, test_size=0.2, random_state=12 70 | ) 71 | 72 | regressor = LinearRegression() 73 | 74 | regressor.fit(X_train, y_train) 75 | 76 | prediction = regressor.predict(X_test) 77 | 78 | print(f'The Explained Variance: {regressor.score(X_test, y_test):.2f}') 79 | print( 80 | f'The Mean Absolute Error: {mean_absolute_error(y_test, prediction):.2f} degrees celcius' 81 | ) 82 | print( 83 | f'The Median Absolute Error: {median_absolute_error(y_test, prediction):.2f} degrees celcius' 84 | ) 85 | -------------------------------------------------------------------------------- /weather.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | from collections import namedtuple 4 | from datetime import datetime, timedelta 5 | 6 | import requests 7 | from pyprind import ProgBar 8 | 9 | loc = '30.578806,-97.853065' 10 | 11 | API_KEY = os.environ.get('MY_API_KEY') 12 | BASE_URL = 'https://api.darksky.net/forecast/{}/{},{}' 13 | 14 | features = [ 15 | 'date', 16 | 'temperatureMean', 17 | 'dewPoint', 18 | 'pressure', 19 | 'humidity', 20 | 'temperatureMax', 21 | 'temperatureMin', 22 | 'precipProbability', 23 | ] 24 | DailySummary = namedtuple('DailySummary', features) 25 | 26 | 27 | def extract_weather_data(url, api_key, target_date, days): 28 | """Call Wunderground API to extract weather data.""" 29 | records = [] 30 | bar = ProgBar(days) 31 | for _ in range(days): 32 | request = BASE_URL.format( 33 | API_KEY, loc, target_date.strftime('%Y-%m-%dT%H:%M:%S') 34 | ) 35 | response = requests.get(request) 36 | if response.status_code == 200: 37 | 38 | def get_mean_temp(): 39 | """Return average temperature across a 24 hour period.""" 40 | total_temp = 0 41 | for i in range(len(hdata)): 42 | try: 43 | total_temp += hdata[i]['temperature'] 44 | except KeyError: 45 | total_temp += hdata[i-1]['temperature'] 46 | meanTemp = total_temp / 24 47 | return meanTemp 48 | 49 | data = response.json()['daily']['data'][0] 50 | hdata = response.json()['hourly']['data'] 51 | try: 52 | records.append( 53 | DailySummary( 54 | date=target_date, 55 | temperatureMean=get_mean_temp(), 56 | dewPoint=data['dewPoint'], 57 | pressure=data['pressure'], 58 | humidity=data['humidity'], 59 | temperatureMax=data['temperatureMax'], 60 | temperatureMin=data['temperatureMin'], 61 | precipProbability=data['precipProbability'], 62 | ) 63 | ) 64 | except KeyError: 65 | records.append( 66 | DailySummary( 67 | date=target_date, 68 | temperatureMean=get_mean_temp(), 69 | dewPoint=data['dewPoint'], 70 | pressure=data['pressure'], 71 | humidity=data['humidity'], 72 | temperatureMax=data['temperatureMax'], 73 | temperatureMin=data['temperatureMin'], 74 | precipProbability=0, 75 | ) 76 | ) 77 | # time.sleep(6) 78 | bar.update() 79 | target_date += timedelta(days=1) 80 | return records 81 | 82 | 83 | def get_target_date(): 84 | """Return target date 1000 days prior to current date.""" 85 | current_date = datetime.now() 86 | target_date = current_date - timedelta(days=1000) 87 | return target_date 88 | 89 | 90 | def derive_nth_day_feature(df, feature, N): 91 | nth_prior_measurements = df[feature].shift(periods=N) 92 | col_name = f'{feature}_{N}' 93 | df[col_name] = nth_prior_measurements 94 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Using Machine Learning to Predict the Weather ([Powered by Dark Sky](https://darksky.net/poweredby/)) 2 | This project is based on a three-part article written by Adam McQuistan in [stackabuse.com](http://stackabuse.com/using-machine-learning-to-predict-the-weather-part-1/). 3 | 4 | ## Update regarding the weather API 5 | My original disclaimer was Weather Underground ([wunderground.com](https://www.wunderground.com/)) was no longer providing free API accounts. At some point (I don't know exactly when), they discontinued their API service altogether. I have since signed up for a [Dark Sky API](https://darksky.net/dev). They don't have a free tier but they do have a trial account which allows 1,000 API calls per day to evaluate the service. Every API request over the free daily limit costs $0.0001. 6 | 7 | ## Summary 8 | I won't go into too much detail about the project since you can go to the original article on stackabuse.com; however, here is a little background if you wish to save time. (Although checkout the series, it's worth the read.) 9 | 10 | The project is split into three separate Jupyter Notebooks: one to collect the weather data from the Wunderground.com developer's API (again I'm using Dark Sky's API), inspect it, and clean it; a second to further refine the features and fit the data to a Linear Regression model; and a third to train and evaluate a deep neural net regressor. 11 | 12 | ## Changes 13 | For the most part I did not deviate from the author's original process. I did seek to automate and streamline the code. For example, I added a progress bar to the data collection function and created another function to automatically set a target date that is 1000 days prior to the current date. I automated the code to remove features that did not show a strong correlation and implemented a stepwise regression function to automate removing features that had p-values that were too high. (The original author did this manually.) 14 | 15 | ## Added modules 16 | Automating the code allowed me to adapt the Python code in the Jupyter Notebooks to regular .py files. Jupyter Notebooks are fantastic tools but I believe the final product should be Python scripts that run in the background. Here are the scripts I added and a quick summary: 17 | 18 | 1. weather.py- a utility file that contains reused methods and variables 19 | 2. collect_weather.py- uses the Requests library to download weather data for 1000 days. Also uses ```os.path.isfile()``` and a ```if/elif/else``` statement to determine whether the data from the first 500 days should be collected, data from the second 500 days should be collected, or no data is to be collected. (This no longer necessary since the daily limit is 1,000 calls.) 20 | 3. preprocess.py- creates a Pandas DataFrame from the weather records and cleans the data 21 | 4. train_test.py- performs some additional preprocessing and fits the data to a Linear Regression model 22 | 5. train_test_dnn- uses the same weather data to train, evaluate, and test a deep neural network regressor 23 | 24 | ## Still To Do 25 | * Update collect_weather.py to make 1,000 API calls at once instead of 500 over two days 26 | * Update the Jupyter Notebooks for the Dark Sky API 27 | * Replace/remove some deprecated methods in the train_test.py and train_test_dnn.py modules 28 | * Add better documentation in the form of markdown cells to the notebooks. 29 | * Apply the model to future forecasts and validate against actual weather data. 30 | -------------------------------------------------------------------------------- /Collect Weather Data API.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import os\n", 10 | "import pickle\n", 11 | "import time\n", 12 | "from collections import namedtuple\n", 13 | "from datetime import datetime, timedelta\n", 14 | "\n", 15 | "import pandas as pd\n", 16 | "import requests\n", 17 | "\n", 18 | "import matplotlib.pyplot as plt\n", 19 | "from pyprind import ProgBar\n", 20 | "\n", 21 | "%matplotlib inline" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 2, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "API_KEY = os.environ.get('MY_API_KEY')\n", 31 | "BASE_URL = 'http://api.wunderground.com/api/{}/history_{}/q/TX/Round_Rock.json'" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 3, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "features = [\n", 41 | " \"date\", \"meantempm\", \"meandewptm\", \"meanpressurem\", \"maxhumidity\",\n", 42 | " \"minhumidity\", \"maxtempm\", \"mintempm\", \"maxdewptm\", \"mindewptm\",\n", 43 | " \"maxpressurem\", \"minpressurem\", \"precipm\"\n", 44 | "]\n", 45 | "DailySummary = namedtuple('DailySummary', features)" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 4, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "def extract_weather_data(url, api_key, target_date, days):\n", 55 | " \"\"\"Call Wunderground API to extract weather data.\"\"\"\n", 56 | " records = []\n", 57 | " bar = ProgBar(days)\n", 58 | " for _ in range(days):\n", 59 | " request = BASE_URL.format(API_KEY, target_date.strftime('%Y%m%d'))\n", 60 | " response = requests.get(request)\n", 61 | " if response.status_code == 200:\n", 62 | " data = response.json()['history']['dailysummary'][0]\n", 63 | " records.append(DailySummary(\n", 64 | " date=target_date,\n", 65 | " meantempm=data['meantempm'],\n", 66 | " meandewptm=data['meandewptm'],\n", 67 | " meanpressurem=data['meanpressurem'],\n", 68 | " maxhumidity=data['maxhumidity'],\n", 69 | " minhumidity=data['minhumidity'],\n", 70 | " maxtempm=data['maxtempm'],\n", 71 | " mintempm=data['mintempm'],\n", 72 | " maxdewptm=data['maxdewptm'],\n", 73 | " mindewptm=data['mindewptm'],\n", 74 | " maxpressurem=data['maxpressurem'],\n", 75 | " minpressurem=data['minpressurem'],\n", 76 | " precipm=data['precipm']))\n", 77 | " time.sleep(6)\n", 78 | " bar.update()\n", 79 | " target_date += timedelta(days=1)\n", 80 | " return records" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 5, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "# Do not run this cell when collecting data on day 2\n", 90 | "def get_target_date():\n", 91 | " \"\"\"Return target date 1000 days prior to current date.\"\"\"\n", 92 | " current_date = datetime.now()\n", 93 | " target_date = current_date - timedelta(days=1000)\n", 94 | " return target_date\n", 95 | "\n", 96 | "target_date = get_target_date()" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 6, 102 | "metadata": {}, 103 | "outputs": [ 104 | { 105 | "name": "stderr", 106 | "output_type": "stream", 107 | "text": [ 108 | "0% [##############################] 100% | ETA: 00:00:00\n", 109 | "Total time elapsed: 00:53:56\n" 110 | ] 111 | } 112 | ], 113 | "source": [ 114 | "records = extract_weather_data(BASE_URL, API_KEY, target_date, 500)" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 7, 120 | "metadata": {}, 121 | "outputs": [ 122 | { 123 | "data": { 124 | "text/plain": [ 125 | "[DailySummary(date=datetime.datetime(2015, 10, 3, 22, 13, 6, 559948), meantempm='21', meandewptm='6', meanpressurem='1012', maxhumidity='63', minhumidity='20', maxtempm='29', mintempm='14', maxdewptm='8', mindewptm='4', maxpressurem='1014', minpressurem='1010', precipm='0.00'),\n", 126 | " DailySummary(date=datetime.datetime(2015, 10, 4, 22, 13, 6, 559948), meantempm='22', meandewptm='8', meanpressurem='1015', maxhumidity='63', minhumidity='25', maxtempm='29', mintempm='15', maxdewptm='10', mindewptm='7', maxpressurem='1017', minpressurem='1013', precipm='0.00'),\n", 127 | " DailySummary(date=datetime.datetime(2015, 10, 5, 22, 13, 6, 559948), meantempm='24', meandewptm='11', meanpressurem='1018', maxhumidity='64', minhumidity='35', maxtempm='29', mintempm='19', maxdewptm='13', mindewptm='8', maxpressurem='1020', minpressurem='1015', precipm='0.00'),\n", 128 | " DailySummary(date=datetime.datetime(2015, 10, 6, 22, 13, 6, 559948), meantempm='23', meandewptm='11', meanpressurem='1019', maxhumidity='73', minhumidity='25', maxtempm='30', mintempm='17', maxdewptm='14', mindewptm='8', maxpressurem='1022', minpressurem='1017', precipm='0.00'),\n", 129 | " DailySummary(date=datetime.datetime(2015, 10, 7, 22, 13, 6, 559948), meantempm='24', meandewptm='13', meanpressurem='1017', maxhumidity='72', minhumidity='31', maxtempm='32', mintempm='17', maxdewptm='16', mindewptm='10', maxpressurem='1020', minpressurem='1015', precipm='0.00')]" 130 | ] 131 | }, 132 | "execution_count": 7, 133 | "metadata": {}, 134 | "output_type": "execute_result" 135 | } 136 | ], 137 | "source": [ 138 | "# Look at first five records\n", 139 | "records[:5]" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 8, 145 | "metadata": { 146 | "scrolled": true 147 | }, 148 | "outputs": [ 149 | { 150 | "data": { 151 | "text/plain": [ 152 | "500" 153 | ] 154 | }, 155 | "execution_count": 8, 156 | "metadata": {}, 157 | "output_type": "execute_result" 158 | } 159 | ], 160 | "source": [ 161 | "len(records)" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": 9, 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [ 170 | "# save records list\n", 171 | "with open('records_pt1.pkl', 'wb') as f:\n", 172 | " pickle.dump(records, f)" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": 5, 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "# load records list - still need to run cells 1-4\n", 182 | "with open('records_pt1.pkl', 'rb') as fp:\n", 183 | " records = pickle.load(fp)" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": 6, 189 | "metadata": {}, 190 | "outputs": [ 191 | { 192 | "data": { 193 | "text/plain": [ 194 | "DailySummary(date=datetime.datetime(2017, 2, 13, 22, 13, 6, 559948), meantempm='20', meandewptm='13', meanpressurem='1018', maxhumidity='94', minhumidity='42', maxtempm='25', mintempm='16', maxdewptm='18', mindewptm='5', maxpressurem='1022', minpressurem='1012', precipm='0.00')" 195 | ] 196 | }, 197 | "execution_count": 6, 198 | "metadata": {}, 199 | "output_type": "execute_result" 200 | } 201 | ], 202 | "source": [ 203 | "# Inspect last record to date; next target date should be plus one day\n", 204 | "records[-1]" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": 7, 210 | "metadata": {}, 211 | "outputs": [], 212 | "source": [ 213 | "# set new target date based on date above plus one day\n", 214 | "target_date = datetime(2017, 2, 14)" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": 8, 220 | "metadata": {}, 221 | "outputs": [ 222 | { 223 | "name": "stderr", 224 | "output_type": "stream", 225 | "text": [ 226 | "0% [##############################] 100% | ETA: 00:00:00\n", 227 | "Total time elapsed: 00:53:38\n" 228 | ] 229 | } 230 | ], 231 | "source": [ 232 | "records += extract_weather_data(BASE_URL, API_KEY, target_date, 500)" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": 9, 238 | "metadata": { 239 | "scrolled": false 240 | }, 241 | "outputs": [ 242 | { 243 | "data": { 244 | "text/plain": [ 245 | "1000" 246 | ] 247 | }, 248 | "execution_count": 9, 249 | "metadata": {}, 250 | "output_type": "execute_result" 251 | } 252 | ], 253 | "source": [ 254 | "len(records)" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": 10, 260 | "metadata": {}, 261 | "outputs": [], 262 | "source": [ 263 | "# with open('records_pt2.pkl', 'wb') as f:\n", 264 | "# pickle.dump(records, f)" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": 11, 270 | "metadata": {}, 271 | "outputs": [], 272 | "source": [ 273 | "# load records list - still need to run cells 1 and 3\n", 274 | "# with open('records_pt2.pkl', 'rb') as fp:\n", 275 | "# records = pickle.load(fp)" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": 12, 281 | "metadata": {}, 282 | "outputs": [], 283 | "source": [ 284 | "df = pd.DataFrame(records, columns=features).set_index('date')" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": 13, 290 | "metadata": {}, 291 | "outputs": [ 292 | { 293 | "data": { 294 | "text/html": [ 295 | "
\n", 296 | "\n", 309 | "\n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | "
meantempmmeandewptm
date
2015-10-03 22:13:06.559948216
2015-10-04 22:13:06.559948228
2015-10-05 22:13:06.5599482411
2015-10-06 22:13:06.5599482311
2015-10-07 22:13:06.5599482413
2015-10-08 22:13:06.5599482617
2015-10-09 22:13:06.5599482617
2015-10-10 22:13:06.5599482414
2015-10-11 22:13:06.5599482616
2015-10-12 22:13:06.5599482819
\n", 375 | "
" 376 | ], 377 | "text/plain": [ 378 | " meantempm meandewptm\n", 379 | "date \n", 380 | "2015-10-03 22:13:06.559948 21 6\n", 381 | "2015-10-04 22:13:06.559948 22 8\n", 382 | "2015-10-05 22:13:06.559948 24 11\n", 383 | "2015-10-06 22:13:06.559948 23 11\n", 384 | "2015-10-07 22:13:06.559948 24 13\n", 385 | "2015-10-08 22:13:06.559948 26 17\n", 386 | "2015-10-09 22:13:06.559948 26 17\n", 387 | "2015-10-10 22:13:06.559948 24 14\n", 388 | "2015-10-11 22:13:06.559948 26 16\n", 389 | "2015-10-12 22:13:06.559948 28 19" 390 | ] 391 | }, 392 | "execution_count": 13, 393 | "metadata": {}, 394 | "output_type": "execute_result" 395 | } 396 | ], 397 | "source": [ 398 | "tmp = df[['meantempm', 'meandewptm']].head(10)\n", 399 | "tmp" 400 | ] 401 | }, 402 | { 403 | "cell_type": "code", 404 | "execution_count": 14, 405 | "metadata": {}, 406 | "outputs": [ 407 | { 408 | "data": { 409 | "text/html": [ 410 | "
\n", 411 | "\n", 424 | "\n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | "
meantempmmeandewptmmeantempm_1
date
2015-10-03 22:13:06.559948216NaN
2015-10-04 22:13:06.55994822821
2015-10-05 22:13:06.559948241122
2015-10-06 22:13:06.559948231124
2015-10-07 22:13:06.559948241323
2015-10-08 22:13:06.559948261724
2015-10-09 22:13:06.559948261726
2015-10-10 22:13:06.559948241426
2015-10-11 22:13:06.559948261624
2015-10-12 22:13:06.559948281926
\n", 502 | "
" 503 | ], 504 | "text/plain": [ 505 | " meantempm meandewptm meantempm_1\n", 506 | "date \n", 507 | "2015-10-03 22:13:06.559948 21 6 NaN\n", 508 | "2015-10-04 22:13:06.559948 22 8 21\n", 509 | "2015-10-05 22:13:06.559948 24 11 22\n", 510 | "2015-10-06 22:13:06.559948 23 11 24\n", 511 | "2015-10-07 22:13:06.559948 24 13 23\n", 512 | "2015-10-08 22:13:06.559948 26 17 24\n", 513 | "2015-10-09 22:13:06.559948 26 17 26\n", 514 | "2015-10-10 22:13:06.559948 24 14 26\n", 515 | "2015-10-11 22:13:06.559948 26 16 24\n", 516 | "2015-10-12 22:13:06.559948 28 19 26" 517 | ] 518 | }, 519 | "execution_count": 14, 520 | "metadata": {}, 521 | "output_type": "execute_result" 522 | } 523 | ], 524 | "source": [ 525 | "# 1 day prior\n", 526 | "N = 1\n", 527 | "\n", 528 | "# target measurement of mean temperature\n", 529 | "feature = 'meantempm'\n", 530 | "\n", 531 | "# total number of rows\n", 532 | "rows = tmp.shape[0]\n", 533 | "\n", 534 | "# a list representing Nth prior measurements of feature\n", 535 | "nth_prior_measurements = tmp[feature].shift(periods=N)\n", 536 | "\n", 537 | "# makee a new column name of feature_N and add to DataFrame\n", 538 | "col_name = f'{feature}_{N}'\n", 539 | "tmp[col_name] = nth_prior_measurements\n", 540 | "tmp" 541 | ] 542 | }, 543 | { 544 | "cell_type": "code", 545 | "execution_count": 15, 546 | "metadata": {}, 547 | "outputs": [], 548 | "source": [ 549 | "def derive_nth_day_feature(df, feature, N):\n", 550 | " nth_prior_measurements = df[feature].shift(periods=N)\n", 551 | " col_name = f'{feature}_{N}'\n", 552 | " df[col_name] = nth_prior_measurements" 553 | ] 554 | }, 555 | { 556 | "cell_type": "code", 557 | "execution_count": 16, 558 | "metadata": {}, 559 | "outputs": [], 560 | "source": [ 561 | "for feature in features:\n", 562 | " if feature != 'date':\n", 563 | " for N in range(1, 4):\n", 564 | " derive_nth_day_feature(df, feature, N)" 565 | ] 566 | }, 567 | { 568 | "cell_type": "code", 569 | "execution_count": 17, 570 | "metadata": {}, 571 | "outputs": [ 572 | { 573 | "data": { 574 | "text/plain": [ 575 | "Index(['meantempm', 'meandewptm', 'meanpressurem', 'maxhumidity',\n", 576 | " 'minhumidity', 'maxtempm', 'mintempm', 'maxdewptm', 'mindewptm',\n", 577 | " 'maxpressurem', 'minpressurem', 'precipm', 'meantempm_1', 'meantempm_2',\n", 578 | " 'meantempm_3', 'meandewptm_1', 'meandewptm_2', 'meandewptm_3',\n", 579 | " 'meanpressurem_1', 'meanpressurem_2', 'meanpressurem_3',\n", 580 | " 'maxhumidity_1', 'maxhumidity_2', 'maxhumidity_3', 'minhumidity_1',\n", 581 | " 'minhumidity_2', 'minhumidity_3', 'maxtempm_1', 'maxtempm_2',\n", 582 | " 'maxtempm_3', 'mintempm_1', 'mintempm_2', 'mintempm_3', 'maxdewptm_1',\n", 583 | " 'maxdewptm_2', 'maxdewptm_3', 'mindewptm_1', 'mindewptm_2',\n", 584 | " 'mindewptm_3', 'maxpressurem_1', 'maxpressurem_2', 'maxpressurem_3',\n", 585 | " 'minpressurem_1', 'minpressurem_2', 'minpressurem_3', 'precipm_1',\n", 586 | " 'precipm_2', 'precipm_3'],\n", 587 | " dtype='object')" 588 | ] 589 | }, 590 | "execution_count": 17, 591 | "metadata": {}, 592 | "output_type": "execute_result" 593 | } 594 | ], 595 | "source": [ 596 | "df.columns" 597 | ] 598 | }, 599 | { 600 | "cell_type": "code", 601 | "execution_count": 18, 602 | "metadata": {}, 603 | "outputs": [ 604 | { 605 | "data": { 606 | "text/plain": [ 607 | "Index(['meantempm', 'maxtempm', 'mintempm', 'meantempm_1', 'meantempm_2',\n", 608 | " 'meantempm_3', 'meandewptm_1', 'meandewptm_2', 'meandewptm_3',\n", 609 | " 'meanpressurem_1', 'meanpressurem_2', 'meanpressurem_3',\n", 610 | " 'maxhumidity_1', 'maxhumidity_2', 'maxhumidity_3', 'minhumidity_1',\n", 611 | " 'minhumidity_2', 'minhumidity_3', 'maxtempm_1', 'maxtempm_2',\n", 612 | " 'maxtempm_3', 'mintempm_1', 'mintempm_2', 'mintempm_3', 'maxdewptm_1',\n", 613 | " 'maxdewptm_2', 'maxdewptm_3', 'mindewptm_1', 'mindewptm_2',\n", 614 | " 'mindewptm_3', 'maxpressurem_1', 'maxpressurem_2', 'maxpressurem_3',\n", 615 | " 'minpressurem_1', 'minpressurem_2', 'minpressurem_3', 'precipm_1',\n", 616 | " 'precipm_2', 'precipm_3'],\n", 617 | " dtype='object')" 618 | ] 619 | }, 620 | "execution_count": 18, 621 | "metadata": {}, 622 | "output_type": "execute_result" 623 | } 624 | ], 625 | "source": [ 626 | "# make list of original features without meantempm, mintempm, and maxtempm\n", 627 | "to_remove = [feature\n", 628 | " for feature in features\n", 629 | " if feature not in ['meantempm', 'mintempm', 'maxtempm']]\n", 630 | "\n", 631 | "# make a list of columns to keep\n", 632 | "to_keep = [col for col in df.columns if col not in to_remove]\n", 633 | "\n", 634 | "# select only the columns in to_keep and assign to df\n", 635 | "df = df[to_keep]\n", 636 | "df.columns" 637 | ] 638 | }, 639 | { 640 | "cell_type": "code", 641 | "execution_count": 19, 642 | "metadata": {}, 643 | "outputs": [ 644 | { 645 | "name": "stdout", 646 | "output_type": "stream", 647 | "text": [ 648 | "\n", 649 | "DatetimeIndex: 1000 entries, 2015-10-03 22:13:06.559948 to 2018-06-28 00:00:00\n", 650 | "Data columns (total 39 columns):\n", 651 | "meantempm 1000 non-null object\n", 652 | "maxtempm 1000 non-null object\n", 653 | "mintempm 1000 non-null object\n", 654 | "meantempm_1 999 non-null object\n", 655 | "meantempm_2 998 non-null object\n", 656 | "meantempm_3 997 non-null object\n", 657 | "meandewptm_1 999 non-null object\n", 658 | "meandewptm_2 998 non-null object\n", 659 | "meandewptm_3 997 non-null object\n", 660 | "meanpressurem_1 999 non-null object\n", 661 | "meanpressurem_2 998 non-null object\n", 662 | "meanpressurem_3 997 non-null object\n", 663 | "maxhumidity_1 999 non-null object\n", 664 | "maxhumidity_2 998 non-null object\n", 665 | "maxhumidity_3 997 non-null object\n", 666 | "minhumidity_1 999 non-null object\n", 667 | "minhumidity_2 998 non-null object\n", 668 | "minhumidity_3 997 non-null object\n", 669 | "maxtempm_1 999 non-null object\n", 670 | "maxtempm_2 998 non-null object\n", 671 | "maxtempm_3 997 non-null object\n", 672 | "mintempm_1 999 non-null object\n", 673 | "mintempm_2 998 non-null object\n", 674 | "mintempm_3 997 non-null object\n", 675 | "maxdewptm_1 999 non-null object\n", 676 | "maxdewptm_2 998 non-null object\n", 677 | "maxdewptm_3 997 non-null object\n", 678 | "mindewptm_1 999 non-null object\n", 679 | "mindewptm_2 998 non-null object\n", 680 | "mindewptm_3 997 non-null object\n", 681 | "maxpressurem_1 999 non-null object\n", 682 | "maxpressurem_2 998 non-null object\n", 683 | "maxpressurem_3 997 non-null object\n", 684 | "minpressurem_1 999 non-null object\n", 685 | "minpressurem_2 998 non-null object\n", 686 | "minpressurem_3 997 non-null object\n", 687 | "precipm_1 999 non-null object\n", 688 | "precipm_2 998 non-null object\n", 689 | "precipm_3 997 non-null object\n", 690 | "dtypes: object(39)\n", 691 | "memory usage: 312.5+ KB\n" 692 | ] 693 | } 694 | ], 695 | "source": [ 696 | "df.info()" 697 | ] 698 | }, 699 | { 700 | "cell_type": "code", 701 | "execution_count": 20, 702 | "metadata": {}, 703 | "outputs": [ 704 | { 705 | "name": "stdout", 706 | "output_type": "stream", 707 | "text": [ 708 | "\n", 709 | "DatetimeIndex: 1000 entries, 2015-10-03 22:13:06.559948 to 2018-06-28 00:00:00\n", 710 | "Data columns (total 39 columns):\n", 711 | "meantempm 997 non-null float64\n", 712 | "maxtempm 997 non-null float64\n", 713 | "mintempm 997 non-null float64\n", 714 | "meantempm_1 996 non-null float64\n", 715 | "meantempm_2 995 non-null float64\n", 716 | "meantempm_3 994 non-null float64\n", 717 | "meandewptm_1 996 non-null float64\n", 718 | "meandewptm_2 995 non-null float64\n", 719 | "meandewptm_3 994 non-null float64\n", 720 | "meanpressurem_1 994 non-null float64\n", 721 | "meanpressurem_2 993 non-null float64\n", 722 | "meanpressurem_3 992 non-null float64\n", 723 | "maxhumidity_1 996 non-null float64\n", 724 | "maxhumidity_2 995 non-null float64\n", 725 | "maxhumidity_3 994 non-null float64\n", 726 | "minhumidity_1 996 non-null float64\n", 727 | "minhumidity_2 995 non-null float64\n", 728 | "minhumidity_3 994 non-null float64\n", 729 | "maxtempm_1 996 non-null float64\n", 730 | "maxtempm_2 995 non-null float64\n", 731 | "maxtempm_3 994 non-null float64\n", 732 | "mintempm_1 996 non-null float64\n", 733 | "mintempm_2 995 non-null float64\n", 734 | "mintempm_3 994 non-null float64\n", 735 | "maxdewptm_1 996 non-null float64\n", 736 | "maxdewptm_2 995 non-null float64\n", 737 | "maxdewptm_3 994 non-null float64\n", 738 | "mindewptm_1 996 non-null float64\n", 739 | "mindewptm_2 995 non-null float64\n", 740 | "mindewptm_3 994 non-null float64\n", 741 | "maxpressurem_1 994 non-null float64\n", 742 | "maxpressurem_2 993 non-null float64\n", 743 | "maxpressurem_3 992 non-null float64\n", 744 | "minpressurem_1 994 non-null float64\n", 745 | "minpressurem_2 993 non-null float64\n", 746 | "minpressurem_3 992 non-null float64\n", 747 | "precipm_1 999 non-null float64\n", 748 | "precipm_2 998 non-null float64\n", 749 | "precipm_3 997 non-null float64\n", 750 | "dtypes: float64(39)\n", 751 | "memory usage: 312.5 KB\n" 752 | ] 753 | } 754 | ], 755 | "source": [ 756 | "df = df.apply(pd.to_numeric, errors='coerce')\n", 757 | "df.info()" 758 | ] 759 | }, 760 | { 761 | "cell_type": "code", 762 | "execution_count": 21, 763 | "metadata": {}, 764 | "outputs": [ 765 | { 766 | "data": { 767 | "text/html": [ 768 | "
\n", 769 | "\n", 782 | "\n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | " \n", 860 | " \n", 861 | " \n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | " \n", 883 | " \n", 884 | " \n", 885 | " \n", 886 | " \n", 887 | " \n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | " \n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | " \n", 902 | " \n", 903 | " \n", 904 | " \n", 905 | " \n", 906 | " \n", 907 | "
countmeanstdmin25%50%75%maxoutliers
maxhumidity_1996.094.32630510.73204745.094.0100.0100.0100.00True
maxhumidity_2995.094.32060310.73593445.094.0100.0100.0100.00True
maxhumidity_3994.094.31488910.73982545.094.0100.0100.0100.00True
minpressurem_1994.01014.2303825.858541996.01011.01014.01017.01037.00True
minpressurem_2993.01014.2316215.861363996.01011.01014.01017.01037.00True
minpressurem_3992.01014.2318555.864315996.01011.01014.01017.01037.00True
precipm_1999.01.4191097.9586520.00.00.00.0131.57True
precipm_2998.01.4205317.9625150.00.00.00.0131.57True
precipm_3997.01.4219567.9663840.00.00.00.0131.57True
\n", 908 | "
" 909 | ], 910 | "text/plain": [ 911 | " count mean std min 25% 50% 75% \\\n", 912 | "maxhumidity_1 996.0 94.326305 10.732047 45.0 94.0 100.0 100.0 \n", 913 | "maxhumidity_2 995.0 94.320603 10.735934 45.0 94.0 100.0 100.0 \n", 914 | "maxhumidity_3 994.0 94.314889 10.739825 45.0 94.0 100.0 100.0 \n", 915 | "minpressurem_1 994.0 1014.230382 5.858541 996.0 1011.0 1014.0 1017.0 \n", 916 | "minpressurem_2 993.0 1014.231621 5.861363 996.0 1011.0 1014.0 1017.0 \n", 917 | "minpressurem_3 992.0 1014.231855 5.864315 996.0 1011.0 1014.0 1017.0 \n", 918 | "precipm_1 999.0 1.419109 7.958652 0.0 0.0 0.0 0.0 \n", 919 | "precipm_2 998.0 1.420531 7.962515 0.0 0.0 0.0 0.0 \n", 920 | "precipm_3 997.0 1.421956 7.966384 0.0 0.0 0.0 0.0 \n", 921 | "\n", 922 | " max outliers \n", 923 | "maxhumidity_1 100.00 True \n", 924 | "maxhumidity_2 100.00 True \n", 925 | "maxhumidity_3 100.00 True \n", 926 | "minpressurem_1 1037.00 True \n", 927 | "minpressurem_2 1037.00 True \n", 928 | "minpressurem_3 1037.00 True \n", 929 | "precipm_1 131.57 True \n", 930 | "precipm_2 131.57 True \n", 931 | "precipm_3 131.57 True " 932 | ] 933 | }, 934 | "execution_count": 21, 935 | "metadata": {}, 936 | "output_type": "execute_result" 937 | } 938 | ], 939 | "source": [ 940 | "# Call describe on df and transpose it due to the large number of columns\n", 941 | "spread = df.describe().T\n", 942 | "\n", 943 | "# precalculate interquartile range for ease of use in next calculation\n", 944 | "IQR = spread['75%'] - spread['25%']\n", 945 | "\n", 946 | "# create an outliers column which is either 3 IQRs below the first quartile or\n", 947 | "# 3 IQRs above the third quartile\n", 948 | "spread['outliers'] = (spread['min'] <\n", 949 | " (spread['25%'] -\n", 950 | " (3 * IQR))) | (spread['max'] >\n", 951 | " (spread['75%'] + 3 * IQR))\n", 952 | "\n", 953 | "# just display the features containing extreame outliers\n", 954 | "spread.loc[spread.outliers, ]" 955 | ] 956 | }, 957 | { 958 | "cell_type": "code", 959 | "execution_count": 22, 960 | "metadata": {}, 961 | "outputs": [], 962 | "source": [ 963 | "# iterate over the precip columns\n", 964 | "for precip_col in ['precipm_1', 'precipm_2', 'precipm_3']:\n", 965 | " # create a boolean array of values representing nans\n", 966 | " missing_vals = pd.isnull(df[precip_col])\n", 967 | " df[precip_col][missing_vals] = 0" 968 | ] 969 | }, 970 | { 971 | "cell_type": "code", 972 | "execution_count": 23, 973 | "metadata": {}, 974 | "outputs": [], 975 | "source": [ 976 | "df = df.dropna()" 977 | ] 978 | }, 979 | { 980 | "cell_type": "code", 981 | "execution_count": 24, 982 | "metadata": {}, 983 | "outputs": [ 984 | { 985 | "data": { 986 | "image/png": "\n", 987 | "text/plain": [ 988 | "
" 989 | ] 990 | }, 991 | "metadata": {}, 992 | "output_type": "display_data" 993 | } 994 | ], 995 | "source": [ 996 | "fig, ax = plt.subplots(figsize = (14, 8))\n", 997 | "ax.hist(df.maxhumidity_1)\n", 998 | "ax.set_title('Distribution of maxhumidity_1')\n", 999 | "ax.set_xlabel('maxhumidity_1')\n", 1000 | "ax.grid()" 1001 | ] 1002 | }, 1003 | { 1004 | "cell_type": "code", 1005 | "execution_count": 25, 1006 | "metadata": {}, 1007 | "outputs": [ 1008 | { 1009 | "data": { 1010 | "image/png": "\n", 1011 | "text/plain": [ 1012 | "
" 1013 | ] 1014 | }, 1015 | "metadata": {}, 1016 | "output_type": "display_data" 1017 | } 1018 | ], 1019 | "source": [ 1020 | "fig, ax = plt.subplots(figsize = (14, 8))\n", 1021 | "ax.hist(df.minpressurem_1)\n", 1022 | "ax.set_title('Distribution of minpressurem_1')\n", 1023 | "ax.set_xlabel('minpressurem_1')\n", 1024 | "ax.grid()" 1025 | ] 1026 | }, 1027 | { 1028 | "cell_type": "code", 1029 | "execution_count": 26, 1030 | "metadata": {}, 1031 | "outputs": [], 1032 | "source": [ 1033 | "# import pickle\n", 1034 | "with open('end-part1_df.pkl', 'wb') as f:\n", 1035 | " pickle.dump(df, f)" 1036 | ] 1037 | } 1038 | ], 1039 | "metadata": { 1040 | "kernelspec": { 1041 | "display_name": "Python 3", 1042 | "language": "python", 1043 | "name": "python3" 1044 | }, 1045 | "language_info": { 1046 | "codemirror_mode": { 1047 | "name": "ipython", 1048 | "version": 3 1049 | }, 1050 | "file_extension": ".py", 1051 | "mimetype": "text/x-python", 1052 | "name": "python", 1053 | "nbconvert_exporter": "python", 1054 | "pygments_lexer": "ipython3", 1055 | "version": "3.6.5" 1056 | } 1057 | }, 1058 | "nbformat": 4, 1059 | "nbformat_minor": 2 1060 | } 1061 | --------------------------------------------------------------------------------