├── .gitignore ├── solutions ├── reference_sol0.py ├── clean_sol1.py ├── setting_sol1.py ├── setting_sol2.py ├── numerical_sol4.py ├── reference_sol3.py ├── numerical_sol2.py ├── numerical_sol3.py ├── import_sol1.py ├── numerical_sol5.py ├── reference_sol1.py ├── numerical_sol6.py ├── clean_sol2.py ├── merge_sol2.py ├── numerical_sol1.py ├── time_sol1.py ├── import_sol2.py ├── resample_sol1.py ├── merge_sol3.py ├── merge_sol1.py ├── reference_sol2.py ├── apply_sol1.py └── cleaning_start.py ├── environment.yml ├── README.md ├── data └── trade.csv ├── session4.ipynb ├── session1.ipynb ├── session2.ipynb └── session3.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints -------------------------------------------------------------------------------- /solutions/reference_sol0.py: -------------------------------------------------------------------------------- 1 | df_qte.loc[0] -------------------------------------------------------------------------------- /solutions/clean_sol1.py: -------------------------------------------------------------------------------- 1 | df_qte[df_qte.BidPrice > 0] -------------------------------------------------------------------------------- /solutions/setting_sol1.py: -------------------------------------------------------------------------------- 1 | df['D'] = np.log(df['A']) 2 | df -------------------------------------------------------------------------------- /solutions/setting_sol2.py: -------------------------------------------------------------------------------- 1 | df.loc[df.index[0], 'A'] = np.pi 2 | df -------------------------------------------------------------------------------- /solutions/numerical_sol4.py: -------------------------------------------------------------------------------- 1 | df['high_volatility'] = df['volatility'] > 20. -------------------------------------------------------------------------------- /solutions/reference_sol3.py: -------------------------------------------------------------------------------- 1 | df.loc['Friday', 'volatility'] = 17.0 2 | df -------------------------------------------------------------------------------- /solutions/numerical_sol2.py: -------------------------------------------------------------------------------- 1 | df[['optiver_turnover', 'total_turnover']].sum() -------------------------------------------------------------------------------- /solutions/numerical_sol3.py: -------------------------------------------------------------------------------- 1 | df['good_market_share'] = df['market_share'] > 10 -------------------------------------------------------------------------------- /solutions/import_sol1.py: -------------------------------------------------------------------------------- 1 | df_qte = pd.read_csv('data/quote.csv') 2 | df_qte.head() -------------------------------------------------------------------------------- /solutions/numerical_sol5.py: -------------------------------------------------------------------------------- 1 | df['both'] = df['high_volatility'] & df['good_market_share'] -------------------------------------------------------------------------------- /solutions/reference_sol1.py: -------------------------------------------------------------------------------- 1 | df_qte.drop(columns='Index', inplace=True) 2 | df_qte.head() -------------------------------------------------------------------------------- /solutions/numerical_sol6.py: -------------------------------------------------------------------------------- 1 | df['Comment'] = np.where(df['high_volatility'], 'HighVol', 'LowVol') -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: datacourse 2 | dependencies: 3 | - python=2.7 4 | - jupyter 5 | - seaborn 6 | -------------------------------------------------------------------------------- /solutions/clean_sol2.py: -------------------------------------------------------------------------------- 1 | df_qte = df_qte[df_qte.BidPrice > 0] 2 | df_qte = df_qte[df_qte.AskPrice > 0] -------------------------------------------------------------------------------- /solutions/merge_sol2.py: -------------------------------------------------------------------------------- 1 | df_right = pd.merge(df_currency, df_forex, on='Currency', how='left') 2 | df_right -------------------------------------------------------------------------------- /solutions/numerical_sol1.py: -------------------------------------------------------------------------------- 1 | df['market_share'] = 100. * df['optiver_turnover'] / df['market_turnover'] 2 | df -------------------------------------------------------------------------------- /solutions/time_sol1.py: -------------------------------------------------------------------------------- 1 | 100. * ((df['PRICE']!=df['BidPrice']) & (df['PRICE']!=df['AskPrice'])).sum() / len(df) -------------------------------------------------------------------------------- /solutions/import_sol2.py: -------------------------------------------------------------------------------- 1 | df_qte.columns = ['Index', 'Timestamp', 'BidPrice', 'BidSize', 'AskPrice', 'AskSize'] 2 | df_qte.head() -------------------------------------------------------------------------------- /solutions/resample_sol1.py: -------------------------------------------------------------------------------- 1 | df_vols.groupby('RELATIVE_EXPIRY').resample('10min').last()['VOLATILITY'].unstack(level='RELATIVE_EXPIRY').ffill() -------------------------------------------------------------------------------- /solutions/merge_sol3.py: -------------------------------------------------------------------------------- 1 | df = pd.merge(df_left, df_right, on='Underlying', how='left') 2 | df['Notional'] = df['Notional'] / df['Rate'] / 1e6 3 | df -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Binder](http://mybinder.org/badge.svg)](http://mybinder.org/repo/optiver/data-course) 2 | 3 | Click the launch binder badge to get started! -------------------------------------------------------------------------------- /solutions/merge_sol1.py: -------------------------------------------------------------------------------- 1 | df_left = pd.merge(df_prices, df_turnovers) # on=['Underlying', 'Month'] is optional 2 | df_left['Notional'] = df_left['Price'] * df_left['Turnover'] 3 | df_left -------------------------------------------------------------------------------- /solutions/reference_sol2.py: -------------------------------------------------------------------------------- 1 | # Using loc 2 | df.loc['Wednesday', 'volatility'] 3 | 4 | # Using iloc 5 | df.iloc[2, 2] 6 | 7 | # Using a combination of labels and locations 8 | df.loc['Wednesday', df.columns[2]] 9 | df.loc[df.index[2], 'volatility'] 10 | -------------------------------------------------------------------------------- /solutions/apply_sol1.py: -------------------------------------------------------------------------------- 1 | df_bond['Price'] = df_bond.apply( 2 | lambda row: np.npv( 3 | rate=row['yield'], 4 | values=[row['2017_cashflow'], 5 | row['2018_cashflow'], 6 | row['2019_cashflow'], 7 | row['2020_cashflow']]), 8 | axis=1) 9 | 10 | df_bond -------------------------------------------------------------------------------- /solutions/cleaning_start.py: -------------------------------------------------------------------------------- 1 | df_qte = pd.read_csv('data/quote.csv') 2 | df_qte.columns = ['Index', 'Timestamp', 'BidPrice', 'BidSize', 'AskPrice', 'AskSize'] 3 | df_qte = df_qte.drop(columns='Index') 4 | df_qte['Timestamp'] = pd.to_datetime(df_qte['Timestamp']) 5 | df_qte = df_qte.set_index('Timestamp', drop=True) 6 | df_qte.head() -------------------------------------------------------------------------------- /data/trade.csv: -------------------------------------------------------------------------------- 1 | TIMESTAMP,PRICE,SIZE 2 | 2014-01-02 01:16:26.044001,233.25,1 3 | 2014-01-02 01:17:47.972599,233.48,1 4 | 2014-01-02 01:17:49.919139,233.52,1 5 | 2014-01-02 01:18:17.941876,233.71,1 6 | 2014-01-02 01:19:03.002624,233.7,1 7 | 2014-01-02 01:19:53.313990,233.54,4 8 | 2014-01-02 01:19:53.317000,233.54,1 9 | 2014-01-02 01:20:22.464612,233.41,4 10 | 2014-01-02 01:20:22.464809,233.4,3 11 | 2014-01-02 01:20:22.464980,233.39,2 12 | 2014-01-02 01:20:22.464983,233.38,2 13 | 2014-01-02 01:20:22.465183,233.33,8 14 | 2014-01-02 01:20:22.466588,233.33,2 15 | 2014-01-02 01:20:22.468193,233.33,2 16 | 2014-01-02 01:20:22.538515,233.33,2 17 | 2014-01-02 01:20:22.575145,233.33,1 18 | 2014-01-02 01:20:23.073677,233.33,1 19 | 2014-01-02 01:20:23.339602,233.33,4 20 | 2014-01-02 01:20:24.923696,233.33,1 21 | 2014-01-02 01:20:26.710972,233.33,1 22 | 2014-01-02 01:20:26.842242,233.33,1 23 | 2014-01-02 01:20:26.845337,233.33,8 24 | 2014-01-02 01:20:26.854956,233.33,1 25 | 2014-01-02 01:20:27.015023,233.33,1 26 | 2014-01-02 01:20:27.203064,233.33,12 27 | 2014-01-02 01:20:27.947323,233.33,2 28 | 2014-01-02 01:20:28.262724,233.33,2 29 | 2014-01-02 01:20:29.248494,233.33,1 30 | 2014-01-02 01:20:39.748030,233.33,1 31 | 2014-01-02 01:20:39.754578,233.33,1 32 | 2014-01-02 01:20:39.759208,233.33,1 33 | 2014-01-02 01:20:40.637402,233.33,1 34 | 2014-01-02 01:20:40.742377,233.33,3 35 | 2014-01-02 01:20:40.751086,233.33,3 36 | 2014-01-02 01:20:40.841602,233.33,1 37 | 2014-01-02 01:20:40.857549,233.33,6 38 | 2014-01-02 01:20:41.522635,233.33,2 39 | 2014-01-02 01:20:42.298645,233.33,1 40 | 2014-01-02 01:20:42.578823,233.33,1 41 | 2014-01-02 01:20:42.580670,233.33,1 42 | 2014-01-02 01:20:42.740737,233.33,1 43 | 2014-01-02 01:20:42.848574,233.33,1 44 | 2014-01-02 01:20:42.851747,233.33,1 45 | 2014-01-02 01:20:42.969211,233.33,1 46 | 2014-01-02 01:20:42.970920,233.33,1 47 | 2014-01-02 01:20:42.975809,233.33,1 48 | 2014-01-02 01:20:47.833702,233.33,6 49 | 2014-01-02 01:20:50.733988,233.39,1 50 | 2014-01-02 01:21:05.162523,233.45,3 51 | 2014-01-02 01:21:05.162678,233.44,7 52 | 2014-01-02 01:21:13.246555,233.46,2 53 | 2014-01-02 01:21:13.246711,233.45,5 54 | 2014-01-02 01:21:14.363567,233.45,1 55 | 2014-01-02 01:21:16.942746,233.45,1 56 | 2014-01-02 01:21:17.786437,233.45,1 57 | 2014-01-02 01:21:19.442570,233.45,5 58 | 2014-01-02 01:21:36.624397,233.53,4 59 | 2014-01-02 01:21:36.624399,233.52,1 60 | 2014-01-02 01:21:36.624582,233.51,1 61 | 2014-01-02 01:21:36.624583,233.48,9 62 | 2014-01-02 01:21:52.637795,233.58,2 63 | 2014-01-02 01:21:52.637919,233.57,1 64 | 2014-01-02 01:21:52.638058,233.56,9 65 | 2014-01-02 01:21:56.366992,233.55,2 66 | 2014-01-02 01:21:56.414913,233.55,1 67 | 2014-01-02 01:22:02.454856,233.55,1 68 | 2014-01-02 01:22:08.262316,233.57,5 69 | 2014-01-02 01:22:08.262468,233.56,1 70 | 2014-01-02 01:22:08.710423,233.56,1 71 | 2014-01-02 01:22:09.205993,233.56,8 72 | 2014-01-02 01:22:22.188506,233.62,1 73 | 2014-01-02 01:22:22.188587,233.61,6 74 | 2014-01-02 01:22:22.188697,233.6,4 75 | 2014-01-02 01:22:22.188698,233.56,4 76 | 2014-01-02 01:22:35.438119,233.57,10 77 | 2014-01-02 01:22:35.648711,233.57,1 78 | 2014-01-02 01:22:36.083792,233.57,4 79 | 2014-01-02 01:22:46.225302,233.59,6 80 | 2014-01-02 01:22:46.225463,233.58,4 81 | 2014-01-02 01:22:46.225628,233.57,5 82 | 2014-01-02 01:22:48.950610,233.63,1 83 | 2014-01-02 01:22:58.549233,233.63,1 84 | 2014-01-02 01:22:58.549418,233.62,1 85 | 2014-01-02 01:22:58.549678,233.61,3 86 | 2014-01-02 01:22:58.549850,233.6,2 87 | 2014-01-02 01:22:58.549939,233.59,6 88 | 2014-01-02 01:22:58.550125,233.58,2 89 | 2014-01-02 01:23:14.675881,233.81,1 90 | 2014-01-02 01:23:15.063253,233.98,2 91 | 2014-01-02 01:23:15.064138,234.0,1 92 | 2014-01-02 01:23:15.064552,234.0,1 93 | 2014-01-02 01:23:53.010632,233.96,1 94 | 2014-01-02 01:24:03.303936,233.96,1 95 | 2014-01-02 01:24:03.642230,233.96,1 96 | 2014-01-02 01:24:03.668364,233.96,1 97 | 2014-01-02 01:24:03.835391,233.96,10 98 | 2014-01-02 01:24:03.837066,233.96,1 99 | 2014-01-02 01:24:13.853747,234.07,1 100 | 2014-01-02 01:24:13.853779,234.06,3 101 | 2014-01-02 01:24:13.853958,234.05,1 102 | 2014-01-02 01:24:13.854047,234.03,3 103 | 2014-01-02 01:24:14.155417,234.03,1 104 | 2014-01-02 01:25:34.611623,233.94,2 105 | 2014-01-02 01:26:38.545414,233.96,1 106 | 2014-01-02 01:27:02.240667,234.03,6 107 | 2014-01-02 01:27:05.819041,234.1,1 108 | 2014-01-02 01:27:37.508351,234.15,1 109 | 2014-01-02 01:27:39.993286,234.18,1 110 | 2014-01-02 01:27:39.993440,234.17,3 111 | 2014-01-02 01:27:39.993764,234.13,6 112 | 2014-01-02 01:27:51.823029,234.21,3 113 | 2014-01-02 01:27:51.823178,234.2,6 114 | 2014-01-02 01:27:52.423179,234.2,1 115 | 2014-01-02 01:28:17.953408,234.18,1 116 | 2014-01-02 01:28:57.250726,234.2,1 117 | 2014-01-02 01:29:04.071206,234.2,1 118 | 2014-01-02 01:29:07.084489,234.2,6 119 | 2014-01-02 01:29:07.085038,234.2,2 120 | 2014-01-02 01:29:28.016118,234.26,2 121 | 2014-01-02 01:29:28.016292,234.25,1 122 | 2014-01-02 01:29:28.027625,234.25,7 123 | 2014-01-02 01:29:36.825111,234.3,1 124 | 2014-01-02 01:29:41.022708,234.28,1 125 | 2014-01-02 01:29:58.009578,234.28,1 126 | 2014-01-02 01:29:58.115158,234.28,13 127 | 2014-01-02 01:30:02.741455,234.26,1 128 | 2014-01-02 01:30:05.716287,234.42,7 129 | 2014-01-02 01:30:05.775705,234.42,1 130 | 2014-01-02 01:30:05.823506,234.42,3 131 | 2014-01-02 01:30:06.566083,234.42,7 132 | 2014-01-02 01:30:06.624855,234.42,1 133 | 2014-01-02 01:30:06.726910,234.42,1 134 | 2014-01-02 01:30:06.927916,234.42,1 135 | 2014-01-02 01:30:07.184972,234.42,7 136 | 2014-01-02 01:30:07.773351,234.42,2 137 | 2014-01-02 01:30:08.066172,234.42,4 138 | 2014-01-02 01:31:46.957918,233.94,6 139 | 2014-01-02 01:31:53.637943,233.94,6 140 | 2014-01-02 01:32:18.333852,234.0,1 141 | 2014-01-02 01:32:38.654624,234.06,26 142 | 2014-01-02 01:33:11.163967,234.12,1 143 | 2014-01-02 01:33:16.302907,234.19,32 144 | 2014-01-02 01:33:19.077985,234.2,30 145 | 2014-01-02 01:33:34.846016,234.25,20 146 | 2014-01-02 01:35:18.302349,233.85,1 147 | 2014-01-02 01:35:30.368511,233.85,30 148 | 2014-01-02 01:36:59.576543,233.75,20 149 | 2014-01-02 01:39:48.114283,233.58,1 150 | 2014-01-02 01:39:55.852318,233.55,1 151 | 2014-01-02 01:40:04.613399,233.33,1 152 | 2014-01-02 01:40:15.334491,233.12,2 153 | 2014-01-02 01:41:40.418417,233.5,1 154 | 2014-01-02 01:43:18.682817,233.36,1 155 | 2014-01-02 01:43:29.565308,233.18,1 156 | 2014-01-02 01:43:29.674331,233.18,1 157 | 2014-01-02 01:43:32.840962,233.12,1 158 | 2014-01-02 01:43:34.104197,233.0,2 159 | 2014-01-02 01:43:34.107478,233.0,1 160 | 2014-01-02 01:43:34.536456,232.96,1 161 | 2014-01-02 01:45:09.592781,233.06,1 162 | 2014-01-02 01:45:25.687330,233.03,1 163 | 2014-01-02 01:48:48.261272,233.11,1 164 | 2014-01-02 01:54:05.263906,233.2,1 165 | 2014-01-02 01:54:56.070653,233.23,2 166 | 2014-01-02 01:55:24.027855,233.22,1 167 | 2014-01-02 02:00:01.301379,233.07,1 168 | 2014-01-02 02:09:56.793689,232.7,1 169 | 2014-01-02 02:14:14.711283,232.5,1 170 | 2014-01-02 02:17:27.654789,232.7,1 171 | 2014-01-02 02:22:53.947514,232.84,2 172 | 2014-01-02 02:24:02.351302,232.8,2 173 | 2014-01-02 02:37:15.401003,232.22,1 174 | 2014-01-02 02:37:41.768814,232.01,1 175 | 2014-01-02 02:38:42.466675,231.78,1 176 | 2014-01-02 02:41:17.801405,232.03,2 177 | 2014-01-02 02:42:33.927575,231.85,1 178 | 2014-01-02 02:42:51.672083,231.8,1 179 | 2014-01-02 02:43:27.675855,231.45,1 180 | 2014-01-02 02:43:59.472432,231.57,1 181 | 2014-01-02 02:48:25.204309,231.76,1 182 | 2014-01-02 02:49:23.831215,231.75,1 183 | 2014-01-02 02:50:29.638078,231.5,1 184 | 2014-01-02 02:50:32.182858,231.46,1 185 | 2014-01-02 02:50:54.498575,231.42,1 186 | 2014-01-02 02:51:03.251978,231.5,1 187 | 2014-01-02 03:01:51.683593,231.5,1 188 | 2014-01-02 03:02:09.137587,231.52,1 189 | 2014-01-02 03:06:17.678129,231.57,2 190 | 2014-01-02 03:11:56.595309,231.68,1 191 | 2014-01-02 03:17:38.788880,231.89,1 192 | 2014-01-02 03:18:58.085457,231.77,1 193 | 2014-01-02 03:19:57.796183,231.71,1 194 | 2014-01-02 03:21:32.130421,231.5,1 195 | 2014-01-02 03:21:44.706462,231.4,2 196 | 2014-01-02 03:25:18.761661,231.27,1 197 | 2014-01-02 03:25:22.873568,231.2,1 198 | 2014-01-02 03:25:59.067886,231.35,1 199 | 2014-01-02 03:27:35.372989,231.48,1 200 | 2014-01-02 03:32:02.444125,231.8,1 201 | 2014-01-02 03:41:31.690353,231.67,1 202 | 2014-01-02 03:42:45.182654,231.8,1 203 | 2014-01-02 03:48:03.876156,231.83,1 204 | 2014-01-02 03:48:07.057782,231.84,1 205 | 2014-01-02 03:50:48.327042,232.01,1 206 | 2014-01-02 03:51:21.260088,232.35,1 207 | 2014-01-02 03:52:22.613115,232.25,1 208 | 2014-01-02 05:03:09.695827,232.3,4 209 | 2014-01-02 05:03:12.709372,232.3,3 210 | 2014-01-02 05:03:14.081786,232.3,1 211 | 2014-01-02 05:03:18.071779,232.3,2 212 | 2014-01-02 05:03:29.101640,232.3,1 213 | 2014-01-02 05:03:38.447350,232.3,3 214 | 2014-01-02 05:07:24.259171,232.13,1 215 | 2014-01-02 05:07:25.715734,232.16,1 216 | 2014-01-02 05:15:11.338651,232.07,3 217 | 2014-01-02 05:15:11.442118,232.07,1 218 | 2014-01-02 05:15:11.443597,232.07,1 219 | 2014-01-02 05:15:11.535078,232.07,1 220 | 2014-01-02 05:15:11.574801,232.07,1 221 | 2014-01-02 05:50:18.618189,232.33,1 222 | 2014-01-02 05:54:31.097315,232.27,1 223 | 2014-01-02 05:55:27.057171,232.5,1 224 | 2014-01-02 05:55:27.118070,232.57,1 225 | 2014-01-02 05:56:15.097961,232.7,1 226 | 2014-01-02 05:56:17.769662,232.7,1 227 | 2014-01-02 05:56:46.987705,232.78,1 228 | 2014-01-02 05:57:27.354956,232.78,7 229 | 2014-01-02 05:57:40.822115,232.78,1 230 | 2014-01-02 05:57:41.214669,232.78,1 231 | 2014-01-02 05:57:41.786190,232.78,1 232 | 2014-01-02 05:57:53.733433,232.78,1 233 | 2014-01-02 05:59:58.584004,233.0,2 234 | 2014-01-02 06:01:38.525522,233.15,1 235 | 2014-01-02 06:02:09.588141,233.04,1 236 | 2014-01-02 06:05:02.324734,233.2,1 237 | 2014-01-02 06:05:03.106198,233.2,1 238 | 2014-01-02 06:06:00.883703,233.18,1 239 | 2014-01-02 06:25:19.709491,233.13,10 240 | 2014-01-02 06:27:44.064305,233.07,1 241 | 2014-01-02 06:31:48.477396,232.65,1 242 | 2014-01-02 06:35:35.904379,232.6,1 243 | 2014-01-02 06:43:36.951378,232.58,1 244 | 2014-01-02 06:43:59.193577,232.57,1 245 | 2014-01-02 06:45:51.320171,232.68,1 246 | 2014-01-02 06:49:03.477736,232.61,1 247 | 2014-01-02 06:49:17.161502,232.56,1 248 | 2014-01-02 06:53:20.726352,232.5,1 249 | 2014-01-02 06:57:07.432829,232.6,1 250 | 2014-01-02 07:00:54.931335,232.7,5 251 | 2014-01-02 07:02:18.234632,232.73,1 252 | 2014-01-02 07:02:21.901932,232.73,1 253 | 2014-01-02 07:02:27.271412,232.73,1 254 | 2014-01-02 07:02:33.697229,232.73,1 255 | 2014-01-02 07:02:52.363462,232.73,1 256 | 2014-01-02 07:06:58.922052,232.59,1 257 | 2014-01-02 07:06:58.922218,232.59,1 258 | 2014-01-02 07:06:59.018587,232.59,1 259 | 2014-01-02 07:07:23.753207,232.4,1 260 | 2014-01-02 07:08:41.044242,232.38,8 261 | 2014-01-02 07:11:30.754504,232.49,2 262 | 2014-01-02 07:18:49.549174,232.45,1 263 | 2014-01-02 07:29:32.202953,232.62,5 264 | 2014-01-02 07:41:40.082567,232.63,1 265 | 2014-01-02 07:48:14.298023,232.66,8 266 | 2014-01-02 07:51:25.501573,232.6,1 267 | 2014-01-02 07:52:48.219426,232.58,1 268 | 2014-01-02 07:54:42.114379,232.6,1 269 | 2014-01-02 07:55:58.984604,232.74,1 270 | 2014-01-02 07:55:59.190695,232.74,1 271 | 2014-01-02 07:56:31.557125,232.71,1 272 | 2014-01-02 07:58:29.061510,232.82,1 273 | 2014-01-02 07:58:33.884088,232.81,1 274 | 2014-01-02 07:59:01.756782,232.85,1 275 | 2014-01-02 07:59:30.519842,232.9,1 276 | 2014-01-02 07:59:58.497827,232.85,2 277 | 2014-01-02 08:00:24.023655,232.85,1 278 | 2014-01-02 08:02:24.604034,232.81,1 279 | 2014-01-02 08:03:25.391236,232.79,5 280 | 2014-01-02 08:04:26.688300,232.77,1 281 | 2014-01-02 08:05:53.019171,232.75,1 282 | 2014-01-02 08:06:48.734532,232.72,1 283 | 2014-01-02 08:07:25.160328,232.69,1 284 | 2014-01-02 08:08:31.034361,232.69,1 285 | 2014-01-02 08:09:46.885938,232.67,1 286 | 2014-01-02 08:11:35.136284,232.69,1 287 | 2014-01-02 08:13:02.088329,232.76,1 288 | 2014-01-02 08:13:39.831056,232.76,1 289 | -------------------------------------------------------------------------------- /session4.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "------------------------------------------\n", 8 | "\n", 9 | "## Session contents\n", 10 | "### [10. Working with Timestamps](#timestamps)\n", 11 | "### [11. Resampling data](#resampling_data)\n", 12 | "### [12. Merging by time](#merge_time)\n", 13 | "### [Exercise set 4](#exercises4)" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "-------------------\n", 21 | "\n", 22 | "## 10. Working with Timestamps\n", 23 | "\n", 24 | "Since financial data are often time series, it makes sense to set the index of our DataFrame to be the timestamp of each event. Pandas provides a __pd.to_datetime()__ function to convert strings or other date objects into datetime64 objects that pandas likes to work with.\n", 25 | "\n", 26 | "When a column of timestamps (datetime64 objects) is set as the index, a lot of time-series methods for the DataFrame become available.\n", 27 | "\n", 28 | "
\n", 29 | "\n", 30 | "Additional resources\n", 31 | "\n", 32 | "http://pandas.pydata.org/pandas-docs/stable/timeseries.html\n", 33 | "http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/03.11-Working-with-Time-Series.ipynb#Dates-and-times-in-pandas:-best-of-both-worlds\n", 34 | "\n", 35 | "http://pandas.pydata.org/pandas-docs/version/0.19.0/generated/pandas.DataFrame.asof.html\n" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "
\n", 43 | "\n", 44 | "If you have just started here or would like to refresh your df_qte object, run the line below." 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "import pandas as pd\n", 54 | "import numpy as np\n", 55 | "import datetime as dt" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "df_qte = pd.read_csv('data/quote.csv')\n", 65 | "df_qte.columns = ['Index', 'Time', 'BidPrice', 'BidSize', 'AskPrice', 'AskSize']\n", 66 | "df_qte.drop(columns='Index', inplace=True)" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "df_qte.head()" 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "metadata": {}, 81 | "source": [ 82 | "We would like to replace the index of our DataFrame with the values in the column 'Time', converting these values into datetime objects in the process. \n", 83 | "\n", 84 | "Check for yourself what type of an object the values in column 'Time' are currently stored as." 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "metadata": {}, 97 | "source": [ 98 | "Were we to set the Time column as the index right now, pandas would not recognise it as an index of timestamps.\n", 99 | "\n", 100 | "Let's try out the pd.to_datetime function on the first Time value." 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": {}, 113 | "source": [ 114 | "This function works for a variety of time objects and formats." 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "pd.to_datetime(dt.datetime(2017, 5, 5, 12, 30))" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "pd.to_datetime('2017-05-05 12:30')" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "pd.to_datetime(1493987400000000000)" 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "metadata": {}, 147 | "source": [ 148 | "The latter format is called Unix time or Unix epoch and is the standard high-precision timestamp format in our trading systems. It is defined as the number of seconds (or milli/micro/nanoseconds) since midnight 1 January 1970 (1/1/1970 00:00:00 GMT)." 149 | ] 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "metadata": {}, 154 | "source": [ 155 | "We can set Time as a DateTimeIndex by first converting the column's values to pandas Timestamp objects, then setting it as the index in the usual way." 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "df_qte['Time'] = pd.to_datetime(df_qte['Time'])\n", 165 | "df_qte = df_qte.set_index('Time')" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [ 174 | "df_qte.head()" 175 | ] 176 | }, 177 | { 178 | "cell_type": "markdown", 179 | "metadata": {}, 180 | "source": [ 181 | "Check that the index of the DataFrame is now a DatetimeIndex." 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [] 190 | }, 191 | { 192 | "cell_type": "markdown", 193 | "metadata": { 194 | "collapsed": true 195 | }, 196 | "source": [ 197 | "The DateTimeIndex is a lot more flexible than a regular index in how data can be selected. We can use loc with a datetime object to return a row at that exact time." 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": null, 203 | "metadata": {}, 204 | "outputs": [], 205 | "source": [ 206 | "df_qte.loc[dt.datetime(2014, 1, 2, 1, 8, 22, 692413)]" 207 | ] 208 | }, 209 | { 210 | "cell_type": "markdown", 211 | "metadata": {}, 212 | "source": [ 213 | "We can also use loc with a datetime string to return all rows that match the specified datetime, up to the level of detail provided." 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": null, 219 | "metadata": {}, 220 | "outputs": [], 221 | "source": [ 222 | "df_qte.loc['2014-01-02 00:45']" 223 | ] 224 | }, 225 | { 226 | "cell_type": "markdown", 227 | "metadata": {}, 228 | "source": [ 229 | "Slicing with datetime strings allows for selection of a specified time period." 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": null, 235 | "metadata": {}, 236 | "outputs": [], 237 | "source": [ 238 | "df_qte.loc['2014-01-02 00:00':'2014-01-02 01:14']" 239 | ] 240 | }, 241 | { 242 | "cell_type": "markdown", 243 | "metadata": {}, 244 | "source": [ 245 | "To select the same time period across multiple days, we can use the between_time method. This can be useful for separating out morning and afternoon trading sessions, or filtering out the auction period." 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": null, 251 | "metadata": { 252 | "scrolled": true 253 | }, 254 | "outputs": [], 255 | "source": [ 256 | "df_qte.between_time('00:00', '01:15')" 257 | ] 258 | }, 259 | { 260 | "cell_type": "markdown", 261 | "metadata": {}, 262 | "source": [ 263 | "Finally, the df.asof() method takes a timestamp and returns the most recent non-NaN row. Try finding the last quote prices as at 01:15 with this method." 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": null, 269 | "metadata": {}, 270 | "outputs": [], 271 | "source": [] 272 | }, 273 | { 274 | "cell_type": "markdown", 275 | "metadata": {}, 276 | "source": [ 277 | "
\n", 278 | "### Working with timezones - OneTick\n", 279 | "\n", 280 | "Depending on the timezone parameter selected in your otq query, your timestamps may be imported as a tz-aware object. This is an object that belongs to the datetime (dt) package. \n", 281 | "\n", 282 | "If you would like to convert a tz-aware index into a tz-naïve index, see the following Stackoverflow post. This is especially relevant when you are pulling data from different databases in OneTick.\n", 283 | "\n", 284 | "http://stackoverflow.com/questions/16628819/convert-pandas-timezone-aware-datetimeindex-to-naive-timestamp-but-in-certain-t\n" 285 | ] 286 | }, 287 | { 288 | "cell_type": "markdown", 289 | "metadata": {}, 290 | "source": [ 291 | "[Back to top](#top)" 292 | ] 293 | }, 294 | { 295 | "cell_type": "markdown", 296 | "metadata": {}, 297 | "source": [ 298 | "------------\n", 299 | "\n", 300 | "## 11. Resampling data\n", 301 | "\n", 302 | "The main method used to resample data is __df.resample__, which is available when the DataFrame/Series object has a valid DatetimeIndex.\n", 303 | "\n", 304 | "It is common to combine resampling with __dropna()__ or __fillna()__ methods as resampling upwards (higher frequency than the original data) will create null values." 305 | ] 306 | }, 307 | { 308 | "cell_type": "markdown", 309 | "metadata": {}, 310 | "source": [ 311 | "
\n", 312 | "\n", 313 | "YouTube video\n", 314 | "\n", 315 | "Watch the following video until around the 2 hour 13 minute mark.\n", 316 | "\n", 317 | "https://www.youtube.com/watch?v=JNfxr4BQrLk&start=6956" 318 | ] 319 | }, 320 | { 321 | "cell_type": "markdown", 322 | "metadata": {}, 323 | "source": [ 324 | "**Key methods covered:**\n", 325 | "\n", 326 | " df.resample - similar to .asfreq\n", 327 | " df.fillna - fills NaNs according to specified logic" 328 | ] 329 | }, 330 | { 331 | "cell_type": "markdown", 332 | "metadata": {}, 333 | "source": [ 334 | "
\n", 335 | "\n", 336 | "Additional resources\n", 337 | "\n", 338 | "http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/03.11-Working-with-Time-Series.ipynb#Resampling,-Shifting,-and-Windowing" 339 | ] 340 | }, 341 | { 342 | "cell_type": "markdown", 343 | "metadata": {}, 344 | "source": [ 345 | "
\n", 346 | "\n", 347 | "If you have just started here or would like to refresh your df_qte object, run the line below." 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": null, 353 | "metadata": {}, 354 | "outputs": [], 355 | "source": [ 356 | "import pandas as pd\n", 357 | "import numpy as np\n", 358 | "import datetime as dt" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": null, 364 | "metadata": {}, 365 | "outputs": [], 366 | "source": [ 367 | "df_qte = pd.read_csv('data/quote.csv')\n", 368 | "df_qte.columns = ['Index', 'Time', 'BidPrice', 'BidSize', 'AskPrice', 'AskSize']\n", 369 | "df_qte.drop(columns='Index', inplace=True)\n", 370 | "df_qte['Time'] = pd.to_datetime(df_qte['Time'])" 371 | ] 372 | }, 373 | { 374 | "cell_type": "markdown", 375 | "metadata": { 376 | "collapsed": true 377 | }, 378 | "source": [ 379 | "Let's try resampling the data into 5 minute buckets using the df.resample() method.\n", 380 | "\n", 381 | "http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.resample.html" 382 | ] 383 | }, 384 | { 385 | "cell_type": "markdown", 386 | "metadata": {}, 387 | "source": [ 388 | "If we simply call .resample with a frequency (e.g. '100ms', '1s', '5min', '2h', '1D'), we will end up with a DatetimeIndexResampler object." 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": null, 394 | "metadata": {}, 395 | "outputs": [], 396 | "source": [ 397 | "df_qte = df_qte.set_index('Time')" 398 | ] 399 | }, 400 | { 401 | "cell_type": "markdown", 402 | "metadata": {}, 403 | "source": [ 404 | "This is because we have yet to specify how we want to resample the data. Try applying a __.last(), .mean()__, or __.sum()__ operation after .resample. In what situations would each be the appropriate operation to use?" 405 | ] 406 | }, 407 | { 408 | "cell_type": "code", 409 | "execution_count": null, 410 | "metadata": { 411 | "scrolled": true 412 | }, 413 | "outputs": [], 414 | "source": [ 415 | "df_qte.resample('20min').mean()" 416 | ] 417 | }, 418 | { 419 | "cell_type": "code", 420 | "execution_count": null, 421 | "metadata": {}, 422 | "outputs": [], 423 | "source": [] 424 | }, 425 | { 426 | "cell_type": "code", 427 | "execution_count": null, 428 | "metadata": {}, 429 | "outputs": [], 430 | "source": [] 431 | }, 432 | { 433 | "cell_type": "markdown", 434 | "metadata": {}, 435 | "source": [ 436 | "Resampling often results in NaNs. This happens when no data comes through within that sample period. \n", 437 | "\n", 438 | "It is appropriate to forward fill the NaNs if the data represents updates to the state of some object (e.g. an order book, a volatility curve, an autotrader's parameters). If the data represents individual events like trade ticks, then forward filling will overcount trade volumes. In this case, it is more appropriate to fill NaNs with 0, to drop the NaNs, or to simply keep them in the DataFrame." 439 | ] 440 | }, 441 | { 442 | "cell_type": "markdown", 443 | "metadata": {}, 444 | "source": [ 445 | "Sometimes our data consists of distinct groups and we want to apply a resampling operation - for instance, resample all relative expiries of a volatility surface. Let's load up that data again." 446 | ] 447 | }, 448 | { 449 | "cell_type": "code", 450 | "execution_count": null, 451 | "metadata": {}, 452 | "outputs": [], 453 | "source": [ 454 | "df_vols = pd.read_csv('data/vols.csv')\n", 455 | "df_vols.TIMESTAMP = pd.to_datetime(df_vols.TIMESTAMP)\n", 456 | "df_vols = df_vols.set_index('TIMESTAMP', drop=True)\n", 457 | "df_vols.head()" 458 | ] 459 | }, 460 | { 461 | "cell_type": "markdown", 462 | "metadata": {}, 463 | "source": [ 464 | "Try resampling and getting the first update every 10 minutes." 465 | ] 466 | }, 467 | { 468 | "cell_type": "code", 469 | "execution_count": null, 470 | "metadata": {}, 471 | "outputs": [], 472 | "source": [] 473 | }, 474 | { 475 | "cell_type": "markdown", 476 | "metadata": {}, 477 | "source": [ 478 | "This operation isn't what we wanted, since we get a relative expiry at random for each time bucket. We can instead perform a resample after a __groupby__ before getting the first entry, so that we resample each relative expiry properly.\n", 479 | "\n", 480 | "Try getting the volatility values for each relative expiry every 10 mins by:\n", 481 | "1. Grouping by RELATIVE_EXPIRY\n", 482 | "2. Resampling\n", 483 | "3. Selecting only the VOLATILITY column\n", 484 | "4. Unstacking RELATIVE_EXPIRY from the index to the columns.\n", 485 | "5. Forward-filling NaNs, if any." 486 | ] 487 | }, 488 | { 489 | "cell_type": "code", 490 | "execution_count": null, 491 | "metadata": {}, 492 | "outputs": [], 493 | "source": [] 494 | }, 495 | { 496 | "cell_type": "code", 497 | "execution_count": null, 498 | "metadata": {}, 499 | "outputs": [], 500 | "source": [] 501 | }, 502 | { 503 | "cell_type": "code", 504 | "execution_count": null, 505 | "metadata": {}, 506 | "outputs": [], 507 | "source": [] 508 | }, 509 | { 510 | "cell_type": "code", 511 | "execution_count": null, 512 | "metadata": {}, 513 | "outputs": [], 514 | "source": [ 515 | "#solutions\n", 516 | "%load solutions/resample_sol1.py" 517 | ] 518 | }, 519 | { 520 | "cell_type": "markdown", 521 | "metadata": {}, 522 | "source": [ 523 | "[Back to top](#top)" 524 | ] 525 | }, 526 | { 527 | "cell_type": "markdown", 528 | "metadata": {}, 529 | "source": [ 530 | "------------\n", 531 | "\n", 532 | "# 12. Merging by time" 533 | ] 534 | }, 535 | { 536 | "cell_type": "markdown", 537 | "metadata": {}, 538 | "source": [ 539 | "The joins we covered in the previous session were __exact__ joins. That is, the data were only joined together if the values of each key were identical.\n", 540 | "\n", 541 | "If we want to join two time series together, we'll find that the timestamps rarely match exactly. The join we are after is usually along the lines of \"for each row in [left source], give me the most recent data from [right source]\".\n", 542 | "\n", 543 | "The classic example in trading data is joining trade and quote data together. We want to know, for each trade, what the state of the order book was at that time.\n", 544 | "\n", 545 | "Pandas implements this kind of join with the function\n", 546 | "\n", 547 | " pd.merge_asof()\n", 548 | " \n", 549 | "which is very similar to pd.merge. There a few additional arguments that are specific to pd.merge_asof:\n", 550 | "\n", 551 | " direction - whether to find the matching row from the right source, either 'backward' (default), 'forward' (i.e. next row), or 'nearest'\n", 552 | " tolerance - only match if the difference between indexes is below this number, e.g. tolerance=pd.Timedelta('1s') will only join the right source if its time was within 1 second of the left source\n", 553 | " by - do the time join for each value in these columns separately. E.g. by='FEEDCODE' will perform the join for each feedcode separately. " 554 | ] 555 | }, 556 | { 557 | "cell_type": "markdown", 558 | "metadata": {}, 559 | "source": [ 560 | "
\n", 561 | "\n", 562 | "Additional resources\n", 563 | "\n", 564 | "https://pandas.pydata.org/pandas-docs/stable/generated/pandas.merge_asof.html" 565 | ] 566 | }, 567 | { 568 | "cell_type": "markdown", 569 | "metadata": {}, 570 | "source": [ 571 | "Let's load up both quote and trade data." 572 | ] 573 | }, 574 | { 575 | "cell_type": "code", 576 | "execution_count": null, 577 | "metadata": {}, 578 | "outputs": [], 579 | "source": [ 580 | "df_qte = pd.read_csv('data/quote.csv')\n", 581 | "df_qte.columns = ['Index', 'Time', 'BidPrice', 'BidSize', 'AskPrice', 'AskSize']\n", 582 | "df_qte.Time = pd.to_datetime(df_qte.Time)\n", 583 | "df_qte.drop(columns='Index', inplace=True)\n", 584 | "df_qte = df_qte.set_index('Time', drop=True)\n", 585 | "df_qte.head()" 586 | ] 587 | }, 588 | { 589 | "cell_type": "code", 590 | "execution_count": null, 591 | "metadata": {}, 592 | "outputs": [], 593 | "source": [ 594 | "df_trd = pd.read_csv('data/trade.csv')\n", 595 | "df_trd.TIMESTAMP = pd.to_datetime(df_trd.TIMESTAMP)\n", 596 | "df_trd = df_trd.set_index('TIMESTAMP', drop=True)\n", 597 | "df_trd.head()" 598 | ] 599 | }, 600 | { 601 | "cell_type": "markdown", 602 | "metadata": {}, 603 | "source": [ 604 | "For each trade, we can join on the most recent quote with pd.merge. Since our join key, the timestamps, are in each DataFrame's index, we have to set left_index=True and right_index=True." 605 | ] 606 | }, 607 | { 608 | "cell_type": "code", 609 | "execution_count": null, 610 | "metadata": {}, 611 | "outputs": [], 612 | "source": [ 613 | "df = pd.merge_asof(df_trd, df_qte, left_index=True, right_index=True)\n", 614 | "df" 615 | ] 616 | }, 617 | { 618 | "cell_type": "markdown", 619 | "metadata": {}, 620 | "source": [ 621 | "It looks like our trade prices don't always match up with the best bid or offer, i.e. there are synchronisation issues. Try to calculate the percentage of trades that don't line up with our quote data, using conditional expressions." 622 | ] 623 | }, 624 | { 625 | "cell_type": "code", 626 | "execution_count": null, 627 | "metadata": {}, 628 | "outputs": [], 629 | "source": [] 630 | }, 631 | { 632 | "cell_type": "code", 633 | "execution_count": null, 634 | "metadata": {}, 635 | "outputs": [], 636 | "source": [ 637 | "#solutions\n", 638 | "%load solutions/time_sol1.py" 639 | ] 640 | }, 641 | { 642 | "cell_type": "markdown", 643 | "metadata": {}, 644 | "source": [ 645 | "[Back to top](#top)" 646 | ] 647 | }, 648 | { 649 | "cell_type": "markdown", 650 | "metadata": {}, 651 | "source": [ 652 | "------------\n", 653 | "\n", 654 | "# Exercise set 4" 655 | ] 656 | }, 657 | { 658 | "cell_type": "markdown", 659 | "metadata": {}, 660 | "source": [ 661 | "Calculate the 30-minute price change for each trade, and plot the price changes as a time series and a histogram. Your 30-minute delayed price should be the midpoint of the nearest quote update. (Hint: Use __pd.merge_asof__, __df.tshift__, and __df.plot.hist__.)" 662 | ] 663 | }, 664 | { 665 | "cell_type": "code", 666 | "execution_count": null, 667 | "metadata": {}, 668 | "outputs": [], 669 | "source": [] 670 | }, 671 | { 672 | "cell_type": "code", 673 | "execution_count": null, 674 | "metadata": {}, 675 | "outputs": [], 676 | "source": [] 677 | }, 678 | { 679 | "cell_type": "code", 680 | "execution_count": null, 681 | "metadata": {}, 682 | "outputs": [], 683 | "source": [] 684 | }, 685 | { 686 | "cell_type": "code", 687 | "execution_count": null, 688 | "metadata": {}, 689 | "outputs": [], 690 | "source": [] 691 | }, 692 | { 693 | "cell_type": "code", 694 | "execution_count": null, 695 | "metadata": {}, 696 | "outputs": [], 697 | "source": [] 698 | }, 699 | { 700 | "cell_type": "code", 701 | "execution_count": null, 702 | "metadata": {}, 703 | "outputs": [], 704 | "source": [] 705 | } 706 | ], 707 | "metadata": { 708 | "anaconda-cloud": {}, 709 | "kernelspec": { 710 | "display_name": "Python 2", 711 | "language": "python", 712 | "name": "python2" 713 | }, 714 | "language_info": { 715 | "codemirror_mode": { 716 | "name": "ipython", 717 | "version": 2 718 | }, 719 | "file_extension": ".py", 720 | "mimetype": "text/x-python", 721 | "name": "python", 722 | "nbconvert_exporter": "python", 723 | "pygments_lexer": "ipython2", 724 | "version": "2.7.14" 725 | } 726 | }, 727 | "nbformat": 4, 728 | "nbformat_minor": 1 729 | } 730 | -------------------------------------------------------------------------------- /session1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Data analysis with pandas\n", 8 | "\n", 9 | "# Introduction\n", 10 | "The goal of this course is to provide you with an overview of the pandas package and to introduce key functionalities. For each topic within the course, the plan is to begin by watching a brief video to gain a high-level understanding of the techniques. This is then supplemented with online documentation and hands-on exercises using data extracted from OneTick to deepen your knowledge. Please work through the course at your own pace and by making full use of the exercises contained in this notebook. Worked solutions are usually provided, but please do not refer to them until you've attempted the exercise yourself.\n" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "## Tips for using Jupyter notebooks efficiently" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "**Keyboard shortcuts**\n", 25 | "\n", 26 | "A list of keyboard shortcuts can be viewed by clicking Help > Keyboard Shortcuts in the notebook menu above. The most useful are\n", 27 | "\n", 28 | " Shift+Enter: run a cell\n", 29 | " Alt+Enter: run a cell and insert a new cell below\n", 30 | " Ctrl+/: comment/uncomment a line" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "**Autocompletion**\n", 38 | "\n", 39 | "To view and select from an object's available attributes, type a dot after the object's name then press the Tab key. First, let's import pandas." 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "import pandas as pd" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "Now uncomment the line below with Ctrl+/, place the cursor after the dot, then press Tab." 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "# pd." 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "**Documentation**\n", 72 | "\n", 73 | "To see a method's documentation without visiting the online page, open a pair of parentheses after the method's name, place the cursor between them, then press Shift+Tab. This brings up the method's \"signature\" (its list of inputs), and documentation. While still holding Shift, each futher press of the Tab key enlarges the window.\n", 74 | "\n", 75 | "Select the __pd.read_csv()__ method using the autocomplete menu, then bring up the method's documentation.\n", 76 | "\n", 77 | "Alternatively, you can run __help(pd.read_csv)__ to display the entire documentation as output in the cell." 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [ 84 | "**Error messages and debugging**\n", 85 | "\n", 86 | "When starting out with a new package, your code will likely contain bugs that will throw error messages. Please do not give up immediately - take the time to read what the error message says! It will usually spell out what went wrong and therefore how to fix it.\n", 87 | "\n", 88 | "It is often a good idea to isolate the issue to the smallest snippet of code that reproduces the error (a \"Minimal Working Example\").\n", 89 | "\n", 90 | "If you cannot figure out what the error message means, just run a Google search on your problem. Since pandas is such a widely used package, the first result will often be a Stack Overflow page with an accepted solution to your question. Also, check the documentation to make sure you actually understand the functionality that you're trying to use.\n", 91 | "\n", 92 | "If you still cannot solve the issue, then ask for assistance from your instructor or colleagues.\n", 93 | "\n", 94 | "This procedure will initially be less efficient than asking your colleagues to debug your code for you, but will help you improve and become more independent in your programming." 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": {}, 100 | "source": [ 101 | "**Trying something new in pandas**\n", 102 | "\n", 103 | "Pandas has been developed in such a way that it is very forgiving - if you think an operation should work, it most likely will! Don't be afraid to just try out the most obvious solution and see if it works.\n", 104 | "\n", 105 | "Pandas also has extensive mathematical and data analysis functionality, so don't re-invent the wheel - try searching through the documentation to see if a function already exists before writing your own code." 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": {}, 111 | "source": [ 112 | "**Saving your notebooks**\n", 113 | "\n", 114 | "You can save your notebooks with File > Save and Checkpoint or with Esc then s. Notebooks can also be downloaded with File > Download as > Notebook. If you're running the notebook on an external server, you'll need to download your notebook when you finish your session and re-upload it to continue your work." 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": {}, 120 | "source": [ 121 | "------------------------------------------\n", 122 | "\n", 123 | "## Session contents\n", 124 | "### [1. Data structures](#data_structures)\n", 125 | "### [2. Importing data](#importing_data)\n", 126 | "### [3. Selecting data](#selecting_data)\n", 127 | "### [Exercise set 1](#exercises1)" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "--------------------------------------\n", 135 | "\n", 136 | "\n", 137 | "## 1. Data structures\n", 138 | "\n", 139 | "There are two main data structures in the pandas library – a DataFrame object and a Series object. Both objects have axis labels that are collectively referred to as the index. You can think of a Series object as a 1-dimensional array where each entry has an *index label*. Similarly, a DataFrame is a 2-dimensional structure, akin to a matrix or a spreadsheet, but each row has a corresponding *index label*. The *index* is a sequential list of these *index labels*. It is possible to reference a subset of the data by calling the corresponding *index labels*." 140 | ] 141 | }, 142 | { 143 | "cell_type": "markdown", 144 | "metadata": {}, 145 | "source": [ 146 | "
\n", 147 | "\n", 148 | "YouTube video\n", 149 | "\n", 150 | "Watch until around the 22 minute mark to get a better understanding of data structures:\n", 151 | "\n", 152 | "https://www.youtube.com/watch?v=dye7rDktJ2E&start=678" 153 | ] 154 | }, 155 | { 156 | "cell_type": "markdown", 157 | "metadata": {}, 158 | "source": [ 159 | "**Key methods and attributes covered:**\n", 160 | "\n", 161 | " pd.read_csv() - imports csv as a DataFrame or Series object\n", 162 | " df.head() - returns the first few rows of a DataFrame\n", 163 | " type() - returns an object's type\n", 164 | " df.shape - the shape (dimension) of a DataFrame\n", 165 | " df.columns - a list of the DataFrame's columns\n", 166 | " df.dtypes - a list of the object types by column" 167 | ] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": {}, 172 | "source": [ 173 | "
\n", 174 | "\n", 175 | "Additional resources\n", 176 | "\n", 177 | "http://pandas.pydata.org/pandas-docs/stable/dsintro.html\n", 178 | "http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/03.01-Introducing-Pandas-Objects.ipynb\n" 179 | ] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "metadata": {}, 184 | "source": [ 185 | "[Back to top](#top)" 186 | ] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "metadata": { 191 | "collapsed": true 192 | }, 193 | "source": [ 194 | "----------------------------\n", 195 | "\n", 196 | "## 2. Importing the datasets\n" 197 | ] 198 | }, 199 | { 200 | "cell_type": "markdown", 201 | "metadata": { 202 | "collapsed": true 203 | }, 204 | "source": [ 205 | "Probably the most common method to import data is __pd.read_csv__, which takes a CSV (comma-separated variable) file and returns a pandas DataFrame. Pandas also supports importing Excel workbooks, text files, and the results of SQL queries. See the linked documentation for a full list of import functions and their arguments." 206 | ] 207 | }, 208 | { 209 | "cell_type": "markdown", 210 | "metadata": {}, 211 | "source": [ 212 | "You can load CSV files by using either the relative path from your notebook's working directory (e.g. 'name.csv'), or by giving the absolute path to the CSV file (e.g. 'H:/Desktop/name.csv').\n", 213 | "\n", 214 | " pd.read_csv('name.csv', names=[column1, column2, ...], index_col=True/False)\n", 215 | "\n", 216 | "\n", 217 | "
\n", 218 | "\n", 219 | "Additional resources\n", 220 | "\n", 221 | "http://pandas.pydata.org/pandas-docs/stable/io.html\n" 222 | ] 223 | }, 224 | { 225 | "cell_type": "markdown", 226 | "metadata": {}, 227 | "source": [ 228 | "Let's start by importing __quote.csv__ in the __data/__ folder into a DataFrame called __df_qte__, then calling __.head()__ on the DataFrame to view the first few rows." 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": null, 241 | "metadata": { 242 | "scrolled": true 243 | }, 244 | "outputs": [], 245 | "source": [ 246 | "# solutions\n", 247 | "%load solutions/import_sol1.py" 248 | ] 249 | }, 250 | { 251 | "cell_type": "markdown", 252 | "metadata": {}, 253 | "source": [ 254 | "As you can see, we now have a DataFrame that contains a day of quote data for a security.\n", 255 | "\n", 256 | "Bids are buy orders resting in the market, and asks/offers are the sell orders. Bids and offers have an associated price and size (\n", 257 | "\n", 258 | "Let's rename the columns so they're easier to read - change 'TIMESTAMP' to 'Timestamp', 'BID_SIZE' to 'BidSize', and so on.\n", 259 | "\n", 260 | "Try this in the cell below setting the __.columns__ attribute to equal a new list of column names." 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": null, 266 | "metadata": {}, 267 | "outputs": [], 268 | "source": [] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": null, 273 | "metadata": {}, 274 | "outputs": [], 275 | "source": [ 276 | "# solutions\n", 277 | "%load solutions/import_sol2.py" 278 | ] 279 | }, 280 | { 281 | "cell_type": "markdown", 282 | "metadata": {}, 283 | "source": [ 284 | "Let's double check that the type of data structure is indeed a DataFrame." 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": null, 290 | "metadata": {}, 291 | "outputs": [], 292 | "source": [ 293 | "type(df_qte)" 294 | ] 295 | }, 296 | { 297 | "cell_type": "markdown", 298 | "metadata": {}, 299 | "source": [ 300 | "Also check the object type of the 'BidPrice' column." 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": null, 306 | "metadata": {}, 307 | "outputs": [], 308 | "source": [ 309 | "type(df_qte.BidPrice)" 310 | ] 311 | }, 312 | { 313 | "cell_type": "markdown", 314 | "metadata": {}, 315 | "source": [ 316 | "[Back to top](#top)" 317 | ] 318 | }, 319 | { 320 | "cell_type": "markdown", 321 | "metadata": {}, 322 | "source": [ 323 | "--------------------\n", 324 | "\n", 325 | "## 3. Selecting the data" 326 | ] 327 | }, 328 | { 329 | "cell_type": "markdown", 330 | "metadata": {}, 331 | "source": [ 332 | "You can select columns in a DataFrame object df with either of the following:\n", 333 | "\n", 334 | " df['column_name']\n", 335 | " df.column_name\n", 336 | " \n", 337 | "The former will always work, whereas the latter has some caveats:\n", 338 | "1. It can't be used if the column name contains spaces, e.g. a column named 'white space' cannot be accessed with df.white space.\n", 339 | "2. It can only be used to refer to an existing column, not create a new column.\n", 340 | "\n", 341 | "Since Series and DataFrame objects are ordered, we can slice the data by specifying either label (using the __.loc__ method) or position (using the __.iloc__ method). There is also the __.ix__ method, which tries label-based indexing first but reverts to position-based indexing if a label is not found.\n", 342 | "\n", 343 | "Columns should usually be selected by their label rather than position, since column ordering is often arbitrary (like in a spreadsheet). Rows, on the other hand, may be selected by label or position depending on the context, e.g. the 100th trade (position) or the trade at 12:30pm (label)." 344 | ] 345 | }, 346 | { 347 | "cell_type": "markdown", 348 | "metadata": {}, 349 | "source": [ 350 | "
\n", 351 | "It is best to avoid using __.ix__ if possible, in favour of the more explicit .loc and .iloc methods. See the following page for more details about how to avoid using .ix.\n", 352 | "https://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated" 353 | ] 354 | }, 355 | { 356 | "cell_type": "markdown", 357 | "metadata": {}, 358 | "source": [ 359 | "
\n", 360 | "\n", 361 | "YouTube video\n", 362 | "\n", 363 | "Watch the video until around the 40 minute mark for more detail about these three methods of slicing.\n", 364 | "\n", 365 | "\n", 366 | "https://www.youtube.com/watch?v=dye7rDktJ2E&start=1602" 367 | ] 368 | }, 369 | { 370 | "cell_type": "markdown", 371 | "metadata": {}, 372 | "source": [ 373 | "
\n", 374 | "If you would like to modify data within your DataFrame using the above methods (eg. df.ix[0,0] = ‘newvalue’), be wary of the difference between working on a copy of the DataFrame and the DataFrame itself. See the answer to the following Stackoverflow question for more details.\n", 375 | "http://stackoverflow.com/questions/17995328/changing-values-in-pandas-dataframe-doenst-work\n" 376 | ] 377 | }, 378 | { 379 | "cell_type": "markdown", 380 | "metadata": {}, 381 | "source": [ 382 | "**Key methods covered:**\n", 383 | "\n", 384 | " df.loc - selects by explicit index (index name)\n", 385 | " df.iloc - selects by implicit index (index number)\n", 386 | " df.ix - combines loc and iloc functionality" 387 | ] 388 | }, 389 | { 390 | "cell_type": "markdown", 391 | "metadata": {}, 392 | "source": [ 393 | "
\n", 394 | "\n", 395 | "Additional resources\n", 396 | "\n", 397 | "http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/03.02-Data-Indexing-and-Selection.ipynb\n", 398 | "\n", 399 | "http://pandas.pydata.org/pandas-docs/stable/10min.html#viewing-data\n" 400 | ] 401 | }, 402 | { 403 | "cell_type": "markdown", 404 | "metadata": {}, 405 | "source": [ 406 | "To familiarise ourselves with the loc and iloc methods, let's have a play around with this DataFrame.\n", 407 | "\n", 408 | "First, try to use .loc to access the first row." 409 | ] 410 | }, 411 | { 412 | "cell_type": "code", 413 | "execution_count": null, 414 | "metadata": {}, 415 | "outputs": [], 416 | "source": [] 417 | }, 418 | { 419 | "cell_type": "code", 420 | "execution_count": null, 421 | "metadata": {}, 422 | "outputs": [], 423 | "source": [ 424 | "#solutions\n", 425 | "%load solutions/reference_sol0.py" 426 | ] 427 | }, 428 | { 429 | "cell_type": "markdown", 430 | "metadata": {}, 431 | "source": [ 432 | "You can see that the data contained a column called 'Index', which is separate from the actual index of this DataFrame.\n", 433 | "\n", 434 | "Let's drop this 'Index' column using the __.drop__ method to avoid confusion." 435 | ] 436 | }, 437 | { 438 | "cell_type": "code", 439 | "execution_count": null, 440 | "metadata": {}, 441 | "outputs": [], 442 | "source": [] 443 | }, 444 | { 445 | "cell_type": "code", 446 | "execution_count": null, 447 | "metadata": { 448 | "scrolled": true 449 | }, 450 | "outputs": [], 451 | "source": [ 452 | "#solutions\n", 453 | "%load solutions/reference_sol1.py" 454 | ] 455 | }, 456 | { 457 | "cell_type": "markdown", 458 | "metadata": {}, 459 | "source": [ 460 | "Let's use a simpler DataFrame to illustrate the difference between these methods. Say we have a dictionary called my_dict:" 461 | ] 462 | }, 463 | { 464 | "cell_type": "code", 465 | "execution_count": null, 466 | "metadata": {}, 467 | "outputs": [], 468 | "source": [ 469 | "my_dict = {\n", 470 | " 'Index': ['Monday', 'Tuesday', 'Wednesday', 'Thursday'],\n", 471 | " 'optiver_turnover': [ 46386, 43775, 75742, 17474],\n", 472 | " 'total_turnover': [278837, 439771, 583722, 358834],\n", 473 | " 'volatility': [ 12.5, 14.0, 21.5, 16.0],\n", 474 | " }\n", 475 | "my_dict" 476 | ] 477 | }, 478 | { 479 | "cell_type": "markdown", 480 | "metadata": {}, 481 | "source": [ 482 | "We can create a DataFrame from this dictionary by simply passing this dict into pd.DataFrame" 483 | ] 484 | }, 485 | { 486 | "cell_type": "code", 487 | "execution_count": null, 488 | "metadata": {}, 489 | "outputs": [], 490 | "source": [ 491 | "df = pd.DataFrame(my_dict)\n", 492 | "df" 493 | ] 494 | }, 495 | { 496 | "cell_type": "markdown", 497 | "metadata": {}, 498 | "source": [ 499 | "However, it is quite clear that instead of using the default index range(4), it would make more sense to set the column named 'Index' as the index of the DataFrame. \n", 500 | "\n", 501 | "\n", 502 | "There are two ways to do this. Either we can replace the .index of this DataFrame with the column 'Index'\n", 503 | "\n", 504 | " df.index = df['Index']\n", 505 | " " 506 | ] 507 | }, 508 | { 509 | "cell_type": "code", 510 | "execution_count": null, 511 | "metadata": {}, 512 | "outputs": [], 513 | "source": [ 514 | "df.index = df['Index']\n", 515 | "df" 516 | ] 517 | }, 518 | { 519 | "cell_type": "markdown", 520 | "metadata": {}, 521 | "source": [ 522 | "but then we would have 'Index' as both our index and as a duplicated column 'Index'. This is why we use the __.set_index__ method of a DataFrame (with drop=True to delete the current index column instead of moving it back into the DataFrame).\n", 523 | " \n", 524 | " df.set_index('Index', drop=True)\n", 525 | "\n", 526 | "Note that df.set_index creates a new DataFrame object by default (so df will be unchanged). We can either overwrite df\n", 527 | "\n", 528 | " df = df.set_index('Index', drop=True)\n", 529 | " \n", 530 | "or modify our existing DataFrame instead with inplace=True\n", 531 | " \n", 532 | " df.set_index('Index', drop=True, inplace=True).\n", 533 | " \n", 534 | "Be aware that inplace operations do not work when \"chaining\" methods together, e.g.\n", 535 | "\n", 536 | " df.drop('Index', 1).set_index('Timestamp', drop=True).\n", 537 | " \n", 538 | "If you don't yet see when and why inplace operations can fail, it's better to avoid them for now." 539 | ] 540 | }, 541 | { 542 | "cell_type": "code", 543 | "execution_count": null, 544 | "metadata": {}, 545 | "outputs": [], 546 | "source": [ 547 | "df.set_index('Index', drop=True, inplace=True)\n", 548 | "df" 549 | ] 550 | }, 551 | { 552 | "cell_type": "markdown", 553 | "metadata": {}, 554 | "source": [ 555 | "If we decide later that we want 'Index' to be a regular column again, we can use the __.reset_index__ to do this.\n", 556 | " \n", 557 | " df.reset_index(inplace=True)\n", 558 | " \n", 559 | "A new default index will be generated." 560 | ] 561 | }, 562 | { 563 | "cell_type": "markdown", 564 | "metadata": {}, 565 | "source": [ 566 | "Using the __.loc__ and/or __.iloc__ methods, try to return Wednesday's volatility." 567 | ] 568 | }, 569 | { 570 | "cell_type": "code", 571 | "execution_count": null, 572 | "metadata": {}, 573 | "outputs": [], 574 | "source": [] 575 | }, 576 | { 577 | "cell_type": "code", 578 | "execution_count": null, 579 | "metadata": {}, 580 | "outputs": [], 581 | "source": [] 582 | }, 583 | { 584 | "cell_type": "code", 585 | "execution_count": null, 586 | "metadata": {}, 587 | "outputs": [], 588 | "source": [ 589 | "#solutions\n", 590 | "%load solutions/reference_sol2.py" 591 | ] 592 | }, 593 | { 594 | "cell_type": "markdown", 595 | "metadata": {}, 596 | "source": [ 597 | "Now, use __.loc__ to set a volatility of 17.0 on Friday." 598 | ] 599 | }, 600 | { 601 | "cell_type": "code", 602 | "execution_count": null, 603 | "metadata": {}, 604 | "outputs": [], 605 | "source": [] 606 | }, 607 | { 608 | "cell_type": "code", 609 | "execution_count": null, 610 | "metadata": {}, 611 | "outputs": [], 612 | "source": [ 613 | "#solutions\n", 614 | "%load solutions/reference_sol3.py" 615 | ] 616 | }, 617 | { 618 | "cell_type": "markdown", 619 | "metadata": {}, 620 | "source": [ 621 | "You'll notice that the turnover columns are missing data on Friday (indicated by __NaN__). We will look at methods for dealing with missing data later in the course." 622 | ] 623 | }, 624 | { 625 | "cell_type": "markdown", 626 | "metadata": {}, 627 | "source": [ 628 | "[Back to top](#top)" 629 | ] 630 | }, 631 | { 632 | "cell_type": "markdown", 633 | "metadata": {}, 634 | "source": [ 635 | "-------------------\n", 636 | "\n", 637 | "## Exercise set 1\n", 638 | "\n", 639 | "The following exercises refer to quote.csv (df_qte) data. They should done with code, rather than just viewing the contents of the DataFrame in a cell. Your code should return the answer, and not any extra unneccesary data.\n", 640 | "\n", 641 | "1. How many quote updates were received during the day?\n", 642 | "2. What is the midpoint (average price) of the quotes at the 500th update of the day?\n", 643 | "3. What are the bid and ask sizes at the 100th last update of the day? (Do this in a single line of code.)" 644 | ] 645 | }, 646 | { 647 | "cell_type": "code", 648 | "execution_count": null, 649 | "metadata": {}, 650 | "outputs": [], 651 | "source": [] 652 | }, 653 | { 654 | "cell_type": "code", 655 | "execution_count": null, 656 | "metadata": {}, 657 | "outputs": [], 658 | "source": [] 659 | }, 660 | { 661 | "cell_type": "code", 662 | "execution_count": null, 663 | "metadata": {}, 664 | "outputs": [], 665 | "source": [] 666 | }, 667 | { 668 | "cell_type": "code", 669 | "execution_count": null, 670 | "metadata": {}, 671 | "outputs": [], 672 | "source": [] 673 | }, 674 | { 675 | "cell_type": "code", 676 | "execution_count": null, 677 | "metadata": {}, 678 | "outputs": [], 679 | "source": [] 680 | }, 681 | { 682 | "cell_type": "code", 683 | "execution_count": null, 684 | "metadata": {}, 685 | "outputs": [], 686 | "source": [] 687 | }, 688 | { 689 | "cell_type": "code", 690 | "execution_count": null, 691 | "metadata": {}, 692 | "outputs": [], 693 | "source": [] 694 | }, 695 | { 696 | "cell_type": "code", 697 | "execution_count": null, 698 | "metadata": {}, 699 | "outputs": [], 700 | "source": [] 701 | }, 702 | { 703 | "cell_type": "code", 704 | "execution_count": null, 705 | "metadata": {}, 706 | "outputs": [], 707 | "source": [] 708 | }, 709 | { 710 | "cell_type": "markdown", 711 | "metadata": {}, 712 | "source": [ 713 | "[Back to top](#top)" 714 | ] 715 | } 716 | ], 717 | "metadata": { 718 | "anaconda-cloud": {}, 719 | "kernelspec": { 720 | "display_name": "Python 2", 721 | "language": "python", 722 | "name": "python2" 723 | }, 724 | "language_info": { 725 | "codemirror_mode": { 726 | "name": "ipython", 727 | "version": 2 728 | }, 729 | "file_extension": ".py", 730 | "mimetype": "text/x-python", 731 | "name": "python", 732 | "nbconvert_exporter": "python", 733 | "pygments_lexer": "ipython2", 734 | "version": "2.7.14" 735 | } 736 | }, 737 | "nbformat": 4, 738 | "nbformat_minor": 1 739 | } 740 | -------------------------------------------------------------------------------- /session2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "------------------------------------------\n", 8 | "\n", 9 | "## Session contents\n", 10 | "### [4. Setting data](#setting_data)\n", 11 | "### [5. Numerical operations and aggregations](#numerical)\n", 12 | "### [6. Cleaning and filtering data](#cleaning_and_filtering)\n", 13 | "### [Exercise set 2](#exercises2)\n" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "----------------------------\n", 21 | "\n", 22 | "## 4. Setting data\n", 23 | "\n", 24 | "When working with a data set it is useful to be able to append data, modify existing data, or create new data that is derived in some way from existing data. We can do this by using the indexing operations from the previous session, as well as some new pandas functions." 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "**Key methods covered:**\n", 32 | "\n", 33 | " pd.concat() - concatenates a list of Series or DataFrame objects together\n", 34 | " df.append() - like pd.concat() but as a method on a DataFrame" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "
\n", 42 | "\n", 43 | "Additional resources\n", 44 | "\n", 45 | "http://pandas.pydata.org/pandas-docs/stable/10min.html#setting\n", 46 | "\n", 47 | "http://pandas.pydata.org/pandas-docs/stable/merging.html" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "We'll try some basic operations on a DataFrame that contains a few different data types." 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "import pandas as pd\n", 64 | "import numpy as np\n", 65 | "import datetime as dt" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "df = pd.DataFrame(\n", 75 | " index=['First', 'Second', 'Third', 'Fourth', 'Fifth'],\n", 76 | " data={'A': range(1, 6),\n", 77 | " 'B': np.random.random(5),\n", 78 | " 'C': list('UVWXY')})\n", 79 | "df" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "Let's check the datatypes of each column before proceeding." 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "df.dtypes" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "As a simple example of derived values, try multiplying the entire DataFrame by two - is the result what you expected?" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "df * 2" 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "metadata": {}, 117 | "source": [ 118 | "Now try dividing the DataFrame by two and take note of the error message. Which lines of the error message are most valuable in debugging your code?" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [ 127 | "df / 2" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "Try making a new column 'D' that contains the natural logarithm of column 'A'. You will need to use the numpy library." 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "#solutions\n", 151 | "%load solutions/setting_sol1.py" 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": {}, 157 | "source": [ 158 | "Data can be overwritten by using loc - try overwriting row 0, column A with the value $\\pi$ and view the DataFrame again." 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": null, 164 | "metadata": {}, 165 | "outputs": [], 166 | "source": [] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [ 174 | "#solutions\n", 175 | "%load solutions/setting_sol2.py" 176 | ] 177 | }, 178 | { 179 | "cell_type": "markdown", 180 | "metadata": {}, 181 | "source": [ 182 | "Now check the data types." 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [ 191 | "df.dtypes" 192 | ] 193 | }, 194 | { 195 | "cell_type": "markdown", 196 | "metadata": {}, 197 | "source": [ 198 | "You should notice that column A is now a float64 rather than an int64 type. Numpy and pandas automatically convert your columns to the most general data type that your data shares - in this case, a floating point number in the first value resulted in __all__ values being floating point numbers." 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "metadata": {}, 205 | "outputs": [], 206 | "source": [ 207 | "df" 208 | ] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "metadata": {}, 213 | "source": [ 214 | "Using __.loc__ with a new index value appends new rows and/or columns as necessary (called \"setting with enlargement\")." 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": null, 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [ 223 | "df.loc['Sixth'] = [6, np.random.random(), 'Z', np.log(6)]\n", 224 | "df" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": null, 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [ 233 | "df.loc['Seventh', 'E'] = dt.datetime.now()\n", 234 | "df" 235 | ] 236 | }, 237 | { 238 | "cell_type": "markdown", 239 | "metadata": {}, 240 | "source": [ 241 | "As an alternative to loc, we can use the append method to add new rows to a DataFrame (where the 'other' argument is a dict/Series or another DataFrame)\n", 242 | "\n", 243 | " df = df.append(other)\n", 244 | " \n", 245 | "Note that, unlike a list's append method which is an inplace operation, a DataFrame's append method returns a new DataFrame object." 246 | ] 247 | }, 248 | { 249 | "cell_type": "markdown", 250 | "metadata": {}, 251 | "source": [ 252 | "Try append now by creating a new Series object and appending it to df." 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": null, 258 | "metadata": {}, 259 | "outputs": [], 260 | "source": [] 261 | }, 262 | { 263 | "cell_type": "markdown", 264 | "metadata": {}, 265 | "source": [ 266 | "If we have multiple Series or DataFrame objects to concatenate, we can avoid multiple append statements by using a single concat statement\n", 267 | "\n", 268 | " df = pd.concat([df1, df2, df3])\n", 269 | " \n", 270 | "This function has many more options than append, but we'll skip over these for now." 271 | ] 272 | }, 273 | { 274 | "cell_type": "markdown", 275 | "metadata": {}, 276 | "source": [ 277 | "Try using concat in the cells below." 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": null, 283 | "metadata": {}, 284 | "outputs": [], 285 | "source": [] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": null, 290 | "metadata": {}, 291 | "outputs": [], 292 | "source": [] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": null, 297 | "metadata": {}, 298 | "outputs": [], 299 | "source": [] 300 | }, 301 | { 302 | "cell_type": "markdown", 303 | "metadata": {}, 304 | "source": [ 305 | "[Back to top](#top)" 306 | ] 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "metadata": {}, 311 | "source": [ 312 | "-------------------\n", 313 | "\n", 314 | "## 5. Numerical operations and aggregations\n", 315 | "\n", 316 | "Since pandas objects are constructed from numpy arrays, we can use numerical functions from the numpy package on our Series and DataFrame objects. It is conventional to import numpy in the following way\n", 317 | "\n", 318 | " import numpy as np\n", 319 | " \n", 320 | "Pandas objects also have a variety of useful numerical and statistical methods." 321 | ] 322 | }, 323 | { 324 | "cell_type": "markdown", 325 | "metadata": {}, 326 | "source": [ 327 | "** Key methods covered: **\n", 328 | " \n", 329 | " np.where()\n", 330 | " df.where()\n", 331 | " df.mean(), df.min(), df.max(), etc.\n", 332 | " df.diff(), df.sum()\n", 333 | " " 334 | ] 335 | }, 336 | { 337 | "cell_type": "markdown", 338 | "metadata": {}, 339 | "source": [ 340 | "
\n", 341 | "\n", 342 | "Additional resources\n", 343 | "\n", 344 | "http://pandas.pydata.org/pandas-docs/stable/10min.html#operations" 345 | ] 346 | }, 347 | { 348 | "cell_type": "markdown", 349 | "metadata": {}, 350 | "source": [ 351 | "Let's return to our DataFrame from earlier." 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": null, 357 | "metadata": {}, 358 | "outputs": [], 359 | "source": [ 360 | "df = pd.DataFrame(index=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday'],\n", 361 | " data={'optiver_turnover': [ 46386, 43775, 75742, 17474, np.nan],\n", 362 | " 'total_turnover': [278837, 439771, 583722, 358834, np.nan],\n", 363 | " 'volatility': [ 12.5, 14.0, 21.5, 16.0, 17.0]})\n", 364 | "df" 365 | ] 366 | }, 367 | { 368 | "cell_type": "markdown", 369 | "metadata": {}, 370 | "source": [ 371 | "Try adding Optiver's market share (optiver_turnover divided by total_turnover as a percentage) as a column in df called 'market_share':" 372 | ] 373 | }, 374 | { 375 | "cell_type": "code", 376 | "execution_count": null, 377 | "metadata": {}, 378 | "outputs": [], 379 | "source": [] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": null, 384 | "metadata": {}, 385 | "outputs": [], 386 | "source": [ 387 | "#solutions\n", 388 | "%load solutions/numerical_sol1.py" 389 | ] 390 | }, 391 | { 392 | "cell_type": "markdown", 393 | "metadata": {}, 394 | "source": [ 395 | "Almost all numpy functions can be applied directly to a Series or DataFrame. For example, let's round the market share percentage to two decimal places using __np.round__." 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": null, 401 | "metadata": {}, 402 | "outputs": [], 403 | "source": [ 404 | "df['market_share'] = np.round(df['market_share'], 2)\n", 405 | "df" 406 | ] 407 | }, 408 | { 409 | "cell_type": "markdown", 410 | "metadata": {}, 411 | "source": [ 412 | "DataFrames have a number of basic statistical methods, such as __mean()__, __min()__, __max()__, __std()__, and __quantile(q)__ (where q=0.5 is the median). By default they are applied column-by-column and ignore missing data. Try a few of them in the cells below." 413 | ] 414 | }, 415 | { 416 | "cell_type": "code", 417 | "execution_count": null, 418 | "metadata": {}, 419 | "outputs": [], 420 | "source": [ 421 | "df.mean()" 422 | ] 423 | }, 424 | { 425 | "cell_type": "code", 426 | "execution_count": null, 427 | "metadata": {}, 428 | "outputs": [], 429 | "source": [] 430 | }, 431 | { 432 | "cell_type": "code", 433 | "execution_count": null, 434 | "metadata": {}, 435 | "outputs": [], 436 | "source": [] 437 | }, 438 | { 439 | "cell_type": "code", 440 | "execution_count": null, 441 | "metadata": {}, 442 | "outputs": [], 443 | "source": [] 444 | }, 445 | { 446 | "cell_type": "code", 447 | "execution_count": null, 448 | "metadata": {}, 449 | "outputs": [], 450 | "source": [] 451 | }, 452 | { 453 | "cell_type": "markdown", 454 | "metadata": {}, 455 | "source": [ 456 | "They can also be applied row-by-row, by adding the argument axis=1. Again, try a few of these below." 457 | ] 458 | }, 459 | { 460 | "cell_type": "code", 461 | "execution_count": null, 462 | "metadata": {}, 463 | "outputs": [], 464 | "source": [] 465 | }, 466 | { 467 | "cell_type": "code", 468 | "execution_count": null, 469 | "metadata": {}, 470 | "outputs": [], 471 | "source": [] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "execution_count": null, 476 | "metadata": {}, 477 | "outputs": [], 478 | "source": [] 479 | }, 480 | { 481 | "cell_type": "markdown", 482 | "metadata": {}, 483 | "source": [ 484 | "DataFrames also have a __describe__ method that calculates a number of summary statistics simultaneously." 485 | ] 486 | }, 487 | { 488 | "cell_type": "code", 489 | "execution_count": null, 490 | "metadata": {}, 491 | "outputs": [], 492 | "source": [ 493 | "df.describe()" 494 | ] 495 | }, 496 | { 497 | "cell_type": "markdown", 498 | "metadata": {}, 499 | "source": [ 500 | "As well as statistical methods, basic aggregation methods are also available. Try calculating the weekly optiver turnover and total turnover using __sum__." 501 | ] 502 | }, 503 | { 504 | "cell_type": "code", 505 | "execution_count": null, 506 | "metadata": {}, 507 | "outputs": [], 508 | "source": [] 509 | }, 510 | { 511 | "cell_type": "code", 512 | "execution_count": null, 513 | "metadata": {}, 514 | "outputs": [], 515 | "source": [ 516 | "#solutions\n", 517 | "%load solutions/numerical_sol2.py" 518 | ] 519 | }, 520 | { 521 | "cell_type": "markdown", 522 | "metadata": {}, 523 | "source": [ 524 | "Other useful methods include __diff__, __cumsum__, and __rank__. Try these below and interpret the meaning of the output." 525 | ] 526 | }, 527 | { 528 | "cell_type": "code", 529 | "execution_count": null, 530 | "metadata": {}, 531 | "outputs": [], 532 | "source": [] 533 | }, 534 | { 535 | "cell_type": "code", 536 | "execution_count": null, 537 | "metadata": {}, 538 | "outputs": [], 539 | "source": [] 540 | }, 541 | { 542 | "cell_type": "code", 543 | "execution_count": null, 544 | "metadata": {}, 545 | "outputs": [], 546 | "source": [] 547 | }, 548 | { 549 | "cell_type": "markdown", 550 | "metadata": {}, 551 | "source": [ 552 | "Comparison operators (<, <=, >, >=, ==, !=) will turn out to be very useful when filtering DataFrames. As an example, let's create a column called 'good_market_share' which is True when market share exceeded 10% and false otherwise." 553 | ] 554 | }, 555 | { 556 | "cell_type": "code", 557 | "execution_count": null, 558 | "metadata": {}, 559 | "outputs": [], 560 | "source": [] 561 | }, 562 | { 563 | "cell_type": "code", 564 | "execution_count": null, 565 | "metadata": {}, 566 | "outputs": [], 567 | "source": [ 568 | "#solutions\n", 569 | "%load solutions/numerical_sol3.py" 570 | ] 571 | }, 572 | { 573 | "cell_type": "markdown", 574 | "metadata": {}, 575 | "source": [ 576 | "The resulting data type is Boolean (True or False). Any comparison with missing data returns False. One interesting way to use Boolean columns is to apply the sum operator - True and False are converted to 1 and 0 respectively, so the result is the number of times the condition was true." 577 | ] 578 | }, 579 | { 580 | "cell_type": "code", 581 | "execution_count": null, 582 | "metadata": {}, 583 | "outputs": [], 584 | "source": [ 585 | "print 'Number of days with good market share:', df['good_market_share'].sum()" 586 | ] 587 | }, 588 | { 589 | "cell_type": "markdown", 590 | "metadata": {}, 591 | "source": [ 592 | "Try making a column called 'high_volatility' which is True when volatility is greater than 20." 593 | ] 594 | }, 595 | { 596 | "cell_type": "code", 597 | "execution_count": null, 598 | "metadata": {}, 599 | "outputs": [], 600 | "source": [] 601 | }, 602 | { 603 | "cell_type": "code", 604 | "execution_count": null, 605 | "metadata": {}, 606 | "outputs": [], 607 | "source": [ 608 | "#solutions\n", 609 | "%load solutions/numerical_sol4.py" 610 | ] 611 | }, 612 | { 613 | "cell_type": "markdown", 614 | "metadata": {}, 615 | "source": [ 616 | "We can combine Boolean arrays together with all the usual logical operators.\n", 617 | "\n", 618 | " low_volatility = ~ df['high_volatility'] # NOT operator\n", 619 | " positive_and_even = (df.A > 0) & (mod(df.A, 2) == 0) # AND operator (note: the brackets around the conditional statements are necessary)\n", 620 | " positive_or_even = (df.A > 0) | (mod(df.A, 2) == 0) # OR operator (not exclusive-or/XOR)\n", 621 | " positive_or_even_but_not_both = (df.A > 0) ^ (mod(df.A, 2) == 0) # XOR operator\n", 622 | " \n", 623 | "Try creating a column which is True when we have high volatility and good market share." 624 | ] 625 | }, 626 | { 627 | "cell_type": "code", 628 | "execution_count": null, 629 | "metadata": {}, 630 | "outputs": [], 631 | "source": [] 632 | }, 633 | { 634 | "cell_type": "code", 635 | "execution_count": null, 636 | "metadata": {}, 637 | "outputs": [], 638 | "source": [ 639 | "#solutions\n", 640 | "%load solutions/numerical_sol5.py" 641 | ] 642 | }, 643 | { 644 | "cell_type": "markdown", 645 | "metadata": {}, 646 | "source": [ 647 | "Finally, __np.where__ provides a useful way of mapping True/False values to other values. See if you can make a 'commentary' column which has the value 'LowVol' if volatility is low and 'HighVol' if volatility is high." 648 | ] 649 | }, 650 | { 651 | "cell_type": "code", 652 | "execution_count": null, 653 | "metadata": {}, 654 | "outputs": [], 655 | "source": [] 656 | }, 657 | { 658 | "cell_type": "code", 659 | "execution_count": null, 660 | "metadata": {}, 661 | "outputs": [], 662 | "source": [ 663 | "#solutions\n", 664 | "%load solutions/numerical_sol6.py" 665 | ] 666 | }, 667 | { 668 | "cell_type": "markdown", 669 | "metadata": {}, 670 | "source": [ 671 | "The DataFrame method __df.where__ is similar to np.where, but can only modify values where the condition is False." 672 | ] 673 | }, 674 | { 675 | "cell_type": "markdown", 676 | "metadata": {}, 677 | "source": [ 678 | "[Back to top](#top)" 679 | ] 680 | }, 681 | { 682 | "cell_type": "markdown", 683 | "metadata": {}, 684 | "source": [ 685 | "--------------\n", 686 | "\n", 687 | "## 6. Cleaning and filtering data\n", 688 | "\n", 689 | "Cleaning data prior to analysis is one of the most essential steps in ensuring the outputs of our analysis are accurate and can be relied upon.\n", 690 | "\n", 691 | "For example, during the auction period the bid and ask prices can often take unrealistic values. It might even be a good idea for some projects to remove the volatile auction period altogether. We can use some techniques to remove certain blocks of data from our DataFrame.\n", 692 | "\n", 693 | "The easiest way to do this is to apply a Boolean mask. For example, the line of code below would only return the subset of the original DataFrame where column1 was positive.\n", 694 | "\n", 695 | " df[df[column1] > 0]\n", 696 | "\n", 697 | "It works by applying your Boolean criteria to column1 which yields a list of True and False values. This is then used to determine which rows of the DataFrame are retained (keep True, discard False).\n", 698 | "Alternatively, you may wish to remove segments of your data (for example, to avoid the opening and closing auctions). The code below shows how Boolean masks can be applied to the index of a DataFrame too.\n", 699 | "\n", 700 | " df[df.index.hour > 9] \n", 701 | "\n", 702 | "The key methods are __df.isnull(), df.dropna(), df.fillna()__. " 703 | ] 704 | }, 705 | { 706 | "cell_type": "markdown", 707 | "metadata": {}, 708 | "source": [ 709 | "
\n", 710 | "\n", 711 | "YouTube video\n", 712 | "\n", 713 | "Watch the video below until the 2 hour and 5 minute mark.\n", 714 | "\n", 715 | "\n", 716 | "https://www.youtube.com/watch?v=6ohWS7J1hVA&start=6661" 717 | ] 718 | }, 719 | { 720 | "cell_type": "markdown", 721 | "metadata": {}, 722 | "source": [ 723 | "**Key methods covered:**\n", 724 | "\n", 725 | " df.drop - drops certain rows/columns\n", 726 | " masking - returning a subsection of the DataFrame according to certain criteria" 727 | ] 728 | }, 729 | { 730 | "cell_type": "markdown", 731 | "metadata": {}, 732 | "source": [ 733 | "\n", 734 | "Refer to the Python Data Science Handbook below for more information.\n" 735 | ] 736 | }, 737 | { 738 | "cell_type": "markdown", 739 | "metadata": {}, 740 | "source": [ 741 | "
\n", 742 | "\n", 743 | "Additional resources\n", 744 | "\n", 745 | "http://pandas.pydata.org/pandas-docs/stable/indexing.html#boolean-indexing\n", 746 | "http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/03.04-Missing-Values.ipynb\n" 747 | ] 748 | }, 749 | { 750 | "cell_type": "markdown", 751 | "metadata": {}, 752 | "source": [ 753 | "
\n", 754 | "\n", 755 | "If you have just started here or would like to refresh your df_qte object, run the line below." 756 | ] 757 | }, 758 | { 759 | "cell_type": "code", 760 | "execution_count": null, 761 | "metadata": {}, 762 | "outputs": [], 763 | "source": [ 764 | "%load solutions/cleaning_start.py" 765 | ] 766 | }, 767 | { 768 | "cell_type": "markdown", 769 | "metadata": {}, 770 | "source": [ 771 | "
" 772 | ] 773 | }, 774 | { 775 | "cell_type": "markdown", 776 | "metadata": {}, 777 | "source": [ 778 | "So far the df_qte DataFrame appears to be behaving as it should - but are there any hidden gotchas?\n", 779 | "\n", 780 | "A good way to check this is to visualise the data. Pandas has some basic built in plotting functionality which allows us to plot DataFrame or Series objects. First run \n", 781 | ">%matplotlib inline \n", 782 | "\n", 783 | "to ensure figures display inline rather than as pop-up windows, and then try to __.plot()__ the bid and ask prices. (see: http://pandas.pydata.org/pandas-docs/stable/visualization.html)\n", 784 | "\n", 785 | "What do you notice? " 786 | ] 787 | }, 788 | { 789 | "cell_type": "code", 790 | "execution_count": null, 791 | "metadata": {}, 792 | "outputs": [], 793 | "source": [] 794 | }, 795 | { 796 | "cell_type": "code", 797 | "execution_count": null, 798 | "metadata": {}, 799 | "outputs": [], 800 | "source": [ 801 | "%matplotlib inline" 802 | ] 803 | }, 804 | { 805 | "cell_type": "code", 806 | "execution_count": null, 807 | "metadata": {}, 808 | "outputs": [], 809 | "source": [ 810 | "df_qte[['BidPrice', 'AskPrice']].plot()" 811 | ] 812 | }, 813 | { 814 | "cell_type": "markdown", 815 | "metadata": {}, 816 | "source": [ 817 | "It's quite clear from the above charts that we've got some 0 values in our data. Check for yourself by using a mask." 818 | ] 819 | }, 820 | { 821 | "cell_type": "code", 822 | "execution_count": null, 823 | "metadata": {}, 824 | "outputs": [], 825 | "source": [] 826 | }, 827 | { 828 | "cell_type": "code", 829 | "execution_count": null, 830 | "metadata": { 831 | "scrolled": true 832 | }, 833 | "outputs": [], 834 | "source": [ 835 | "#solutions\n", 836 | "%load solutions/clean_sol1.py" 837 | ] 838 | }, 839 | { 840 | "cell_type": "markdown", 841 | "metadata": { 842 | "collapsed": true 843 | }, 844 | "source": [ 845 | "It would probably be a good idea (in most cases) to filter out these entries with a value 0 or less. We can do this by replacing the DataFrame with a subset of itself where bid and ask prices are positive." 846 | ] 847 | }, 848 | { 849 | "cell_type": "code", 850 | "execution_count": null, 851 | "metadata": {}, 852 | "outputs": [], 853 | "source": [] 854 | }, 855 | { 856 | "cell_type": "code", 857 | "execution_count": null, 858 | "metadata": {}, 859 | "outputs": [], 860 | "source": [ 861 | "#solutions\n", 862 | "%load solutions/clean_sol2.py" 863 | ] 864 | }, 865 | { 866 | "cell_type": "markdown", 867 | "metadata": {}, 868 | "source": [ 869 | "If we revisit the charts of bid and ask prices, we should find the numbers much more reasonable now." 870 | ] 871 | }, 872 | { 873 | "cell_type": "code", 874 | "execution_count": null, 875 | "metadata": {}, 876 | "outputs": [], 877 | "source": [] 878 | }, 879 | { 880 | "cell_type": "code", 881 | "execution_count": null, 882 | "metadata": {}, 883 | "outputs": [], 884 | "source": [ 885 | "df_qte.BidPrice.plot()" 886 | ] 887 | }, 888 | { 889 | "cell_type": "code", 890 | "execution_count": null, 891 | "metadata": {}, 892 | "outputs": [], 893 | "source": [ 894 | "df_qte.AskPrice.plot()" 895 | ] 896 | }, 897 | { 898 | "cell_type": "markdown", 899 | "metadata": {}, 900 | "source": [ 901 | "Since masks are simply lists of Boolean operators, we should be able to apply **multiple masks** to a DataFrame with ease. For example, if we have 2 criteria - A and B, and A returns a Series of \n", 902 | "\n", 903 | " [True, False] \n", 904 | " \n", 905 | "and B returns a Series of \n", 906 | "\n", 907 | " [False, False]\n", 908 | " \n", 909 | "applying both masks A and B will simply return \n", 910 | "\n", 911 | " [False, False]\n", 912 | "\n", 913 | "Remember, when we combine Boolean operators only 2 Trues will return a True.\n", 914 | "\n", 915 | "Let's try and apply 2 masks to our df_qte DataFrame - **BidSize >= 30 and BidSize <= 40**. Recall the format of masks!\n", 916 | "\n", 917 | " mask_name = df['column'] *criterion*\n", 918 | " \n", 919 | " df[mask_1 & mask_2]" 920 | ] 921 | }, 922 | { 923 | "cell_type": "code", 924 | "execution_count": null, 925 | "metadata": {}, 926 | "outputs": [], 927 | "source": [] 928 | }, 929 | { 930 | "cell_type": "markdown", 931 | "metadata": {}, 932 | "source": [ 933 | "Missing data may be represented in pandas using either Python's None or numpy's np.nan (rendered as NaN when displayed). We may encounter missing data if there are data quality issues, or we if we choose to set some (wrong or uninteresting) data as missing.\n", 934 | "\n", 935 | "There are a few options for dealing with missing data:\n", 936 | "\n", 937 | "Set to another number:\n", 938 | "\n", 939 | " - A fixed value (.fillna)\n", 940 | " - Fill forwards from the previous non-NaN value (.ffill)\n", 941 | " - Fill backwards from the next non-NaN value (.bfill)\n", 942 | " \n", 943 | "Remove rows or columns:\n", 944 | "\n", 945 | " df.dropna(how='any', axis=0) # drop row if any of its values are NaN\n", 946 | " df.dropna(how='any', axis=1) # drop column if any of its values are NaN\n", 947 | " df.dropna(how='all') # drop row if all values are NaN\n", 948 | " df.dropna(thresh=2) # keep row if 2 or more entries are not NaN\n", 949 | " \n", 950 | "Forward-fill is usually better to use than back-fill for trading data, since the current (missing) value is likely to be the most recent non-NaN value.\n", 951 | "\n", 952 | "If you need to do more complicated operations with NaNs, there are methods that return True or False if the data is or is not NaN.\n", 953 | "\n", 954 | " df.isnull() # or df.isna() - returns True if value is NaN and False otherwise\n", 955 | " df.notnull() # or df.notna() - returns False if value is NaN and True otherwise" 956 | ] 957 | }, 958 | { 959 | "cell_type": "markdown", 960 | "metadata": {}, 961 | "source": [ 962 | "Let's try dealing with zero-priced bids/offers with missing-data operations instead. We'll reload the data again." 963 | ] 964 | }, 965 | { 966 | "cell_type": "code", 967 | "execution_count": null, 968 | "metadata": {}, 969 | "outputs": [], 970 | "source": [ 971 | "%load solutions/cleaning_start.py" 972 | ] 973 | }, 974 | { 975 | "cell_type": "markdown", 976 | "metadata": {}, 977 | "source": [ 978 | "This time, set invalid bids and offers to np.nan." 979 | ] 980 | }, 981 | { 982 | "cell_type": "code", 983 | "execution_count": null, 984 | "metadata": {}, 985 | "outputs": [], 986 | "source": [] 987 | }, 988 | { 989 | "cell_type": "markdown", 990 | "metadata": {}, 991 | "source": [ 992 | "Now, forward fill the data." 993 | ] 994 | }, 995 | { 996 | "cell_type": "code", 997 | "execution_count": null, 998 | "metadata": {}, 999 | "outputs": [], 1000 | "source": [] 1001 | }, 1002 | { 1003 | "cell_type": "markdown", 1004 | "metadata": {}, 1005 | "source": [ 1006 | "Then drop any remaining NaNs." 1007 | ] 1008 | }, 1009 | { 1010 | "cell_type": "code", 1011 | "execution_count": null, 1012 | "metadata": {}, 1013 | "outputs": [], 1014 | "source": [] 1015 | }, 1016 | { 1017 | "cell_type": "markdown", 1018 | "metadata": {}, 1019 | "source": [ 1020 | "Finally, check the plots again to make sure our bids and offers are correct." 1021 | ] 1022 | }, 1023 | { 1024 | "cell_type": "code", 1025 | "execution_count": null, 1026 | "metadata": {}, 1027 | "outputs": [], 1028 | "source": [] 1029 | }, 1030 | { 1031 | "cell_type": "markdown", 1032 | "metadata": {}, 1033 | "source": [ 1034 | "[Back to top](#top)" 1035 | ] 1036 | }, 1037 | { 1038 | "cell_type": "markdown", 1039 | "metadata": {}, 1040 | "source": [ 1041 | "-------------------\n", 1042 | "\n", 1043 | "## Exercise set 2" 1044 | ] 1045 | }, 1046 | { 1047 | "cell_type": "markdown", 1048 | "metadata": {}, 1049 | "source": [ 1050 | "1. Using the cleaned df_qte dataframe, add the bid-ask midpoint as a column.\n", 1051 | "2. Calculate the following quantities.\n", 1052 | " - The average bid-ask spread in the product.\n", 1053 | " - The open/low/high/close of the midpoint." 1054 | ] 1055 | }, 1056 | { 1057 | "cell_type": "code", 1058 | "execution_count": null, 1059 | "metadata": {}, 1060 | "outputs": [], 1061 | "source": [] 1062 | }, 1063 | { 1064 | "cell_type": "code", 1065 | "execution_count": null, 1066 | "metadata": {}, 1067 | "outputs": [], 1068 | "source": [] 1069 | }, 1070 | { 1071 | "cell_type": "code", 1072 | "execution_count": null, 1073 | "metadata": {}, 1074 | "outputs": [], 1075 | "source": [] 1076 | }, 1077 | { 1078 | "cell_type": "code", 1079 | "execution_count": null, 1080 | "metadata": {}, 1081 | "outputs": [], 1082 | "source": [] 1083 | }, 1084 | { 1085 | "cell_type": "code", 1086 | "execution_count": null, 1087 | "metadata": {}, 1088 | "outputs": [], 1089 | "source": [] 1090 | } 1091 | ], 1092 | "metadata": { 1093 | "anaconda-cloud": {}, 1094 | "kernelspec": { 1095 | "display_name": "Python 2", 1096 | "language": "python", 1097 | "name": "python2" 1098 | }, 1099 | "language_info": { 1100 | "codemirror_mode": { 1101 | "name": "ipython", 1102 | "version": 2 1103 | }, 1104 | "file_extension": ".py", 1105 | "mimetype": "text/x-python", 1106 | "name": "python", 1107 | "nbconvert_exporter": "python", 1108 | "pygments_lexer": "ipython2", 1109 | "version": "2.7.14" 1110 | } 1111 | }, 1112 | "nbformat": 4, 1113 | "nbformat_minor": 1 1114 | } 1115 | -------------------------------------------------------------------------------- /session3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "------------------------------------------\n", 8 | "\n", 9 | "## Session contents\n", 10 | "### [7. Map, ApplyMap, and Apply](#applying)\n", 11 | "### [8. Groupby](#aggregating)\n", 12 | "### [9. Merge/Join](#merging)\n", 13 | "### [Exercise set 3](#)" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "import pandas as pd" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "pd.merge()" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": {}, 37 | "source": [ 38 | "----------------------\n", 39 | "\n", 40 | "## 7. Map, ApplyMap, and Apply\n", 41 | "\n", 42 | "We've seen that pandas objects come with simple aggregation methods, and that numpy functions allow numerical operations on pandas objects. You might wonder whether we can use functions from other packages, or apply our own user-defined functions, in a similar way.\n", 43 | "\n", 44 | "Pandas provides this functionality through the following methods:\n", 45 | "\n", 46 | " srs.map(f) - apply a function f element-wise to a Series (or DataFrame column)\n", 47 | " df.applymap(f) - apply a function f element-wise to the entire DataFrame (i.e. the DataFrame equivalent of .map)\n", 48 | " df.apply(f, axis) - apply a function f along columns (axis=0) or rows (axis=1) of a DataFrame\n", 49 | " \n", 50 | "\n", 51 | "\n", 52 | "Many of the functions we covered in the previous sessions are shorthand for these more general methods:\n", 53 | "\n", 54 | " np.log(df['A']) --> df['A'].map(np.log)\n", 55 | " df * 2 --> df.applymap(lambda x: x*2)\n", 56 | " df.sum(axis=1) --> df.apply(sum, axis=1)\n", 57 | " \n", 58 | "Generally, you should use map, apply, and applymap only if there is no Series or DataFrame method available. In this way, your code will be more readable and use any optimisations that pandas may have for these methods." 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "
\n", 66 | "\n", 67 | "Additional resources\n", 68 | "\n", 69 | "http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.apply.html\n", 70 | "\n", 71 | "http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.applymap.html#pandas.DataFrame.applymap\n" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "Let's first load up a test DataFrame." 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [ 87 | "import numpy as np\n", 88 | "import pandas as pd" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "df = pd.DataFrame(columns=list('ABCD'), data=np.random.randn(4, 4)*10)\n", 98 | "df" 99 | ] 100 | }, 101 | { 102 | "cell_type": "markdown", 103 | "metadata": {}, 104 | "source": [ 105 | "### ApplyMap" 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": {}, 111 | "source": [ 112 | "This is probably the simplest of the apply functions to understand. Let's convert every number in df to an int (dropping the decimal)." 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "df = df.applymap(int)\n", 122 | "df" 123 | ] 124 | }, 125 | { 126 | "cell_type": "markdown", 127 | "metadata": {}, 128 | "source": [ 129 | "We can create our own functions to apply to the DataFrame as well. Simple one-line functions can be declared using lambda functions. \n", 130 | "\n", 131 | "See http://www.secnetix.de/olli/Python/lambda_functions.hawk\n", 132 | "\n", 133 | "In the example below, we have formatted each element of the DataFrame as a percent." 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "df.applymap(lambda x: str(x)+'%')" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": {}, 148 | "source": [ 149 | "More complex functions can be applied by defining a function in the usual way." 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "def half_or_three_plus_one(x):\n", 159 | " \"\"\"Halve if even, triple and add one if odd\"\"\"\n", 160 | " if x==1:\n", 161 | " y = 1\n", 162 | " elif np.mod(x, 2)==0:\n", 163 | " y = x / 2\n", 164 | " else:\n", 165 | " y = 3*x + 1\n", 166 | " return int(y)" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "df.abs().applymap(int).applymap(half_or_three_plus_one)" 176 | ] 177 | }, 178 | { 179 | "cell_type": "markdown", 180 | "metadata": {}, 181 | "source": [ 182 | "### Map" 183 | ] 184 | }, 185 | { 186 | "cell_type": "markdown", 187 | "metadata": {}, 188 | "source": [ 189 | "This method provides the same functionality as applymap, but for Series objects. Try some of the same functions above but on a single column or row of the DataFrame only." 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": null, 195 | "metadata": {}, 196 | "outputs": [], 197 | "source": [] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "metadata": {}, 203 | "outputs": [], 204 | "source": [] 205 | }, 206 | { 207 | "cell_type": "markdown", 208 | "metadata": {}, 209 | "source": [ 210 | "### Apply" 211 | ] 212 | }, 213 | { 214 | "cell_type": "markdown", 215 | "metadata": {}, 216 | "source": [ 217 | "The apply method is a little more difficult to understand. It applies a particular function, often an aggregation, to each row or column independently. We've encountered a few examples of an apply-like method already, e.g., the df.sum() method." 218 | ] 219 | }, 220 | { 221 | "cell_type": "markdown", 222 | "metadata": {}, 223 | "source": [ 224 | "If we call .apply and use the sum function with axis=0, we will be summing up the rows (or, summing \"along the columns\") of our DataFrame." 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": null, 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [ 233 | "df.apply(sum, axis=0)" 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "metadata": {}, 239 | "source": [ 240 | "To sum along rows, simply pass axis=1 instead." 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": null, 246 | "metadata": {}, 247 | "outputs": [], 248 | "source": [ 249 | "df.apply(sum, axis=1)" 250 | ] 251 | }, 252 | { 253 | "cell_type": "markdown", 254 | "metadata": {}, 255 | "source": [ 256 | "When using .apply, the argument to your function is the row or column itself (which is of course a Series)." 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": null, 262 | "metadata": {}, 263 | "outputs": [], 264 | "source": [ 265 | "df" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": null, 271 | "metadata": {}, 272 | "outputs": [], 273 | "source": [ 274 | "df.apply(lambda x: x['C'], axis=1) # x is each row of the DataFrame" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": null, 280 | "metadata": {}, 281 | "outputs": [], 282 | "source": [ 283 | "df.apply(lambda x: x.iloc[-1], axis=0) # x is each column of the DataFrame" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": null, 289 | "metadata": {}, 290 | "outputs": [], 291 | "source": [ 292 | "def second_largest(srs):\n", 293 | " srs = srs.copy() # to make a local copy of the input\n", 294 | " srs = srs.abs()\n", 295 | " srs = srs.sort_values(ascending=True)\n", 296 | " return srs.iloc[-2]" 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": null, 302 | "metadata": {}, 303 | "outputs": [], 304 | "source": [ 305 | "df.apply(second_largest, axis=0)" 306 | ] 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "metadata": {}, 311 | "source": [ 312 | "If function that we using in the apply method returns a dict or Series, the resulting output is a DataFrame." 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": null, 318 | "metadata": {}, 319 | "outputs": [], 320 | "source": [ 321 | "df.apply(lambda x: pd.Series({'Median': x.median(), 'Mean': x.mean()}), axis=0)" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": null, 327 | "metadata": {}, 328 | "outputs": [], 329 | "source": [ 330 | "df.apply(lambda x: pd.Series({'Median': x.median(), 'Mean': x.mean()}), axis=1)" 331 | ] 332 | }, 333 | { 334 | "cell_type": "markdown", 335 | "metadata": {}, 336 | "source": [ 337 | "The **np.npv** function has 2 arguments, rate (a *float* which is the discount rate) and values (a *list* of future cashflows).\n", 338 | "\n", 339 | "For example,\n", 340 | "\n", 341 | " np.npv(rate=0.05,values=[1,1,1,1,101])\n", 342 | " \n", 343 | "will find the price of a 5 year bond with 1% annual coupons at a yield of 5%.\n", 344 | "\n", 345 | "Run the cell below to initialise the DataFrame we'll be working with next." 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": null, 351 | "metadata": {}, 352 | "outputs": [], 353 | "source": [ 354 | "df_bond = pd.DataFrame({'bond_name': ['Bank_2020', 'Retailer_2018', 'JGB_3Y'], 'yield': [0.0465, 0.0573, 0.00347],\n", 355 | " '2017_cashflow': [1.5, 2.5, 0.125], '2018_cashflow':[1.5, 102.5, 0.125],\n", 356 | " '2019_cashflow': [1.5, 0, 100.125], '2020_cashflow':[101.5, 0, 0]})\n", 357 | "df_bond.set_index('bond_name', drop=True, inplace=True)\n", 358 | "df_bond" 359 | ] 360 | }, 361 | { 362 | "cell_type": "markdown", 363 | "metadata": { 364 | "collapsed": true 365 | }, 366 | "source": [ 367 | "Using the **.apply, lambda functions and the np.npv function**, calculate the price of each of the three bonds above.\n", 368 | "\n", 369 | "Hint: use a lambda function where the variable is a **row** from your DataFrame.\n", 370 | "\n", 371 | "This exercise is fairly difficult so take a look at the solution below if you get stuck." 372 | ] 373 | }, 374 | { 375 | "cell_type": "code", 376 | "execution_count": null, 377 | "metadata": {}, 378 | "outputs": [], 379 | "source": [] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": null, 384 | "metadata": {}, 385 | "outputs": [], 386 | "source": [ 387 | "#solutions\n", 388 | "%load solutions/apply_sol1.py" 389 | ] 390 | }, 391 | { 392 | "cell_type": "markdown", 393 | "metadata": { 394 | "collapsed": true 395 | }, 396 | "source": [ 397 | "[Back to top](#top)" 398 | ] 399 | }, 400 | { 401 | "cell_type": "markdown", 402 | "metadata": {}, 403 | "source": [ 404 | "----------\n", 405 | "\n", 406 | "# 8. GroupBy" 407 | ] 408 | }, 409 | { 410 | "cell_type": "markdown", 411 | "metadata": {}, 412 | "source": [ 413 | "We have now looked at applying functions and performing aggregations over an entire DataFrame. Often though, we are interested in aggregations among particular subsets of the data. For example, finding the turnover for each security, or median latency of different proccesses in our trading systems. Pandas allows this type of aggregation through the __df.groupby()__ method, which implements a \"split-apply-combine\" paradigm. The process is explained in the diagram below, which groups by the key and applies the sum method.\n", 414 | "\n", 415 | "\n", 416 | "\n", 417 | "
\n", 418 | "\n", 419 | "Additional resources\n", 420 | "\n", 421 | "http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/03.08-Aggregation-and-Grouping.ipynb" 422 | ] 423 | }, 424 | { 425 | "cell_type": "markdown", 426 | "metadata": {}, 427 | "source": [ 428 | "We'll be working with volatility data, df_vols, for this section. Run the code below to import and clean the data." 429 | ] 430 | }, 431 | { 432 | "cell_type": "code", 433 | "execution_count": null, 434 | "metadata": {}, 435 | "outputs": [], 436 | "source": [ 437 | "import pandas as pd\n", 438 | "import numpy as np" 439 | ] 440 | }, 441 | { 442 | "cell_type": "code", 443 | "execution_count": null, 444 | "metadata": {}, 445 | "outputs": [], 446 | "source": [ 447 | "df_vols = pd.read_csv('data/vols.csv')\n", 448 | "df_vols.TIMESTAMP = pd.to_datetime(df_vols.TIMESTAMP)\n", 449 | "df_vols = df_vols.set_index('TIMESTAMP', drop=True)\n", 450 | "df_vols.head()" 451 | ] 452 | }, 453 | { 454 | "cell_type": "markdown", 455 | "metadata": {}, 456 | "source": [ 457 | "Let's do some summarisation first with the .describe() method." 458 | ] 459 | }, 460 | { 461 | "cell_type": "code", 462 | "execution_count": null, 463 | "metadata": {}, 464 | "outputs": [], 465 | "source": [ 466 | "df_vols.describe()" 467 | ] 468 | }, 469 | { 470 | "cell_type": "markdown", 471 | "metadata": {}, 472 | "source": [ 473 | "Suppose we want to find the mean of each column, but **per relative expiry**. We can first do a groupby on the DataFrame." 474 | ] 475 | }, 476 | { 477 | "cell_type": "code", 478 | "execution_count": null, 479 | "metadata": {}, 480 | "outputs": [], 481 | "source": [ 482 | "g = df_vols.groupby('RELATIVE_EXPIRY')\n", 483 | "g" 484 | ] 485 | }, 486 | { 487 | "cell_type": "markdown", 488 | "metadata": {}, 489 | "source": [ 490 | "The output is a DataFrameGroupBy object. Let's look at the attributes of this object with the Tab button or by running the __dir__ function on g." 491 | ] 492 | }, 493 | { 494 | "cell_type": "code", 495 | "execution_count": null, 496 | "metadata": {}, 497 | "outputs": [], 498 | "source": [] 499 | }, 500 | { 501 | "cell_type": "markdown", 502 | "metadata": {}, 503 | "source": [ 504 | "You'll notice that it shares many of the same attributes and methods of the original DataFrame object. For instance, try running a few of the aggregation methods to see how they work." 505 | ] 506 | }, 507 | { 508 | "cell_type": "code", 509 | "execution_count": null, 510 | "metadata": {}, 511 | "outputs": [], 512 | "source": [ 513 | "g.sum()" 514 | ] 515 | }, 516 | { 517 | "cell_type": "code", 518 | "execution_count": null, 519 | "metadata": {}, 520 | "outputs": [], 521 | "source": [] 522 | }, 523 | { 524 | "cell_type": "code", 525 | "execution_count": null, 526 | "metadata": {}, 527 | "outputs": [], 528 | "source": [] 529 | }, 530 | { 531 | "cell_type": "code", 532 | "execution_count": null, 533 | "metadata": {}, 534 | "outputs": [], 535 | "source": [] 536 | }, 537 | { 538 | "cell_type": "markdown", 539 | "metadata": {}, 540 | "source": [ 541 | "If we want to perform an aggregation over a subset of columns, we can select those columns with the dict-like syntax in the usual way. Try a few of these below." 542 | ] 543 | }, 544 | { 545 | "cell_type": "code", 546 | "execution_count": null, 547 | "metadata": {}, 548 | "outputs": [], 549 | "source": [ 550 | "g[['FUTURE', 'VOLATILITY']].mean()" 551 | ] 552 | }, 553 | { 554 | "cell_type": "code", 555 | "execution_count": null, 556 | "metadata": {}, 557 | "outputs": [], 558 | "source": [] 559 | }, 560 | { 561 | "cell_type": "code", 562 | "execution_count": null, 563 | "metadata": {}, 564 | "outputs": [], 565 | "source": [] 566 | }, 567 | { 568 | "cell_type": "code", 569 | "execution_count": null, 570 | "metadata": {}, 571 | "outputs": [], 572 | "source": [] 573 | }, 574 | { 575 | "cell_type": "markdown", 576 | "metadata": {}, 577 | "source": [ 578 | "What if we wanted to group over __ranges__ of values instead? Pandas has very useful functions __pd.cut__ and __pd.qcut__ that can bin the data into value ranges and quantile ranges respectively. Let's create 10 bins around the minimum and maximum forward price." 579 | ] 580 | }, 581 | { 582 | "cell_type": "code", 583 | "execution_count": null, 584 | "metadata": {}, 585 | "outputs": [], 586 | "source": [ 587 | "print df_vols['FUTURE'].min(), df_vols['FUTURE'].max()" 588 | ] 589 | }, 590 | { 591 | "cell_type": "code", 592 | "execution_count": null, 593 | "metadata": {}, 594 | "outputs": [], 595 | "source": [ 596 | "bins = np.linspace(210, 240, 11)\n", 597 | "bins" 598 | ] 599 | }, 600 | { 601 | "cell_type": "code", 602 | "execution_count": null, 603 | "metadata": {}, 604 | "outputs": [], 605 | "source": [ 606 | "df_vols['FUTURE_RANGE'] = pd.cut(df_vols['FUTURE'], bins)\n", 607 | "df_vols.head()" 608 | ] 609 | }, 610 | { 611 | "cell_type": "markdown", 612 | "metadata": {}, 613 | "source": [ 614 | "We can group by these to find the average ATM vol in each forward price bucket - is the result what you expected?" 615 | ] 616 | }, 617 | { 618 | "cell_type": "code", 619 | "execution_count": null, 620 | "metadata": {}, 621 | "outputs": [], 622 | "source": [ 623 | "df_vols.groupby('FUTURE_RANGE')['VOLATILITY'].mean()" 624 | ] 625 | }, 626 | { 627 | "cell_type": "markdown", 628 | "metadata": {}, 629 | "source": [ 630 | "Now, the above calculation is actually not very informative, because we have lumped all relative expiries. We really should group over __both__ the future prices __and__ the expiries at the same time. All we need to do is to provide a list of keys/columns to groupby." 631 | ] 632 | }, 633 | { 634 | "cell_type": "code", 635 | "execution_count": null, 636 | "metadata": {}, 637 | "outputs": [], 638 | "source": [ 639 | "avg_vol = df_vols.groupby(['RELATIVE_EXPIRY', 'FUTURE_RANGE'])['VOLATILITY'].mean()\n", 640 | "avg_vol" 641 | ] 642 | }, 643 | { 644 | "cell_type": "markdown", 645 | "metadata": {}, 646 | "source": [ 647 | "The output is a Series with a MultiIndex, where relative expiry and forward price are different levels of the index. This turns out to be a much easier way of working with data than a \"3D spreadsheet\" kind of structure." 648 | ] 649 | }, 650 | { 651 | "cell_type": "markdown", 652 | "metadata": { 653 | "collapsed": true 654 | }, 655 | "source": [ 656 | "Whenever we get a stacked object like above, we can call the .unstack() method to turn it back into a DataFrame." 657 | ] 658 | }, 659 | { 660 | "cell_type": "code", 661 | "execution_count": null, 662 | "metadata": {}, 663 | "outputs": [], 664 | "source": [ 665 | "df_avg_vol = avg_vol.unstack(level='RELATIVE_EXPIRY') # or level=0\n", 666 | "df_avg_vol" 667 | ] 668 | }, 669 | { 670 | "cell_type": "markdown", 671 | "metadata": {}, 672 | "source": [ 673 | "Groupby objects also have a __.apply__ method, except the apply acts on each key's DataFrame. For instance, calculating the daily change of a few columns for each relative expiry." 674 | ] 675 | }, 676 | { 677 | "cell_type": "code", 678 | "execution_count": null, 679 | "metadata": {}, 680 | "outputs": [], 681 | "source": [ 682 | "df_vols.groupby('RELATIVE_EXPIRY')[['FUTURE', 'VOLATILITY']].apply(lambda df: df.iloc[-1] - df.iloc[0])" 683 | ] 684 | }, 685 | { 686 | "cell_type": "markdown", 687 | "metadata": { 688 | "collapsed": true 689 | }, 690 | "source": [ 691 | "[Back to top](#top)" 692 | ] 693 | }, 694 | { 695 | "cell_type": "markdown", 696 | "metadata": {}, 697 | "source": [ 698 | "----------------------------\n", 699 | "\n", 700 | "# 9. Merge/Join\n", 701 | "\n", 702 | "Sometimes we will want to complement one data set with information from another data set. For example, joining a DataFrame of trades (price, size, time) with a DataFrame of instrument properties (underlying, expiry date, strike price). Pandas' main method for joining two DataFrames is __pd.merge__:\n", 703 | "\n", 704 | " pd.merge(df_left, df_right, on=..., how=...)\n", 705 | " \n", 706 | "The 'on' argument determines which column(s) to join on. If left empty, the columns that df_left and df_right share will be used as join keys. If the columns to join on have different names between df_left and df_right, we can use the 'left_on' and 'right_on' arguments instead. To join on the index instead of a column, we use left_index=True and/or right_index=True. Alternatively, df_left.join(df_right) performs a join on the indexes.\n", 707 | "\n", 708 | "The ‘how’ argument determines the style of join to use. Options for this argument are ‘inner’, 'outer', 'left', and 'right'. An inner join contains the intersection of the two sets of inputs. An outer join returns a join over the union of the input columns, and fills in all missing values with NaNs. The left and right joins return joins over the left and right indices respectively. Note that a right join is identical to a left join with the left/right labels swapped - so we usually just use left joins.\n", 709 | "\n", 710 | "\n", 711 | "\n", 712 | "\n", 713 | "" 714 | ] 715 | }, 716 | { 717 | "cell_type": "markdown", 718 | "metadata": {}, 719 | "source": [ 720 | "
\n", 721 | "\n", 722 | "YouTube video\n", 723 | "\n", 724 | "Watch the following video until the 1 hour 13 minute mark to get a better idea of these methods.\n", 725 | "\n", 726 | "https://www.youtube.com/watch?v=dye7rDktJ2E&start=3180" 727 | ] 728 | }, 729 | { 730 | "cell_type": "markdown", 731 | "metadata": {}, 732 | "source": [ 733 | "**Key methods covered:**\n", 734 | "\n", 735 | " pd.concat - combines two objects into a single DataFrame\n", 736 | " pd.merge - merges existing DataFrames" 737 | ] 738 | }, 739 | { 740 | "cell_type": "markdown", 741 | "metadata": {}, 742 | "source": [ 743 | "
\n", 744 | "\n", 745 | "Additional resources\n", 746 | "\n", 747 | "http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/03.07-Merge-and-Join.ipynb" 748 | ] 749 | }, 750 | { 751 | "cell_type": "markdown", 752 | "metadata": {}, 753 | "source": [ 754 | "Let's load up the following data of prices and turnovers (assume a multiplier of 1)." 755 | ] 756 | }, 757 | { 758 | "cell_type": "code", 759 | "execution_count": null, 760 | "metadata": {}, 761 | "outputs": [], 762 | "source": [ 763 | "import pandas as pd\n", 764 | "import numpy as np" 765 | ] 766 | }, 767 | { 768 | "cell_type": "code", 769 | "execution_count": null, 770 | "metadata": {}, 771 | "outputs": [], 772 | "source": [ 773 | "df_turnovers = pd.DataFrame(columns=['Underlying', 'Month', 'Turnover'],\n", 774 | " data={'Underlying': ['HSI']*3 + ['NK225']*3 + ['HHI']*3,\n", 775 | " 'Month': ['Jan', 'Feb', 'Mar']*3,\n", 776 | " 'Turnover': [1000, 1100, 900,\n", 777 | " 300, 350, 400,\n", 778 | " 6000, 7000, np.nan]})\n", 779 | "df_turnovers" 780 | ] 781 | }, 782 | { 783 | "cell_type": "code", 784 | "execution_count": null, 785 | "metadata": {}, 786 | "outputs": [], 787 | "source": [ 788 | "df_prices = pd.DataFrame(columns=['Underlying', 'Month', 'Price'],\n", 789 | " data={'Underlying': ['HSI']*3 + ['HHI']*3 + ['NK225']*3,\n", 790 | " 'Month': ['Jan', 'Feb', 'Mar']*3,\n", 791 | " 'Price': [28000, 29000, 30000,\n", 792 | " 11000, 12000, 115000,\n", 793 | " 22000, 21000, 20000]})\n", 794 | "df_prices" 795 | ] 796 | }, 797 | { 798 | "cell_type": "markdown", 799 | "metadata": {}, 800 | "source": [ 801 | "Calculate the notional turnover (price times size) in local currency by joining on the appropriate key(s) with __pd.merge__." 802 | ] 803 | }, 804 | { 805 | "cell_type": "code", 806 | "execution_count": null, 807 | "metadata": {}, 808 | "outputs": [], 809 | "source": [] 810 | }, 811 | { 812 | "cell_type": "code", 813 | "execution_count": null, 814 | "metadata": {}, 815 | "outputs": [], 816 | "source": [ 817 | "#solutions\n", 818 | "%load solutions/merge_sol1.py" 819 | ] 820 | }, 821 | { 822 | "cell_type": "markdown", 823 | "metadata": {}, 824 | "source": [ 825 | "Now load the following currency data, and join them together to get the forex rates for each underlying." 826 | ] 827 | }, 828 | { 829 | "cell_type": "code", 830 | "execution_count": null, 831 | "metadata": {}, 832 | "outputs": [], 833 | "source": [ 834 | "df_currency = pd.DataFrame({'Underlying': ['NK225', 'HSI', 'HHI'], 'Currency': ['JPY', 'HKD', 'HKD']})\n", 835 | "df_currency" 836 | ] 837 | }, 838 | { 839 | "cell_type": "code", 840 | "execution_count": null, 841 | "metadata": {}, 842 | "outputs": [], 843 | "source": [ 844 | "df_forex = pd.DataFrame({'Currency': ['HKD', 'JPY', 'KRW'], 'Rate': [6, 80, 850]})\n", 845 | "df_forex" 846 | ] 847 | }, 848 | { 849 | "cell_type": "code", 850 | "execution_count": null, 851 | "metadata": {}, 852 | "outputs": [], 853 | "source": [] 854 | }, 855 | { 856 | "cell_type": "code", 857 | "execution_count": null, 858 | "metadata": {}, 859 | "outputs": [], 860 | "source": [ 861 | "#solutions\n", 862 | "%load solutions/merge_sol2.py" 863 | ] 864 | }, 865 | { 866 | "cell_type": "markdown", 867 | "metadata": {}, 868 | "source": [ 869 | "Finally, join currency data onto the notional turnover data and convert the notional turnover to AUD (in millions)." 870 | ] 871 | }, 872 | { 873 | "cell_type": "code", 874 | "execution_count": null, 875 | "metadata": {}, 876 | "outputs": [], 877 | "source": [] 878 | }, 879 | { 880 | "cell_type": "code", 881 | "execution_count": null, 882 | "metadata": {}, 883 | "outputs": [], 884 | "source": [ 885 | "#solutions\n", 886 | "%load solutions/merge_sol3.py" 887 | ] 888 | }, 889 | { 890 | "cell_type": "markdown", 891 | "metadata": {}, 892 | "source": [ 893 | "[Back to top](#top)" 894 | ] 895 | }, 896 | { 897 | "cell_type": "markdown", 898 | "metadata": {}, 899 | "source": [ 900 | "## Exercise set 3 (unavailable externally)" 901 | ] 902 | }, 903 | { 904 | "cell_type": "markdown", 905 | "metadata": {}, 906 | "source": [ 907 | "For this exercise set we'll use our HSI options data. For now, just run the following cells to get our trade data from OneTick - we'll spend time learning how to use OneTick later. You'll need to install the following package first:\n", 908 | ">pip install optiver.etl" 909 | ] 910 | }, 911 | { 912 | "cell_type": "code", 913 | "execution_count": null, 914 | "metadata": {}, 915 | "outputs": [], 916 | "source": [ 917 | "import pandas as pd\n", 918 | "from etl.onetick import otq, query\n", 919 | "import datetime as dt" 920 | ] 921 | }, 922 | { 923 | "cell_type": "code", 924 | "execution_count": null, 925 | "metadata": {}, 926 | "outputs": [], 927 | "source": [ 928 | "start = dt.datetime(2018, 3, 13, 0, 0)\n", 929 | "end = dt.datetime(2018, 3, 13, 23, 59)" 930 | ] 931 | }, 932 | { 933 | "cell_type": "code", 934 | "execution_count": null, 935 | "metadata": {}, 936 | "outputs": [], 937 | "source": [ 938 | "# Trade data\n", 939 | "q = query.tick_query('Trade_Tick_Analysis', 'ATLAS_IN',\n", 940 | " start, end, 'Australia/Sydney',\n", 941 | " symbol_regex='ATLAS_IN::opa_in_hsi_tko_001.XHKF',\n", 942 | " columns=['EEID_TIMESTAMP', 'FEEDCODE', 'TRADE_PRICE', 'TRADE_VOLUME', 'THEO_PRICE', 'DELTA'])\n", 943 | "\n", 944 | "df_trd = otq.query(q)\n", 945 | "df_trd = df_trd.drop(columns=['Time', 'SYMBOL_NAME'])\n", 946 | "df_trd['EEID_TIMESTAMP'] = pd.to_datetime(df_trd['EEID_TIMESTAMP'])\n", 947 | "df_trd = df_trd.set_index('EEID_TIMESTAMP')\n", 948 | "df_trd = df_trd[df_trd['FEEDCODE'].str.startswith('HSI')]" 949 | ] 950 | }, 951 | { 952 | "cell_type": "code", 953 | "execution_count": null, 954 | "metadata": {}, 955 | "outputs": [], 956 | "source": [ 957 | "# Instrument data\n", 958 | "q = query.tick_query('Instrument', 'XHKF',\n", 959 | " start, end, 'Australia/Sydney',\n", 960 | " symbol_regex='XHKF::HSI',\n", 961 | " columns=['FEEDCODE', 'KIND', 'STRIKE_PRICE', 'EXPIRY_DATE'])\n", 962 | "\n", 963 | "df_ins = otq.query(q)\n", 964 | "df_ins = df_ins.drop(columns=['Time', 'SYMBOL_NAME'])" 965 | ] 966 | }, 967 | { 968 | "cell_type": "markdown", 969 | "metadata": {}, 970 | "source": [ 971 | "1. Add a new column EDGE to df_trd that contains the total edge of that trade in AUD.\n", 972 | "2. Merge the instrument data into the trade data.\n", 973 | "3. Calculate the total edge and trade volumes per delta bucket (delta 0-10, 10-20, 20-30, etc.), expiry date, and instrument kind.\n", 974 | "4. Unstack that dataframe so that it's easier to view.\n", 975 | "5. Sort the dataframe in descending order of edge.\n", 976 | "6. What was the total edge for the day?" 977 | ] 978 | }, 979 | { 980 | "cell_type": "code", 981 | "execution_count": null, 982 | "metadata": {}, 983 | "outputs": [], 984 | "source": [] 985 | }, 986 | { 987 | "cell_type": "code", 988 | "execution_count": null, 989 | "metadata": {}, 990 | "outputs": [], 991 | "source": [] 992 | }, 993 | { 994 | "cell_type": "code", 995 | "execution_count": null, 996 | "metadata": {}, 997 | "outputs": [], 998 | "source": [] 999 | }, 1000 | { 1001 | "cell_type": "code", 1002 | "execution_count": null, 1003 | "metadata": {}, 1004 | "outputs": [], 1005 | "source": [] 1006 | }, 1007 | { 1008 | "cell_type": "code", 1009 | "execution_count": null, 1010 | "metadata": {}, 1011 | "outputs": [], 1012 | "source": [] 1013 | }, 1014 | { 1015 | "cell_type": "code", 1016 | "execution_count": null, 1017 | "metadata": {}, 1018 | "outputs": [], 1019 | "source": [] 1020 | }, 1021 | { 1022 | "cell_type": "code", 1023 | "execution_count": null, 1024 | "metadata": {}, 1025 | "outputs": [], 1026 | "source": [] 1027 | }, 1028 | { 1029 | "cell_type": "code", 1030 | "execution_count": null, 1031 | "metadata": {}, 1032 | "outputs": [], 1033 | "source": [] 1034 | }, 1035 | { 1036 | "cell_type": "code", 1037 | "execution_count": null, 1038 | "metadata": {}, 1039 | "outputs": [], 1040 | "source": [] 1041 | } 1042 | ], 1043 | "metadata": { 1044 | "anaconda-cloud": {}, 1045 | "kernelspec": { 1046 | "display_name": "Python 2", 1047 | "language": "python", 1048 | "name": "python2" 1049 | }, 1050 | "language_info": { 1051 | "codemirror_mode": { 1052 | "name": "ipython", 1053 | "version": 2 1054 | }, 1055 | "file_extension": ".py", 1056 | "mimetype": "text/x-python", 1057 | "name": "python", 1058 | "nbconvert_exporter": "python", 1059 | "pygments_lexer": "ipython2", 1060 | "version": "2.7.14" 1061 | } 1062 | }, 1063 | "nbformat": 4, 1064 | "nbformat_minor": 1 1065 | } 1066 | --------------------------------------------------------------------------------