├── .gitignore
├── APT_FammaMacbeth.ipynb
├── Alpha Trading Workflow.pdf
├── KalmanFilterIntro.ipynb
├── README.md
├── README.pdf
├── README_old.md
├── Step1_FactorPretest.ipynb
├── Step2_FactorsScreening-Copy1.ipynb
├── Step2_FactorsScreening.ipynb
├── Step3_FactorCombination_AdaBoost_Quantopian.ipynb
├── Step3_FactorCombination_AdaBoost_Quantopian_old.ipynb
├── Step3_FactorCombination_BarraKalmanFilter.ipynb
├── output
└── factor_ic_analysis.csv
├── report
├── Alpha Trading Workflow.md
├── Corr_matrix_for_factor_ranks.png
├── Corr_matrix_for_raw_factors.png
├── Quantitative Strategy Workflow.pptx
├── Step3_FactorCombination_AdaBoost_Quantopian.html
├── adaboost_algorithm.png
├── corr_comparison_after_pca_analysis.png
├── mean_spearmans_rank_IC.png
├── mean_spearmans_rank_IC_absolute_value.png
├── rank_of_mean_spearmans_rank_IC_absolute_value.png
├── test_accuracy_bar.png
├── test_score_dist.png
├── train_accuracy_bar.png
├── train_score_dist.png
└── train_score_dist2.png
├── rqdata_utils.py
└── source
├── DownloadData.ipynb
├── DownloadData_bak.ipynb
├── FactorAnalysis.ipynb
├── FactorModeling.ipynb
├── FactorsScreening.ipynb
├── KalmanFilter.ipynb
├── MultiFactorModel.ipynb
└── rqdata_utils.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Spesific files #
2 | ###################
3 | cn_*.csv
4 | *-checkpoint.ipynb
5 |
6 | # Spesific directors #
7 | # #################
8 | .idea/
9 | .ipython_checkpoints/
10 | __pycache__/
11 |
12 | # Backup #
13 | ###################
14 | *.pyo
15 | *.pyc
16 | *~
17 | *.bak
18 | *.swp
19 | *#
20 |
21 | # Images #
22 | ###################
23 | #*.jpg
24 | *.gif
25 | #*.png
26 | *.svg
27 | *.ico
28 |
29 | # Compiled source #
30 | ###################
31 | *.com
32 | *.class
33 | *.dll
34 | *.exe
35 | *.o
36 | *.so
37 |
38 | # Packages #
39 | ############
40 | # it's better to unpack these files and commit the raw source
41 | # git has its own built in compression methods
42 | *.7z
43 | *.dmg
44 | *.gz
45 | *.iso
46 | *.jar
47 | *.rar
48 | *.tar
49 | *.zip
50 |
51 | # Logs and databases #
52 | ######################
53 | *.log
54 | *.sql
55 | *.sqlite
56 |
57 | # OS generated files #
58 | ######################
59 | .DS_Store
60 | .DS_Store?
61 | ._*
62 | .Spotlight-V100
63 | .Trashes
64 | ehthumbs.db
65 | Thumbs.db
66 |
--------------------------------------------------------------------------------
/APT_FammaMacbeth.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# APT model: Famma-Macbeth Regression"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {
14 | "collapsed": true
15 | },
16 | "outputs": [],
17 | "source": [
18 | "from rqdata_utils import *\n",
19 | "import pandas\n",
20 | "import numpy as np\n",
21 | "import scipy as sp\n",
22 | "import alphalens as al\n",
23 | "%matplotlib inline"
24 | ]
25 | },
26 | {
27 | "cell_type": "markdown",
28 | "metadata": {},
29 | "source": [
30 | "## Loading Data"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": 2,
36 | "metadata": {
37 | "collapsed": true
38 | },
39 | "outputs": [],
40 | "source": [
41 | "price_df,instrument_df,equity_df = get_price_instrument_equity(\"cn_stock_price_2012_2018.csv\",\"cn_instrument_info_2012_2018.csv\",\"cn_equity_daily_2012_2018.csv\",\"sectorCode\")"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": 3,
47 | "metadata": {
48 | "collapsed": false
49 | },
50 | "outputs": [
51 | {
52 | "data": {
53 | "text/html": [
54 | "
\n",
55 | "
\n",
56 | " \n",
57 | " \n",
58 | " | \n",
59 | " | \n",
60 | " return | \n",
61 | " close | \n",
62 | " total_turnover | \n",
63 | " volume | \n",
64 | " week | \n",
65 | " month | \n",
66 | " report_quarter | \n",
67 | " market_cap | \n",
68 | " a_share_market_val_2 | \n",
69 | " cash_received_from_sales_of_goods | \n",
70 | " pb_ratio | \n",
71 | " net_profit | \n",
72 | " ps_ratio | \n",
73 | " sectorCode | \n",
74 | "
\n",
75 | " \n",
76 | " date | \n",
77 | " order_book_id | \n",
78 | " | \n",
79 | " | \n",
80 | " | \n",
81 | " | \n",
82 | " | \n",
83 | " | \n",
84 | " | \n",
85 | " | \n",
86 | " | \n",
87 | " | \n",
88 | " | \n",
89 | " | \n",
90 | " | \n",
91 | " | \n",
92 | "
\n",
93 | " \n",
94 | " \n",
95 | " \n",
96 | " 2012-01-04 | \n",
97 | " 000001.XSHE | \n",
98 | " -0.027582 | \n",
99 | " 5.1224 | \n",
100 | " 2.275637e+08 | \n",
101 | " 40894428.0 | \n",
102 | " 0.5775 | \n",
103 | " 0.4331 | \n",
104 | " NaN | \n",
105 | " NaN | \n",
106 | " NaN | \n",
107 | " NaN | \n",
108 | " NaN | \n",
109 | " NaN | \n",
110 | " NaN | \n",
111 | " Financials | \n",
112 | "
\n",
113 | " \n",
114 | " 000002.XSHE | \n",
115 | " -0.018742 | \n",
116 | " 6.0525 | \n",
117 | " 3.559891e+08 | \n",
118 | " 47432958.0 | \n",
119 | " 0.3711 | \n",
120 | " 0.4030 | \n",
121 | " 2011q3 | \n",
122 | " 8.059489e+10 | \n",
123 | " 7.082120e+10 | \n",
124 | " 7.516785e+10 | \n",
125 | " 1.5216 | \n",
126 | " 4.106349e+09 | \n",
127 | " 0.8679 | \n",
128 | " Financials | \n",
129 | "
\n",
130 | " \n",
131 | " 000004.XSHE | \n",
132 | " -0.022250 | \n",
133 | " 7.9100 | \n",
134 | " 3.763833e+06 | \n",
135 | " 465469.0 | \n",
136 | " 0.5720 | \n",
137 | " 0.7506 | \n",
138 | " 2011q3 | \n",
139 | " 6.642556e+08 | \n",
140 | " 6.634549e+08 | \n",
141 | " 5.949968e+07 | \n",
142 | " 8.8175 | \n",
143 | " 4.500363e+06 | \n",
144 | " 37.5796 | \n",
145 | " HealthCare | \n",
146 | "
\n",
147 | " \n",
148 | " 000005.XSHE | \n",
149 | " 0.000000 | \n",
150 | " 3.8600 | \n",
151 | " 0.000000e+00 | \n",
152 | " 0.0 | \n",
153 | " 0.0000 | \n",
154 | " 0.0000 | \n",
155 | " 2011q3 | \n",
156 | " 3.529328e+09 | \n",
157 | " 3.527048e+09 | \n",
158 | " 2.565851e+07 | \n",
159 | " 5.3480 | \n",
160 | " 1.365665e+07 | \n",
161 | " -347.2191 | \n",
162 | " Industrials | \n",
163 | "
\n",
164 | " \n",
165 | " 000006.XSHE | \n",
166 | " -0.009756 | \n",
167 | " 2.6766 | \n",
168 | " 7.619286e+06 | \n",
169 | " 2513811.0 | \n",
170 | " 0.1416 | \n",
171 | " 0.1667 | \n",
172 | " 2011q3 | \n",
173 | " 4.015370e+09 | \n",
174 | " 3.929464e+09 | \n",
175 | " 2.531436e+09 | \n",
176 | " 1.4348 | \n",
177 | " 2.763917e+08 | \n",
178 | " 1.4139 | \n",
179 | " Financials | \n",
180 | "
\n",
181 | " \n",
182 | "
\n",
183 | "
"
184 | ],
185 | "text/plain": [
186 | " return close total_turnover volume \\\n",
187 | "date order_book_id \n",
188 | "2012-01-04 000001.XSHE -0.027582 5.1224 2.275637e+08 40894428.0 \n",
189 | " 000002.XSHE -0.018742 6.0525 3.559891e+08 47432958.0 \n",
190 | " 000004.XSHE -0.022250 7.9100 3.763833e+06 465469.0 \n",
191 | " 000005.XSHE 0.000000 3.8600 0.000000e+00 0.0 \n",
192 | " 000006.XSHE -0.009756 2.6766 7.619286e+06 2513811.0 \n",
193 | "\n",
194 | " week month report_quarter market_cap \\\n",
195 | "date order_book_id \n",
196 | "2012-01-04 000001.XSHE 0.5775 0.4331 NaN NaN \n",
197 | " 000002.XSHE 0.3711 0.4030 2011q3 8.059489e+10 \n",
198 | " 000004.XSHE 0.5720 0.7506 2011q3 6.642556e+08 \n",
199 | " 000005.XSHE 0.0000 0.0000 2011q3 3.529328e+09 \n",
200 | " 000006.XSHE 0.1416 0.1667 2011q3 4.015370e+09 \n",
201 | "\n",
202 | " a_share_market_val_2 \\\n",
203 | "date order_book_id \n",
204 | "2012-01-04 000001.XSHE NaN \n",
205 | " 000002.XSHE 7.082120e+10 \n",
206 | " 000004.XSHE 6.634549e+08 \n",
207 | " 000005.XSHE 3.527048e+09 \n",
208 | " 000006.XSHE 3.929464e+09 \n",
209 | "\n",
210 | " cash_received_from_sales_of_goods pb_ratio \\\n",
211 | "date order_book_id \n",
212 | "2012-01-04 000001.XSHE NaN NaN \n",
213 | " 000002.XSHE 7.516785e+10 1.5216 \n",
214 | " 000004.XSHE 5.949968e+07 8.8175 \n",
215 | " 000005.XSHE 2.565851e+07 5.3480 \n",
216 | " 000006.XSHE 2.531436e+09 1.4348 \n",
217 | "\n",
218 | " net_profit ps_ratio sectorCode \n",
219 | "date order_book_id \n",
220 | "2012-01-04 000001.XSHE NaN NaN Financials \n",
221 | " 000002.XSHE 4.106349e+09 0.8679 Financials \n",
222 | " 000004.XSHE 4.500363e+06 37.5796 HealthCare \n",
223 | " 000005.XSHE 1.365665e+07 -347.2191 Industrials \n",
224 | " 000006.XSHE 2.763917e+08 1.4139 Financials "
225 | ]
226 | },
227 | "execution_count": 3,
228 | "metadata": {},
229 | "output_type": "execute_result"
230 | }
231 | ],
232 | "source": [
233 | "equity_df.head()"
234 | ]
235 | },
236 | {
237 | "cell_type": "code",
238 | "execution_count": 4,
239 | "metadata": {
240 | "collapsed": false
241 | },
242 | "outputs": [
243 | {
244 | "data": {
245 | "text/plain": [
246 | "164"
247 | ]
248 | },
249 | "execution_count": 4,
250 | "metadata": {},
251 | "output_type": "execute_result"
252 | }
253 | ],
254 | "source": [
255 | "healthcareUniverse = instrument_df.index[instrument_df.sectorCode=='HealthCare'].values\n",
256 | "len(healthcareUniverse)"
257 | ]
258 | },
259 | {
260 | "cell_type": "code",
261 | "execution_count": 5,
262 | "metadata": {
263 | "collapsed": true
264 | },
265 | "outputs": [],
266 | "source": [
267 | "def equity_universe_filtering(equity_df, universe):\n",
268 | " universeFilter = [book_id in set(universe) for book_id in equity_df.index.get_level_values(level=1).values]\n",
269 | " return equity_df[universeFilter]"
270 | ]
271 | },
272 | {
273 | "cell_type": "code",
274 | "execution_count": 6,
275 | "metadata": {
276 | "collapsed": false
277 | },
278 | "outputs": [
279 | {
280 | "data": {
281 | "text/html": [
282 | "\n",
283 | "
\n",
284 | " \n",
285 | " \n",
286 | " | \n",
287 | " | \n",
288 | " return | \n",
289 | " close | \n",
290 | " total_turnover | \n",
291 | " volume | \n",
292 | " week | \n",
293 | " month | \n",
294 | " report_quarter | \n",
295 | " market_cap | \n",
296 | " a_share_market_val_2 | \n",
297 | " cash_received_from_sales_of_goods | \n",
298 | " pb_ratio | \n",
299 | " net_profit | \n",
300 | " ps_ratio | \n",
301 | " sectorCode | \n",
302 | "
\n",
303 | " \n",
304 | " date | \n",
305 | " order_book_id | \n",
306 | " | \n",
307 | " | \n",
308 | " | \n",
309 | " | \n",
310 | " | \n",
311 | " | \n",
312 | " | \n",
313 | " | \n",
314 | " | \n",
315 | " | \n",
316 | " | \n",
317 | " | \n",
318 | " | \n",
319 | " | \n",
320 | "
\n",
321 | " \n",
322 | " \n",
323 | " \n",
324 | " 2012-01-04 | \n",
325 | " 000004.XSHE | \n",
326 | " -0.022250 | \n",
327 | " 7.9100 | \n",
328 | " 3763832.88 | \n",
329 | " 465469.0 | \n",
330 | " 0.5720 | \n",
331 | " 0.7506 | \n",
332 | " 2011q3 | \n",
333 | " 6.642556e+08 | \n",
334 | " 6.634549e+08 | \n",
335 | " 5.949968e+07 | \n",
336 | " 8.8175 | \n",
337 | " 4.500363e+06 | \n",
338 | " 37.5796 | \n",
339 | " HealthCare | \n",
340 | "
\n",
341 | " \n",
342 | " 000028.XSHE | \n",
343 | " -0.045433 | \n",
344 | " 19.8422 | \n",
345 | " 9326924.28 | \n",
346 | " 450553.0 | \n",
347 | " 0.4201 | \n",
348 | " 0.2722 | \n",
349 | " 2011q3 | \n",
350 | " 5.872485e+09 | \n",
351 | " 4.753820e+09 | \n",
352 | " 1.053298e+10 | \n",
353 | " 4.3493 | \n",
354 | " 2.481834e+08 | \n",
355 | " 0.3414 | \n",
356 | " HealthCare | \n",
357 | "
\n",
358 | " \n",
359 | " 000150.XSHE | \n",
360 | " -0.030295 | \n",
361 | " 3.1737 | \n",
362 | " 3109304.50 | \n",
363 | " 952600.0 | \n",
364 | " 0.3460 | \n",
365 | " 0.3610 | \n",
366 | " 2011q3 | \n",
367 | " 1.036800e+09 | \n",
368 | " 1.036800e+09 | \n",
369 | " 4.913279e+07 | \n",
370 | " 1.4763 | \n",
371 | " 3.657858e+06 | \n",
372 | " 7.8956 | \n",
373 | " HealthCare | \n",
374 | "
\n",
375 | " \n",
376 | " 000153.XSHE | \n",
377 | " -0.028053 | \n",
378 | " 5.7700 | \n",
379 | " 9673054.49 | \n",
380 | " 1596020.0 | \n",
381 | " 0.6830 | \n",
382 | " 2.4594 | \n",
383 | " 2011q3 | \n",
384 | " 1.531454e+09 | \n",
385 | " 1.360856e+09 | \n",
386 | " 1.329425e+09 | \n",
387 | " 2.1169 | \n",
388 | " 1.560397e+07 | \n",
389 | " 0.7818 | \n",
390 | " HealthCare | \n",
391 | "
\n",
392 | " \n",
393 | " 000403.XSHE | \n",
394 | " 0.000000 | \n",
395 | " 3.1625 | \n",
396 | " 0.00 | \n",
397 | " 0.0 | \n",
398 | " 0.0000 | \n",
399 | " 0.0000 | \n",
400 | " NaN | \n",
401 | " NaN | \n",
402 | " NaN | \n",
403 | " NaN | \n",
404 | " NaN | \n",
405 | " NaN | \n",
406 | " NaN | \n",
407 | " HealthCare | \n",
408 | "
\n",
409 | " \n",
410 | "
\n",
411 | "
"
412 | ],
413 | "text/plain": [
414 | " return close total_turnover volume \\\n",
415 | "date order_book_id \n",
416 | "2012-01-04 000004.XSHE -0.022250 7.9100 3763832.88 465469.0 \n",
417 | " 000028.XSHE -0.045433 19.8422 9326924.28 450553.0 \n",
418 | " 000150.XSHE -0.030295 3.1737 3109304.50 952600.0 \n",
419 | " 000153.XSHE -0.028053 5.7700 9673054.49 1596020.0 \n",
420 | " 000403.XSHE 0.000000 3.1625 0.00 0.0 \n",
421 | "\n",
422 | " week month report_quarter market_cap \\\n",
423 | "date order_book_id \n",
424 | "2012-01-04 000004.XSHE 0.5720 0.7506 2011q3 6.642556e+08 \n",
425 | " 000028.XSHE 0.4201 0.2722 2011q3 5.872485e+09 \n",
426 | " 000150.XSHE 0.3460 0.3610 2011q3 1.036800e+09 \n",
427 | " 000153.XSHE 0.6830 2.4594 2011q3 1.531454e+09 \n",
428 | " 000403.XSHE 0.0000 0.0000 NaN NaN \n",
429 | "\n",
430 | " a_share_market_val_2 \\\n",
431 | "date order_book_id \n",
432 | "2012-01-04 000004.XSHE 6.634549e+08 \n",
433 | " 000028.XSHE 4.753820e+09 \n",
434 | " 000150.XSHE 1.036800e+09 \n",
435 | " 000153.XSHE 1.360856e+09 \n",
436 | " 000403.XSHE NaN \n",
437 | "\n",
438 | " cash_received_from_sales_of_goods pb_ratio \\\n",
439 | "date order_book_id \n",
440 | "2012-01-04 000004.XSHE 5.949968e+07 8.8175 \n",
441 | " 000028.XSHE 1.053298e+10 4.3493 \n",
442 | " 000150.XSHE 4.913279e+07 1.4763 \n",
443 | " 000153.XSHE 1.329425e+09 2.1169 \n",
444 | " 000403.XSHE NaN NaN \n",
445 | "\n",
446 | " net_profit ps_ratio sectorCode \n",
447 | "date order_book_id \n",
448 | "2012-01-04 000004.XSHE 4.500363e+06 37.5796 HealthCare \n",
449 | " 000028.XSHE 2.481834e+08 0.3414 HealthCare \n",
450 | " 000150.XSHE 3.657858e+06 7.8956 HealthCare \n",
451 | " 000153.XSHE 1.560397e+07 0.7818 HealthCare \n",
452 | " 000403.XSHE NaN NaN HealthCare "
453 | ]
454 | },
455 | "execution_count": 6,
456 | "metadata": {},
457 | "output_type": "execute_result"
458 | }
459 | ],
460 | "source": [
461 | "healthcare_equity_df = equity_universe_filtering(equity_df, healthcareUniverse)\n",
462 | "healthcare_equity_df.head()"
463 | ]
464 | },
465 | {
466 | "cell_type": "code",
467 | "execution_count": 7,
468 | "metadata": {
469 | "collapsed": false
470 | },
471 | "outputs": [
472 | {
473 | "name": "stdout",
474 | "output_type": "stream",
475 | "text": [
476 | "universe ratio: 6.210331877919959%\n"
477 | ]
478 | }
479 | ],
480 | "source": [
481 | "print(\"universe ratio: {}%\".format(len(healthcare_equity_df)/len(equity_df)*100))"
482 | ]
483 | },
484 | {
485 | "cell_type": "markdown",
486 | "metadata": {},
487 | "source": [
488 | "### benchmark"
489 | ]
490 | },
491 | {
492 | "cell_type": "code",
493 | "execution_count": 27,
494 | "metadata": {
495 | "collapsed": true
496 | },
497 | "outputs": [],
498 | "source": [
499 | "benchmark_df = pd.read_csv(\"cn_SH_healthcare_index_2012_2018.csv\",names=['date','value'])\n",
500 | "benchmark_df = benchmark_df.set_index('date',drop=True)"
501 | ]
502 | },
503 | {
504 | "cell_type": "code",
505 | "execution_count": 33,
506 | "metadata": {
507 | "collapsed": false
508 | },
509 | "outputs": [
510 | {
511 | "data": {
512 | "text/html": [
513 | "\n",
514 | "
\n",
515 | " \n",
516 | " \n",
517 | " | \n",
518 | " value | \n",
519 | " return | \n",
520 | "
\n",
521 | " \n",
522 | " date | \n",
523 | " | \n",
524 | " | \n",
525 | "
\n",
526 | " \n",
527 | " \n",
528 | " \n",
529 | " 2012-01-04 | \n",
530 | " 2891.462 | \n",
531 | " 0.000000 | \n",
532 | "
\n",
533 | " \n",
534 | " 2012-01-05 | \n",
535 | " 2766.955 | \n",
536 | " 0.044015 | \n",
537 | "
\n",
538 | " \n",
539 | " 2012-01-06 | \n",
540 | " 2744.793 | \n",
541 | " 0.008042 | \n",
542 | "
\n",
543 | " \n",
544 | " 2012-01-09 | \n",
545 | " 2833.219 | \n",
546 | " -0.031708 | \n",
547 | "
\n",
548 | " \n",
549 | " 2012-01-10 | \n",
550 | " 2929.594 | \n",
551 | " -0.033450 | \n",
552 | "
\n",
553 | " \n",
554 | "
\n",
555 | "
"
556 | ],
557 | "text/plain": [
558 | " value return\n",
559 | "date \n",
560 | "2012-01-04 2891.462 0.000000\n",
561 | "2012-01-05 2766.955 0.044015\n",
562 | "2012-01-06 2744.793 0.008042\n",
563 | "2012-01-09 2833.219 -0.031708\n",
564 | "2012-01-10 2929.594 -0.033450"
565 | ]
566 | },
567 | "execution_count": 33,
568 | "metadata": {},
569 | "output_type": "execute_result"
570 | }
571 | ],
572 | "source": [
573 | "benchmark_df['return'] = np.log(benchmark_df.shift(1)/benchmark_df).fillna(0)\n",
574 | "benchmark_df.head()"
575 | ]
576 | },
577 | {
578 | "cell_type": "markdown",
579 | "metadata": {},
580 | "source": [
581 | "## Factor Returns"
582 | ]
583 | },
584 | {
585 | "cell_type": "code",
586 | "execution_count": 8,
587 | "metadata": {
588 | "collapsed": true
589 | },
590 | "outputs": [],
591 | "source": [
592 | "def equity_factor_return(equity_df, factorColumn, nAllocations, longTop=True):\n",
593 | " equity_copy = equity_df.copy()\n",
594 | "# equity_copy[\"{}_rank\".format(factorColumn)] = equity_copy.groupby(level='date')[factorColumn].rank()\n",
595 | "# equity_copy[equity_copy.groupby(level='date')[factorColumn].nlargest(nAllocations).index][\"biggest_{}_{}\".format(nAllocations,factorColumn)]=True\n",
596 | " largest = equity_copy[factorColumn].groupby(level='date').nlargest(nAllocations).reset_index(level=0,drop=True)\n",
597 | " smallest = equity_copy[factorColumn].groupby(level='date').nsmallest(nAllocations).reset_index(level=0,drop=True)\n",
598 | " r_largest = equity_copy.loc[largest.index,'return'].groupby(level='date').mean()\n",
599 | " r_smallest = equity_copy.loc[smallest.index,'return'].groupby(level='date').mean()\n",
600 | " LMS = r_largest - r_smallest\n",
601 | " if(longTop):\n",
602 | " return LMS\n",
603 | " else:\n",
604 | " return -LMS"
605 | ]
606 | },
607 | {
608 | "cell_type": "code",
609 | "execution_count": 9,
610 | "metadata": {
611 | "collapsed": false
612 | },
613 | "outputs": [
614 | {
615 | "data": {
616 | "text/plain": [
617 | "date\n",
618 | "2012-01-04 0.005983\n",
619 | "2012-01-05 -0.009098\n",
620 | "2012-01-06 -0.004155\n",
621 | "2012-01-09 0.014615\n",
622 | "2012-01-10 0.006728\n",
623 | "Name: return, dtype: float64"
624 | ]
625 | },
626 | "execution_count": 9,
627 | "metadata": {},
628 | "output_type": "execute_result"
629 | }
630 | ],
631 | "source": [
632 | "SMB = equity_factor_return(healthcare_equity_df, 'market_cap', 20,longTop=False)\n",
633 | "SMB.head()"
634 | ]
635 | },
636 | {
637 | "cell_type": "code",
638 | "execution_count": 10,
639 | "metadata": {
640 | "collapsed": false
641 | },
642 | "outputs": [
643 | {
644 | "data": {
645 | "text/plain": [
646 | "date\n",
647 | "2012-01-04 0.005302\n",
648 | "2012-01-05 -0.007223\n",
649 | "2012-01-06 0.006031\n",
650 | "2012-01-09 -0.002597\n",
651 | "2012-01-10 -0.010780\n",
652 | "Name: return, dtype: float64"
653 | ]
654 | },
655 | "execution_count": 10,
656 | "metadata": {},
657 | "output_type": "execute_result"
658 | }
659 | ],
660 | "source": [
661 | "HML = equity_factor_return(healthcare_equity_df, 'pb_ratio', 20,longTop=True)\n",
662 | "HML.head()"
663 | ]
664 | },
665 | {
666 | "cell_type": "code",
667 | "execution_count": 11,
668 | "metadata": {
669 | "collapsed": true
670 | },
671 | "outputs": [],
672 | "source": [
673 | "import itertools\n",
674 | "import statsmodels.api as sm\n",
675 | "from statsmodels import regression,stats\n",
676 | "import scipy\n",
677 | "\n",
678 | "data = healthcare_equity_df[['return']] # dataframe\n",
679 | "data = data.set_index(healthcare_equity_df.index) # elimilate redundant index (whole universe)\n",
680 | "asset_list_sizes = [group[1].size for group in data.groupby(level=0)]\n",
681 | "\n",
682 | "# Spreading the factor portfolio data across all assets for each day\n",
683 | "SMB_column = [[SMB.loc[group[0]]] * size for group, size \\\n",
684 | " in zip(data.groupby(level=0), asset_list_sizes)]\n",
685 | "data['SMB'] = list(itertools.chain(*SMB_column))\n",
686 | "\n",
687 | "HML_column = [[HML.loc[group[0]]] * size for group, size \\\n",
688 | " in zip(data.groupby(level=0), asset_list_sizes)]\n",
689 | "data['HML'] = list(itertools.chain(*HML_column))\n",
690 | "data = sm.add_constant(data.dropna())"
691 | ]
692 | },
693 | {
694 | "cell_type": "code",
695 | "execution_count": 12,
696 | "metadata": {
697 | "collapsed": false
698 | },
699 | "outputs": [
700 | {
701 | "data": {
702 | "text/html": [
703 | "\n",
704 | "
\n",
705 | " \n",
706 | " \n",
707 | " | \n",
708 | " | \n",
709 | " const | \n",
710 | " return | \n",
711 | " SMB | \n",
712 | " HML | \n",
713 | "
\n",
714 | " \n",
715 | " date | \n",
716 | " order_book_id | \n",
717 | " | \n",
718 | " | \n",
719 | " | \n",
720 | " | \n",
721 | "
\n",
722 | " \n",
723 | " \n",
724 | " \n",
725 | " 2012-01-04 | \n",
726 | " 000004.XSHE | \n",
727 | " 1.0 | \n",
728 | " -0.022250 | \n",
729 | " 0.005983 | \n",
730 | " 0.005302 | \n",
731 | "
\n",
732 | " \n",
733 | " 000028.XSHE | \n",
734 | " 1.0 | \n",
735 | " -0.045433 | \n",
736 | " 0.005983 | \n",
737 | " 0.005302 | \n",
738 | "
\n",
739 | " \n",
740 | " 000150.XSHE | \n",
741 | " 1.0 | \n",
742 | " -0.030295 | \n",
743 | " 0.005983 | \n",
744 | " 0.005302 | \n",
745 | "
\n",
746 | " \n",
747 | " 000153.XSHE | \n",
748 | " 1.0 | \n",
749 | " -0.028053 | \n",
750 | " 0.005983 | \n",
751 | " 0.005302 | \n",
752 | "
\n",
753 | " \n",
754 | " 000403.XSHE | \n",
755 | " 1.0 | \n",
756 | " 0.000000 | \n",
757 | " 0.005983 | \n",
758 | " 0.005302 | \n",
759 | "
\n",
760 | " \n",
761 | "
\n",
762 | "
"
763 | ],
764 | "text/plain": [
765 | " const return SMB HML\n",
766 | "date order_book_id \n",
767 | "2012-01-04 000004.XSHE 1.0 -0.022250 0.005983 0.005302\n",
768 | " 000028.XSHE 1.0 -0.045433 0.005983 0.005302\n",
769 | " 000150.XSHE 1.0 -0.030295 0.005983 0.005302\n",
770 | " 000153.XSHE 1.0 -0.028053 0.005983 0.005302\n",
771 | " 000403.XSHE 1.0 0.000000 0.005983 0.005302"
772 | ]
773 | },
774 | "execution_count": 12,
775 | "metadata": {},
776 | "output_type": "execute_result"
777 | }
778 | ],
779 | "source": [
780 | "data.head()"
781 | ]
782 | },
783 | {
784 | "cell_type": "markdown",
785 | "metadata": {},
786 | "source": [
787 | "## Factor Exposures ($\\beta$)"
788 | ]
789 | },
790 | {
791 | "cell_type": "code",
792 | "execution_count": 13,
793 | "metadata": {
794 | "collapsed": true
795 | },
796 | "outputs": [],
797 | "source": [
798 | "assets = data.index.levels[1].unique()\n",
799 | "Y = [data.xs(asset,level=1)['return'] for asset in assets]\n",
800 | "X = [data.xs(asset,level=1)[['SMB','HML','const']] for asset in assets]\n",
801 | "reg_results = [regression.linear_model.OLS(y,x).fit().params for y,x in zip(Y,X) if not(x.empty or y.empty)]\n",
802 | "indices = [asset for y, x, asset in zip(Y, X, assets) if not(x.empty or y.empty)]\n",
803 | "betas = pd.DataFrame(reg_results, index=indices)"
804 | ]
805 | },
806 | {
807 | "cell_type": "code",
808 | "execution_count": 15,
809 | "metadata": {
810 | "collapsed": false
811 | },
812 | "outputs": [
813 | {
814 | "data": {
815 | "text/html": [
816 | "\n",
817 | "
\n",
818 | " \n",
819 | " \n",
820 | " | \n",
821 | " SMB | \n",
822 | " HML | \n",
823 | " const | \n",
824 | "
\n",
825 | " \n",
826 | " \n",
827 | " \n",
828 | " 000004.XSHE | \n",
829 | " 0.883906 | \n",
830 | " 0.048757 | \n",
831 | " 0.002002 | \n",
832 | "
\n",
833 | " \n",
834 | " 000028.XSHE | \n",
835 | " -0.003029 | \n",
836 | " -0.064295 | \n",
837 | " 0.001073 | \n",
838 | "
\n",
839 | " \n",
840 | " 000150.XSHE | \n",
841 | " 0.354122 | \n",
842 | " 0.066071 | \n",
843 | " 0.002031 | \n",
844 | "
\n",
845 | " \n",
846 | " 000153.XSHE | \n",
847 | " 0.620706 | \n",
848 | " -0.082229 | \n",
849 | " 0.001405 | \n",
850 | "
\n",
851 | " \n",
852 | " 000403.XSHE | \n",
853 | " 2.032192 | \n",
854 | " 11.457418 | \n",
855 | " -0.017412 | \n",
856 | "
\n",
857 | " \n",
858 | "
\n",
859 | "
"
860 | ],
861 | "text/plain": [
862 | " SMB HML const\n",
863 | "000004.XSHE 0.883906 0.048757 0.002002\n",
864 | "000028.XSHE -0.003029 -0.064295 0.001073\n",
865 | "000150.XSHE 0.354122 0.066071 0.002031\n",
866 | "000153.XSHE 0.620706 -0.082229 0.001405\n",
867 | "000403.XSHE 2.032192 11.457418 -0.017412"
868 | ]
869 | },
870 | "execution_count": 15,
871 | "metadata": {},
872 | "output_type": "execute_result"
873 | }
874 | ],
875 | "source": [
876 | "betas.head()"
877 | ]
878 | },
879 | {
880 | "cell_type": "markdown",
881 | "metadata": {},
882 | "source": [
883 | "## Factor Premium"
884 | ]
885 | },
886 | {
887 | "cell_type": "code",
888 | "execution_count": 36,
889 | "metadata": {
890 | "collapsed": false
891 | },
892 | "outputs": [
893 | {
894 | "data": {
895 | "text/html": [
896 | "\n",
897 | "OLS Regression Results\n",
898 | "\n",
899 | " Dep. Variable: | return | R-squared: | 0.398 | \n",
900 | "
\n",
901 | "\n",
902 | " Model: | OLS | Adj. R-squared: | 0.391 | \n",
903 | "
\n",
904 | "\n",
905 | " Method: | Least Squares | F-statistic: | 53.26 | \n",
906 | "
\n",
907 | "\n",
908 | " Date: | Sat, 05 May 2018 | Prob (F-statistic): | 1.77e-18 | \n",
909 | "
\n",
910 | "\n",
911 | " Time: | 21:03:25 | Log-Likelihood: | 1012.1 | \n",
912 | "
\n",
913 | "\n",
914 | " No. Observations: | 164 | AIC: | -2018. | \n",
915 | "
\n",
916 | "\n",
917 | " Df Residuals: | 161 | BIC: | -2009. | \n",
918 | "
\n",
919 | "\n",
920 | " Df Model: | 2 | | | \n",
921 | "
\n",
922 | "\n",
923 | " Covariance Type: | nonrobust | | | \n",
924 | "
\n",
925 | "
\n",
926 | "\n",
927 | "\n",
928 | " | coef | std err | t | P>|t| | [0.025 | 0.975] | \n",
929 | "
\n",
930 | "\n",
931 | " const | 0.0017 | 6.72e-05 | 24.956 | 0.000 | 0.002 | 0.002 | \n",
932 | "
\n",
933 | "\n",
934 | " SMB | -7.597e-05 | 0.000 | -0.599 | 0.550 | -0.000 | 0.000 | \n",
935 | "
\n",
936 | "\n",
937 | " HML | 0.0005 | 4.81e-05 | 9.695 | 0.000 | 0.000 | 0.001 | \n",
938 | "
\n",
939 | "
\n",
940 | "\n",
941 | "\n",
942 | " Omnibus: | 39.154 | Durbin-Watson: | 1.906 | \n",
943 | "
\n",
944 | "\n",
945 | " Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 78.545 | \n",
946 | "
\n",
947 | "\n",
948 | " Skew: | 1.087 | Prob(JB): | 8.80e-18 | \n",
949 | "
\n",
950 | "\n",
951 | " Kurtosis: | 5.601 | Cond. No. | 3.92 | \n",
952 | "
\n",
953 | "
"
954 | ],
955 | "text/plain": [
956 | "\n",
957 | "\"\"\"\n",
958 | " OLS Regression Results \n",
959 | "==============================================================================\n",
960 | "Dep. Variable: return R-squared: 0.398\n",
961 | "Model: OLS Adj. R-squared: 0.391\n",
962 | "Method: Least Squares F-statistic: 53.26\n",
963 | "Date: Sat, 05 May 2018 Prob (F-statistic): 1.77e-18\n",
964 | "Time: 21:03:25 Log-Likelihood: 1012.1\n",
965 | "No. Observations: 164 AIC: -2018.\n",
966 | "Df Residuals: 161 BIC: -2009.\n",
967 | "Df Model: 2 \n",
968 | "Covariance Type: nonrobust \n",
969 | "==============================================================================\n",
970 | " coef std err t P>|t| [0.025 0.975]\n",
971 | "------------------------------------------------------------------------------\n",
972 | "const 0.0017 6.72e-05 24.956 0.000 0.002 0.002\n",
973 | "SMB -7.597e-05 0.000 -0.599 0.550 -0.000 0.000\n",
974 | "HML 0.0005 4.81e-05 9.695 0.000 0.000 0.001\n",
975 | "==============================================================================\n",
976 | "Omnibus: 39.154 Durbin-Watson: 1.906\n",
977 | "Prob(Omnibus): 0.000 Jarque-Bera (JB): 78.545\n",
978 | "Skew: 1.087 Prob(JB): 8.80e-18\n",
979 | "Kurtosis: 5.601 Cond. No. 3.92\n",
980 | "==============================================================================\n",
981 | "\n",
982 | "Warnings:\n",
983 | "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
984 | "\"\"\""
985 | ]
986 | },
987 | "execution_count": 36,
988 | "metadata": {},
989 | "output_type": "execute_result"
990 | }
991 | ],
992 | "source": [
993 | "betas = sm.add_constant(betas.drop('const', axis=1))\n",
994 | "\n",
995 | "R = data['return'].mean(axis=0, level=1)\n",
996 | "\n",
997 | "# Second regression step: estimating the risk premia\n",
998 | "risk_free_rate = benchmark_df['return'].mean()\n",
999 | "\n",
1000 | "final_results = regression.linear_model.OLS(R - risk_free_rate, betas).fit()\n",
1001 | "\n",
1002 | "final_results.summary()"
1003 | ]
1004 | },
1005 | {
1006 | "cell_type": "markdown",
1007 | "metadata": {},
1008 | "source": [
1009 | "## Fama-Macbeth Test Conclusion: \n",
1010 | "although our individual factors are significant, we have a very low $R^2$ . What this may suggest is that there is a real link between our factors and the returns of our assets, but that there still remains a lot of unexplained noise!"
1011 | ]
1012 | },
1013 | {
1014 | "cell_type": "code",
1015 | "execution_count": null,
1016 | "metadata": {
1017 | "collapsed": true
1018 | },
1019 | "outputs": [],
1020 | "source": []
1021 | }
1022 | ],
1023 | "metadata": {
1024 | "kernelspec": {
1025 | "display_name": "Python 3",
1026 | "language": "python",
1027 | "name": "python3"
1028 | },
1029 | "language_info": {
1030 | "codemirror_mode": {
1031 | "name": "ipython",
1032 | "version": 3
1033 | },
1034 | "file_extension": ".py",
1035 | "mimetype": "text/x-python",
1036 | "name": "python",
1037 | "nbconvert_exporter": "python",
1038 | "pygments_lexer": "ipython3",
1039 | "version": "3.5.2"
1040 | }
1041 | },
1042 | "nbformat": 4,
1043 | "nbformat_minor": 2
1044 | }
1045 |
--------------------------------------------------------------------------------
/Alpha Trading Workflow.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jerryxyx/AlphaTrading/5e73923786297faeadb27c76f83ec81fad74af51/Alpha Trading Workflow.pdf
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Multi-Factor Models
2 |
3 | Author: Jerry Xia
4 |
5 | Date: 2018/07/27
6 |
7 | *Note: The advanced Marckdown features such as math expression may not be compatible in GitHub, please see README.pdf instead if you want more details*
8 |
9 |
10 |
11 | ## Project Introduction
12 | This is a research survey about alpha trading. In this project, I built up a pipeline of alpha trading including:
13 |
14 | * factor pretest
15 | * factor screening
16 | * factor combination (modeling)
17 |
18 | The models involed are APT models, Barra's risk models and dynamic factors model using Kalman filter.
19 |
20 | ### Files
21 |
22 | * rqdata_utils.py: Utils dealing with the rice quant platform data
23 |
24 | * Step1_FactorPretest.ipynb: Factor returns profile visulization
25 |
26 | * Step2_FactorsScreening.ipynb: Factor returns turnover visulization and correlation coefficients
27 |
28 | * Step3\_FactorCombination\_AdaBoost\_Quantopian.ipynb: A Quantopian notebook file to combine alpha factors using Adaboost
29 |
30 | * Step3\_FactorCombination\_BarraKalmanFilter.ipynb: Barra's risk model with three calibration schemes:
31 | * Scheme 1: Cross-sectional regression and weighted average
32 | * Scheme 2: Optimization problem: minimize the exponential weighted average of squared error
33 | * Scheme 3: Dynamic linear model using Kalman filter
34 |
35 | * KalmanFilterIntro.ipynb: An introduction to the dynamic multi-factor model
36 | * APT_FammaBeth.ipynb: Using Famma-Macbeth regression to calibrate APT model.
37 |
38 | ### Dataset
39 | The dataset is not available in GitHub as it is too large. Except for Step3\_FactorCombination\_AdaBoost\_Quantopian.ipynb which we used US stock data in Quantopian, among other files, we used Chinese A-stocks data downloaded from RiceQuant instead (hard for free US equities' data).
40 |
41 | The data frame is multi-indexed similar to Quantopian's format(see both Alphalens github codes and rqdata_utils.py). However, feel free to cast and apply your own dataset.
42 |
43 |
44 | ## TODO
45 |
46 | * Input more effective factors: take advice from people and industry reports
47 | * Should add technical analysis, because it matters! People care about them and then make it good sentimental indexes.
48 | * Find well-known metrics to express results
49 |
50 | ## Workflow
51 | $\checkmark$ stands for finished and $\vartriangle$ stands for TODO
52 |
53 | * Universe definition
54 | * Factors collection and preprocessing
55 | * $\vartriangle$Factors collection
56 | - Sources
57 | - balance sheet
58 | - cash flow statement
59 | - income statement
60 | - earning report
61 | - Econometric Classifications
62 | - value
63 | - growth
64 | - profitability
65 | - market size
66 | - liquidity
67 | - volatility
68 | - Momentom
69 | - Financial leverage (debt-to-equity ratio)
70 | * Factors preprocessing
71 | - $\vartriangle$daily, quaterly, annually
72 | - continuous: rescale, outliers
73 | - $\checkmark$discrete: rank
74 | * Factors screening and combination
75 | * Factors screening
76 | - $\checkmark$Factors' correlation
77 | - $\checkmark$Factors' foreseeablity
78 | - Fama-Macbeth regression
79 | * $\vartriangle$Factors combination
80 | - PCA, FA
81 | - Techniqual Analaysis
82 | - Financial Modeling
83 | - $\checkmark$APT model
84 | - $\checkmark$Barra's risk model
85 | - $\checkmark$Dynamic multi-factors model
86 | - Linear combination to maximize Sharpe ratio
87 | - Non-linear learning algorithms
88 | - $\checkmark$AdaBoost
89 | - Reinforcement learning
90 |
91 | * Portfolio allocation
92 |
93 |
94 | ## Factors' Correlations
95 | Here, I use correlation matrix as the measure. The difference from the second result is that the correlation matrix is calculated by the rank data rather than the raw data
96 | ### Two ICs comparison
97 | * Pearson's IC: measures linear relationship between components
98 |
99 | * Spearman's IC: measures monotonic relationship between components. Since We only care about the monotonic relationships. Spearman's IC wins.
100 |
101 |
102 | ### Regular IC(Pearson's correlation coefficient) for each factors
103 | 
104 | ### Spearman's Rank correlation coefficient for each factors
105 | 
106 |
107 | ### How to rule out redundant factors and why Spearman's rank correlation coefficients?
108 | From the correlation coefficients below, we can again conclude that Spearman's rank IC is far more robust. Take ps_ratio and sales_yield as a example.
109 | $$ps\_ratio = \frac{\mbox{adjusted close price}}{\mbox{sales per share}}$$
110 | whereas
111 | $$sales\_yield = \frac{\mbox{sales per share}}{\mbox{price}}$$
112 | Ahthogh the price in sales_yield formula is vague in our data source we can see roughly speaking, these two variable should be inverse of each other. The Spearman's rank correlation coefficient is -0.98 which verifies this statement, and we should avoid using both of these factors, which would exeggarate the impact of this peticular factor. However, we can not see such identity in the Pearson's regular correlation coefficients. It's quite misleading actually and that's why we choose Spearman's rank IC.
113 |
114 | ## Factors' Foreseeability
115 |
116 | ### Methods
117 | * Spearman's rank correlation coefficients
118 | * Fama-Macbeth regression: Not only consider the foreseeability of factors itself but also consider the co-vary of different factors, which means rule out factors if the returns can be explained by the recent factors.
119 |
120 |
121 | ### Spearman's rank IC for factors vs. forward returns
122 |
123 | 
124 |
125 | ### Spearman's rank IC (absolute value) for factors vs. forward returns
126 | 
127 |
128 | ### Rank of the Spearman's rank IC (absolute value) for factors vs. forward returns
129 | 
130 |
131 | ## Factors Preprocessing
132 | * Get ranked data
133 | * Obtain the valid stocks set
134 | * Reshape the data: only valid stocks set
135 | * Fill null: using daily average
136 | * Rescale the data: MinMaxScaler
137 | * Variable reduction: PCA analysis
138 | * Sanity check
139 |
140 | 
141 |
142 | Here, I use principle component analysis because it can brings two benefits to our data - orthogonality and dimensionality reduction. Orthogonality makes data more separate, less dimensionality makes information more concentrated. Either of them is essential for machine learning algorithms.
143 |
144 | In the next part, I used this preprocessed data as the input to obtain a "mega alpha".
145 |
146 | ## Mega Alpha
147 | construct an aggregate alpha factor which has its return distribution profitable. The term "profitable" here means condense, little turnover, significant in the positive return.
148 | ### Methods
149 | #### linear methods
150 | * normalize factors and try a linear combination
151 | * rank each factor and then sum up
152 | * Financial modeling: **See the appendix and Step3\_FactorCombination\_BarraKalmanFilter.ipynb**
153 | * linear combination to maximize Sharpe ratio
154 |
155 | #### Non-linear methods
156 | * AdaBoost: **See Step3\_FactorCombination\_AdaBoost\_Quantopian.ipynb**
157 | * Reinforement Learning
158 |
159 |
160 | Here we only introduce AdaBoost algorithm in this documentation. For more details about the linear models, please See the appendix and Step3\_FactorCombination\_BarraKalmanFilter.ipynb.
161 |
162 | ### AdaBoost
163 | #### Description
164 | The algorithm sequentially applies a weak classification to modified versions of the data. By increasing the weights of the missclassified observations, each weak learner focuses on the error of the previous one. The predictions are aggregated through a weighted majority vote.
165 |
166 | #### Algorithm
167 |
168 | 
169 |
170 | #### Train set
171 | The adaboost classifier was applied to our fundamental dataset. The objective is to train a classifier which give a score for the bunch of factors. Or in other word, the mega alpha. Pink for the positive forward returns observations and blue for the negative forward returns observations. A good score system is to make the two classes more separated.
172 | 
173 | We can see, in train set, AdaBoost classifier did so well! The next plot is the precision in each quantile of scores. In the top and bottom quantile, the predicted precision is nearly 100%!
174 | 
175 |
176 | #### Test set
177 | alpha values histogram
178 | 
179 | quantile precision bar plot
180 | 
181 | The precision in the top and bottom quantile is only slightly higher than 50%. Far from good if we considered transaction cost.
182 |
183 | So, I added some technical analysis factors to see if we can tackle this problem.
184 | 
185 | Surprisingly, even the average accuracy in test set is about 67%. What if we only trade the extreme quantile? That is around 80% accuracy! It literally shows that technical factors are really important in US stock market and can be used to find arbitrage opportunity.
186 |
187 | ## References
188 | * Jonathan Larkin, *A Professional Quant Equity Workflow*. August 31, 2016
189 | * *A Practitioner‘s Guide to Factor Models*. The Research Foundation of The Institute of Chartered Financial Analysts
190 | * Thomas Wiecki, Machine Learning on Quantopian
191 | * Inigo Fraser Jenkins, *Using factors with different alpha decay times: The case for non-linear combination*
192 | * PNC, *Factor Analysis: What Drives Performance?*
193 | * O’Shaughnessy, *Alpha or Assets? — Factor Alpha vs. Smart Beta*. April 2016
194 | * *O’Shaughnessy Quarterly Investor Letter Q1 2018*
195 | * Jiantao Zhu, Orient Securities, *Alpha Forecasting - Factor-Based Strategy Research Series 13*
196 | * Yang Song, Bohai Securities, *Multi-Factor Models Research: Single Factor Testing*, 2017/10/11
197 |
198 |
199 | ## Appendix: Notes on Factor Models
200 |
201 | ### CAPM
202 | * Author: Markovitz(1959)
203 | * single-factor:
204 | * explain: security returns
205 |
206 | ### APT
207 | * Author: Stephen A. Ross(1976)
208 | * multi-factor
209 | * explain: security returns
210 |
211 | #### Postulates:
212 | - The linear model
213 | $$r_i(t) - \alpha_i = \sum_{k=1}^K \beta_{ik} \cdot f_k(t) + \epsilon_i(t)$$
214 |
215 | where $f_k(t)$ is the realization(value) of risk factor at time t
216 |
217 | - No pure arbitrage profit
218 |
219 | #### Conclusion
220 | * Exposure of each security on each factor
221 | * Risk premium on each factor
222 | $$(Mean[r_i(t)])_i = P_0 + \sum_{k=1}^K \beta_{ik} \cdot P_k$$
223 | or make $\beta_{0,k}$ equals 1 for each k,
224 | $$(Mean[r_i(t)])_i = \sum_{k=0}^K \bar{\beta}_{i,k} \cdot P_k$$
225 | where $P_0$ is the risk free return
226 |
227 | * Portfolio exposure to each factor
228 | $$Portfolio_{it} = \beta_0 + \beta_k \cdot f_{kit}$$
229 |
230 |
231 |
232 | #### Three alternative calibration methods
233 | * **statistical techniques** such as factor analysis, principle analysis
234 | - **Goodness**: good for determining the number of relevent risk factors
235 | - **Undesirable**: hard to interpret
236 |
237 | * **portfolios**: K different well-diversified portfolios as substitutions
238 | - **Goodness**: lead to insights
239 | - **Fama-Macbeth regression**
240 |
241 | * **economic theory** (highly developed art)
242 | - **Goodness**: Intuitively appealing set of factors that admit economic interpretation of risk exposures
243 | - **Goodness**: Using economic information in addition to stock return. Avoid using stock return to explain stock return
244 | - **factors**:
245 | 1. confidence risk
246 | 2. time horizon risk
247 | 3. inflation risk
248 | 4. bussiness cycle risk
249 | 5. market-timing risk
250 |
251 | #### Generalizations
252 | The simplicity of APT framework is a great virtue. It is helpful to understand the true sources of stock returns. The basic APT model can be enhanced in many ways.
253 |
254 | * Allow risk prices $P_k$ to vary over time
255 | * Allow risk exposures $\beta_{i,k}$ to vary over time
256 | * Use Bayesian mothods to produce optimal out-of-sample forcasts for the risk exposures and hence for the expected returns
257 | * Introduce additional factor with zero-risk prices. Although do not contribute to expected return, help to explain the volatility.
258 |
259 | ### Multi-Index Models (Factor Analysis & PCA)
260 |
261 | #### Goal
262 | Using historical return extract the factors
263 |
264 | $$r_{it} = \alpha_i + \sum_k \beta_{ik}\cdot f_{kt}$$
265 | where
266 | $$E[\epsilon_{it} \epsilon_{jt}]=0$$
267 | $$E[\epsilon_{it} f_{kt}]=0$$
268 |
269 | $f_{kt}$: the return on index k inperiod t
270 |
271 | $\beta$: sensitivities
272 |
273 | #### Estimation
274 | Either exposure or factor return can be asserted on a priori grounds with the other identified empirically, or both can be identified empirically.
275 |
276 | #### Characteristics
277 | * Have f(indexes) represents separate influence
278 | * The structure must be parsimonious: the returns can be described in terms of limited indexes
279 |
280 | #### Statistical Solutions
281 | Let the data design the model
282 |
283 | * PCA
284 | * Factor Analysis: better in heteroscedastic series
285 |
286 | #### Design Issue
287 | * **The Choice of Data**: Individul stocks vs portfolio
288 | * **The number of Index**:
289 | - Stactical techniques: Factor analysis, PCA
290 | - Common sense and economic significance play a major role in deciding on the number of factors
291 | * **The nonuniqueness of Factors**: The researcher should realize the resulting structure is not unique. Some researchers will examine alternative structures in an atempt to understand what influences are affecting security returns and to convince themself the overall separation make an intuitive sense
292 | * **Computational Problems**:
293 | - Roll and Ross: Multisample approach
294 | - Chen: Portfolio approach
295 |
296 | #### Applications
297 | * **Identify the Indexes set**
298 | * **Determine the number of factors**: PCA / Factor Analysis
299 | - Single-group tests for each sample
300 | - Factor Analysis on return-generating process
301 | - Criteria: Chi2, AIC, **BIC**
302 | - Multiple-group tests for all stocks
303 | - Canonical Correlation (CCA):
304 |
305 | take two sets of variables and see what is common amongst the two sets (can be two noncorresponding variables either on index or dimension)
306 | $$X_{N \times K}, Y_{N \times K^{\prime}}$$
307 | $$\mbox{x_weights}_{K,n}$$
308 | $$\mbox{y_weights}_{K^{\prime},n}$$
309 | Use CCA / PLS:
310 | $$\mbox{X_score}_{N\times n} = \mbox{Normalized}[X]_{N \times K} \mbox{x_weights}_{K,n}$$
311 |
312 | $$\mbox{Y_score}_{N\times n} = \mbox{Normalized}[Y]_{N \times K^{\prime}} \mbox{y_weights}_{K^{\prime},n}$$
313 | - Determin the number:
314 | - r-value for $n=10$
315 | - correlation matrix pattern for each number of components: $n \times n$ for $n=1,\cdots,10$
316 |
317 | * **Generate Factors**
318 |
319 | * **Calibrate sensitivities**:
320 |
321 | - Portfolio exposure to each factor
322 | - $Adjusted R^2$ (Should be stable)
323 | - Explanatory power: Compare these results with those for the single-index model (Should depend on the market cap)
324 |
325 | * **Explanatory Power** of the Model for Each Stock: R2>0.7 excellent
326 |
327 | #### Conclusions
328 | * Goodness: simultaneously estimate the indexes and sensitivities in a multi-index model
329 | * Defect: Data Minning: Using return to explain return
330 |
331 |
332 | ### Multi-Factor Models for Portfolio Risk (BARRA)
333 |
334 | $$r_{i,t} = a_{i,t} + X_{i,k,t} \cdot f_{k,t}$$
335 | where
336 | $X_{i,k,t}$: the exposure of asset i to factor k known at time t
337 | $f_{k,t}$: the factor return to factor k during the period from time $t$ to time $t+1$
338 | $a_{i,t}$: the stock i's specific return during period from time $t$ to time $t+1$
339 | $r_{i,t}$: the excess return (return above the risk-free return) on stock i during the period from time $t$ to time $t+1$
340 |
341 | The risk structure
342 | $$V_{i,j} = X_{i,k1} F_{k1,k2} X_{j,k2}^T + \Delta_{i,j}$$
343 | $$V = X^T F X + \Delta$$
344 | where
345 |
346 | $F_{k1,k2}$ is the K by K covariance matrix for factor returns
347 |
348 | $\Delta_{i,j}$ is the N by N diagonal matrix of specific variance
349 |
350 | A portfolio described by an N-element vector $h_i$
351 |
352 | * portfolio exposure: $x_p = X^T h_p$
353 | * portfolio variance: $\sigma_p^2 = x_p^T F x_p + h_p^T \Delta h_p = h_p^T V h_p$
354 | * Marginal Contribution for Total Risk
355 | $$MCTR = \frac{V h_p}{\sigma_p}$$
356 | * Risk-adjusted expected return:
357 | $$U = h_p^T r_p - \lambda\cdot h_p^T V h_p$$
358 |
359 |
360 | #### Choosing the Factors
361 | * External influences --> BARRA Model
362 | - Return in bond market (bond beta)
363 | - Unexpected changes in inflation
364 | - Change in oil price
365 | - Change in exchange rate
366 | * Cross-sectional comparisons
367 | - Fundamental
368 | - Market
369 | - volatility
370 | - price
371 | - share turnover
372 | * Purely internal or statistical factors
373 | - see multi-index model
374 |
375 | #### Exposures
376 | * Industry Exposures
377 | - 1/0 variable
378 | * Risk Index Exposures
379 | - Volatility: beta, daily return vol, option implied vol
380 | - Momentum
381 | - Size
382 | - Liquidity
383 | - Growth
384 | - Value(Fundamentals)
385 | - Earning volatility
386 | - Financial leverage: debt-to-equity ratios
387 |
388 | #### Applications
389 | * Rescale the Exposures
390 | * Regress the Factor Returns Against Exposures via Cross-sectional Regression
391 | $$f = (X^T W X)^{-1} (X^T W r)\\
392 | = \sum_{i=1}^N C_{k,i} r_i$$
393 | Here factor return can be interpreted as the return to a portfolio with weights $C_{k,i}$. So factor returns are the returns to factor portfolios. This portfolio has unit exposure to the particular factor
394 | * Factor Covariance and Specific
395 | - Stock returns
396 | - Factor exposures
397 | - Stock dividends, splits, and other adjustment
398 |
399 | #### Model Validation
400 | * Model Setting:
401 | - 50 factors
402 | - 1000 assets
403 | * Measures:
404 |
405 | - $R^2$: 30-40%. It can vary quite significantly from month to month. And depends on the market return level.
406 | - root mean square error: 6% roughly against 10% volatility
407 | - Portfolio Risk
408 | * Goal:
409 | - Expain the portfolio risk
410 | - Forecast variances and covariances of factors and specific returns
411 | - Providing incisive, intuitive and interesting risk analysis
412 |
413 |
414 | You can think of this as slicing through the other direction from the APT analysis, as now the factor returns are unknowns to be solved for, whereas originally the coefficients b were the unknowns. Another way to think about it is that you're determining how predictive of returns the factor was on that day, and therefore how much return you could have squeezed out of that factor.
415 |
--------------------------------------------------------------------------------
/README.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jerryxyx/AlphaTrading/5e73923786297faeadb27c76f83ec81fad74af51/README.pdf
--------------------------------------------------------------------------------
/README_old.md:
--------------------------------------------------------------------------------
1 | # Multi-Factor Models
2 |
3 | Author: Jerry Xia
4 |
5 | Date: 2018/05/21
6 |
7 | *Note: The advanced Marckdown features such as math expression may not be compatible in GitHub, please see README.pdf instead if you want more details*
8 |
9 |
10 |
11 | ## Project Introduction
12 | This is a research survey about alpha trading. In this project, I built up a pipeline of alpha trading including:
13 |
14 | * factor pretest
15 | * factor screening
16 | * factor combination (modeling)
17 |
18 | The models involed are APT models, Barra's risk models and dynamic factors model using Kalman filter.
19 |
20 | ### Files
21 |
22 | * rqdata_utils.py: Utils dealing with the rice quant platform data
23 |
24 | * Step1_FactorPretest.ipynb: Factor returns profile visulization
25 |
26 | * Step2_FactorsScreening.ipynb: Factor returns turnover visulization and correlation coefficients
27 |
28 | * Step3\_FactorCombination\_AdaBoost\_Quantopian.ipynb: A Quantopian notebook file to combine alpha factors using Adaboost
29 |
30 | * Step3\_FactorCombination\_BarraKalmanFilter.ipynb: Barra's risk model with three calibration schemes:
31 | * Scheme 1: Cross-sectional regression and weighted average
32 | * Scheme 2: Optimization problem: minimize the exponential weighted average of squared error
33 | * Scheme 3: Dynamic linear model using Kalman filter
34 |
35 | * KalmanFilterIntro.ipynb: An introduction to the dynamic multi-factor model
36 | * APT_FammaBeth.ipynb: Using Famma-Macbeth regression to calibrate APT model.
37 |
38 | ### Dataset
39 | The dataset is not available in GitHub as it is too large. Except for Step3\_FactorCombination\_AdaBoost\_Quantopian.ipynb which we used US stock data in Quantopian, among other files, we used Chinese A-stocks data downloaded from RiceQuant instead (hard for free US equities' data).
40 |
41 | The data frame is multi-indexed similar to Quantopian's format(see both Alphalens github codes and rqdata_utils.py). However, feel free to cast and apply your own dataset.
42 |
43 |
44 | ### Goal
45 | * **Equity Return Forecasting**
46 |
47 | * **Portfolio Risk Estimation**
48 |
49 | - APT
50 | - Risk Exposure: $\beta_{i,k}$
51 | - Risk Premium: $P_k$
52 | - Contribution of Risk Factor to Long Term Excess Return:
53 | $$E[r_i] - TB = \sum_k \beta_{i,k}P_k$$
54 | - BARRA
55 | - Factor Return Covariance: V
56 | - Portfolio Risk: $\sigma_p$
57 | - Portfolio Risk Exposures: $$x_p=X^T h_p$$
58 | - Marginal Contribution for Total Risk: $$MCTR = \frac{V h_p}{\sigma_p}$$
59 | - Portfolio Risk-Adjusted Expected Return: $$U = h_p^T r - \lambda \cdot h_p^T V h_p$$
60 |
61 | ### Model Classification
62 | * CAPM
63 | - a kind of sigle-factor model
64 | - usually, a validity benchmark for other models
65 |
66 | * APT
67 | - factor returns are assumed to be known
68 | - factor exposure can be regressed from factor returns
69 | - aimed at forecasting
70 | - how to fit: Fama-Macbeth Algorithm
71 |
72 | * Multi-Index Models
73 | - statistical indogeneous model using factor analysis
74 | - useful at factors parsimouny and decouple
75 |
76 | * Multi-Factor Risk Models(BARRA)
77 | - factor exposures are assumed to be known (can be derived as the rescaled factor value)
78 | - factor return can be regressed from factor exposures
79 | - aimed at risk management
80 | - how to fit: cross-sectional regression
81 |
82 | ### Calibration Algorithms
83 | Here I used 2 traditional way add a novel Kalman filter technique (see KalmanFilter.ipynb or MultiFactorModel.ipynb)
84 |
85 | * Time-series regression (fix equity)
86 | * Cross-sectional regression (fix time-stamp)
87 | * Kalmn filter (APT model allowing risk exposure and risk premium to vary over time. In another word, a dynamic model with gaussian noise)
88 |
89 | ### Improvements
90 |
91 | * A percentage rank test is a good alternative to a z score
92 | * Beware of quarterly ratios (referring to ROA, ROE, gross margin, etc.)
93 | * Factor for quality: gross profitability a la Novy-Marx (2013). It's simply gross profits divided by total assets.
94 | * Substituting ROA/Gross Margin with gross profitability
95 | *
96 |
97 | ## Appendix: Notes on Factor Models
98 |
99 | ### CAPM
100 | * Author: Markovitz(1959)
101 | * single-factor:
102 | * explain: security returns
103 |
104 | ### APT
105 | * Author: Stephen A. Ross(1976)
106 | * multi-factor
107 | * explain: security returns
108 |
109 | #### Postulates:
110 | - The linear model
111 | $$r_i(t) - \alpha_i = \sum_{k=1}^K \beta_{ik} \cdot f_k(t) + \epsilon_i(t)$$
112 |
113 | where $f_k(t)$ is the realization(value) of risk factor at time t
114 |
115 | - No pure arbitrage profit
116 |
117 | #### Conclusion
118 | * Exposure of each security on each factor
119 | * Risk premium on each factor
120 | $$(Mean[r_i(t)])_i = P_0 + \sum_{k=1}^K \beta_{ik} \cdot P_k$$
121 | or make $\beta_{0,k}$ equals 1 for each k,
122 | $$(Mean[r_i(t)])_i = \sum_{k=0}^K \bar{\beta}_{i,k} \cdot P_k$$
123 | where $P_0$ is the risk free return
124 |
125 | * Portfolio exposure to each factor
126 | $$Portfolio_{it} = \beta_0 + \beta_k \cdot f_{kit}$$
127 |
128 |
129 |
130 | #### Three alternative calibration methods
131 | * **statistical techniques** such as factor analysis, principle analysis
132 | - **Goodness**: good for determining the number of relevent risk factors
133 | - **Undesirable**: hard to interpret
134 |
135 | * **portfolios**: K different well-diversified portfolios as substitutions
136 | - **Goodness**: lead to insights
137 | - **Fama-Macbeth regression**
138 |
139 | * **economic theory** (highly developed art)
140 | - **Goodness**: Intuitively appealing set of factors that admit economic interpretation of risk exposures
141 | - **Goodness**: Using economic information in addition to stock return. Avoid using stock return to explain stock return
142 | - **factors**:
143 | 1. confidence risk
144 | 2. time horizon risk
145 | 3. inflation risk
146 | 4. bussiness cycle risk
147 | 5. market-timing risk
148 |
149 | #### Generalizations
150 | The simplicity of APT framework is a great virtue. It is helpful to understand the true sources of stock returns. The basic APT model can be enhanced in many ways.
151 |
152 | * Allow risk prices $P_k$ to vary over time
153 | * Allow risk exposures $\beta_{i,k}$ to vary over time
154 | * Use Bayesian mothods to produce optimal out-of-sample forcasts for the risk exposures and hence for the expected returns
155 | * Introduce additional factor with zero-risk prices. Although do not contribute to expected return, help to explain the volatility.
156 |
157 | ### Multi-Index Models (Factor Analysis & PCA)
158 |
159 | #### Goal
160 | Using historical return extract the factors
161 |
162 | $$r_{it} = \alpha_i + \sum_k \beta_{ik}\cdot f_{kt}$$
163 | where
164 | $$E[\epsilon_{it} \epsilon_{jt}]=0$$
165 | $$E[\epsilon_{it} f_{kt}]=0$$
166 |
167 | $f_{kt}$: the return on index k inperiod t
168 |
169 | $\beta$: sensitivities
170 |
171 | #### Estimation
172 | Either exposure or factor return can be asserted on a priori grounds with the other identified empirically, or both can be identified empirically.
173 |
174 | #### Characteristics
175 | * Have f(indexes) represents separate influence
176 | * The structure must be parsimonious: the returns can be described in terms of limited indexes
177 |
178 | #### Statistical Solutions
179 | Let the data design the model
180 |
181 | * PCA
182 | * Factor Analysis: better in heteroscedastic series
183 |
184 | #### Design Issue
185 | * **The Choice of Data**: Individul stocks vs portfolio
186 | * **The number of Index**:
187 | - Stactical techniques: Factor analysis, PCA
188 | - Common sense and economic significance play a major role in deciding on the number of factors
189 | * **The nonuniqueness of Factors**: The researcher should realize the resulting structure is not unique. Some researchers will examine alternative structures in an atempt to understand what influences are affecting security returns and to convince themself the overall separation make an intuitive sense
190 | * **Computational Problems**:
191 | - Roll and Ross: Multisample approach
192 | - Chen: Portfolio approach
193 |
194 | #### Applications
195 | * **Identify the Indexes set**
196 | * **Determine the number of factors**: PCA / Factor Analysis
197 | - Single-group tests for each sample
198 | - Factor Analysis on return-generating process
199 | - Criteria: Chi2, AIC, **BIC**
200 | - Multiple-group tests for all stocks
201 | - Canonical Correlation (CCA):
202 |
203 | take two sets of variables and see what is common amongst the two sets (can be two noncorresponding variables either on index or dimension)
204 | $$X_{N \times K}, Y_{N \times K^{\prime}}$$
205 | $$\mbox{x_weights}_{K,n}$$
206 | $$\mbox{y_weights}_{K^{\prime},n}$$
207 | Use CCA / PLS:
208 | $$\mbox{X_score}_{N\times n} = \mbox{Normalized}[X]_{N \times K} \mbox{x_weights}_{K,n}$$
209 |
210 | $$\mbox{Y_score}_{N\times n} = \mbox{Normalized}[Y]_{N \times K^{\prime}} \mbox{y_weights}_{K^{\prime},n}$$
211 | - Determin the number:
212 | - r-value for $n=10$
213 | - correlation matrix pattern for each number of components: $n \times n$ for $n=1,\cdots,10$
214 |
215 | * **Generate Factors**
216 |
217 | * **Calibrate sensitivities**:
218 |
219 | - Portfolio exposure to each factor
220 | - $Adjusted R^2$ (Should be stable)
221 | - Explanatory power: Compare these results with those for the single-index model (Should depend on the market cap)
222 |
223 | * **Explanatory Power** of the Model for Each Stock: R2>0.7 excellent
224 |
225 | #### Conclusions
226 | * Goodness: simultaneously estimate the indexes and sensitivities in a multi-index model
227 | * Defect: Data Minning: Using return to explain return
228 |
229 |
230 | ### Multi-Factor Models for Portfolio Risk (BARRA)
231 |
232 | $$r_{i,t} = a_{i,t} + X_{i,k,t} \cdot f_{k,t}$$
233 | where
234 | $X_{i,k,t}$: the exposure of asset i to factor k known at time t
235 | $f_{k,t}$: the factor return to factor k during the period from time $t$ to time $t+1$
236 | $a_{i,t}$: the stock i's specific return during period from time $t$ to time $t+1$
237 | $r_{i,t}$: the excess return (return above the risk-free return) on stock i during the period from time $t$ to time $t+1$
238 |
239 | The risk structure
240 | $$V_{i,j} = X_{i,k1} F_{k1,k2} X_{j,k2}^T + \Delta_{i,j}$$
241 | $$V = X^T F X + \Delta$$
242 | where
243 |
244 | $F_{k1,k2}$ is the K by K covariance matrix for factor returns
245 |
246 | $\Delta_{i,j}$ is the N by N diagonal matrix of specific variance
247 |
248 | A portfolio described by an N-element vector $h_i$
249 |
250 | * portfolio exposure: $x_p = X^T h_p$
251 | * portfolio variance: $\sigma_p^2 = x_p^T F x_p + h_p^T \Delta h_p = h_p^T V h_p$
252 | * Marginal Contribution for Total Risk
253 | $$MCTR = \frac{V h_p}{\sigma_p}$$
254 | * Risk-adjusted expected return:
255 | $$U = h_p^T r_p - \lambda\cdot h_p^T V h_p$$
256 |
257 |
258 | #### Choosing the Factors
259 | * External influences --> BARRA Model
260 | - Return in bond market (bond beta)
261 | - Unexpected changes in inflation
262 | - Change in oil price
263 | - Change in exchange rate
264 | * Cross-sectional comparisons
265 | - Fundamental
266 | - Market
267 | - volatility
268 | - price
269 | - share turnover
270 | * Purely internal or statistical factors
271 | - see multi-index model
272 |
273 | #### Exposures
274 | * Industry Exposures
275 | - 1/0 variable
276 | * Risk Index Exposures
277 | - Volatility: beta, daily return vol, option implied vol
278 | - Momentum
279 | - Size
280 | - Liquidity
281 | - Growth
282 | - Value(Fundamentals)
283 | - Earning volatility
284 | - Financial leverage: debt-to-equity ratios
285 |
286 | #### Applications
287 | * Rescale the Exposures
288 | * Regress the Factor Returns Against Exposures via Cross-sectional Regression
289 | $$f = (X^T W X)^{-1} (X^T W r)\\
290 | = \sum_{i=1}^N C_{k,i} r_i$$
291 | Here factor return can be interpreted as the return to a portfolio with weights $C_{k,i}$. So factor returns are the returns to factor portfolios. This portfolio has unit exposure to the particular factor
292 | * Factor Covariance and Specific
293 | - Stock returns
294 | - Factor exposures
295 | - Stock dividends, splits, and other adjustment
296 |
297 | #### Model Validation
298 | * Model Setting:
299 | - 50 factors
300 | - 1000 assets
301 | * Measures:
302 |
303 | - $R^2$: 30-40%. It can vary quite significantly from month to month. And depends on the market return level.
304 | - root mean square error: 6% roughly against 10% volatility
305 | - Portfolio Risk
306 | * Goal:
307 | - Expain the portfolio risk
308 | - Forecast variances and covariances of factors and specific returns
309 | - Providing incisive, intuitive and interesting risk analysis
310 |
311 |
312 | You can think of this as slicing through the other direction from the APT analysis, as now the factor returns are unknowns to be solved for, whereas originally the coefficients b were the unknowns. Another way to think about it is that you're determining how predictive of returns the factor was on that day, and therefore how much return you could have squeezed out of that factor.
313 |
--------------------------------------------------------------------------------
/output/factor_ic_analysis.csv:
--------------------------------------------------------------------------------
1 | factor,group,1D,27D,98D
2 | total_turnover,ConsumerDiscretionary,-0.052061319774850776,-0.11316983202573735,-0.1929657054345211
3 | total_turnover,ConsumerStaples,-0.06218330937378258,-0.12809022695465364,-0.23566073217524877
4 | total_turnover,Energy,-0.053310899809748104,-0.114291646238941,-0.1828861184315757
5 | total_turnover,Financials,-0.043483723699830806,-0.09250154138845446,-0.15258051713146398
6 | total_turnover,HealthCare,-0.03054709278550283,-0.07126898666786867,-0.1370994975862821
7 | total_turnover,Industrials,-0.06155338477832689,-0.12191628841788626,-0.1913136626300075
8 | total_turnover,InformationTechnology,-0.04331167468163195,-0.09994088680694369,-0.18793138112573995
9 | total_turnover,Materials,-0.06347263107873605,-0.1454435080579407,-0.23516052362957487
10 | total_turnover,TelecommunicationServices,-0.05623721881390593,-0.12195086829491936,-0.08384458077709611
11 | total_turnover,Utilities,-0.06710637414072558,-0.11822026772629673,-0.18005363953326778
12 | volume,ConsumerDiscretionary,-0.038821806113053185,-0.06517649803074897,-0.10564426190383719
13 | volume,ConsumerStaples,-0.04881955414007564,-0.07004912944675076,-0.10700285231017939
14 | volume,Energy,-0.05050533179526796,-0.05539159512201123,-0.04449211070962409
15 | volume,Financials,-0.037379891386358104,-0.06717310704498229,-0.11532946922437222
16 | volume,HealthCare,-0.018552307919222938,-0.031143940450130612,-0.06310080118478258
17 | volume,Industrials,-0.05369190239494951,-0.09056039930841121,-0.12912730395652844
18 | volume,InformationTechnology,-0.03270391959214507,-0.06303027646641893,-0.12181248844576387
19 | volume,Materials,-0.05213678473116451,-0.09941954650749407,-0.1433535572754988
20 | volume,TelecommunicationServices,-0.03401630796772099,-0.013292433537832311,-0.028629856850715747
21 | volume,Utilities,-0.053355632253003354,-0.07805499426744068,-0.09309882686634807
22 | market_cap,ConsumerDiscretionary,-0.018642170440300053,-0.08071467806160568,-0.15194577884185356
23 | market_cap,ConsumerStaples,-0.022631363847695745,-0.09367476053930583,-0.1991416188736341
24 | market_cap,Energy,-0.0215733019306775,-0.10099046052166827,-0.23276886886450332
25 | market_cap,Financials,-0.01783881472520913,-0.06584105506902525,-0.12906327329642123
26 | market_cap,HealthCare,-0.016506107254639694,-0.07322090125094058,-0.14809213265805843
27 | market_cap,Industrials,-0.024891064518646394,-0.08946681310150975,-0.1722836413867398
28 | market_cap,InformationTechnology,-0.018400836027949976,-0.06852777925101093,-0.16777140860823717
29 | market_cap,Materials,-0.028921427376547782,-0.12317603287175999,-0.2357473587754678
30 | market_cap,TelecommunicationServices,-0.024539877300613498,-0.12706334273254716,-0.18916155419222905
31 | market_cap,Utilities,-0.021505325470817266,-0.06974356783593436,-0.11709171910355881
32 | a_share_market_val_2,ConsumerDiscretionary,-0.018301595390333388,-0.0834047795877226,-0.1622640445487711
33 | a_share_market_val_2,ConsumerStaples,-0.021353636086466607,-0.08957475782545855,-0.19257272420530416
34 | a_share_market_val_2,Energy,-0.01589375064124426,-0.08269370057080777,-0.22624001661944343
35 | a_share_market_val_2,Financials,-0.013595140013004083,-0.05757239677747778,-0.11097273433879687
36 | a_share_market_val_2,HealthCare,-0.016442154211948196,-0.06823771497421101,-0.1313262609982461
37 | a_share_market_val_2,Industrials,-0.022288503369883857,-0.08422835061983074,-0.16421150228922107
38 | a_share_market_val_2,InformationTechnology,-0.01602024151930178,-0.06606228480096947,-0.16184394862584908
39 | a_share_market_val_2,Materials,-0.026673267375488252,-0.11840215884371369,-0.22834195826194792
40 | a_share_market_val_2,TelecommunicationServices,-0.019427402862985693,-0.050102249488752554,-0.006134969325153374
41 | a_share_market_val_2,Utilities,-0.01685842238760016,-0.07250987448493125,-0.12230908395788107
42 | cash_received_from_sales_of_goods,ConsumerDiscretionary,-0.00103083821123597,-0.03206445749325148,-0.06876501903118355
43 | cash_received_from_sales_of_goods,ConsumerStaples,0.0011848183502683814,-0.026253336689890584,-0.048118632633388406
44 | cash_received_from_sales_of_goods,Energy,-0.016016456712267646,-0.08156643860850851,-0.1719692794464853
45 | cash_received_from_sales_of_goods,Financials,0.002049322081554394,-0.01712216286244825,-0.04627365414791241
46 | cash_received_from_sales_of_goods,HealthCare,-0.007054075457333036,-0.04186122320226921,-0.09264530897763122
47 | cash_received_from_sales_of_goods,Industrials,-0.0033712276790714055,-0.026317880861985995,-0.07804687752350649
48 | cash_received_from_sales_of_goods,InformationTechnology,-0.003735428340115983,-0.03292082920277817,-0.09611089193235643
49 | cash_received_from_sales_of_goods,Materials,-0.009364496334334471,-0.062445293690082074,-0.13116837426913705
50 | cash_received_from_sales_of_goods,TelecommunicationServices,-0.0010224948875255625,0.013292433537832311,0.044989775051124746
51 | cash_received_from_sales_of_goods,Utilities,-0.004586561111603414,-0.020219037138779015,-0.02362352504750876
52 | pb_ratio,ConsumerDiscretionary,-0.02786139236091668,-0.04723618727888782,-0.06416866121080726
53 | pb_ratio,ConsumerStaples,-0.029475817225151503,-0.07143808423799812,-0.12540901349300188
54 | pb_ratio,Energy,-0.0036947086202947805,0.00801451621850763,0.050710527187911955
55 | pb_ratio,Financials,-0.02539575463258946,-0.05441037179139589,-0.08509951720797193
56 | pb_ratio,HealthCare,-0.01773232906393156,-0.005604430266716481,-0.0050057836964287755
57 | pb_ratio,Industrials,-0.02302510249518233,-0.04874561973483434,-0.0640169980335691
58 | pb_ratio,InformationTechnology,-0.02211961063266852,-0.05069932164964732,-0.0718467294959841
59 | pb_ratio,Materials,-0.01959489813139817,-0.039595205878501294,-0.06585604305979385
60 | pb_ratio,TelecommunicationServices,-0.002044989775051125,-0.016359918200409,-0.028629856850715747
61 | pb_ratio,Utilities,-0.026500309923510785,-0.07579696587327645,-0.1316871231270864
62 | net_profit,ConsumerDiscretionary,0.004534976198255834,-0.024587145489383917,-0.06697091876991088
63 | net_profit,ConsumerStaples,0.001762134488588463,-0.030557169605312656,-0.09508911979085215
64 | net_profit,Energy,-0.0037239591056138937,-0.05576234882359838,-0.14286158066900853
65 | net_profit,Financials,0.005619412446675389,-0.004552703253618796,-0.03282171908115013
66 | net_profit,HealthCare,0.004023627795232325,-0.0122560368659469,-0.054647179895811165
67 | net_profit,Industrials,0.005922822373344156,-0.004876234673987302,-0.037687191394743746
68 | net_profit,InformationTechnology,0.0034451143466464815,-0.01467883275591999,-0.07454590190755
69 | net_profit,Materials,0.007775417553720173,-0.017608951812301852,-0.097233933891964
70 | net_profit,TelecommunicationServices,-0.0010224948875255625,0.013292433537832311,0.044989775051124746
71 | net_profit,Utilities,0.003587155195185814,-0.009910187986564376,-0.04759442386949663
72 | ps_ratio,ConsumerDiscretionary,-0.012080771280731099,-0.018288440941963535,-0.014703891059025101
73 | ps_ratio,ConsumerStaples,-0.019200947719015923,-0.04964656897109551,-0.12391325426910196
74 | ps_ratio,Energy,0.009579091016352978,0.04035361462838712,0.07135150759411069
75 | ps_ratio,Financials,-0.014126989377467987,-0.014116943885514285,-0.0199666437383381
76 | ps_ratio,HealthCare,-0.0050853830262781955,-0.005927366792048748,0.0016129868796054226
77 | ps_ratio,Industrials,-0.008480756136386746,-0.010614464339011856,0.0075018939060060375
78 | ps_ratio,InformationTechnology,-0.01071146507594663,-0.016407405031958085,-0.013131536623179884
79 | ps_ratio,Materials,-0.00811664224948075,-0.003958926420953927,0.006930722032411271
80 | ps_ratio,TelecommunicationServices,-0.00408997955010225,-0.02556237218813906,-0.053169734151329244
81 | ps_ratio,Utilities,-0.011680459426909625,-0.03175334201449768,-0.08629345861643857
82 |
--------------------------------------------------------------------------------
/report/Alpha Trading Workflow.md:
--------------------------------------------------------------------------------
1 | # Alpha Trading Workflow
2 |
3 | Analyst: Yuxuan Xia
4 |
5 | Date: 2018/06/04
6 |
7 | ## TODO
8 |
9 | * Input more effective factors: take advice from people and industry reports
10 | * Should add technical analysis, because it matters! People care about them and then make it good sentimental indexes.
11 | * Find well-known metrics to express results
12 |
13 | ## Workflow
14 | \checkmark stands for finished and \vartriangle stands for TODO
15 |
16 | * Universe definition
17 | * Factors collection and preprocessing
18 | * $\vartriangle$ Factors collection
19 | - Sources
20 | - balance sheet
21 | - cash flow statement
22 | - income statement
23 | - earning report
24 | - Econometric Classifications
25 | - value
26 | - growth
27 | - profitability
28 | - market size
29 | - liquidity
30 | - volatility
31 | - Momentom
32 | - Financial leverage (debt-to-equity ratio)
33 | * Factors preprocessing
34 | - $\vartriangle$daily, quaterly, annually
35 | - continuous: rescale, outliers
36 | - $\checkmark$discrete: rank
37 | * Factors screening and combination
38 | * Factors screening
39 | - $\checkmark$Factors' correlation
40 | - $\checkmark$Factors' foreseeablity
41 | - Fama-Macbeth regression
42 | * $\vartriangle$Factors combination
43 | - PCA, FA
44 | - Techniqual Analaysis
45 | - Financial Modeling
46 | - Linear combination to maximize Sharpe ratio
47 | - Non-linear learning algorithms
48 | - $\checkmark$AdaBoost
49 | - Reinforcement learning
50 |
51 | * Portfolio allocation
52 |
53 |
54 | ## Factors' Correlations
55 | Here, I use correlation matrix as the measure. The difference from the second result is that the correlation matrix is calculated by the rank data rather than the raw data
56 | ### Two ICs comparison
57 | * Pearson's IC: measures linear relationship between components
58 |
59 | * Spearman's IC: measures monotonic relationship between components. Since We only care about the monotonic relationships. Spearman's IC wins.
60 |
61 |
62 | ### Regular IC(Pearson's correlation coefficient) for each factors
63 | 
64 | ### Spearman's Rank correlation coefficient for each factors
65 | 
66 |
67 | ### How to rule out redundant factors and why Spearman's rank correlation coefficients?
68 | From the correlation coefficients below, we can again conclude that Spearman's rank IC is far more robust. Take ps_ratio and sales_yield as a example.
69 | $$ps\_ratio = \frac{\mbox{adjusted close price}}{\mbox{sales per share}}$$
70 | whereas
71 | $$sales\_yield = \frac{\mbox{sales per share}}{\mbox{price}}$$
72 | Ahthogh the price in sales_yield formula is vague in our data source we can see roughly speaking, these two variable should be inverse of each other. The Spearman's rank correlation coefficient is -0.98 which verifies this statement, and we should avoid using both of these factors, which would exeggarate the impact of this peticular factor. However, we can not see such identity in the Pearson's regular correlation coefficients. It's quite misleading actually and that's why we choose Spearman's rank IC.
73 |
74 | ## Factors' Foreseeability
75 |
76 | ### Mehods
77 | * Spearman's rank correlation coefficients
78 | * Fama-Macbeth regression: Not only consider the foreseeability of factors itself but also consider the co-vary of different factors, which means rule out factors if the returns can be explained by the recent factors.
79 |
80 |
81 | ### Spearman's rank IC for factors vs. forward returns
82 |
83 | 
84 |
85 | ### Spearman's rank IC (absolute value) for factors vs. forward returns
86 | .png)
87 |
88 | ### Rank of the Spearman's rank IC (absolute value) for factors vs. forward returns
89 | .png)
90 |
91 | ## Factors Preprocessing
92 | * Get ranked data
93 | * Obtain the valid stocks set
94 | * Reshape the data: only valid stocks set
95 | * Fill null: using daily average
96 | * Rescale the data: MinMaxScaler
97 | * Variet reduction: PCA analysis
98 | * Sanity check
99 |
100 | 
101 |
102 | Here, I use principle component analysis because it can brings two benefits to our data - orthogonality and dimensionality reduction. Orthogonality makes data more separate, less dimensionality makes information more concentrated. Either of them is essential for machine learning algorithms.
103 |
104 | In the next part, I used this preprocessed data as the input to obtain a "mega alpha".
105 |
106 | ## Mega Alpha
107 | construct an aggregate alpha factor which has its return distribution profitable. The term "profitable" here means condense, little turnover, significant in the positive return.
108 | ### Methods
109 | #### linear methods
110 | * normalize factors and try a linear combination
111 | * rank each factor and then sum up
112 | * Financial modeling
113 | * linear combination to maximize Sharpe ratio
114 |
115 | #### Non-linear methods
116 | * AdaBoost
117 | * Reinforement Learning
118 |
119 | ### AdaBoost
120 | #### Description
121 | The algorithm sequentially applies a weak classification to modified versions of the data. By increasing the weights of the missclassified observations, each weak learner focuses on the error of the previous one. The predictions are aggregated through a weighted majority vote.
122 |
123 | #### Algorithm
124 |
125 | 
126 |
127 | #### Train set
128 | The adaboost classifier was applied to our fundamental dataset. The objective is to train a classifier which give a score for the bunch of factors. Or in other word, the mega alpha. Pink for the positive forward returns observations and blue for the negative forward returns observations. A good score system is to make the two classes more separated.
129 | 
130 | We can see, in train set, AdaBoost classifier did so well! The next plot is the precision in each quantile of scores. In the top and bottom quantile, the predicted precision is nearly 100%!
131 | 
132 |
133 | #### Test set
134 | alpha values histogram
135 | 
136 | quantile precision bar plot
137 | 
138 | The precision in the top and bottom quantile is only slightly higher than 50%. Far from good if we considered transaction cost. Frankly, there are plenty of works should be done before we get some satisfied results. Anyway, this pipeline gives us a flexible routine and a judgement system. I'll continue to tweak the routine and factors to make sure it goes on the right direction.
139 |
140 | ## References
141 | * Jonathan Larkin, *A Professional Quant Equity Workflow*. August 31, 2016
142 | * *A Practitioner‘s Guide to Factor Models*. The Research Foundation of The Institute of Chartered Financial Analysts
143 | * Thomas Wiecki, Machine Learning on Quantopian
144 | * Inigo Fraser Jenkins, *Using factors with different alpha decay times: The case for non-linear combination*
145 | * PNC, *Factor Analysis: What Drives Performance?*
146 | * O’Shaughnessy, *Alpha or Assets? — Factor Alpha vs. Smart Beta*. April 2016
147 | * *O’Shaughnessy Quarterly Investor Letter Q1 2018*
148 | * Jiantao Zhu, Orient Securities, *Alpha Forecasting - Factor-Based Strategy Research Series 13*
149 | * Yang Song, Bohai Securities, *Multi-Factor Models Research: Single Factor Testing*, 2017/10/11
--------------------------------------------------------------------------------
/report/Corr_matrix_for_factor_ranks.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jerryxyx/AlphaTrading/5e73923786297faeadb27c76f83ec81fad74af51/report/Corr_matrix_for_factor_ranks.png
--------------------------------------------------------------------------------
/report/Corr_matrix_for_raw_factors.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jerryxyx/AlphaTrading/5e73923786297faeadb27c76f83ec81fad74af51/report/Corr_matrix_for_raw_factors.png
--------------------------------------------------------------------------------
/report/Quantitative Strategy Workflow.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jerryxyx/AlphaTrading/5e73923786297faeadb27c76f83ec81fad74af51/report/Quantitative Strategy Workflow.pptx
--------------------------------------------------------------------------------
/report/adaboost_algorithm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jerryxyx/AlphaTrading/5e73923786297faeadb27c76f83ec81fad74af51/report/adaboost_algorithm.png
--------------------------------------------------------------------------------
/report/corr_comparison_after_pca_analysis.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jerryxyx/AlphaTrading/5e73923786297faeadb27c76f83ec81fad74af51/report/corr_comparison_after_pca_analysis.png
--------------------------------------------------------------------------------
/report/mean_spearmans_rank_IC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jerryxyx/AlphaTrading/5e73923786297faeadb27c76f83ec81fad74af51/report/mean_spearmans_rank_IC.png
--------------------------------------------------------------------------------
/report/mean_spearmans_rank_IC_absolute_value.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jerryxyx/AlphaTrading/5e73923786297faeadb27c76f83ec81fad74af51/report/mean_spearmans_rank_IC_absolute_value.png
--------------------------------------------------------------------------------
/report/rank_of_mean_spearmans_rank_IC_absolute_value.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jerryxyx/AlphaTrading/5e73923786297faeadb27c76f83ec81fad74af51/report/rank_of_mean_spearmans_rank_IC_absolute_value.png
--------------------------------------------------------------------------------
/report/test_accuracy_bar.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jerryxyx/AlphaTrading/5e73923786297faeadb27c76f83ec81fad74af51/report/test_accuracy_bar.png
--------------------------------------------------------------------------------
/report/test_score_dist.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jerryxyx/AlphaTrading/5e73923786297faeadb27c76f83ec81fad74af51/report/test_score_dist.png
--------------------------------------------------------------------------------
/report/train_accuracy_bar.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jerryxyx/AlphaTrading/5e73923786297faeadb27c76f83ec81fad74af51/report/train_accuracy_bar.png
--------------------------------------------------------------------------------
/report/train_score_dist.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jerryxyx/AlphaTrading/5e73923786297faeadb27c76f83ec81fad74af51/report/train_score_dist.png
--------------------------------------------------------------------------------
/report/train_score_dist2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jerryxyx/AlphaTrading/5e73923786297faeadb27c76f83ec81fad74af51/report/train_score_dist2.png
--------------------------------------------------------------------------------
/rqdata_utils.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import alphalens as al
3 | import matplotlib.pyplot as plt
4 | import numpy as np
5 |
6 | def price_reader(price_path):
7 | price_df = pd.read_csv(price_path)
8 | price_df.rename(index=str,columns={"Unnamed: 0":"date"},inplace=True)
9 | price_df.date = pd.to_datetime(price_df.date,format="%Y-%m-%d",errors='ignore')
10 | # price_df.date = price_df.date.apply(timezone.localize)
11 | price_df.set_index(['date'],drop=True,inplace=True)
12 | price_df = price_df.sortlevel(axis=1)
13 | return price_df
14 |
15 | def instrument_reader(instrument_path):
16 | instrument_df = pd.read_csv(instrument_path)
17 | instrument_df.drop(['Unnamed: 0'],axis=1,inplace=True)
18 | instrument_df = instrument_df.set_index(['bookId'])
19 | instrument_df = instrument_df.sort_index()
20 | return instrument_df
21 |
22 | def equity_reader(equity_path):
23 | cn_df = pd.read_csv(equity_path)
24 | cn_df.date = pd.to_datetime(cn_df.date,format="%Y-%m-%d",errors='ignore')
25 | cn_df.set_index(['date','order_book_id'],drop=True,inplace=True)
26 | cn_df.drop(["Unnamed: 0"],axis=1,inplace=True)
27 | return cn_df
28 |
29 | def benchmark_reader(benchmark_path):
30 | benchmark_df = pd.read_csv(benchmark_path,names=['date','value'])
31 | benchmark_df = benchmark_df.set_index('date',drop=True)
32 | benchmark_df['return'] = np.log(benchmark_df.shift(1)/benchmark_df).fillna(0)
33 | return benchmark_df
34 |
35 | def equity_add_instrumentInfo(cn_df,instrument_df,instrument_column):
36 | instrumentInfoSeries = instrument_df[instrument_column]
37 | bookIdIdx = cn_df.index.get_level_values('order_book_id')
38 | bookIdArray = bookIdIdx.get_values()
39 | instrumentInfo = instrumentInfoSeries[bookIdArray[:]].values
40 | cn_df[instrument_column] = instrumentInfo
41 | return cn_df
42 |
43 | def get_price_instrument_equity(price_path,instrument_path,equity_path,addInstrumentColumn=None):
44 | price_df = price_reader(price_path)
45 | instrument_df = instrument_reader(instrument_path)
46 | equity_df = equity_reader(equity_path)
47 | if(addInstrumentColumn):
48 | equity_df = equity_add_instrumentInfo(equity_df,instrument_df,addInstrumentColumn)
49 | return price_df,instrument_df,equity_df
50 |
51 | def ic_analysis(equity_df, price_df, factor_columns, group_column, periods=(1,22,66), group_adjust=False):
52 | factor_list = []
53 | ic_list = []
54 | monthly_ic_list = []
55 | groupby = equity_df[group_column]
56 | for col in factor_columns:
57 | factor_list.append(equity_df[col])
58 |
59 | for my_factor in factor_list:
60 | factor_data = al.utils.get_clean_factor_and_forward_returns(factor=my_factor,
61 | prices=price_df,
62 | groupby=groupby,
63 | periods=periods,
64 | max_loss=1)
65 | mean_ic = al.performance.mean_information_coefficient(factor_data, group_adjust=group_adjust,
66 | by_group=True,
67 | by_time=None)
68 | mean_monthly_ic = al.performance.mean_information_coefficient(factor_data, group_adjust=group_adjust,
69 | by_group=False,
70 | by_time='M')
71 | print("#######################################################")
72 | print("factor: {}".format(my_factor.name))
73 | print(mean_ic)
74 | # print(mean_monthly_ic)
75 | ic_list.append(mean_ic)
76 | monthly_ic_list.append(mean_monthly_ic)
77 | al.plotting.plot_monthly_ic_heatmap(mean_monthly_ic)
78 | plt.show()
79 |
80 |
81 | mean_ic_df = pd.concat(ic_list, keys=factor_columns)
82 | mean_ic_df.index = mean_ic_df.index.set_names(['factor', 'group'])
83 | return mean_ic_df, monthly_ic_list
--------------------------------------------------------------------------------
/source/DownloadData.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [
10 | {
11 | "name": "stdout",
12 | "output_type": "stream",
13 | "text": [
14 | "Population Check - Initial #: 2320\n"
15 | ]
16 | }
17 | ],
18 | "source": [
19 | "# Constructs Time Series Data for All Stocks\n",
20 | "import pandas as pd\n",
21 | "import numpy as np\n",
22 | "from datetime import datetime\n",
23 | "from datetime import timedelta\n",
24 | "import tushare as ts\n",
25 | "\n",
26 | "from scipy.stats import rankdata\n",
27 | "\n",
28 | "import seaborn as sns\n",
29 | "\n",
30 | "# Pull All Trade Dates\n",
31 | "trade_dates = pd.Series(data=[pd.Timestamp(date) for date in get_trading_dates('2001-01-01', '2018-12-31')], name='trade_date')\n",
32 | "\n",
33 | "# year_start = 2001\n",
34 | "year_start = 2012\n",
35 | "year_end = 2018\n",
36 | "\n",
37 | "# date_end_last_dt = max(trade_dates[trade_dates.dt.year == year_start-1])\n",
38 | "date_start_dt = min(trade_dates[trade_dates.dt.year == year_start])\n",
39 | "date_end_dt = max(trade_dates[trade_dates.dt.year == year_end])\n",
40 | "# date_end_dt = date_start_dt+timedelta(days=1) # 2012-01-05\n",
41 | "# date_end_last = date_end_last_dt.strftime('%Y-%m-%d')\n",
42 | "date_start = date_start_dt.strftime('%Y-%m-%d')\n",
43 | "date_end = date_end_dt.strftime('%Y-%m-%d')\n",
44 | "\n",
45 | "# Construct Stock Population\n",
46 | "stock_all = all_instruments(type=\"CS\", country='cn', date=date_start_dt)\n",
47 | "stock_list = stock_all['order_book_id'].tolist()\n",
48 | "print(\"Population Check - Initial #: {}\".format(stock_all.shape[0]))"
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": 2,
54 | "metadata": {
55 | "collapsed": false
56 | },
57 | "outputs": [],
58 | "source": [
59 | "price_data = get_price(stock_list, start_date=date_start, end_date=date_end, frequency='1d', \n",
60 | " fields=['close'], \n",
61 | " adjust_type='pre', skip_suspended=False, country='cn')\n",
62 | "price_data.to_csv(\"cn_stock_price_{}_{}.csv\".format(year_start,year_end)) # Download price data"
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": 3,
68 | "metadata": {
69 | "collapsed": false
70 | },
71 | "outputs": [
72 | {
73 | "data": {
74 | "text/html": [
75 | "\n",
76 | "
\n",
77 | " \n",
78 | " \n",
79 | " | \n",
80 | " 300188.XSHE | \n",
81 | " 600337.XSHG | \n",
82 | " 600168.XSHG | \n",
83 | " 002337.XSHE | \n",
84 | " 600592.XSHG | \n",
85 | " 000950.XSHE | \n",
86 | " 600991.XSHG | \n",
87 | " 002473.XSHE | \n",
88 | " 600784.XSHG | \n",
89 | " 600736.XSHG | \n",
90 | " ... | \n",
91 | " 600345.XSHG | \n",
92 | " 600387.XSHG | \n",
93 | " 000063.XSHE | \n",
94 | " 002506.XSHE | \n",
95 | " 300151.XSHE | \n",
96 | " 002579.XSHE | \n",
97 | " 000563.XSHE | \n",
98 | " 000551.XSHE | \n",
99 | " 002578.XSHE | \n",
100 | " 000726.XSHE | \n",
101 | "
\n",
102 | " \n",
103 | " \n",
104 | " \n",
105 | " 2012-01-04 | \n",
106 | " 4.3401 | \n",
107 | " 3.1750 | \n",
108 | " 6.3693 | \n",
109 | " 2.8752 | \n",
110 | " 6.8314 | \n",
111 | " 4.0168 | \n",
112 | " 15.08 | \n",
113 | " 7.2898 | \n",
114 | " 5.0104 | \n",
115 | " 4.0135 | \n",
116 | " ... | \n",
117 | " 9.4373 | \n",
118 | " 8.7711 | \n",
119 | " 13.3691 | \n",
120 | " 2.6406 | \n",
121 | " 2.0055 | \n",
122 | " 3.4037 | \n",
123 | " 2.3129 | \n",
124 | " 5.7181 | \n",
125 | " 1.8931 | \n",
126 | " 6.0165 | \n",
127 | "
\n",
128 | " \n",
129 | " 2012-01-05 | \n",
130 | " 4.0743 | \n",
131 | " 2.8583 | \n",
132 | " 6.1697 | \n",
133 | " 2.6739 | \n",
134 | " 6.1737 | \n",
135 | " 3.9101 | \n",
136 | " 15.08 | \n",
137 | " 6.9505 | \n",
138 | " 4.5089 | \n",
139 | " 3.8534 | \n",
140 | " ... | \n",
141 | " 8.8456 | \n",
142 | " 8.5560 | \n",
143 | " 13.2895 | \n",
144 | " 2.3920 | \n",
145 | " 1.8872 | \n",
146 | " 3.1452 | \n",
147 | " 2.2942 | \n",
148 | " 5.2525 | \n",
149 | " 1.7877 | \n",
150 | " 6.0005 | \n",
151 | "
\n",
152 | " \n",
153 | " 2012-01-06 | \n",
154 | " 4.1173 | \n",
155 | " 2.6920 | \n",
156 | " 5.9511 | \n",
157 | " 2.7148 | \n",
158 | " 6.2207 | \n",
159 | " 4.1429 | \n",
160 | " 15.11 | \n",
161 | " 7.0765 | \n",
162 | " 4.6111 | \n",
163 | " 3.9570 | \n",
164 | " ... | \n",
165 | " 8.9315 | \n",
166 | " 8.4289 | \n",
167 | " 12.9473 | \n",
168 | " 2.4624 | \n",
169 | " 1.9159 | \n",
170 | " 3.1724 | \n",
171 | " 2.2475 | \n",
172 | " 5.2787 | \n",
173 | " 1.7897 | \n",
174 | " 6.2645 | \n",
175 | "
\n",
176 | " \n",
177 | " 2012-01-09 | \n",
178 | " 4.4041 | \n",
179 | " 2.7791 | \n",
180 | " 6.2172 | \n",
181 | " 2.8154 | \n",
182 | " 6.4744 | \n",
183 | " 4.2787 | \n",
184 | " 15.12 | \n",
185 | " 7.4449 | \n",
186 | " 4.8741 | \n",
187 | " 4.2114 | \n",
188 | " ... | \n",
189 | " 9.3705 | \n",
190 | " 8.9765 | \n",
191 | " 13.2975 | \n",
192 | " 2.5991 | \n",
193 | " 2.0094 | \n",
194 | " 3.3194 | \n",
195 | " 2.3362 | \n",
196 | " 5.7050 | \n",
197 | " 1.8814 | \n",
198 | " 6.4965 | \n",
199 | "
\n",
200 | " \n",
201 | " 2012-01-10 | \n",
202 | " 4.5124 | \n",
203 | " 2.9137 | \n",
204 | " 6.4549 | \n",
205 | " 2.9570 | \n",
206 | " 6.7845 | \n",
207 | " 4.4437 | \n",
208 | " 15.29 | \n",
209 | " 7.7357 | \n",
210 | " 5.1370 | \n",
211 | " 4.2962 | \n",
212 | " ... | \n",
213 | " 9.7712 | \n",
214 | " 9.3285 | \n",
215 | " 13.8864 | \n",
216 | " 2.7565 | \n",
217 | " 2.1143 | \n",
218 | " 3.4255 | \n",
219 | " 2.4087 | \n",
220 | " 5.9410 | \n",
221 | " 2.0024 | \n",
222 | " 6.6565 | \n",
223 | "
\n",
224 | " \n",
225 | "
\n",
226 | "
5 rows × 2320 columns
\n",
227 | "
"
228 | ],
229 | "text/plain": [
230 | " 300188.XSHE 600337.XSHG 600168.XSHG 002337.XSHE 600592.XSHG \\\n",
231 | "2012-01-04 4.3401 3.1750 6.3693 2.8752 6.8314 \n",
232 | "2012-01-05 4.0743 2.8583 6.1697 2.6739 6.1737 \n",
233 | "2012-01-06 4.1173 2.6920 5.9511 2.7148 6.2207 \n",
234 | "2012-01-09 4.4041 2.7791 6.2172 2.8154 6.4744 \n",
235 | "2012-01-10 4.5124 2.9137 6.4549 2.9570 6.7845 \n",
236 | "\n",
237 | " 000950.XSHE 600991.XSHG 002473.XSHE 600784.XSHG 600736.XSHG \\\n",
238 | "2012-01-04 4.0168 15.08 7.2898 5.0104 4.0135 \n",
239 | "2012-01-05 3.9101 15.08 6.9505 4.5089 3.8534 \n",
240 | "2012-01-06 4.1429 15.11 7.0765 4.6111 3.9570 \n",
241 | "2012-01-09 4.2787 15.12 7.4449 4.8741 4.2114 \n",
242 | "2012-01-10 4.4437 15.29 7.7357 5.1370 4.2962 \n",
243 | "\n",
244 | " ... 600345.XSHG 600387.XSHG 000063.XSHE 002506.XSHE \\\n",
245 | "2012-01-04 ... 9.4373 8.7711 13.3691 2.6406 \n",
246 | "2012-01-05 ... 8.8456 8.5560 13.2895 2.3920 \n",
247 | "2012-01-06 ... 8.9315 8.4289 12.9473 2.4624 \n",
248 | "2012-01-09 ... 9.3705 8.9765 13.2975 2.5991 \n",
249 | "2012-01-10 ... 9.7712 9.3285 13.8864 2.7565 \n",
250 | "\n",
251 | " 300151.XSHE 002579.XSHE 000563.XSHE 000551.XSHE 002578.XSHE \\\n",
252 | "2012-01-04 2.0055 3.4037 2.3129 5.7181 1.8931 \n",
253 | "2012-01-05 1.8872 3.1452 2.2942 5.2525 1.7877 \n",
254 | "2012-01-06 1.9159 3.1724 2.2475 5.2787 1.7897 \n",
255 | "2012-01-09 2.0094 3.3194 2.3362 5.7050 1.8814 \n",
256 | "2012-01-10 2.1143 3.4255 2.4087 5.9410 2.0024 \n",
257 | "\n",
258 | " 000726.XSHE \n",
259 | "2012-01-04 6.0165 \n",
260 | "2012-01-05 6.0005 \n",
261 | "2012-01-06 6.2645 \n",
262 | "2012-01-09 6.4965 \n",
263 | "2012-01-10 6.6565 \n",
264 | "\n",
265 | "[5 rows x 2320 columns]"
266 | ]
267 | },
268 | "execution_count": 3,
269 | "metadata": {},
270 | "output_type": "execute_result"
271 | }
272 | ],
273 | "source": [
274 | "price_data.head()"
275 | ]
276 | },
277 | {
278 | "cell_type": "code",
279 | "execution_count": 4,
280 | "metadata": {
281 | "collapsed": false
282 | },
283 | "outputs": [],
284 | "source": [
285 | "trade_data = get_price(stock_list, start_date=date_start, end_date=date_end, frequency='1d', \n",
286 | " fields=['close', 'total_turnover', 'volume'], \n",
287 | " adjust_type='pre', skip_suspended=False, country='cn')"
288 | ]
289 | },
290 | {
291 | "cell_type": "code",
292 | "execution_count": 5,
293 | "metadata": {
294 | "collapsed": true
295 | },
296 | "outputs": [],
297 | "source": [
298 | "return_data = get_price_change_rate(stock_list, start_date=date_start, end_date=date_end)"
299 | ]
300 | },
301 | {
302 | "cell_type": "code",
303 | "execution_count": 6,
304 | "metadata": {
305 | "collapsed": true
306 | },
307 | "outputs": [],
308 | "source": [
309 | "turnover_data = get_turnover_rate(stock_list, date_start, date_end, fields=['week', 'month'])"
310 | ]
311 | },
312 | {
313 | "cell_type": "code",
314 | "execution_count": 9,
315 | "metadata": {
316 | "collapsed": false
317 | },
318 | "outputs": [],
319 | "source": [
320 | "instrument_info = instruments(stock_list)"
321 | ]
322 | },
323 | {
324 | "cell_type": "code",
325 | "execution_count": 25,
326 | "metadata": {
327 | "collapsed": false
328 | },
329 | "outputs": [
330 | {
331 | "data": {
332 | "text/plain": [
333 | "Instrument(industry_name='软件和信息技术服务业', sector_code_name='信息技术', abbrev_symbol='MYBK', listed_date='2011-03-16', exchange='XSHE', symbol='美亚柏科', industry_code='I65', round_lot=100.0, order_book_id='300188.XSHE', special_type='Normal', shenwan_industry_name='计算机', de_listed_date='0000-00-00', type='CS', sector_code='InformationTechnology', board_type='GEM', shenwan_industry_code='801750.INDX', status='Active')"
334 | ]
335 | },
336 | "execution_count": 25,
337 | "metadata": {},
338 | "output_type": "execute_result"
339 | }
340 | ],
341 | "source": [
342 | "instrument_info[0]"
343 | ]
344 | },
345 | {
346 | "cell_type": "code",
347 | "execution_count": 27,
348 | "metadata": {
349 | "collapsed": false
350 | },
351 | "outputs": [],
352 | "source": [
353 | "# Download instrument information\n",
354 | "\n",
355 | "bookId_list = []\n",
356 | "exchange_list = []\n",
357 | "abbrevSymbol_list = []\n",
358 | "shenwanIndustryCode_list = []\n",
359 | "shenwanIndustryName_list = []\n",
360 | "industryCode_list = []\n",
361 | "industryName_list = []\n",
362 | "sectorCode_list = []\n",
363 | "sectorName_list = []\n",
364 | "for inst in instrument_info:\n",
365 | " bookId_list.append(inst.order_book_id)\n",
366 | " exchange_list.append(inst.exchange)\n",
367 | " abbrevSymbol_list.append(inst.abbrev_symbol)\n",
368 | " shenwanIndustryCode_list.append(inst.shenwan_industry_code)\n",
369 | " shenwanIndustryName_list.append(inst.shenwan_industry_name)\n",
370 | " industryCode_list.append(inst.industry_code)\n",
371 | " industryName_list.append(inst.industry_name)\n",
372 | " sectorCode_list.append(inst.sector_code)\n",
373 | " sectorName_list.append(inst.sector_code_name)\n",
374 | " \n",
375 | "instrument_df = pd.DataFrame({\"bookId\":bookId_list,\n",
376 | " \"exchange\":exchange_list,\n",
377 | " \"abbrevSymbol\":abbrevSymbol_list,\n",
378 | " \"shenwanIndustryCode\":shenwanIndustryCode_list,\n",
379 | " \"shenwanIndustryName\":shenwanIndustryName_list,\n",
380 | " \"industryCode\":industryCode_list,\n",
381 | " \"industryName\":industryName_list,\n",
382 | " \"sectorCode\":sectorCode_list,\n",
383 | " \"sectorName\":sectorName_list})"
384 | ]
385 | },
386 | {
387 | "cell_type": "code",
388 | "execution_count": 29,
389 | "metadata": {
390 | "collapsed": false
391 | },
392 | "outputs": [],
393 | "source": [
394 | "instrument_df.to_csv(\"cn_instrument_info_{}_{}.csv\".format(year_start,year_end))"
395 | ]
396 | },
397 | {
398 | "cell_type": "code",
399 | "execution_count": 7,
400 | "metadata": {
401 | "collapsed": false
402 | },
403 | "outputs": [
404 | {
405 | "name": "stdout",
406 | "output_type": "stream",
407 | "text": [
408 | "Date: 2012-01-04 00:00:00 | Progress: 0.05875440658049354%\n",
409 | "Date: 2012-01-05 00:00:00 | Progress: 0.11750881316098707%\n",
410 | "Date: 2012-01-06 00:00:00 | Progress: 0.17626321974148063%\n",
411 | "Date: 2012-01-09 00:00:00 | Progress: 0.23501762632197415%\n",
412 | "Date: 2012-01-10 00:00:00 | Progress: 0.2937720329024677%\n",
413 | "Date: 2012-01-11 00:00:00 | Progress: 0.35252643948296125%\n",
414 | "Date: 2012-01-12 00:00:00 | Progress: 0.4112808460634548%\n",
415 | "Date: 2012-01-13 00:00:00 | Progress: 0.4700352526439483%\n",
416 | "Date: 2012-01-16 00:00:00 | Progress: 0.5287896592244419%\n",
417 | "Date: 2012-01-17 00:00:00 | Progress: 0.5875440658049353%\n",
418 | "Date: 2012-01-18 00:00:00 | Progress: 0.6462984723854289%\n",
419 | "Date: 2012-01-19 00:00:00 | Progress: 0.7050528789659225%\n",
420 | "Date: 2012-01-20 00:00:00 | Progress: 0.763807285546416%\n",
421 | "Date: 2012-01-30 00:00:00 | Progress: 0.8225616921269095%\n",
422 | "Date: 2012-01-31 00:00:00 | Progress: 0.881316098707403%\n",
423 | "Date: 2012-02-01 00:00:00 | Progress: 0.9400705052878966%\n"
424 | ]
425 | },
426 | {
427 | "ename": "KeyboardInterrupt",
428 | "evalue": "",
429 | "traceback": [
430 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
431 | "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
432 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mfilter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfundamentals\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mincome_statement\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstockcode\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0min_\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstock_list\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 17\u001b[0;31m \u001b[0mentry_date\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdt\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minterval\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'1q'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreport_quarter\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 18\u001b[0m )\n\u001b[1;32m 19\u001b[0m \u001b[0m_fundamental_data\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_fundamental_data\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_frame\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
433 | "\u001b[0;32m/opt/conda/envs/ricequant/lib/python3.5/site-packages/rqcommons/facade.py\u001b[0m in \u001b[0;36mwrap\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 29\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Not inited yet. Please call rqdatac.init() first.'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 30\u001b[0m \u001b[0;32mreturn\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 31\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 32\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mwrap\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 33\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
434 | "\u001b[0;32m/opt/conda/envs/ricequant/lib/python3.5/site-packages/rqcommons/facade.py\u001b[0m in \u001b[0;36mget_fundamentals\u001b[0;34m(query, entry_date, interval, report_quarter, country)\u001b[0m\n\u001b[1;32m 314\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mget_fundamentals\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mquery\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mentry_date\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minterval\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreport_quarter\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcountry\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'cn'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 315\u001b[0m \u001b[0;34m\"\"\"获取财务数据\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 316\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mimplmentation\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_fundamentals\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mquery\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mentry_date\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minterval\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreport_quarter\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcountry\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 317\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 318\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
435 | "\u001b[0;32m/opt/conda/envs/ricequant/lib/python3.5/site-packages/rqdatac/implementation.py\u001b[0m in \u001b[0;36mget_fundamentals\u001b[0;34m(cls, query, entry_date, interval, report_quarter, country)\u001b[0m\n\u001b[1;32m 251\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 252\u001b[0m \u001b[0mquery\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_unsafe_apply_query_filter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mquery\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtrading_dates\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 253\u001b[0;31m \u001b[0mrecords\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcls\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_run_fundamental_query\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_compile_query\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mquery\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcountry\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcountry\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 254\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 255\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'ERROR: internal error, please contact public@ricequant.com. exception: {}'\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
436 | "\u001b[0;32m/opt/conda/envs/ricequant/lib/python3.5/site-packages/rqdatac/implementation.py\u001b[0m in \u001b[0;36m_compile_query\u001b[0;34m(query)\u001b[0m\n\u001b[1;32m 30\u001b[0m \u001b[0mparams\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 31\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mk\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mcomp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpositiontup\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 32\u001b[0;31m \u001b[0mv\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcomp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mparams\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 33\u001b[0m \u001b[0mparams\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mescape_item\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mv\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mconversions\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mencoders\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 34\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
437 | "\u001b[0;32m/opt/conda/envs/ricequant/lib/python3.5/site-packages/sqlalchemy/sql/compiler.py\u001b[0m in \u001b[0;36mparams\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 576\u001b[0m \"\"\"Return the bind param dictionary embedded into this\n\u001b[1;32m 577\u001b[0m compiled object, for those values that are present.\"\"\"\n\u001b[0;32m--> 578\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconstruct_params\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_check\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 579\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 580\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mutil\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdependencies\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"sqlalchemy.engine.result\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
438 | "\u001b[0;32m/opt/conda/envs/ricequant/lib/python3.5/site-packages/sqlalchemy/sql/compiler.py\u001b[0m in \u001b[0;36mconstruct_params\u001b[0;34m(self, params, _group_number, _check)\u001b[0m\n\u001b[1;32m 569\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbind_names\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mbindparam\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mbindparam\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0meffective_value\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 570\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 571\u001b[0;31m \u001b[0mpd\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbind_names\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mbindparam\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mbindparam\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 572\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 573\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
439 | "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
440 | ],
441 | "output_type": "error"
442 | }
443 | ],
444 | "source": [
445 | "fundamental_data = {}\n",
446 | "query_dates = trade_dates[(trade_dates >= date_start_dt) & (trade_dates <= date_end_dt)]\n",
447 | "ndates = len(query_dates)\n",
448 | "for counter,dt in enumerate(query_dates):\n",
449 | " print(\"Date: {} | Progress: {}%\".format(dt,(counter+1)/ndates*100))\n",
450 | " _fundamental_data = get_fundamentals(\n",
451 | " query(\n",
452 | " fundamentals.eod_derivative_indicator.market_cap, #总市值\n",
453 | " fundamentals.eod_derivative_indicator.a_share_market_val_2, #流通市值\n",
454 | " fundamentals.cash_flow_statement.cash_received_from_sales_of_goods, #销售额 - 单季/同比\n",
455 | " fundamentals.eod_derivative_indicator.pb_ratio, #净资产/总市值=市净率\n",
456 | " fundamentals.income_statement.net_profit, #净利润\n",
457 | " fundamentals.eod_derivative_indicator.ps_ratio #市销率\n",
458 | " )\n",
459 | " .filter(fundamentals.income_statement.stockcode.in_(stock_list))\n",
460 | " , \n",
461 | " entry_date=dt, interval='1q', report_quarter=True\n",
462 | " )\n",
463 | " _fundamental_data = _fundamental_data.to_frame()\n",
464 | " _fundamental_data.index.names = ['date', 'order_book_id']\n",
465 | " fundamental_data[dt] = _fundamental_data\n",
466 | " \n",
467 | "fundamental_data = pd.concat(fundamental_data)\n",
468 | "fundamental_data.reset_index(level=0, drop=True, inplace=True)"
469 | ]
470 | },
471 | {
472 | "cell_type": "code",
473 | "execution_count": null,
474 | "metadata": {
475 | "collapsed": false
476 | },
477 | "outputs": [],
478 | "source": [
479 | "trade_ts = trade_data.to_frame()\n",
480 | "trade_ts.index.names = ['date', 'order_book_id']\n",
481 | "\n",
482 | "return_ts = pd.DataFrame(return_data.stack(), columns=['return'])\n",
483 | "return_ts.index.names = ['date', 'order_book_id']\n",
484 | "\n",
485 | "turnover_ts = turnover_data.to_frame()\n",
486 | "turnover_ts.index.names = ['date', 'order_book_id']\n",
487 | "\n",
488 | "data = return_ts.merge(trade_ts, how='left', left_index=True, right_index=True)\n",
489 | "data = data.merge(turnover_ts, how='left', left_index=True, right_index=True)\n",
490 | "data = data.merge(fundamental_data, how='left', left_index=True, right_index=True)"
491 | ]
492 | },
493 | {
494 | "cell_type": "code",
495 | "execution_count": null,
496 | "metadata": {
497 | "collapsed": false
498 | },
499 | "outputs": [],
500 | "source": [
501 | "data.head()"
502 | ]
503 | },
504 | {
505 | "cell_type": "code",
506 | "execution_count": null,
507 | "metadata": {
508 | "collapsed": true
509 | },
510 | "outputs": [],
511 | "source": [
512 | "# Save Data\n",
513 | "data.to_csv(\"cn_equity_daily_{}_{}.csv\".format(year_start,year_end))"
514 | ]
515 | },
516 | {
517 | "cell_type": "code",
518 | "execution_count": null,
519 | "metadata": {
520 | "collapsed": true
521 | },
522 | "outputs": [],
523 | "source": []
524 | }
525 | ],
526 | "metadata": {
527 | "kernelspec": {
528 | "display_name": "Python 3",
529 | "language": "python",
530 | "name": "python3"
531 | },
532 | "language_info": {
533 | "codemirror_mode": {
534 | "name": "ipython",
535 | "version": 3
536 | },
537 | "file_extension": ".py",
538 | "mimetype": "text/x-python",
539 | "name": "python",
540 | "nbconvert_exporter": "python",
541 | "pygments_lexer": "ipython3",
542 | "version": "3.5.5"
543 | }
544 | },
545 | "nbformat": 4,
546 | "nbformat_minor": 2
547 | }
548 |
--------------------------------------------------------------------------------
/source/DownloadData_bak.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 2,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [
10 | {
11 | "name": "stdout",
12 | "output_type": "stream",
13 | "text": [
14 | "Population Check - Initial #: 1059\n"
15 | ]
16 | }
17 | ],
18 | "source": [
19 | "# Constructs Time Series Data for All Stocks\n",
20 | "import pandas as pd\n",
21 | "import numpy as np\n",
22 | "from datetime import datetime\n",
23 | "import tushare as ts\n",
24 | "\n",
25 | "from scipy.stats import rankdata\n",
26 | "\n",
27 | "import seaborn as sns\n",
28 | "\n",
29 | "# Pull All Trade Dates\n",
30 | "trade_dates = pd.Series(data=[pd.Timestamp(date) for date in get_trading_dates('2001-01-01', '2018-12-31')], name='trade_date')\n",
31 | "\n",
32 | "year_start = 2001\n",
33 | "year_end = 2012\n",
34 | "\n",
35 | "# date_end_last_dt = max(trade_dates[trade_dates.dt.year == year_start-1])\n",
36 | "date_start_dt = min(trade_dates[trade_dates.dt.year == year_start])\n",
37 | "date_end_dt = max(trade_dates[trade_dates.dt.year == year_end])\n",
38 | "\n",
39 | "# date_end_last = date_end_last_dt.strftime('%Y-%m-%d')\n",
40 | "date_start = date_start_dt.strftime('%Y-%m-%d')\n",
41 | "date_end = date_end_dt.strftime('%Y-%m-%d')\n",
42 | "\n",
43 | "# Construct Stock Population\n",
44 | "stock_all = all_instruments(type=\"CS\", country='cn', date=date_start_dt)\n",
45 | "stock_list = stock_all['order_book_id'].tolist()\n",
46 | "print(\"Population Check - Initial #: {}\".format(stock_all.shape[0]))"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": 3,
52 | "metadata": {
53 | "collapsed": false
54 | },
55 | "outputs": [
56 | {
57 | "name": "stdout",
58 | "output_type": "stream",
59 | "text": [
60 | "WARN: start_date is earlier than 2005-01-04, adjusted\n"
61 | ]
62 | }
63 | ],
64 | "source": [
65 | "trade_data = get_price(stock_list, start_date=date_start, end_date=date_end, frequency='1d', \n",
66 | " fields=['close', 'total_turnover', 'volume'], \n",
67 | " adjust_type='pre', skip_suspended=False, country='cn')\n",
68 | "\n",
69 | "return_data = get_price_change_rate(stock_list, start_date=date_start, end_date=date_end)\n",
70 | "\n",
71 | "turnover_data = get_turnover_rate(stock_list, date_start, date_end, fields=['week', 'month'])\n",
72 | "\n",
73 | "fundamental_data = {}\n",
74 | "for dt in trade_dates[(trade_dates.dt.year >= year_start) & (trade_dates.dt.year <= year_end)]:\n",
75 | " _fundamental_data = get_fundamentals(\n",
76 | " query(\n",
77 | " fundamentals.eod_derivative_indicator.market_cap, #总市值\n",
78 | " fundamentals.eod_derivative_indicator.a_share_market_val_2, #流通市值\n",
79 | " fundamentals.cash_flow_statement.cash_received_from_sales_of_goods, #销售额 - 单季/同比\n",
80 | " fundamentals.eod_derivative_indicator.pb_ratio, #净资产/总市值=市净率\n",
81 | " fundamentals.income_statement.net_profit, #净利润\n",
82 | " fundamentals.eod_derivative_indicator.ps_ratio #市销率\n",
83 | " ).filter(fundamentals.income_statement.stockcode.in_(stock_list)), \n",
84 | " entry_date=dt, interval='1q', report_quarter=True\n",
85 | " )\n",
86 | " _fundamental_data = _fundamental_data.to_frame()\n",
87 | " _fundamental_data.index.names = ['date', 'order_book_id']\n",
88 | " fundamental_data[dt] = _fundamental_data\n",
89 | " \n",
90 | "fundamental_data = pd.concat(fundamental_data)\n",
91 | "fundamental_data.reset_index(level=0, drop=True, inplace=True)\n",
92 | "\n",
93 | "# Aggregate Data\n",
94 | "\n",
95 | "trade_ts = trade_data.to_frame()\n",
96 | "trade_ts.index.names = ['date', 'order_book_id']\n",
97 | "\n",
98 | "return_ts = pd.DataFrame(return_data.stack(), columns=['return'])\n",
99 | "return_ts.index.names = ['date', 'order_book_id']\n",
100 | "\n",
101 | "turnover_ts = turnover_data.to_frame()\n",
102 | "turnover_ts.index.names = ['date', 'order_book_id']\n",
103 | "\n",
104 | "data = return_ts.merge(trade_ts, how='left', left_index=True, right_index=True)\n",
105 | "data = data.merge(turnover_ts, how='left', left_index=True, right_index=True)\n",
106 | "data = data.merge(fundamental_data, how='left', left_index=True, right_index=True)\n",
107 | "\n",
108 | "# Save Data\n",
109 | "data.to_csv(\"stock_data_all_2016_2018.csv\")\n"
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": 4,
115 | "metadata": {
116 | "collapsed": false
117 | },
118 | "outputs": [],
119 | "source": [
120 | "trade_ts = trade_data.to_frame()\n",
121 | "trade_ts.index.names = ['date', 'order_book_id']\n",
122 | "\n",
123 | "return_ts = pd.DataFrame(return_data.stack(), columns=['return'])\n",
124 | "return_ts.index.names = ['date', 'order_book_id']\n",
125 | "\n",
126 | "turnover_ts = turnover_data.to_frame()\n",
127 | "turnover_ts.index.names = ['date', 'order_book_id']\n",
128 | "\n",
129 | "data = return_ts.merge(trade_ts, how='left', left_index=True, right_index=True)\n",
130 | "data = data.merge(turnover_ts, how='left', left_index=True, right_index=True)\n",
131 | "data = data.merge(fundamental_data, how='left', left_index=True, right_index=True)\n"
132 | ]
133 | },
134 | {
135 | "cell_type": "code",
136 | "execution_count": 5,
137 | "metadata": {
138 | "collapsed": true
139 | },
140 | "outputs": [],
141 | "source": [
142 | "# data.to_csv(\"stock_data_all_2005_2012.csv\")\n",
143 | "\n",
144 | "# Break data into monthly chunks\n",
145 | "year = 2011\n",
146 | "for month in range(1,13):\n",
147 | " data_tmp = data.loc[(data['date'].dt.year == year) & (data['date'].dt.month == month), :]\n",
148 | " data_tmp.to_csv(\"stock_data_all_\"+str(year)+\"{0:0=2d}\".format(month)+\".csv\")"
149 | ]
150 | },
151 | {
152 | "cell_type": "code",
153 | "execution_count": null,
154 | "metadata": {
155 | "collapsed": true
156 | },
157 | "outputs": [],
158 | "source": []
159 | }
160 | ],
161 | "metadata": {
162 | "kernelspec": {
163 | "display_name": "Python 3",
164 | "language": "python",
165 | "name": "python3"
166 | },
167 | "language_info": {
168 | "codemirror_mode": {
169 | "name": "ipython",
170 | "version": 3
171 | },
172 | "file_extension": ".py",
173 | "mimetype": "text/x-python",
174 | "name": "python",
175 | "nbconvert_exporter": "python",
176 | "pygments_lexer": "ipython3",
177 | "version": "3.5.2"
178 | }
179 | },
180 | "nbformat": 4,
181 | "nbformat_minor": 2
182 | }
183 |
--------------------------------------------------------------------------------
/source/FactorModeling.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "from rqdata_utils import *\n",
10 | "import pandas\n",
11 | "import numpy as np\n",
12 | "import scipy as sp\n",
13 | "import alphalens as al\n",
14 | "%matplotlib inline"
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "## Loading Data"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": 2,
27 | "metadata": {},
28 | "outputs": [],
29 | "source": [
30 | "price_df,instrument_df,equity_df = get_price_instrument_equity(\"cn_stock_price_2012_2018.csv\",\"cn_instrument_info_2012_2018.csv\",\"cn_equity_daily_2012_2018.csv\",\"sectorCode\")"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": 3,
36 | "metadata": {},
37 | "outputs": [
38 | {
39 | "data": {
40 | "text/html": [
41 | "\n",
42 | "
\n",
43 | " \n",
44 | " \n",
45 | " | \n",
46 | " | \n",
47 | " return | \n",
48 | " close | \n",
49 | " total_turnover | \n",
50 | " volume | \n",
51 | " week | \n",
52 | " month | \n",
53 | " report_quarter | \n",
54 | " market_cap | \n",
55 | " a_share_market_val_2 | \n",
56 | " cash_received_from_sales_of_goods | \n",
57 | " pb_ratio | \n",
58 | " net_profit | \n",
59 | " ps_ratio | \n",
60 | " sectorCode | \n",
61 | "
\n",
62 | " \n",
63 | " date | \n",
64 | " order_book_id | \n",
65 | " | \n",
66 | " | \n",
67 | " | \n",
68 | " | \n",
69 | " | \n",
70 | " | \n",
71 | " | \n",
72 | " | \n",
73 | " | \n",
74 | " | \n",
75 | " | \n",
76 | " | \n",
77 | " | \n",
78 | " | \n",
79 | "
\n",
80 | " \n",
81 | " \n",
82 | " \n",
83 | " 2012-01-04 | \n",
84 | " 000001.XSHE | \n",
85 | " -0.027582 | \n",
86 | " 5.1224 | \n",
87 | " 2.275637e+08 | \n",
88 | " 40894428.0 | \n",
89 | " 0.5775 | \n",
90 | " 0.4331 | \n",
91 | " NaN | \n",
92 | " NaN | \n",
93 | " NaN | \n",
94 | " NaN | \n",
95 | " NaN | \n",
96 | " NaN | \n",
97 | " NaN | \n",
98 | " Financials | \n",
99 | "
\n",
100 | " \n",
101 | " 000002.XSHE | \n",
102 | " -0.018742 | \n",
103 | " 6.0525 | \n",
104 | " 3.559891e+08 | \n",
105 | " 47432958.0 | \n",
106 | " 0.3711 | \n",
107 | " 0.4030 | \n",
108 | " 2011q3 | \n",
109 | " 8.059489e+10 | \n",
110 | " 7.082120e+10 | \n",
111 | " 7.516785e+10 | \n",
112 | " 1.5216 | \n",
113 | " 4.106349e+09 | \n",
114 | " 0.8679 | \n",
115 | " Financials | \n",
116 | "
\n",
117 | " \n",
118 | " 000004.XSHE | \n",
119 | " -0.022250 | \n",
120 | " 7.9100 | \n",
121 | " 3.763833e+06 | \n",
122 | " 465469.0 | \n",
123 | " 0.5720 | \n",
124 | " 0.7506 | \n",
125 | " 2011q3 | \n",
126 | " 6.642556e+08 | \n",
127 | " 6.634549e+08 | \n",
128 | " 5.949968e+07 | \n",
129 | " 8.8175 | \n",
130 | " 4.500363e+06 | \n",
131 | " 37.5796 | \n",
132 | " HealthCare | \n",
133 | "
\n",
134 | " \n",
135 | " 000005.XSHE | \n",
136 | " 0.000000 | \n",
137 | " 3.8600 | \n",
138 | " 0.000000e+00 | \n",
139 | " 0.0 | \n",
140 | " 0.0000 | \n",
141 | " 0.0000 | \n",
142 | " 2011q3 | \n",
143 | " 3.529328e+09 | \n",
144 | " 3.527048e+09 | \n",
145 | " 2.565851e+07 | \n",
146 | " 5.3480 | \n",
147 | " 1.365665e+07 | \n",
148 | " -347.2191 | \n",
149 | " Industrials | \n",
150 | "
\n",
151 | " \n",
152 | " 000006.XSHE | \n",
153 | " -0.009756 | \n",
154 | " 2.6766 | \n",
155 | " 7.619286e+06 | \n",
156 | " 2513811.0 | \n",
157 | " 0.1416 | \n",
158 | " 0.1667 | \n",
159 | " 2011q3 | \n",
160 | " 4.015370e+09 | \n",
161 | " 3.929464e+09 | \n",
162 | " 2.531436e+09 | \n",
163 | " 1.4348 | \n",
164 | " 2.763917e+08 | \n",
165 | " 1.4139 | \n",
166 | " Financials | \n",
167 | "
\n",
168 | " \n",
169 | "
\n",
170 | "
"
171 | ],
172 | "text/plain": [
173 | " return close total_turnover volume \\\n",
174 | "date order_book_id \n",
175 | "2012-01-04 000001.XSHE -0.027582 5.1224 2.275637e+08 40894428.0 \n",
176 | " 000002.XSHE -0.018742 6.0525 3.559891e+08 47432958.0 \n",
177 | " 000004.XSHE -0.022250 7.9100 3.763833e+06 465469.0 \n",
178 | " 000005.XSHE 0.000000 3.8600 0.000000e+00 0.0 \n",
179 | " 000006.XSHE -0.009756 2.6766 7.619286e+06 2513811.0 \n",
180 | "\n",
181 | " week month report_quarter market_cap \\\n",
182 | "date order_book_id \n",
183 | "2012-01-04 000001.XSHE 0.5775 0.4331 NaN NaN \n",
184 | " 000002.XSHE 0.3711 0.4030 2011q3 8.059489e+10 \n",
185 | " 000004.XSHE 0.5720 0.7506 2011q3 6.642556e+08 \n",
186 | " 000005.XSHE 0.0000 0.0000 2011q3 3.529328e+09 \n",
187 | " 000006.XSHE 0.1416 0.1667 2011q3 4.015370e+09 \n",
188 | "\n",
189 | " a_share_market_val_2 \\\n",
190 | "date order_book_id \n",
191 | "2012-01-04 000001.XSHE NaN \n",
192 | " 000002.XSHE 7.082120e+10 \n",
193 | " 000004.XSHE 6.634549e+08 \n",
194 | " 000005.XSHE 3.527048e+09 \n",
195 | " 000006.XSHE 3.929464e+09 \n",
196 | "\n",
197 | " cash_received_from_sales_of_goods pb_ratio \\\n",
198 | "date order_book_id \n",
199 | "2012-01-04 000001.XSHE NaN NaN \n",
200 | " 000002.XSHE 7.516785e+10 1.5216 \n",
201 | " 000004.XSHE 5.949968e+07 8.8175 \n",
202 | " 000005.XSHE 2.565851e+07 5.3480 \n",
203 | " 000006.XSHE 2.531436e+09 1.4348 \n",
204 | "\n",
205 | " net_profit ps_ratio sectorCode \n",
206 | "date order_book_id \n",
207 | "2012-01-04 000001.XSHE NaN NaN Financials \n",
208 | " 000002.XSHE 4.106349e+09 0.8679 Financials \n",
209 | " 000004.XSHE 4.500363e+06 37.5796 HealthCare \n",
210 | " 000005.XSHE 1.365665e+07 -347.2191 Industrials \n",
211 | " 000006.XSHE 2.763917e+08 1.4139 Financials "
212 | ]
213 | },
214 | "execution_count": 3,
215 | "metadata": {},
216 | "output_type": "execute_result"
217 | }
218 | ],
219 | "source": [
220 | "equity_df.head()"
221 | ]
222 | },
223 | {
224 | "cell_type": "code",
225 | "execution_count": 4,
226 | "metadata": {},
227 | "outputs": [
228 | {
229 | "data": {
230 | "text/plain": [
231 | "164"
232 | ]
233 | },
234 | "execution_count": 4,
235 | "metadata": {},
236 | "output_type": "execute_result"
237 | }
238 | ],
239 | "source": [
240 | "healthcareUniverse = instrument_df.index[instrument_df.sectorCode=='HealthCare'].values\n",
241 | "len(healthcareUniverse)"
242 | ]
243 | },
244 | {
245 | "cell_type": "code",
246 | "execution_count": 5,
247 | "metadata": {},
248 | "outputs": [],
249 | "source": [
250 | "def equity_universe_filtering(equity_df, universe):\n",
251 | " universeFilter = [book_id in set(universe) for book_id in equity_df.index.get_level_values(level=1).values]\n",
252 | " return equity_df[universeFilter]"
253 | ]
254 | },
255 | {
256 | "cell_type": "code",
257 | "execution_count": 6,
258 | "metadata": {},
259 | "outputs": [
260 | {
261 | "data": {
262 | "text/html": [
263 | "\n",
264 | "
\n",
265 | " \n",
266 | " \n",
267 | " | \n",
268 | " | \n",
269 | " return | \n",
270 | " close | \n",
271 | " total_turnover | \n",
272 | " volume | \n",
273 | " week | \n",
274 | " month | \n",
275 | " report_quarter | \n",
276 | " market_cap | \n",
277 | " a_share_market_val_2 | \n",
278 | " cash_received_from_sales_of_goods | \n",
279 | " pb_ratio | \n",
280 | " net_profit | \n",
281 | " ps_ratio | \n",
282 | " sectorCode | \n",
283 | "
\n",
284 | " \n",
285 | " date | \n",
286 | " order_book_id | \n",
287 | " | \n",
288 | " | \n",
289 | " | \n",
290 | " | \n",
291 | " | \n",
292 | " | \n",
293 | " | \n",
294 | " | \n",
295 | " | \n",
296 | " | \n",
297 | " | \n",
298 | " | \n",
299 | " | \n",
300 | " | \n",
301 | "
\n",
302 | " \n",
303 | " \n",
304 | " \n",
305 | " 2012-01-04 | \n",
306 | " 000004.XSHE | \n",
307 | " -0.022250 | \n",
308 | " 7.9100 | \n",
309 | " 3763832.88 | \n",
310 | " 465469.0 | \n",
311 | " 0.5720 | \n",
312 | " 0.7506 | \n",
313 | " 2011q3 | \n",
314 | " 6.642556e+08 | \n",
315 | " 6.634549e+08 | \n",
316 | " 5.949968e+07 | \n",
317 | " 8.8175 | \n",
318 | " 4.500363e+06 | \n",
319 | " 37.5796 | \n",
320 | " HealthCare | \n",
321 | "
\n",
322 | " \n",
323 | " 000028.XSHE | \n",
324 | " -0.045433 | \n",
325 | " 19.8422 | \n",
326 | " 9326924.28 | \n",
327 | " 450553.0 | \n",
328 | " 0.4201 | \n",
329 | " 0.2722 | \n",
330 | " 2011q3 | \n",
331 | " 5.872485e+09 | \n",
332 | " 4.753820e+09 | \n",
333 | " 1.053298e+10 | \n",
334 | " 4.3493 | \n",
335 | " 2.481834e+08 | \n",
336 | " 0.3414 | \n",
337 | " HealthCare | \n",
338 | "
\n",
339 | " \n",
340 | " 000150.XSHE | \n",
341 | " -0.030295 | \n",
342 | " 3.1737 | \n",
343 | " 3109304.50 | \n",
344 | " 952600.0 | \n",
345 | " 0.3460 | \n",
346 | " 0.3610 | \n",
347 | " 2011q3 | \n",
348 | " 1.036800e+09 | \n",
349 | " 1.036800e+09 | \n",
350 | " 4.913279e+07 | \n",
351 | " 1.4763 | \n",
352 | " 3.657858e+06 | \n",
353 | " 7.8956 | \n",
354 | " HealthCare | \n",
355 | "
\n",
356 | " \n",
357 | " 000153.XSHE | \n",
358 | " -0.028053 | \n",
359 | " 5.7700 | \n",
360 | " 9673054.49 | \n",
361 | " 1596020.0 | \n",
362 | " 0.6830 | \n",
363 | " 2.4594 | \n",
364 | " 2011q3 | \n",
365 | " 1.531454e+09 | \n",
366 | " 1.360856e+09 | \n",
367 | " 1.329425e+09 | \n",
368 | " 2.1169 | \n",
369 | " 1.560397e+07 | \n",
370 | " 0.7818 | \n",
371 | " HealthCare | \n",
372 | "
\n",
373 | " \n",
374 | " 000403.XSHE | \n",
375 | " 0.000000 | \n",
376 | " 3.1625 | \n",
377 | " 0.00 | \n",
378 | " 0.0 | \n",
379 | " 0.0000 | \n",
380 | " 0.0000 | \n",
381 | " NaN | \n",
382 | " NaN | \n",
383 | " NaN | \n",
384 | " NaN | \n",
385 | " NaN | \n",
386 | " NaN | \n",
387 | " NaN | \n",
388 | " HealthCare | \n",
389 | "
\n",
390 | " \n",
391 | "
\n",
392 | "
"
393 | ],
394 | "text/plain": [
395 | " return close total_turnover volume \\\n",
396 | "date order_book_id \n",
397 | "2012-01-04 000004.XSHE -0.022250 7.9100 3763832.88 465469.0 \n",
398 | " 000028.XSHE -0.045433 19.8422 9326924.28 450553.0 \n",
399 | " 000150.XSHE -0.030295 3.1737 3109304.50 952600.0 \n",
400 | " 000153.XSHE -0.028053 5.7700 9673054.49 1596020.0 \n",
401 | " 000403.XSHE 0.000000 3.1625 0.00 0.0 \n",
402 | "\n",
403 | " week month report_quarter market_cap \\\n",
404 | "date order_book_id \n",
405 | "2012-01-04 000004.XSHE 0.5720 0.7506 2011q3 6.642556e+08 \n",
406 | " 000028.XSHE 0.4201 0.2722 2011q3 5.872485e+09 \n",
407 | " 000150.XSHE 0.3460 0.3610 2011q3 1.036800e+09 \n",
408 | " 000153.XSHE 0.6830 2.4594 2011q3 1.531454e+09 \n",
409 | " 000403.XSHE 0.0000 0.0000 NaN NaN \n",
410 | "\n",
411 | " a_share_market_val_2 \\\n",
412 | "date order_book_id \n",
413 | "2012-01-04 000004.XSHE 6.634549e+08 \n",
414 | " 000028.XSHE 4.753820e+09 \n",
415 | " 000150.XSHE 1.036800e+09 \n",
416 | " 000153.XSHE 1.360856e+09 \n",
417 | " 000403.XSHE NaN \n",
418 | "\n",
419 | " cash_received_from_sales_of_goods pb_ratio \\\n",
420 | "date order_book_id \n",
421 | "2012-01-04 000004.XSHE 5.949968e+07 8.8175 \n",
422 | " 000028.XSHE 1.053298e+10 4.3493 \n",
423 | " 000150.XSHE 4.913279e+07 1.4763 \n",
424 | " 000153.XSHE 1.329425e+09 2.1169 \n",
425 | " 000403.XSHE NaN NaN \n",
426 | "\n",
427 | " net_profit ps_ratio sectorCode \n",
428 | "date order_book_id \n",
429 | "2012-01-04 000004.XSHE 4.500363e+06 37.5796 HealthCare \n",
430 | " 000028.XSHE 2.481834e+08 0.3414 HealthCare \n",
431 | " 000150.XSHE 3.657858e+06 7.8956 HealthCare \n",
432 | " 000153.XSHE 1.560397e+07 0.7818 HealthCare \n",
433 | " 000403.XSHE NaN NaN HealthCare "
434 | ]
435 | },
436 | "execution_count": 6,
437 | "metadata": {},
438 | "output_type": "execute_result"
439 | }
440 | ],
441 | "source": [
442 | "healthcare_equity_df = equity_universe_filtering(equity_df, healthcareUniverse)\n",
443 | "healthcare_equity_df.head()"
444 | ]
445 | },
446 | {
447 | "cell_type": "code",
448 | "execution_count": 7,
449 | "metadata": {},
450 | "outputs": [
451 | {
452 | "name": "stdout",
453 | "output_type": "stream",
454 | "text": [
455 | "universe ratio: 6.210331877919959%\n"
456 | ]
457 | }
458 | ],
459 | "source": [
460 | "print(\"universe ratio: {}%\".format(len(healthcare_equity_df)/len(equity_df)*100))"
461 | ]
462 | },
463 | {
464 | "cell_type": "markdown",
465 | "metadata": {},
466 | "source": [
467 | "### benchmark"
468 | ]
469 | },
470 | {
471 | "cell_type": "code",
472 | "execution_count": 27,
473 | "metadata": {},
474 | "outputs": [],
475 | "source": [
476 | "benchmark_df = pd.read_csv(\"cn_SH_healthcare_index_2012_2018.csv\",names=['date','value'])\n",
477 | "benchmark_df = benchmark_df.set_index('date',drop=True)"
478 | ]
479 | },
480 | {
481 | "cell_type": "code",
482 | "execution_count": 33,
483 | "metadata": {},
484 | "outputs": [
485 | {
486 | "data": {
487 | "text/html": [
488 | "\n",
489 | "
\n",
490 | " \n",
491 | " \n",
492 | " | \n",
493 | " value | \n",
494 | " return | \n",
495 | "
\n",
496 | " \n",
497 | " date | \n",
498 | " | \n",
499 | " | \n",
500 | "
\n",
501 | " \n",
502 | " \n",
503 | " \n",
504 | " 2012-01-04 | \n",
505 | " 2891.462 | \n",
506 | " 0.000000 | \n",
507 | "
\n",
508 | " \n",
509 | " 2012-01-05 | \n",
510 | " 2766.955 | \n",
511 | " 0.044015 | \n",
512 | "
\n",
513 | " \n",
514 | " 2012-01-06 | \n",
515 | " 2744.793 | \n",
516 | " 0.008042 | \n",
517 | "
\n",
518 | " \n",
519 | " 2012-01-09 | \n",
520 | " 2833.219 | \n",
521 | " -0.031708 | \n",
522 | "
\n",
523 | " \n",
524 | " 2012-01-10 | \n",
525 | " 2929.594 | \n",
526 | " -0.033450 | \n",
527 | "
\n",
528 | " \n",
529 | "
\n",
530 | "
"
531 | ],
532 | "text/plain": [
533 | " value return\n",
534 | "date \n",
535 | "2012-01-04 2891.462 0.000000\n",
536 | "2012-01-05 2766.955 0.044015\n",
537 | "2012-01-06 2744.793 0.008042\n",
538 | "2012-01-09 2833.219 -0.031708\n",
539 | "2012-01-10 2929.594 -0.033450"
540 | ]
541 | },
542 | "execution_count": 33,
543 | "metadata": {},
544 | "output_type": "execute_result"
545 | }
546 | ],
547 | "source": [
548 | "benchmark_df['return'] = np.log(benchmark_df.shift(1)/benchmark_df).fillna(0)\n",
549 | "benchmark_df.head()"
550 | ]
551 | },
552 | {
553 | "cell_type": "markdown",
554 | "metadata": {},
555 | "source": [
556 | "## Factor Returns"
557 | ]
558 | },
559 | {
560 | "cell_type": "code",
561 | "execution_count": 8,
562 | "metadata": {},
563 | "outputs": [],
564 | "source": [
565 | "def equity_factor_return(equity_df, factorColumn, nAllocations, longTop=True):\n",
566 | " equity_copy = equity_df.copy()\n",
567 | "# equity_copy[\"{}_rank\".format(factorColumn)] = equity_copy.groupby(level='date')[factorColumn].rank()\n",
568 | "# equity_copy[equity_copy.groupby(level='date')[factorColumn].nlargest(nAllocations).index][\"biggest_{}_{}\".format(nAllocations,factorColumn)]=True\n",
569 | " largest = equity_copy[factorColumn].groupby(level='date').nlargest(nAllocations).reset_index(level=0,drop=True)\n",
570 | " smallest = equity_copy[factorColumn].groupby(level='date').nsmallest(nAllocations).reset_index(level=0,drop=True)\n",
571 | " r_largest = equity_copy.loc[largest.index,'return'].groupby(level='date').mean()\n",
572 | " r_smallest = equity_copy.loc[smallest.index,'return'].groupby(level='date').mean()\n",
573 | " LMS = r_largest - r_smallest\n",
574 | " if(longTop):\n",
575 | " return LMS\n",
576 | " else:\n",
577 | " return -LMS"
578 | ]
579 | },
580 | {
581 | "cell_type": "code",
582 | "execution_count": 9,
583 | "metadata": {},
584 | "outputs": [
585 | {
586 | "data": {
587 | "text/plain": [
588 | "date\n",
589 | "2012-01-04 0.005983\n",
590 | "2012-01-05 -0.009098\n",
591 | "2012-01-06 -0.004155\n",
592 | "2012-01-09 0.014615\n",
593 | "2012-01-10 0.006728\n",
594 | "Name: return, dtype: float64"
595 | ]
596 | },
597 | "execution_count": 9,
598 | "metadata": {},
599 | "output_type": "execute_result"
600 | }
601 | ],
602 | "source": [
603 | "SMB = equity_factor_return(healthcare_equity_df, 'market_cap', 20,longTop=False)\n",
604 | "SMB.head()"
605 | ]
606 | },
607 | {
608 | "cell_type": "code",
609 | "execution_count": 10,
610 | "metadata": {},
611 | "outputs": [
612 | {
613 | "data": {
614 | "text/plain": [
615 | "date\n",
616 | "2012-01-04 0.005302\n",
617 | "2012-01-05 -0.007223\n",
618 | "2012-01-06 0.006031\n",
619 | "2012-01-09 -0.002597\n",
620 | "2012-01-10 -0.010780\n",
621 | "Name: return, dtype: float64"
622 | ]
623 | },
624 | "execution_count": 10,
625 | "metadata": {},
626 | "output_type": "execute_result"
627 | }
628 | ],
629 | "source": [
630 | "HML = equity_factor_return(healthcare_equity_df, 'pb_ratio', 20,longTop=True)\n",
631 | "HML.head()"
632 | ]
633 | },
634 | {
635 | "cell_type": "code",
636 | "execution_count": 11,
637 | "metadata": {},
638 | "outputs": [],
639 | "source": [
640 | "import itertools\n",
641 | "import statsmodels.api as sm\n",
642 | "from statsmodels import regression,stats\n",
643 | "import scipy\n",
644 | "\n",
645 | "data = healthcare_equity_df[['return']] # dataframe\n",
646 | "data = data.set_index(healthcare_equity_df.index) # elimilate redundant index (whole universe)\n",
647 | "asset_list_sizes = [group[1].size for group in data.groupby(level=0)]\n",
648 | "\n",
649 | "# Spreading the factor portfolio data across all assets for each day\n",
650 | "SMB_column = [[SMB.loc[group[0]]] * size for group, size \\\n",
651 | " in zip(data.groupby(level=0), asset_list_sizes)]\n",
652 | "data['SMB'] = list(itertools.chain(*SMB_column))\n",
653 | "\n",
654 | "HML_column = [[HML.loc[group[0]]] * size for group, size \\\n",
655 | " in zip(data.groupby(level=0), asset_list_sizes)]\n",
656 | "data['HML'] = list(itertools.chain(*HML_column))\n",
657 | "data = sm.add_constant(data.dropna())"
658 | ]
659 | },
660 | {
661 | "cell_type": "code",
662 | "execution_count": 12,
663 | "metadata": {},
664 | "outputs": [
665 | {
666 | "data": {
667 | "text/html": [
668 | "\n",
669 | "
\n",
670 | " \n",
671 | " \n",
672 | " | \n",
673 | " | \n",
674 | " const | \n",
675 | " return | \n",
676 | " SMB | \n",
677 | " HML | \n",
678 | "
\n",
679 | " \n",
680 | " date | \n",
681 | " order_book_id | \n",
682 | " | \n",
683 | " | \n",
684 | " | \n",
685 | " | \n",
686 | "
\n",
687 | " \n",
688 | " \n",
689 | " \n",
690 | " 2012-01-04 | \n",
691 | " 000004.XSHE | \n",
692 | " 1.0 | \n",
693 | " -0.022250 | \n",
694 | " 0.005983 | \n",
695 | " 0.005302 | \n",
696 | "
\n",
697 | " \n",
698 | " 000028.XSHE | \n",
699 | " 1.0 | \n",
700 | " -0.045433 | \n",
701 | " 0.005983 | \n",
702 | " 0.005302 | \n",
703 | "
\n",
704 | " \n",
705 | " 000150.XSHE | \n",
706 | " 1.0 | \n",
707 | " -0.030295 | \n",
708 | " 0.005983 | \n",
709 | " 0.005302 | \n",
710 | "
\n",
711 | " \n",
712 | " 000153.XSHE | \n",
713 | " 1.0 | \n",
714 | " -0.028053 | \n",
715 | " 0.005983 | \n",
716 | " 0.005302 | \n",
717 | "
\n",
718 | " \n",
719 | " 000403.XSHE | \n",
720 | " 1.0 | \n",
721 | " 0.000000 | \n",
722 | " 0.005983 | \n",
723 | " 0.005302 | \n",
724 | "
\n",
725 | " \n",
726 | "
\n",
727 | "
"
728 | ],
729 | "text/plain": [
730 | " const return SMB HML\n",
731 | "date order_book_id \n",
732 | "2012-01-04 000004.XSHE 1.0 -0.022250 0.005983 0.005302\n",
733 | " 000028.XSHE 1.0 -0.045433 0.005983 0.005302\n",
734 | " 000150.XSHE 1.0 -0.030295 0.005983 0.005302\n",
735 | " 000153.XSHE 1.0 -0.028053 0.005983 0.005302\n",
736 | " 000403.XSHE 1.0 0.000000 0.005983 0.005302"
737 | ]
738 | },
739 | "execution_count": 12,
740 | "metadata": {},
741 | "output_type": "execute_result"
742 | }
743 | ],
744 | "source": [
745 | "data.head()"
746 | ]
747 | },
748 | {
749 | "cell_type": "markdown",
750 | "metadata": {},
751 | "source": [
752 | "## Factor Exposures ($\\beta$)"
753 | ]
754 | },
755 | {
756 | "cell_type": "code",
757 | "execution_count": 13,
758 | "metadata": {},
759 | "outputs": [],
760 | "source": [
761 | "assets = data.index.levels[1].unique()\n",
762 | "Y = [data.xs(asset,level=1)['return'] for asset in assets]\n",
763 | "X = [data.xs(asset,level=1)[['SMB','HML','const']] for asset in assets]\n",
764 | "reg_results = [regression.linear_model.OLS(y,x).fit().params for y,x in zip(Y,X) if not(x.empty or y.empty)]\n",
765 | "indices = [asset for y, x, asset in zip(Y, X, assets) if not(x.empty or y.empty)]\n",
766 | "betas = pd.DataFrame(reg_results, index=indices)"
767 | ]
768 | },
769 | {
770 | "cell_type": "code",
771 | "execution_count": 15,
772 | "metadata": {},
773 | "outputs": [
774 | {
775 | "data": {
776 | "text/html": [
777 | "\n",
778 | "
\n",
779 | " \n",
780 | " \n",
781 | " | \n",
782 | " SMB | \n",
783 | " HML | \n",
784 | " const | \n",
785 | "
\n",
786 | " \n",
787 | " \n",
788 | " \n",
789 | " 000004.XSHE | \n",
790 | " 0.883906 | \n",
791 | " 0.048757 | \n",
792 | " 0.002002 | \n",
793 | "
\n",
794 | " \n",
795 | " 000028.XSHE | \n",
796 | " -0.003029 | \n",
797 | " -0.064295 | \n",
798 | " 0.001073 | \n",
799 | "
\n",
800 | " \n",
801 | " 000150.XSHE | \n",
802 | " 0.354122 | \n",
803 | " 0.066071 | \n",
804 | " 0.002031 | \n",
805 | "
\n",
806 | " \n",
807 | " 000153.XSHE | \n",
808 | " 0.620706 | \n",
809 | " -0.082229 | \n",
810 | " 0.001405 | \n",
811 | "
\n",
812 | " \n",
813 | " 000403.XSHE | \n",
814 | " 2.032192 | \n",
815 | " 11.457418 | \n",
816 | " -0.017412 | \n",
817 | "
\n",
818 | " \n",
819 | "
\n",
820 | "
"
821 | ],
822 | "text/plain": [
823 | " SMB HML const\n",
824 | "000004.XSHE 0.883906 0.048757 0.002002\n",
825 | "000028.XSHE -0.003029 -0.064295 0.001073\n",
826 | "000150.XSHE 0.354122 0.066071 0.002031\n",
827 | "000153.XSHE 0.620706 -0.082229 0.001405\n",
828 | "000403.XSHE 2.032192 11.457418 -0.017412"
829 | ]
830 | },
831 | "execution_count": 15,
832 | "metadata": {},
833 | "output_type": "execute_result"
834 | }
835 | ],
836 | "source": [
837 | "betas.head()"
838 | ]
839 | },
840 | {
841 | "cell_type": "markdown",
842 | "metadata": {},
843 | "source": [
844 | "## Factor Premium"
845 | ]
846 | },
847 | {
848 | "cell_type": "code",
849 | "execution_count": 36,
850 | "metadata": {},
851 | "outputs": [
852 | {
853 | "data": {
854 | "text/html": [
855 | "\n",
856 | "OLS Regression Results\n",
857 | "\n",
858 | " Dep. Variable: | return | R-squared: | 0.398 | \n",
859 | "
\n",
860 | "\n",
861 | " Model: | OLS | Adj. R-squared: | 0.391 | \n",
862 | "
\n",
863 | "\n",
864 | " Method: | Least Squares | F-statistic: | 53.26 | \n",
865 | "
\n",
866 | "\n",
867 | " Date: | Sat, 05 May 2018 | Prob (F-statistic): | 1.77e-18 | \n",
868 | "
\n",
869 | "\n",
870 | " Time: | 21:03:25 | Log-Likelihood: | 1012.1 | \n",
871 | "
\n",
872 | "\n",
873 | " No. Observations: | 164 | AIC: | -2018. | \n",
874 | "
\n",
875 | "\n",
876 | " Df Residuals: | 161 | BIC: | -2009. | \n",
877 | "
\n",
878 | "\n",
879 | " Df Model: | 2 | | | \n",
880 | "
\n",
881 | "\n",
882 | " Covariance Type: | nonrobust | | | \n",
883 | "
\n",
884 | "
\n",
885 | "\n",
886 | "\n",
887 | " | coef | std err | t | P>|t| | [0.025 | 0.975] | \n",
888 | "
\n",
889 | "\n",
890 | " const | 0.0017 | 6.72e-05 | 24.956 | 0.000 | 0.002 | 0.002 | \n",
891 | "
\n",
892 | "\n",
893 | " SMB | -7.597e-05 | 0.000 | -0.599 | 0.550 | -0.000 | 0.000 | \n",
894 | "
\n",
895 | "\n",
896 | " HML | 0.0005 | 4.81e-05 | 9.695 | 0.000 | 0.000 | 0.001 | \n",
897 | "
\n",
898 | "
\n",
899 | "\n",
900 | "\n",
901 | " Omnibus: | 39.154 | Durbin-Watson: | 1.906 | \n",
902 | "
\n",
903 | "\n",
904 | " Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 78.545 | \n",
905 | "
\n",
906 | "\n",
907 | " Skew: | 1.087 | Prob(JB): | 8.80e-18 | \n",
908 | "
\n",
909 | "\n",
910 | " Kurtosis: | 5.601 | Cond. No. | 3.92 | \n",
911 | "
\n",
912 | "
"
913 | ],
914 | "text/plain": [
915 | "\n",
916 | "\"\"\"\n",
917 | " OLS Regression Results \n",
918 | "==============================================================================\n",
919 | "Dep. Variable: return R-squared: 0.398\n",
920 | "Model: OLS Adj. R-squared: 0.391\n",
921 | "Method: Least Squares F-statistic: 53.26\n",
922 | "Date: Sat, 05 May 2018 Prob (F-statistic): 1.77e-18\n",
923 | "Time: 21:03:25 Log-Likelihood: 1012.1\n",
924 | "No. Observations: 164 AIC: -2018.\n",
925 | "Df Residuals: 161 BIC: -2009.\n",
926 | "Df Model: 2 \n",
927 | "Covariance Type: nonrobust \n",
928 | "==============================================================================\n",
929 | " coef std err t P>|t| [0.025 0.975]\n",
930 | "------------------------------------------------------------------------------\n",
931 | "const 0.0017 6.72e-05 24.956 0.000 0.002 0.002\n",
932 | "SMB -7.597e-05 0.000 -0.599 0.550 -0.000 0.000\n",
933 | "HML 0.0005 4.81e-05 9.695 0.000 0.000 0.001\n",
934 | "==============================================================================\n",
935 | "Omnibus: 39.154 Durbin-Watson: 1.906\n",
936 | "Prob(Omnibus): 0.000 Jarque-Bera (JB): 78.545\n",
937 | "Skew: 1.087 Prob(JB): 8.80e-18\n",
938 | "Kurtosis: 5.601 Cond. No. 3.92\n",
939 | "==============================================================================\n",
940 | "\n",
941 | "Warnings:\n",
942 | "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
943 | "\"\"\""
944 | ]
945 | },
946 | "execution_count": 36,
947 | "metadata": {},
948 | "output_type": "execute_result"
949 | }
950 | ],
951 | "source": [
952 | "betas = sm.add_constant(betas.drop('const', axis=1))\n",
953 | "\n",
954 | "R = data['return'].mean(axis=0, level=1)\n",
955 | "\n",
956 | "# Second regression step: estimating the risk premia\n",
957 | "risk_free_rate = benchmark_df['return'].mean()\n",
958 | "\n",
959 | "final_results = regression.linear_model.OLS(R - risk_free_rate, betas).fit()\n",
960 | "\n",
961 | "final_results.summary()"
962 | ]
963 | },
964 | {
965 | "cell_type": "markdown",
966 | "metadata": {},
967 | "source": [
968 | "## Fama-Macbeth Test Conclusion: \n",
969 | "although our individual factors are significant, we have a very low $R^2$ . What this may suggest is that there is a real link between our factors and the returns of our assets, but that there still remains a lot of unexplained noise!"
970 | ]
971 | },
972 | {
973 | "cell_type": "code",
974 | "execution_count": null,
975 | "metadata": {},
976 | "outputs": [],
977 | "source": []
978 | }
979 | ],
980 | "metadata": {
981 | "kernelspec": {
982 | "display_name": "Python 3",
983 | "language": "python",
984 | "name": "python3"
985 | },
986 | "language_info": {
987 | "codemirror_mode": {
988 | "name": "ipython",
989 | "version": 3
990 | },
991 | "file_extension": ".py",
992 | "mimetype": "text/x-python",
993 | "name": "python",
994 | "nbconvert_exporter": "python",
995 | "pygments_lexer": "ipython3",
996 | "version": "3.5.4"
997 | }
998 | },
999 | "nbformat": 4,
1000 | "nbformat_minor": 2
1001 | }
1002 |
--------------------------------------------------------------------------------
/source/rqdata_utils.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import alphalens as al
3 | import matplotlib.pyplot as plt
4 |
5 | def price_reader(price_path):
6 | price_df = pd.read_csv(price_path)
7 | price_df.rename(index=str,columns={"Unnamed: 0":"date"},inplace=True)
8 | price_df.date = pd.to_datetime(price_df.date,format="%Y-%m-%d",errors='ignore')
9 | # price_df.date = price_df.date.apply(timezone.localize)
10 | price_df.set_index(['date'],drop=True,inplace=True)
11 | price_df = price_df.sortlevel(axis=1)
12 | return price_df
13 |
14 | def instrument_reader(instrument_path):
15 | instrument_df = pd.read_csv(instrument_path)
16 | instrument_df.drop(['Unnamed: 0'],axis=1,inplace=True)
17 | instrument_df = instrument_df.set_index(['bookId'])
18 | instrument_df = instrument_df.sort_index()
19 | return instrument_df
20 |
21 | def equity_reader(equity_path):
22 | cn_df = pd.read_csv(equity_path)
23 | cn_df.date = pd.to_datetime(cn_df.date,format="%Y-%m-%d",errors='ignore')
24 | cn_df.set_index(['date','order_book_id'],drop=True,inplace=True)
25 | cn_df.drop(["Unnamed: 0"],axis=1,inplace=True)
26 | return cn_df
27 |
28 | def equity_add_instrumentInfo(cn_df,instrument_df,instrument_column):
29 | instrumentInfoSeries = instrument_df[instrument_column]
30 | bookIdIdx = cn_df.index.get_level_values('order_book_id')
31 | bookIdArray = bookIdIdx.get_values()
32 | instrumentInfo = instrumentInfoSeries[bookIdArray[:]].values
33 | cn_df[instrument_column] = instrumentInfo
34 | return cn_df
35 |
36 | def get_price_instrument_equity(price_path,instrument_path,equity_path,addInstrumentColumn=None):
37 | price_df = price_reader(price_path)
38 | instrument_df = instrument_reader(instrument_path)
39 | equity_df = equity_reader(equity_path)
40 | if(addInstrumentColumn):
41 | equity_df = equity_add_instrumentInfo(equity_df,instrument_df,addInstrumentColumn)
42 | return price_df,instrument_df,equity_df
43 |
44 | def ic_analysis(equity_df, price_df, factor_columns, group_column, periods=(1,22,66), group_adjust=False):
45 | factor_list = []
46 | ic_list = []
47 | monthly_ic_list = []
48 | groupby = equity_df[group_column]
49 | for col in factor_columns:
50 | factor_list.append(equity_df[col])
51 |
52 | for my_factor in factor_list:
53 | factor_data = al.utils.get_clean_factor_and_forward_returns(factor=my_factor,
54 | prices=price_df,
55 | groupby=groupby,
56 | periods=periods,
57 | max_loss=1)
58 | mean_ic = al.performance.mean_information_coefficient(factor_data, group_adjust=group_adjust,
59 | by_group=True,
60 | by_time=None)
61 | mean_monthly_ic = al.performance.mean_information_coefficient(factor_data, group_adjust=group_adjust,
62 | by_group=False,
63 | by_time='M')
64 | print("#######################################################")
65 | print("factor: {}".format(my_factor.name))
66 | print(mean_ic)
67 | # print(mean_monthly_ic)
68 | ic_list.append(mean_ic)
69 | monthly_ic_list.append(mean_monthly_ic)
70 | al.plotting.plot_monthly_ic_heatmap(mean_monthly_ic)
71 | plt.show()
72 |
73 |
74 | mean_ic_df = pd.concat(ic_list, keys=factor_columns)
75 | mean_ic_df.index = mean_ic_df.index.set_names(['factor', 'group'])
76 | return mean_ic_df, monthly_ic_list
--------------------------------------------------------------------------------