├── .gitignore ├── APT_FammaMacbeth.ipynb ├── Alpha Trading Workflow.pdf ├── KalmanFilterIntro.ipynb ├── README.md ├── README.pdf ├── README_old.md ├── Step1_FactorPretest.ipynb ├── Step2_FactorsScreening-Copy1.ipynb ├── Step2_FactorsScreening.ipynb ├── Step3_FactorCombination_AdaBoost_Quantopian.ipynb ├── Step3_FactorCombination_AdaBoost_Quantopian_old.ipynb ├── Step3_FactorCombination_BarraKalmanFilter.ipynb ├── output └── factor_ic_analysis.csv ├── report ├── Alpha Trading Workflow.md ├── Corr_matrix_for_factor_ranks.png ├── Corr_matrix_for_raw_factors.png ├── Quantitative Strategy Workflow.pptx ├── Step3_FactorCombination_AdaBoost_Quantopian.html ├── adaboost_algorithm.png ├── corr_comparison_after_pca_analysis.png ├── mean_spearmans_rank_IC.png ├── mean_spearmans_rank_IC_absolute_value.png ├── rank_of_mean_spearmans_rank_IC_absolute_value.png ├── test_accuracy_bar.png ├── test_score_dist.png ├── train_accuracy_bar.png ├── train_score_dist.png └── train_score_dist2.png ├── rqdata_utils.py └── source ├── DownloadData.ipynb ├── DownloadData_bak.ipynb ├── FactorAnalysis.ipynb ├── FactorModeling.ipynb ├── FactorsScreening.ipynb ├── KalmanFilter.ipynb ├── MultiFactorModel.ipynb └── rqdata_utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Spesific files # 2 | ################### 3 | cn_*.csv 4 | *-checkpoint.ipynb 5 | 6 | # Spesific directors # 7 | # ################# 8 | .idea/ 9 | .ipython_checkpoints/ 10 | __pycache__/ 11 | 12 | # Backup # 13 | ################### 14 | *.pyo 15 | *.pyc 16 | *~ 17 | *.bak 18 | *.swp 19 | *# 20 | 21 | # Images # 22 | ################### 23 | #*.jpg 24 | *.gif 25 | #*.png 26 | *.svg 27 | *.ico 28 | 29 | # Compiled source # 30 | ################### 31 | *.com 32 | *.class 33 | *.dll 34 | *.exe 35 | *.o 36 | *.so 37 | 38 | # Packages # 39 | ############ 40 | # it's better to unpack these files and commit the raw source 41 | # git has its own built in compression methods 42 | *.7z 43 | *.dmg 44 | *.gz 45 | *.iso 46 | *.jar 47 | *.rar 48 | *.tar 49 | *.zip 50 | 51 | # Logs and databases # 52 | ###################### 53 | *.log 54 | *.sql 55 | *.sqlite 56 | 57 | # OS generated files # 58 | ###################### 59 | .DS_Store 60 | .DS_Store? 61 | ._* 62 | .Spotlight-V100 63 | .Trashes 64 | ehthumbs.db 65 | Thumbs.db 66 | -------------------------------------------------------------------------------- /APT_FammaMacbeth.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# APT model: Famma-Macbeth Regression" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": { 14 | "collapsed": true 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "from rqdata_utils import *\n", 19 | "import pandas\n", 20 | "import numpy as np\n", 21 | "import scipy as sp\n", 22 | "import alphalens as al\n", 23 | "%matplotlib inline" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "## Loading Data" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 2, 36 | "metadata": { 37 | "collapsed": true 38 | }, 39 | "outputs": [], 40 | "source": [ 41 | "price_df,instrument_df,equity_df = get_price_instrument_equity(\"cn_stock_price_2012_2018.csv\",\"cn_instrument_info_2012_2018.csv\",\"cn_equity_daily_2012_2018.csv\",\"sectorCode\")" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 3, 47 | "metadata": { 48 | "collapsed": false 49 | }, 50 | "outputs": [ 51 | { 52 | "data": { 53 | "text/html": [ 54 | "
\n", 55 | "\n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | "
returnclosetotal_turnovervolumeweekmonthreport_quartermarket_capa_share_market_val_2cash_received_from_sales_of_goodspb_rationet_profitps_ratiosectorCode
dateorder_book_id
2012-01-04000001.XSHE-0.0275825.12242.275637e+0840894428.00.57750.4331NaNNaNNaNNaNNaNNaNNaNFinancials
000002.XSHE-0.0187426.05253.559891e+0847432958.00.37110.40302011q38.059489e+107.082120e+107.516785e+101.52164.106349e+090.8679Financials
000004.XSHE-0.0222507.91003.763833e+06465469.00.57200.75062011q36.642556e+086.634549e+085.949968e+078.81754.500363e+0637.5796HealthCare
000005.XSHE0.0000003.86000.000000e+000.00.00000.00002011q33.529328e+093.527048e+092.565851e+075.34801.365665e+07-347.2191Industrials
000006.XSHE-0.0097562.67667.619286e+062513811.00.14160.16672011q34.015370e+093.929464e+092.531436e+091.43482.763917e+081.4139Financials
\n", 183 | "
" 184 | ], 185 | "text/plain": [ 186 | " return close total_turnover volume \\\n", 187 | "date order_book_id \n", 188 | "2012-01-04 000001.XSHE -0.027582 5.1224 2.275637e+08 40894428.0 \n", 189 | " 000002.XSHE -0.018742 6.0525 3.559891e+08 47432958.0 \n", 190 | " 000004.XSHE -0.022250 7.9100 3.763833e+06 465469.0 \n", 191 | " 000005.XSHE 0.000000 3.8600 0.000000e+00 0.0 \n", 192 | " 000006.XSHE -0.009756 2.6766 7.619286e+06 2513811.0 \n", 193 | "\n", 194 | " week month report_quarter market_cap \\\n", 195 | "date order_book_id \n", 196 | "2012-01-04 000001.XSHE 0.5775 0.4331 NaN NaN \n", 197 | " 000002.XSHE 0.3711 0.4030 2011q3 8.059489e+10 \n", 198 | " 000004.XSHE 0.5720 0.7506 2011q3 6.642556e+08 \n", 199 | " 000005.XSHE 0.0000 0.0000 2011q3 3.529328e+09 \n", 200 | " 000006.XSHE 0.1416 0.1667 2011q3 4.015370e+09 \n", 201 | "\n", 202 | " a_share_market_val_2 \\\n", 203 | "date order_book_id \n", 204 | "2012-01-04 000001.XSHE NaN \n", 205 | " 000002.XSHE 7.082120e+10 \n", 206 | " 000004.XSHE 6.634549e+08 \n", 207 | " 000005.XSHE 3.527048e+09 \n", 208 | " 000006.XSHE 3.929464e+09 \n", 209 | "\n", 210 | " cash_received_from_sales_of_goods pb_ratio \\\n", 211 | "date order_book_id \n", 212 | "2012-01-04 000001.XSHE NaN NaN \n", 213 | " 000002.XSHE 7.516785e+10 1.5216 \n", 214 | " 000004.XSHE 5.949968e+07 8.8175 \n", 215 | " 000005.XSHE 2.565851e+07 5.3480 \n", 216 | " 000006.XSHE 2.531436e+09 1.4348 \n", 217 | "\n", 218 | " net_profit ps_ratio sectorCode \n", 219 | "date order_book_id \n", 220 | "2012-01-04 000001.XSHE NaN NaN Financials \n", 221 | " 000002.XSHE 4.106349e+09 0.8679 Financials \n", 222 | " 000004.XSHE 4.500363e+06 37.5796 HealthCare \n", 223 | " 000005.XSHE 1.365665e+07 -347.2191 Industrials \n", 224 | " 000006.XSHE 2.763917e+08 1.4139 Financials " 225 | ] 226 | }, 227 | "execution_count": 3, 228 | "metadata": {}, 229 | "output_type": "execute_result" 230 | } 231 | ], 232 | "source": [ 233 | "equity_df.head()" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 4, 239 | "metadata": { 240 | "collapsed": false 241 | }, 242 | "outputs": [ 243 | { 244 | "data": { 245 | "text/plain": [ 246 | "164" 247 | ] 248 | }, 249 | "execution_count": 4, 250 | "metadata": {}, 251 | "output_type": "execute_result" 252 | } 253 | ], 254 | "source": [ 255 | "healthcareUniverse = instrument_df.index[instrument_df.sectorCode=='HealthCare'].values\n", 256 | "len(healthcareUniverse)" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": 5, 262 | "metadata": { 263 | "collapsed": true 264 | }, 265 | "outputs": [], 266 | "source": [ 267 | "def equity_universe_filtering(equity_df, universe):\n", 268 | " universeFilter = [book_id in set(universe) for book_id in equity_df.index.get_level_values(level=1).values]\n", 269 | " return equity_df[universeFilter]" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": 6, 275 | "metadata": { 276 | "collapsed": false 277 | }, 278 | "outputs": [ 279 | { 280 | "data": { 281 | "text/html": [ 282 | "
\n", 283 | "\n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | "
returnclosetotal_turnovervolumeweekmonthreport_quartermarket_capa_share_market_val_2cash_received_from_sales_of_goodspb_rationet_profitps_ratiosectorCode
dateorder_book_id
2012-01-04000004.XSHE-0.0222507.91003763832.88465469.00.57200.75062011q36.642556e+086.634549e+085.949968e+078.81754.500363e+0637.5796HealthCare
000028.XSHE-0.04543319.84229326924.28450553.00.42010.27222011q35.872485e+094.753820e+091.053298e+104.34932.481834e+080.3414HealthCare
000150.XSHE-0.0302953.17373109304.50952600.00.34600.36102011q31.036800e+091.036800e+094.913279e+071.47633.657858e+067.8956HealthCare
000153.XSHE-0.0280535.77009673054.491596020.00.68302.45942011q31.531454e+091.360856e+091.329425e+092.11691.560397e+070.7818HealthCare
000403.XSHE0.0000003.16250.000.00.00000.0000NaNNaNNaNNaNNaNNaNNaNHealthCare
\n", 411 | "
" 412 | ], 413 | "text/plain": [ 414 | " return close total_turnover volume \\\n", 415 | "date order_book_id \n", 416 | "2012-01-04 000004.XSHE -0.022250 7.9100 3763832.88 465469.0 \n", 417 | " 000028.XSHE -0.045433 19.8422 9326924.28 450553.0 \n", 418 | " 000150.XSHE -0.030295 3.1737 3109304.50 952600.0 \n", 419 | " 000153.XSHE -0.028053 5.7700 9673054.49 1596020.0 \n", 420 | " 000403.XSHE 0.000000 3.1625 0.00 0.0 \n", 421 | "\n", 422 | " week month report_quarter market_cap \\\n", 423 | "date order_book_id \n", 424 | "2012-01-04 000004.XSHE 0.5720 0.7506 2011q3 6.642556e+08 \n", 425 | " 000028.XSHE 0.4201 0.2722 2011q3 5.872485e+09 \n", 426 | " 000150.XSHE 0.3460 0.3610 2011q3 1.036800e+09 \n", 427 | " 000153.XSHE 0.6830 2.4594 2011q3 1.531454e+09 \n", 428 | " 000403.XSHE 0.0000 0.0000 NaN NaN \n", 429 | "\n", 430 | " a_share_market_val_2 \\\n", 431 | "date order_book_id \n", 432 | "2012-01-04 000004.XSHE 6.634549e+08 \n", 433 | " 000028.XSHE 4.753820e+09 \n", 434 | " 000150.XSHE 1.036800e+09 \n", 435 | " 000153.XSHE 1.360856e+09 \n", 436 | " 000403.XSHE NaN \n", 437 | "\n", 438 | " cash_received_from_sales_of_goods pb_ratio \\\n", 439 | "date order_book_id \n", 440 | "2012-01-04 000004.XSHE 5.949968e+07 8.8175 \n", 441 | " 000028.XSHE 1.053298e+10 4.3493 \n", 442 | " 000150.XSHE 4.913279e+07 1.4763 \n", 443 | " 000153.XSHE 1.329425e+09 2.1169 \n", 444 | " 000403.XSHE NaN NaN \n", 445 | "\n", 446 | " net_profit ps_ratio sectorCode \n", 447 | "date order_book_id \n", 448 | "2012-01-04 000004.XSHE 4.500363e+06 37.5796 HealthCare \n", 449 | " 000028.XSHE 2.481834e+08 0.3414 HealthCare \n", 450 | " 000150.XSHE 3.657858e+06 7.8956 HealthCare \n", 451 | " 000153.XSHE 1.560397e+07 0.7818 HealthCare \n", 452 | " 000403.XSHE NaN NaN HealthCare " 453 | ] 454 | }, 455 | "execution_count": 6, 456 | "metadata": {}, 457 | "output_type": "execute_result" 458 | } 459 | ], 460 | "source": [ 461 | "healthcare_equity_df = equity_universe_filtering(equity_df, healthcareUniverse)\n", 462 | "healthcare_equity_df.head()" 463 | ] 464 | }, 465 | { 466 | "cell_type": "code", 467 | "execution_count": 7, 468 | "metadata": { 469 | "collapsed": false 470 | }, 471 | "outputs": [ 472 | { 473 | "name": "stdout", 474 | "output_type": "stream", 475 | "text": [ 476 | "universe ratio: 6.210331877919959%\n" 477 | ] 478 | } 479 | ], 480 | "source": [ 481 | "print(\"universe ratio: {}%\".format(len(healthcare_equity_df)/len(equity_df)*100))" 482 | ] 483 | }, 484 | { 485 | "cell_type": "markdown", 486 | "metadata": {}, 487 | "source": [ 488 | "### benchmark" 489 | ] 490 | }, 491 | { 492 | "cell_type": "code", 493 | "execution_count": 27, 494 | "metadata": { 495 | "collapsed": true 496 | }, 497 | "outputs": [], 498 | "source": [ 499 | "benchmark_df = pd.read_csv(\"cn_SH_healthcare_index_2012_2018.csv\",names=['date','value'])\n", 500 | "benchmark_df = benchmark_df.set_index('date',drop=True)" 501 | ] 502 | }, 503 | { 504 | "cell_type": "code", 505 | "execution_count": 33, 506 | "metadata": { 507 | "collapsed": false 508 | }, 509 | "outputs": [ 510 | { 511 | "data": { 512 | "text/html": [ 513 | "
\n", 514 | "\n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | "
valuereturn
date
2012-01-042891.4620.000000
2012-01-052766.9550.044015
2012-01-062744.7930.008042
2012-01-092833.219-0.031708
2012-01-102929.594-0.033450
\n", 555 | "
" 556 | ], 557 | "text/plain": [ 558 | " value return\n", 559 | "date \n", 560 | "2012-01-04 2891.462 0.000000\n", 561 | "2012-01-05 2766.955 0.044015\n", 562 | "2012-01-06 2744.793 0.008042\n", 563 | "2012-01-09 2833.219 -0.031708\n", 564 | "2012-01-10 2929.594 -0.033450" 565 | ] 566 | }, 567 | "execution_count": 33, 568 | "metadata": {}, 569 | "output_type": "execute_result" 570 | } 571 | ], 572 | "source": [ 573 | "benchmark_df['return'] = np.log(benchmark_df.shift(1)/benchmark_df).fillna(0)\n", 574 | "benchmark_df.head()" 575 | ] 576 | }, 577 | { 578 | "cell_type": "markdown", 579 | "metadata": {}, 580 | "source": [ 581 | "## Factor Returns" 582 | ] 583 | }, 584 | { 585 | "cell_type": "code", 586 | "execution_count": 8, 587 | "metadata": { 588 | "collapsed": true 589 | }, 590 | "outputs": [], 591 | "source": [ 592 | "def equity_factor_return(equity_df, factorColumn, nAllocations, longTop=True):\n", 593 | " equity_copy = equity_df.copy()\n", 594 | "# equity_copy[\"{}_rank\".format(factorColumn)] = equity_copy.groupby(level='date')[factorColumn].rank()\n", 595 | "# equity_copy[equity_copy.groupby(level='date')[factorColumn].nlargest(nAllocations).index][\"biggest_{}_{}\".format(nAllocations,factorColumn)]=True\n", 596 | " largest = equity_copy[factorColumn].groupby(level='date').nlargest(nAllocations).reset_index(level=0,drop=True)\n", 597 | " smallest = equity_copy[factorColumn].groupby(level='date').nsmallest(nAllocations).reset_index(level=0,drop=True)\n", 598 | " r_largest = equity_copy.loc[largest.index,'return'].groupby(level='date').mean()\n", 599 | " r_smallest = equity_copy.loc[smallest.index,'return'].groupby(level='date').mean()\n", 600 | " LMS = r_largest - r_smallest\n", 601 | " if(longTop):\n", 602 | " return LMS\n", 603 | " else:\n", 604 | " return -LMS" 605 | ] 606 | }, 607 | { 608 | "cell_type": "code", 609 | "execution_count": 9, 610 | "metadata": { 611 | "collapsed": false 612 | }, 613 | "outputs": [ 614 | { 615 | "data": { 616 | "text/plain": [ 617 | "date\n", 618 | "2012-01-04 0.005983\n", 619 | "2012-01-05 -0.009098\n", 620 | "2012-01-06 -0.004155\n", 621 | "2012-01-09 0.014615\n", 622 | "2012-01-10 0.006728\n", 623 | "Name: return, dtype: float64" 624 | ] 625 | }, 626 | "execution_count": 9, 627 | "metadata": {}, 628 | "output_type": "execute_result" 629 | } 630 | ], 631 | "source": [ 632 | "SMB = equity_factor_return(healthcare_equity_df, 'market_cap', 20,longTop=False)\n", 633 | "SMB.head()" 634 | ] 635 | }, 636 | { 637 | "cell_type": "code", 638 | "execution_count": 10, 639 | "metadata": { 640 | "collapsed": false 641 | }, 642 | "outputs": [ 643 | { 644 | "data": { 645 | "text/plain": [ 646 | "date\n", 647 | "2012-01-04 0.005302\n", 648 | "2012-01-05 -0.007223\n", 649 | "2012-01-06 0.006031\n", 650 | "2012-01-09 -0.002597\n", 651 | "2012-01-10 -0.010780\n", 652 | "Name: return, dtype: float64" 653 | ] 654 | }, 655 | "execution_count": 10, 656 | "metadata": {}, 657 | "output_type": "execute_result" 658 | } 659 | ], 660 | "source": [ 661 | "HML = equity_factor_return(healthcare_equity_df, 'pb_ratio', 20,longTop=True)\n", 662 | "HML.head()" 663 | ] 664 | }, 665 | { 666 | "cell_type": "code", 667 | "execution_count": 11, 668 | "metadata": { 669 | "collapsed": true 670 | }, 671 | "outputs": [], 672 | "source": [ 673 | "import itertools\n", 674 | "import statsmodels.api as sm\n", 675 | "from statsmodels import regression,stats\n", 676 | "import scipy\n", 677 | "\n", 678 | "data = healthcare_equity_df[['return']] # dataframe\n", 679 | "data = data.set_index(healthcare_equity_df.index) # elimilate redundant index (whole universe)\n", 680 | "asset_list_sizes = [group[1].size for group in data.groupby(level=0)]\n", 681 | "\n", 682 | "# Spreading the factor portfolio data across all assets for each day\n", 683 | "SMB_column = [[SMB.loc[group[0]]] * size for group, size \\\n", 684 | " in zip(data.groupby(level=0), asset_list_sizes)]\n", 685 | "data['SMB'] = list(itertools.chain(*SMB_column))\n", 686 | "\n", 687 | "HML_column = [[HML.loc[group[0]]] * size for group, size \\\n", 688 | " in zip(data.groupby(level=0), asset_list_sizes)]\n", 689 | "data['HML'] = list(itertools.chain(*HML_column))\n", 690 | "data = sm.add_constant(data.dropna())" 691 | ] 692 | }, 693 | { 694 | "cell_type": "code", 695 | "execution_count": 12, 696 | "metadata": { 697 | "collapsed": false 698 | }, 699 | "outputs": [ 700 | { 701 | "data": { 702 | "text/html": [ 703 | "
\n", 704 | "\n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | "
constreturnSMBHML
dateorder_book_id
2012-01-04000004.XSHE1.0-0.0222500.0059830.005302
000028.XSHE1.0-0.0454330.0059830.005302
000150.XSHE1.0-0.0302950.0059830.005302
000153.XSHE1.0-0.0280530.0059830.005302
000403.XSHE1.00.0000000.0059830.005302
\n", 762 | "
" 763 | ], 764 | "text/plain": [ 765 | " const return SMB HML\n", 766 | "date order_book_id \n", 767 | "2012-01-04 000004.XSHE 1.0 -0.022250 0.005983 0.005302\n", 768 | " 000028.XSHE 1.0 -0.045433 0.005983 0.005302\n", 769 | " 000150.XSHE 1.0 -0.030295 0.005983 0.005302\n", 770 | " 000153.XSHE 1.0 -0.028053 0.005983 0.005302\n", 771 | " 000403.XSHE 1.0 0.000000 0.005983 0.005302" 772 | ] 773 | }, 774 | "execution_count": 12, 775 | "metadata": {}, 776 | "output_type": "execute_result" 777 | } 778 | ], 779 | "source": [ 780 | "data.head()" 781 | ] 782 | }, 783 | { 784 | "cell_type": "markdown", 785 | "metadata": {}, 786 | "source": [ 787 | "## Factor Exposures ($\\beta$)" 788 | ] 789 | }, 790 | { 791 | "cell_type": "code", 792 | "execution_count": 13, 793 | "metadata": { 794 | "collapsed": true 795 | }, 796 | "outputs": [], 797 | "source": [ 798 | "assets = data.index.levels[1].unique()\n", 799 | "Y = [data.xs(asset,level=1)['return'] for asset in assets]\n", 800 | "X = [data.xs(asset,level=1)[['SMB','HML','const']] for asset in assets]\n", 801 | "reg_results = [regression.linear_model.OLS(y,x).fit().params for y,x in zip(Y,X) if not(x.empty or y.empty)]\n", 802 | "indices = [asset for y, x, asset in zip(Y, X, assets) if not(x.empty or y.empty)]\n", 803 | "betas = pd.DataFrame(reg_results, index=indices)" 804 | ] 805 | }, 806 | { 807 | "cell_type": "code", 808 | "execution_count": 15, 809 | "metadata": { 810 | "collapsed": false 811 | }, 812 | "outputs": [ 813 | { 814 | "data": { 815 | "text/html": [ 816 | "
\n", 817 | "\n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | "
SMBHMLconst
000004.XSHE0.8839060.0487570.002002
000028.XSHE-0.003029-0.0642950.001073
000150.XSHE0.3541220.0660710.002031
000153.XSHE0.620706-0.0822290.001405
000403.XSHE2.03219211.457418-0.017412
\n", 859 | "
" 860 | ], 861 | "text/plain": [ 862 | " SMB HML const\n", 863 | "000004.XSHE 0.883906 0.048757 0.002002\n", 864 | "000028.XSHE -0.003029 -0.064295 0.001073\n", 865 | "000150.XSHE 0.354122 0.066071 0.002031\n", 866 | "000153.XSHE 0.620706 -0.082229 0.001405\n", 867 | "000403.XSHE 2.032192 11.457418 -0.017412" 868 | ] 869 | }, 870 | "execution_count": 15, 871 | "metadata": {}, 872 | "output_type": "execute_result" 873 | } 874 | ], 875 | "source": [ 876 | "betas.head()" 877 | ] 878 | }, 879 | { 880 | "cell_type": "markdown", 881 | "metadata": {}, 882 | "source": [ 883 | "## Factor Premium" 884 | ] 885 | }, 886 | { 887 | "cell_type": "code", 888 | "execution_count": 36, 889 | "metadata": { 890 | "collapsed": false 891 | }, 892 | "outputs": [ 893 | { 894 | "data": { 895 | "text/html": [ 896 | "\n", 897 | "\n", 898 | "\n", 899 | " \n", 900 | "\n", 901 | "\n", 902 | " \n", 903 | "\n", 904 | "\n", 905 | " \n", 906 | "\n", 907 | "\n", 908 | " \n", 909 | "\n", 910 | "\n", 911 | " \n", 912 | "\n", 913 | "\n", 914 | " \n", 915 | "\n", 916 | "\n", 917 | " \n", 918 | "\n", 919 | "\n", 920 | " \n", 921 | "\n", 922 | "\n", 923 | " \n", 924 | "\n", 925 | "
OLS Regression Results
Dep. Variable: return R-squared: 0.398
Model: OLS Adj. R-squared: 0.391
Method: Least Squares F-statistic: 53.26
Date: Sat, 05 May 2018 Prob (F-statistic): 1.77e-18
Time: 21:03:25 Log-Likelihood: 1012.1
No. Observations: 164 AIC: -2018.
Df Residuals: 161 BIC: -2009.
Df Model: 2
Covariance Type: nonrobust
\n", 926 | "\n", 927 | "\n", 928 | " \n", 929 | "\n", 930 | "\n", 931 | " \n", 932 | "\n", 933 | "\n", 934 | " \n", 935 | "\n", 936 | "\n", 937 | " \n", 938 | "\n", 939 | "
coef std err t P>|t| [0.025 0.975]
const 0.0017 6.72e-05 24.956 0.000 0.002 0.002
SMB -7.597e-05 0.000 -0.599 0.550 -0.000 0.000
HML 0.0005 4.81e-05 9.695 0.000 0.000 0.001
\n", 940 | "\n", 941 | "\n", 942 | " \n", 943 | "\n", 944 | "\n", 945 | " \n", 946 | "\n", 947 | "\n", 948 | " \n", 949 | "\n", 950 | "\n", 951 | " \n", 952 | "\n", 953 | "
Omnibus: 39.154 Durbin-Watson: 1.906
Prob(Omnibus): 0.000 Jarque-Bera (JB): 78.545
Skew: 1.087 Prob(JB): 8.80e-18
Kurtosis: 5.601 Cond. No. 3.92
" 954 | ], 955 | "text/plain": [ 956 | "\n", 957 | "\"\"\"\n", 958 | " OLS Regression Results \n", 959 | "==============================================================================\n", 960 | "Dep. Variable: return R-squared: 0.398\n", 961 | "Model: OLS Adj. R-squared: 0.391\n", 962 | "Method: Least Squares F-statistic: 53.26\n", 963 | "Date: Sat, 05 May 2018 Prob (F-statistic): 1.77e-18\n", 964 | "Time: 21:03:25 Log-Likelihood: 1012.1\n", 965 | "No. Observations: 164 AIC: -2018.\n", 966 | "Df Residuals: 161 BIC: -2009.\n", 967 | "Df Model: 2 \n", 968 | "Covariance Type: nonrobust \n", 969 | "==============================================================================\n", 970 | " coef std err t P>|t| [0.025 0.975]\n", 971 | "------------------------------------------------------------------------------\n", 972 | "const 0.0017 6.72e-05 24.956 0.000 0.002 0.002\n", 973 | "SMB -7.597e-05 0.000 -0.599 0.550 -0.000 0.000\n", 974 | "HML 0.0005 4.81e-05 9.695 0.000 0.000 0.001\n", 975 | "==============================================================================\n", 976 | "Omnibus: 39.154 Durbin-Watson: 1.906\n", 977 | "Prob(Omnibus): 0.000 Jarque-Bera (JB): 78.545\n", 978 | "Skew: 1.087 Prob(JB): 8.80e-18\n", 979 | "Kurtosis: 5.601 Cond. No. 3.92\n", 980 | "==============================================================================\n", 981 | "\n", 982 | "Warnings:\n", 983 | "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n", 984 | "\"\"\"" 985 | ] 986 | }, 987 | "execution_count": 36, 988 | "metadata": {}, 989 | "output_type": "execute_result" 990 | } 991 | ], 992 | "source": [ 993 | "betas = sm.add_constant(betas.drop('const', axis=1))\n", 994 | "\n", 995 | "R = data['return'].mean(axis=0, level=1)\n", 996 | "\n", 997 | "# Second regression step: estimating the risk premia\n", 998 | "risk_free_rate = benchmark_df['return'].mean()\n", 999 | "\n", 1000 | "final_results = regression.linear_model.OLS(R - risk_free_rate, betas).fit()\n", 1001 | "\n", 1002 | "final_results.summary()" 1003 | ] 1004 | }, 1005 | { 1006 | "cell_type": "markdown", 1007 | "metadata": {}, 1008 | "source": [ 1009 | "## Fama-Macbeth Test Conclusion: \n", 1010 | "although our individual factors are significant, we have a very low $R^2$ . What this may suggest is that there is a real link between our factors and the returns of our assets, but that there still remains a lot of unexplained noise!" 1011 | ] 1012 | }, 1013 | { 1014 | "cell_type": "code", 1015 | "execution_count": null, 1016 | "metadata": { 1017 | "collapsed": true 1018 | }, 1019 | "outputs": [], 1020 | "source": [] 1021 | } 1022 | ], 1023 | "metadata": { 1024 | "kernelspec": { 1025 | "display_name": "Python 3", 1026 | "language": "python", 1027 | "name": "python3" 1028 | }, 1029 | "language_info": { 1030 | "codemirror_mode": { 1031 | "name": "ipython", 1032 | "version": 3 1033 | }, 1034 | "file_extension": ".py", 1035 | "mimetype": "text/x-python", 1036 | "name": "python", 1037 | "nbconvert_exporter": "python", 1038 | "pygments_lexer": "ipython3", 1039 | "version": "3.5.2" 1040 | } 1041 | }, 1042 | "nbformat": 4, 1043 | "nbformat_minor": 2 1044 | } 1045 | -------------------------------------------------------------------------------- /Alpha Trading Workflow.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jerryxyx/AlphaTrading/5e73923786297faeadb27c76f83ec81fad74af51/Alpha Trading Workflow.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Multi-Factor Models 2 | 3 | Author: Jerry Xia 4 | 5 | Date: 2018/07/27 6 | 7 | *Note: The advanced Marckdown features such as math expression may not be compatible in GitHub, please see README.pdf instead if you want more details* 8 | 9 | 10 | 11 | ## Project Introduction 12 | This is a research survey about alpha trading. In this project, I built up a pipeline of alpha trading including: 13 | 14 | * factor pretest 15 | * factor screening 16 | * factor combination (modeling) 17 | 18 | The models involed are APT models, Barra's risk models and dynamic factors model using Kalman filter. 19 | 20 | ### Files 21 | 22 | * rqdata_utils.py: Utils dealing with the rice quant platform data 23 | 24 | * Step1_FactorPretest.ipynb: Factor returns profile visulization 25 | 26 | * Step2_FactorsScreening.ipynb: Factor returns turnover visulization and correlation coefficients 27 | 28 | * Step3\_FactorCombination\_AdaBoost\_Quantopian.ipynb: A Quantopian notebook file to combine alpha factors using Adaboost 29 | 30 | * Step3\_FactorCombination\_BarraKalmanFilter.ipynb: Barra's risk model with three calibration schemes: 31 | * Scheme 1: Cross-sectional regression and weighted average 32 | * Scheme 2: Optimization problem: minimize the exponential weighted average of squared error 33 | * Scheme 3: Dynamic linear model using Kalman filter 34 | 35 | * KalmanFilterIntro.ipynb: An introduction to the dynamic multi-factor model 36 | * APT_FammaBeth.ipynb: Using Famma-Macbeth regression to calibrate APT model. 37 | 38 | ### Dataset 39 | The dataset is not available in GitHub as it is too large. Except for Step3\_FactorCombination\_AdaBoost\_Quantopian.ipynb which we used US stock data in Quantopian, among other files, we used Chinese A-stocks data downloaded from RiceQuant instead (hard for free US equities' data). 40 | 41 | The data frame is multi-indexed similar to Quantopian's format(see both Alphalens github codes and rqdata_utils.py). However, feel free to cast and apply your own dataset. 42 | 43 | 44 | ## TODO 45 | 46 | * Input more effective factors: take advice from people and industry reports 47 | * Should add technical analysis, because it matters! People care about them and then make it good sentimental indexes. 48 | * Find well-known metrics to express results 49 | 50 | ## Workflow 51 | $\checkmark$ stands for finished and $\vartriangle$ stands for TODO 52 | 53 | * Universe definition 54 | * Factors collection and preprocessing 55 | * $\vartriangle$Factors collection 56 | - Sources 57 | - balance sheet 58 | - cash flow statement 59 | - income statement 60 | - earning report 61 | - Econometric Classifications 62 | - value 63 | - growth 64 | - profitability 65 | - market size 66 | - liquidity 67 | - volatility 68 | - Momentom 69 | - Financial leverage (debt-to-equity ratio) 70 | * Factors preprocessing 71 | - $\vartriangle$daily, quaterly, annually 72 | - continuous: rescale, outliers 73 | - $\checkmark$discrete: rank 74 | * Factors screening and combination 75 | * Factors screening 76 | - $\checkmark$Factors' correlation 77 | - $\checkmark$Factors' foreseeablity 78 | - Fama-Macbeth regression 79 | * $\vartriangle$Factors combination 80 | - PCA, FA 81 | - Techniqual Analaysis 82 | - Financial Modeling 83 | - $\checkmark$APT model 84 | - $\checkmark$Barra's risk model 85 | - $\checkmark$Dynamic multi-factors model 86 | - Linear combination to maximize Sharpe ratio 87 | - Non-linear learning algorithms 88 | - $\checkmark$AdaBoost 89 | - Reinforcement learning 90 | 91 | * Portfolio allocation 92 | 93 | 94 | ## Factors' Correlations 95 | Here, I use correlation matrix as the measure. The difference from the second result is that the correlation matrix is calculated by the rank data rather than the raw data 96 | ### Two ICs comparison 97 | * Pearson's IC: measures linear relationship between components 98 | 99 | * Spearman's IC: measures monotonic relationship between components. Since We only care about the monotonic relationships. Spearman's IC wins. 100 | 101 | 102 | ### Regular IC(Pearson's correlation coefficient) for each factors 103 | ![](report/Corr_matrix_for_raw_factors.png) 104 | ### Spearman's Rank correlation coefficient for each factors 105 | ![](report/Corr_matrix_for_factor_ranks.png) 106 | 107 | ### How to rule out redundant factors and why Spearman's rank correlation coefficients? 108 | From the correlation coefficients below, we can again conclude that Spearman's rank IC is far more robust. Take ps_ratio and sales_yield as a example. 109 | $$ps\_ratio = \frac{\mbox{adjusted close price}}{\mbox{sales per share}}$$ 110 | whereas 111 | $$sales\_yield = \frac{\mbox{sales per share}}{\mbox{price}}$$ 112 | Ahthogh the price in sales_yield formula is vague in our data source we can see roughly speaking, these two variable should be inverse of each other. The Spearman's rank correlation coefficient is -0.98 which verifies this statement, and we should avoid using both of these factors, which would exeggarate the impact of this peticular factor. However, we can not see such identity in the Pearson's regular correlation coefficients. It's quite misleading actually and that's why we choose Spearman's rank IC. 113 | 114 | ## Factors' Foreseeability 115 | 116 | ### Methods 117 | * Spearman's rank correlation coefficients 118 | * Fama-Macbeth regression: Not only consider the foreseeability of factors itself but also consider the co-vary of different factors, which means rule out factors if the returns can be explained by the recent factors. 119 | 120 | 121 | ### Spearman's rank IC for factors vs. forward returns 122 | 123 | ![](report/mean_spearmans_rank_IC.png) 124 | 125 | ### Spearman's rank IC (absolute value) for factors vs. forward returns 126 | ![](report/mean_spearmans_rank_IC_absolute_value.png) 127 | 128 | ### Rank of the Spearman's rank IC (absolute value) for factors vs. forward returns 129 | ![](report/rank_of_mean_spearmans_rank_IC_absolute_value.png) 130 | 131 | ## Factors Preprocessing 132 | * Get ranked data 133 | * Obtain the valid stocks set 134 | * Reshape the data: only valid stocks set 135 | * Fill null: using daily average 136 | * Rescale the data: MinMaxScaler 137 | * Variable reduction: PCA analysis 138 | * Sanity check 139 | 140 | ![](report/corr_comparison_after_pca_analysis.png) 141 | 142 | Here, I use principle component analysis because it can brings two benefits to our data - orthogonality and dimensionality reduction. Orthogonality makes data more separate, less dimensionality makes information more concentrated. Either of them is essential for machine learning algorithms. 143 | 144 | In the next part, I used this preprocessed data as the input to obtain a "mega alpha". 145 | 146 | ## Mega Alpha 147 | construct an aggregate alpha factor which has its return distribution profitable. The term "profitable" here means condense, little turnover, significant in the positive return. 148 | ### Methods 149 | #### linear methods 150 | * normalize factors and try a linear combination 151 | * rank each factor and then sum up 152 | * Financial modeling: **See the appendix and Step3\_FactorCombination\_BarraKalmanFilter.ipynb** 153 | * linear combination to maximize Sharpe ratio 154 | 155 | #### Non-linear methods 156 | * AdaBoost: **See Step3\_FactorCombination\_AdaBoost\_Quantopian.ipynb** 157 | * Reinforement Learning 158 | 159 | 160 | Here we only introduce AdaBoost algorithm in this documentation. For more details about the linear models, please See the appendix and Step3\_FactorCombination\_BarraKalmanFilter.ipynb. 161 | 162 | ### AdaBoost 163 | #### Description 164 | The algorithm sequentially applies a weak classification to modified versions of the data. By increasing the weights of the missclassified observations, each weak learner focuses on the error of the previous one. The predictions are aggregated through a weighted majority vote. 165 | 166 | #### Algorithm 167 | 168 | ![](report/adaboost_algorithm.png) 169 | 170 | #### Train set 171 | The adaboost classifier was applied to our fundamental dataset. The objective is to train a classifier which give a score for the bunch of factors. Or in other word, the mega alpha. Pink for the positive forward returns observations and blue for the negative forward returns observations. A good score system is to make the two classes more separated. 172 | ![](report/train_score_dist.png) 173 | We can see, in train set, AdaBoost classifier did so well! The next plot is the precision in each quantile of scores. In the top and bottom quantile, the predicted precision is nearly 100%! 174 | ![](report/train_accuracy_bar.png) 175 | 176 | #### Test set 177 | alpha values histogram 178 | ![](report/test_score_dist.png) 179 | quantile precision bar plot 180 | ![](report/test_accuracy_bar.png) 181 | The precision in the top and bottom quantile is only slightly higher than 50%. Far from good if we considered transaction cost. 182 | 183 | So, I added some technical analysis factors to see if we can tackle this problem. 184 | ![](report/train_score_dist2.png) 185 | Surprisingly, even the average accuracy in test set is about 67%. What if we only trade the extreme quantile? That is around 80% accuracy! It literally shows that technical factors are really important in US stock market and can be used to find arbitrage opportunity. 186 | 187 | ## References 188 | * Jonathan Larkin, *A Professional Quant Equity Workflow*. August 31, 2016 189 | * *A Practitioner‘s Guide to Factor Models*. The Research Foundation of The Institute of Chartered Financial Analysts 190 | * Thomas Wiecki, Machine Learning on Quantopian 191 | * Inigo Fraser Jenkins, *Using factors with different alpha decay times: The case for non-linear combination*  192 | * PNC, *Factor Analysis: What Drives Performance?* 193 | * O’Shaughnessy, *Alpha or Assets? — Factor Alpha vs. Smart Beta*. April 2016 194 | * *O’Shaughnessy Quarterly Investor Letter Q1 2018*  195 | * Jiantao Zhu, Orient Securities, *Alpha Forecasting - Factor-Based Strategy Research Series 13* 196 | * Yang Song, Bohai Securities, *Multi-Factor Models Research: Single Factor Testing*, 2017/10/11 197 | 198 | 199 | ## Appendix: Notes on Factor Models 200 | 201 | ### CAPM 202 | * Author: Markovitz(1959) 203 | * single-factor: 204 | * explain: security returns 205 | 206 | ### APT 207 | * Author: Stephen A. Ross(1976) 208 | * multi-factor 209 | * explain: security returns 210 | 211 | #### Postulates: 212 | - The linear model 213 | $$r_i(t) - \alpha_i = \sum_{k=1}^K \beta_{ik} \cdot f_k(t) + \epsilon_i(t)$$ 214 | 215 | where $f_k(t)$ is the realization(value) of risk factor at time t 216 | 217 | - No pure arbitrage profit 218 | 219 | #### Conclusion 220 | * Exposure of each security on each factor 221 | * Risk premium on each factor 222 | $$(Mean[r_i(t)])_i = P_0 + \sum_{k=1}^K \beta_{ik} \cdot P_k$$ 223 | or make $\beta_{0,k}$ equals 1 for each k, 224 | $$(Mean[r_i(t)])_i = \sum_{k=0}^K \bar{\beta}_{i,k} \cdot P_k$$ 225 | where $P_0$ is the risk free return 226 | 227 | * Portfolio exposure to each factor 228 | $$Portfolio_{it} = \beta_0 + \beta_k \cdot f_{kit}$$ 229 | 230 | 231 | 232 | #### Three alternative calibration methods 233 | * **statistical techniques** such as factor analysis, principle analysis 234 | - **Goodness**: good for determining the number of relevent risk factors 235 | - **Undesirable**: hard to interpret 236 | 237 | * **portfolios**: K different well-diversified portfolios as substitutions 238 | - **Goodness**: lead to insights 239 | - **Fama-Macbeth regression** 240 | 241 | * **economic theory** (highly developed art) 242 | - **Goodness**: Intuitively appealing set of factors that admit economic interpretation of risk exposures 243 | - **Goodness**: Using economic information in addition to stock return. Avoid using stock return to explain stock return 244 | - **factors**: 245 | 1. confidence risk 246 | 2. time horizon risk 247 | 3. inflation risk 248 | 4. bussiness cycle risk 249 | 5. market-timing risk 250 | 251 | #### Generalizations 252 | The simplicity of APT framework is a great virtue. It is helpful to understand the true sources of stock returns. The basic APT model can be enhanced in many ways. 253 | 254 | * Allow risk prices $P_k$ to vary over time 255 | * Allow risk exposures $\beta_{i,k}$ to vary over time 256 | * Use Bayesian mothods to produce optimal out-of-sample forcasts for the risk exposures and hence for the expected returns 257 | * Introduce additional factor with zero-risk prices. Although do not contribute to expected return, help to explain the volatility. 258 | 259 | ### Multi-Index Models (Factor Analysis & PCA) 260 | 261 | #### Goal 262 | Using historical return extract the factors 263 | 264 | $$r_{it} = \alpha_i + \sum_k \beta_{ik}\cdot f_{kt}$$ 265 | where 266 | $$E[\epsilon_{it} \epsilon_{jt}]=0$$ 267 | $$E[\epsilon_{it} f_{kt}]=0$$ 268 | 269 | $f_{kt}$: the return on index k inperiod t 270 | 271 | $\beta$: sensitivities 272 | 273 | #### Estimation 274 | Either exposure or factor return can be asserted on a priori grounds with the other identified empirically, or both can be identified empirically. 275 | 276 | #### Characteristics 277 | * Have f(indexes) represents separate influence 278 | * The structure must be parsimonious: the returns can be described in terms of limited indexes 279 | 280 | #### Statistical Solutions 281 | Let the data design the model 282 | 283 | * PCA 284 | * Factor Analysis: better in heteroscedastic series 285 | 286 | #### Design Issue 287 | * **The Choice of Data**: Individul stocks vs portfolio 288 | * **The number of Index**: 289 | - Stactical techniques: Factor analysis, PCA 290 | - Common sense and economic significance play a major role in deciding on the number of factors 291 | * **The nonuniqueness of Factors**: The researcher should realize the resulting structure is not unique. Some researchers will examine alternative structures in an atempt to understand what influences are affecting security returns and to convince themself the overall separation make an intuitive sense 292 | * **Computational Problems**: 293 | - Roll and Ross: Multisample approach 294 | - Chen: Portfolio approach 295 | 296 | #### Applications 297 | * **Identify the Indexes set** 298 | * **Determine the number of factors**: PCA / Factor Analysis 299 | - Single-group tests for each sample 300 | - Factor Analysis on return-generating process 301 | - Criteria: Chi2, AIC, **BIC** 302 | - Multiple-group tests for all stocks 303 | - Canonical Correlation (CCA): 304 | 305 | take two sets of variables and see what is common amongst the two sets (can be two noncorresponding variables either on index or dimension) 306 | $$X_{N \times K}, Y_{N \times K^{\prime}}$$ 307 | $$\mbox{x_weights}_{K,n}$$ 308 | $$\mbox{y_weights}_{K^{\prime},n}$$ 309 | Use CCA / PLS: 310 | $$\mbox{X_score}_{N\times n} = \mbox{Normalized}[X]_{N \times K} \mbox{x_weights}_{K,n}$$ 311 | 312 | $$\mbox{Y_score}_{N\times n} = \mbox{Normalized}[Y]_{N \times K^{\prime}} \mbox{y_weights}_{K^{\prime},n}$$ 313 | - Determin the number: 314 | - r-value for $n=10$ 315 | - correlation matrix pattern for each number of components: $n \times n$ for $n=1,\cdots,10$ 316 | 317 | * **Generate Factors** 318 | 319 | * **Calibrate sensitivities**: 320 | 321 | - Portfolio exposure to each factor 322 | - $Adjusted R^2$ (Should be stable) 323 | - Explanatory power: Compare these results with those for the single-index model (Should depend on the market cap) 324 | 325 | * **Explanatory Power** of the Model for Each Stock: R2>0.7 excellent 326 | 327 | #### Conclusions 328 | * Goodness: simultaneously estimate the indexes and sensitivities in a multi-index model 329 | * Defect: Data Minning: Using return to explain return 330 | 331 | 332 | ### Multi-Factor Models for Portfolio Risk (BARRA) 333 | 334 | $$r_{i,t} = a_{i,t} + X_{i,k,t} \cdot f_{k,t}$$ 335 | where 336 | $X_{i,k,t}$: the exposure of asset i to factor k known at time t 337 | $f_{k,t}$: the factor return to factor k during the period from time $t$ to time $t+1$ 338 | $a_{i,t}$: the stock i's specific return during period from time $t$ to time $t+1$ 339 | $r_{i,t}$: the excess return (return above the risk-free return) on stock i during the period from time $t$ to time $t+1$ 340 | 341 | The risk structure 342 | $$V_{i,j} = X_{i,k1} F_{k1,k2} X_{j,k2}^T + \Delta_{i,j}$$ 343 | $$V = X^T F X + \Delta$$ 344 | where 345 | 346 | $F_{k1,k2}$ is the K by K covariance matrix for factor returns 347 | 348 | $\Delta_{i,j}$ is the N by N diagonal matrix of specific variance 349 | 350 | A portfolio described by an N-element vector $h_i$ 351 | 352 | * portfolio exposure: $x_p = X^T h_p$ 353 | * portfolio variance: $\sigma_p^2 = x_p^T F x_p + h_p^T \Delta h_p = h_p^T V h_p$ 354 | * Marginal Contribution for Total Risk 355 | $$MCTR = \frac{V h_p}{\sigma_p}$$ 356 | * Risk-adjusted expected return: 357 | $$U = h_p^T r_p - \lambda\cdot h_p^T V h_p$$ 358 | 359 | 360 | #### Choosing the Factors 361 | * External influences --> BARRA Model 362 | - Return in bond market (bond beta) 363 | - Unexpected changes in inflation 364 | - Change in oil price 365 | - Change in exchange rate 366 | * Cross-sectional comparisons 367 | - Fundamental 368 | - Market 369 | - volatility 370 | - price 371 | - share turnover 372 | * Purely internal or statistical factors 373 | - see multi-index model 374 | 375 | #### Exposures 376 | * Industry Exposures 377 | - 1/0 variable 378 | * Risk Index Exposures 379 | - Volatility: beta, daily return vol, option implied vol 380 | - Momentum 381 | - Size 382 | - Liquidity 383 | - Growth 384 | - Value(Fundamentals) 385 | - Earning volatility 386 | - Financial leverage: debt-to-equity ratios 387 | 388 | #### Applications 389 | * Rescale the Exposures 390 | * Regress the Factor Returns Against Exposures via Cross-sectional Regression 391 | $$f = (X^T W X)^{-1} (X^T W r)\\ 392 | = \sum_{i=1}^N C_{k,i} r_i$$ 393 | Here factor return can be interpreted as the return to a portfolio with weights $C_{k,i}$. So factor returns are the returns to factor portfolios. This portfolio has unit exposure to the particular factor 394 | * Factor Covariance and Specific 395 | - Stock returns 396 | - Factor exposures 397 | - Stock dividends, splits, and other adjustment 398 | 399 | #### Model Validation 400 | * Model Setting: 401 | - 50 factors 402 | - 1000 assets 403 | * Measures: 404 | 405 | - $R^2$: 30-40%. It can vary quite significantly from month to month. And depends on the market return level. 406 | - root mean square error: 6% roughly against 10% volatility 407 | - Portfolio Risk 408 | * Goal: 409 | - Expain the portfolio risk 410 | - Forecast variances and covariances of factors and specific returns 411 | - Providing incisive, intuitive and interesting risk analysis 412 | 413 | 414 | You can think of this as slicing through the other direction from the APT analysis, as now the factor returns are unknowns to be solved for, whereas originally the coefficients b were the unknowns. Another way to think about it is that you're determining how predictive of returns the factor was on that day, and therefore how much return you could have squeezed out of that factor. 415 | -------------------------------------------------------------------------------- /README.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jerryxyx/AlphaTrading/5e73923786297faeadb27c76f83ec81fad74af51/README.pdf -------------------------------------------------------------------------------- /README_old.md: -------------------------------------------------------------------------------- 1 | # Multi-Factor Models 2 | 3 | Author: Jerry Xia 4 | 5 | Date: 2018/05/21 6 | 7 | *Note: The advanced Marckdown features such as math expression may not be compatible in GitHub, please see README.pdf instead if you want more details* 8 | 9 | 10 | 11 | ## Project Introduction 12 | This is a research survey about alpha trading. In this project, I built up a pipeline of alpha trading including: 13 | 14 | * factor pretest 15 | * factor screening 16 | * factor combination (modeling) 17 | 18 | The models involed are APT models, Barra's risk models and dynamic factors model using Kalman filter. 19 | 20 | ### Files 21 | 22 | * rqdata_utils.py: Utils dealing with the rice quant platform data 23 | 24 | * Step1_FactorPretest.ipynb: Factor returns profile visulization 25 | 26 | * Step2_FactorsScreening.ipynb: Factor returns turnover visulization and correlation coefficients 27 | 28 | * Step3\_FactorCombination\_AdaBoost\_Quantopian.ipynb: A Quantopian notebook file to combine alpha factors using Adaboost 29 | 30 | * Step3\_FactorCombination\_BarraKalmanFilter.ipynb: Barra's risk model with three calibration schemes: 31 | * Scheme 1: Cross-sectional regression and weighted average 32 | * Scheme 2: Optimization problem: minimize the exponential weighted average of squared error 33 | * Scheme 3: Dynamic linear model using Kalman filter 34 | 35 | * KalmanFilterIntro.ipynb: An introduction to the dynamic multi-factor model 36 | * APT_FammaBeth.ipynb: Using Famma-Macbeth regression to calibrate APT model. 37 | 38 | ### Dataset 39 | The dataset is not available in GitHub as it is too large. Except for Step3\_FactorCombination\_AdaBoost\_Quantopian.ipynb which we used US stock data in Quantopian, among other files, we used Chinese A-stocks data downloaded from RiceQuant instead (hard for free US equities' data). 40 | 41 | The data frame is multi-indexed similar to Quantopian's format(see both Alphalens github codes and rqdata_utils.py). However, feel free to cast and apply your own dataset. 42 | 43 | 44 | ### Goal 45 | * **Equity Return Forecasting** 46 | 47 | * **Portfolio Risk Estimation** 48 | 49 | - APT 50 | - Risk Exposure: $\beta_{i,k}$ 51 | - Risk Premium: $P_k$ 52 | - Contribution of Risk Factor to Long Term Excess Return: 53 | $$E[r_i] - TB = \sum_k \beta_{i,k}P_k$$ 54 | - BARRA 55 | - Factor Return Covariance: V 56 | - Portfolio Risk: $\sigma_p$ 57 | - Portfolio Risk Exposures: $$x_p=X^T h_p$$ 58 | - Marginal Contribution for Total Risk: $$MCTR = \frac{V h_p}{\sigma_p}$$ 59 | - Portfolio Risk-Adjusted Expected Return: $$U = h_p^T r - \lambda \cdot h_p^T V h_p$$ 60 | 61 | ### Model Classification 62 | * CAPM 63 | - a kind of sigle-factor model 64 | - usually, a validity benchmark for other models 65 | 66 | * APT 67 | - factor returns are assumed to be known 68 | - factor exposure can be regressed from factor returns 69 | - aimed at forecasting 70 | - how to fit: Fama-Macbeth Algorithm 71 | 72 | * Multi-Index Models 73 | - statistical indogeneous model using factor analysis 74 | - useful at factors parsimouny and decouple 75 | 76 | * Multi-Factor Risk Models(BARRA) 77 | - factor exposures are assumed to be known (can be derived as the rescaled factor value) 78 | - factor return can be regressed from factor exposures 79 | - aimed at risk management 80 | - how to fit: cross-sectional regression 81 | 82 | ### Calibration Algorithms 83 | Here I used 2 traditional way add a novel Kalman filter technique (see KalmanFilter.ipynb or MultiFactorModel.ipynb) 84 | 85 | * Time-series regression (fix equity) 86 | * Cross-sectional regression (fix time-stamp) 87 | * Kalmn filter (APT model allowing risk exposure and risk premium to vary over time. In another word, a dynamic model with gaussian noise) 88 | 89 | ### Improvements 90 | 91 | * A percentage rank test is a good alternative to a z score 92 | * Beware of quarterly ratios (referring to ROA, ROE, gross margin, etc.) 93 | * Factor for quality: gross profitability a la Novy-Marx (2013). It's simply gross profits divided by total assets. 94 | * Substituting ROA/Gross Margin with gross profitability 95 | * 96 | 97 | ## Appendix: Notes on Factor Models 98 | 99 | ### CAPM 100 | * Author: Markovitz(1959) 101 | * single-factor: 102 | * explain: security returns 103 | 104 | ### APT 105 | * Author: Stephen A. Ross(1976) 106 | * multi-factor 107 | * explain: security returns 108 | 109 | #### Postulates: 110 | - The linear model 111 | $$r_i(t) - \alpha_i = \sum_{k=1}^K \beta_{ik} \cdot f_k(t) + \epsilon_i(t)$$ 112 | 113 | where $f_k(t)$ is the realization(value) of risk factor at time t 114 | 115 | - No pure arbitrage profit 116 | 117 | #### Conclusion 118 | * Exposure of each security on each factor 119 | * Risk premium on each factor 120 | $$(Mean[r_i(t)])_i = P_0 + \sum_{k=1}^K \beta_{ik} \cdot P_k$$ 121 | or make $\beta_{0,k}$ equals 1 for each k, 122 | $$(Mean[r_i(t)])_i = \sum_{k=0}^K \bar{\beta}_{i,k} \cdot P_k$$ 123 | where $P_0$ is the risk free return 124 | 125 | * Portfolio exposure to each factor 126 | $$Portfolio_{it} = \beta_0 + \beta_k \cdot f_{kit}$$ 127 | 128 | 129 | 130 | #### Three alternative calibration methods 131 | * **statistical techniques** such as factor analysis, principle analysis 132 | - **Goodness**: good for determining the number of relevent risk factors 133 | - **Undesirable**: hard to interpret 134 | 135 | * **portfolios**: K different well-diversified portfolios as substitutions 136 | - **Goodness**: lead to insights 137 | - **Fama-Macbeth regression** 138 | 139 | * **economic theory** (highly developed art) 140 | - **Goodness**: Intuitively appealing set of factors that admit economic interpretation of risk exposures 141 | - **Goodness**: Using economic information in addition to stock return. Avoid using stock return to explain stock return 142 | - **factors**: 143 | 1. confidence risk 144 | 2. time horizon risk 145 | 3. inflation risk 146 | 4. bussiness cycle risk 147 | 5. market-timing risk 148 | 149 | #### Generalizations 150 | The simplicity of APT framework is a great virtue. It is helpful to understand the true sources of stock returns. The basic APT model can be enhanced in many ways. 151 | 152 | * Allow risk prices $P_k$ to vary over time 153 | * Allow risk exposures $\beta_{i,k}$ to vary over time 154 | * Use Bayesian mothods to produce optimal out-of-sample forcasts for the risk exposures and hence for the expected returns 155 | * Introduce additional factor with zero-risk prices. Although do not contribute to expected return, help to explain the volatility. 156 | 157 | ### Multi-Index Models (Factor Analysis & PCA) 158 | 159 | #### Goal 160 | Using historical return extract the factors 161 | 162 | $$r_{it} = \alpha_i + \sum_k \beta_{ik}\cdot f_{kt}$$ 163 | where 164 | $$E[\epsilon_{it} \epsilon_{jt}]=0$$ 165 | $$E[\epsilon_{it} f_{kt}]=0$$ 166 | 167 | $f_{kt}$: the return on index k inperiod t 168 | 169 | $\beta$: sensitivities 170 | 171 | #### Estimation 172 | Either exposure or factor return can be asserted on a priori grounds with the other identified empirically, or both can be identified empirically. 173 | 174 | #### Characteristics 175 | * Have f(indexes) represents separate influence 176 | * The structure must be parsimonious: the returns can be described in terms of limited indexes 177 | 178 | #### Statistical Solutions 179 | Let the data design the model 180 | 181 | * PCA 182 | * Factor Analysis: better in heteroscedastic series 183 | 184 | #### Design Issue 185 | * **The Choice of Data**: Individul stocks vs portfolio 186 | * **The number of Index**: 187 | - Stactical techniques: Factor analysis, PCA 188 | - Common sense and economic significance play a major role in deciding on the number of factors 189 | * **The nonuniqueness of Factors**: The researcher should realize the resulting structure is not unique. Some researchers will examine alternative structures in an atempt to understand what influences are affecting security returns and to convince themself the overall separation make an intuitive sense 190 | * **Computational Problems**: 191 | - Roll and Ross: Multisample approach 192 | - Chen: Portfolio approach 193 | 194 | #### Applications 195 | * **Identify the Indexes set** 196 | * **Determine the number of factors**: PCA / Factor Analysis 197 | - Single-group tests for each sample 198 | - Factor Analysis on return-generating process 199 | - Criteria: Chi2, AIC, **BIC** 200 | - Multiple-group tests for all stocks 201 | - Canonical Correlation (CCA): 202 | 203 | take two sets of variables and see what is common amongst the two sets (can be two noncorresponding variables either on index or dimension) 204 | $$X_{N \times K}, Y_{N \times K^{\prime}}$$ 205 | $$\mbox{x_weights}_{K,n}$$ 206 | $$\mbox{y_weights}_{K^{\prime},n}$$ 207 | Use CCA / PLS: 208 | $$\mbox{X_score}_{N\times n} = \mbox{Normalized}[X]_{N \times K} \mbox{x_weights}_{K,n}$$ 209 | 210 | $$\mbox{Y_score}_{N\times n} = \mbox{Normalized}[Y]_{N \times K^{\prime}} \mbox{y_weights}_{K^{\prime},n}$$ 211 | - Determin the number: 212 | - r-value for $n=10$ 213 | - correlation matrix pattern for each number of components: $n \times n$ for $n=1,\cdots,10$ 214 | 215 | * **Generate Factors** 216 | 217 | * **Calibrate sensitivities**: 218 | 219 | - Portfolio exposure to each factor 220 | - $Adjusted R^2$ (Should be stable) 221 | - Explanatory power: Compare these results with those for the single-index model (Should depend on the market cap) 222 | 223 | * **Explanatory Power** of the Model for Each Stock: R2>0.7 excellent 224 | 225 | #### Conclusions 226 | * Goodness: simultaneously estimate the indexes and sensitivities in a multi-index model 227 | * Defect: Data Minning: Using return to explain return 228 | 229 | 230 | ### Multi-Factor Models for Portfolio Risk (BARRA) 231 | 232 | $$r_{i,t} = a_{i,t} + X_{i,k,t} \cdot f_{k,t}$$ 233 | where 234 | $X_{i,k,t}$: the exposure of asset i to factor k known at time t 235 | $f_{k,t}$: the factor return to factor k during the period from time $t$ to time $t+1$ 236 | $a_{i,t}$: the stock i's specific return during period from time $t$ to time $t+1$ 237 | $r_{i,t}$: the excess return (return above the risk-free return) on stock i during the period from time $t$ to time $t+1$ 238 | 239 | The risk structure 240 | $$V_{i,j} = X_{i,k1} F_{k1,k2} X_{j,k2}^T + \Delta_{i,j}$$ 241 | $$V = X^T F X + \Delta$$ 242 | where 243 | 244 | $F_{k1,k2}$ is the K by K covariance matrix for factor returns 245 | 246 | $\Delta_{i,j}$ is the N by N diagonal matrix of specific variance 247 | 248 | A portfolio described by an N-element vector $h_i$ 249 | 250 | * portfolio exposure: $x_p = X^T h_p$ 251 | * portfolio variance: $\sigma_p^2 = x_p^T F x_p + h_p^T \Delta h_p = h_p^T V h_p$ 252 | * Marginal Contribution for Total Risk 253 | $$MCTR = \frac{V h_p}{\sigma_p}$$ 254 | * Risk-adjusted expected return: 255 | $$U = h_p^T r_p - \lambda\cdot h_p^T V h_p$$ 256 | 257 | 258 | #### Choosing the Factors 259 | * External influences --> BARRA Model 260 | - Return in bond market (bond beta) 261 | - Unexpected changes in inflation 262 | - Change in oil price 263 | - Change in exchange rate 264 | * Cross-sectional comparisons 265 | - Fundamental 266 | - Market 267 | - volatility 268 | - price 269 | - share turnover 270 | * Purely internal or statistical factors 271 | - see multi-index model 272 | 273 | #### Exposures 274 | * Industry Exposures 275 | - 1/0 variable 276 | * Risk Index Exposures 277 | - Volatility: beta, daily return vol, option implied vol 278 | - Momentum 279 | - Size 280 | - Liquidity 281 | - Growth 282 | - Value(Fundamentals) 283 | - Earning volatility 284 | - Financial leverage: debt-to-equity ratios 285 | 286 | #### Applications 287 | * Rescale the Exposures 288 | * Regress the Factor Returns Against Exposures via Cross-sectional Regression 289 | $$f = (X^T W X)^{-1} (X^T W r)\\ 290 | = \sum_{i=1}^N C_{k,i} r_i$$ 291 | Here factor return can be interpreted as the return to a portfolio with weights $C_{k,i}$. So factor returns are the returns to factor portfolios. This portfolio has unit exposure to the particular factor 292 | * Factor Covariance and Specific 293 | - Stock returns 294 | - Factor exposures 295 | - Stock dividends, splits, and other adjustment 296 | 297 | #### Model Validation 298 | * Model Setting: 299 | - 50 factors 300 | - 1000 assets 301 | * Measures: 302 | 303 | - $R^2$: 30-40%. It can vary quite significantly from month to month. And depends on the market return level. 304 | - root mean square error: 6% roughly against 10% volatility 305 | - Portfolio Risk 306 | * Goal: 307 | - Expain the portfolio risk 308 | - Forecast variances and covariances of factors and specific returns 309 | - Providing incisive, intuitive and interesting risk analysis 310 | 311 | 312 | You can think of this as slicing through the other direction from the APT analysis, as now the factor returns are unknowns to be solved for, whereas originally the coefficients b were the unknowns. Another way to think about it is that you're determining how predictive of returns the factor was on that day, and therefore how much return you could have squeezed out of that factor. 313 | -------------------------------------------------------------------------------- /output/factor_ic_analysis.csv: -------------------------------------------------------------------------------- 1 | factor,group,1D,27D,98D 2 | total_turnover,ConsumerDiscretionary,-0.052061319774850776,-0.11316983202573735,-0.1929657054345211 3 | total_turnover,ConsumerStaples,-0.06218330937378258,-0.12809022695465364,-0.23566073217524877 4 | total_turnover,Energy,-0.053310899809748104,-0.114291646238941,-0.1828861184315757 5 | total_turnover,Financials,-0.043483723699830806,-0.09250154138845446,-0.15258051713146398 6 | total_turnover,HealthCare,-0.03054709278550283,-0.07126898666786867,-0.1370994975862821 7 | total_turnover,Industrials,-0.06155338477832689,-0.12191628841788626,-0.1913136626300075 8 | total_turnover,InformationTechnology,-0.04331167468163195,-0.09994088680694369,-0.18793138112573995 9 | total_turnover,Materials,-0.06347263107873605,-0.1454435080579407,-0.23516052362957487 10 | total_turnover,TelecommunicationServices,-0.05623721881390593,-0.12195086829491936,-0.08384458077709611 11 | total_turnover,Utilities,-0.06710637414072558,-0.11822026772629673,-0.18005363953326778 12 | volume,ConsumerDiscretionary,-0.038821806113053185,-0.06517649803074897,-0.10564426190383719 13 | volume,ConsumerStaples,-0.04881955414007564,-0.07004912944675076,-0.10700285231017939 14 | volume,Energy,-0.05050533179526796,-0.05539159512201123,-0.04449211070962409 15 | volume,Financials,-0.037379891386358104,-0.06717310704498229,-0.11532946922437222 16 | volume,HealthCare,-0.018552307919222938,-0.031143940450130612,-0.06310080118478258 17 | volume,Industrials,-0.05369190239494951,-0.09056039930841121,-0.12912730395652844 18 | volume,InformationTechnology,-0.03270391959214507,-0.06303027646641893,-0.12181248844576387 19 | volume,Materials,-0.05213678473116451,-0.09941954650749407,-0.1433535572754988 20 | volume,TelecommunicationServices,-0.03401630796772099,-0.013292433537832311,-0.028629856850715747 21 | volume,Utilities,-0.053355632253003354,-0.07805499426744068,-0.09309882686634807 22 | market_cap,ConsumerDiscretionary,-0.018642170440300053,-0.08071467806160568,-0.15194577884185356 23 | market_cap,ConsumerStaples,-0.022631363847695745,-0.09367476053930583,-0.1991416188736341 24 | market_cap,Energy,-0.0215733019306775,-0.10099046052166827,-0.23276886886450332 25 | market_cap,Financials,-0.01783881472520913,-0.06584105506902525,-0.12906327329642123 26 | market_cap,HealthCare,-0.016506107254639694,-0.07322090125094058,-0.14809213265805843 27 | market_cap,Industrials,-0.024891064518646394,-0.08946681310150975,-0.1722836413867398 28 | market_cap,InformationTechnology,-0.018400836027949976,-0.06852777925101093,-0.16777140860823717 29 | market_cap,Materials,-0.028921427376547782,-0.12317603287175999,-0.2357473587754678 30 | market_cap,TelecommunicationServices,-0.024539877300613498,-0.12706334273254716,-0.18916155419222905 31 | market_cap,Utilities,-0.021505325470817266,-0.06974356783593436,-0.11709171910355881 32 | a_share_market_val_2,ConsumerDiscretionary,-0.018301595390333388,-0.0834047795877226,-0.1622640445487711 33 | a_share_market_val_2,ConsumerStaples,-0.021353636086466607,-0.08957475782545855,-0.19257272420530416 34 | a_share_market_val_2,Energy,-0.01589375064124426,-0.08269370057080777,-0.22624001661944343 35 | a_share_market_val_2,Financials,-0.013595140013004083,-0.05757239677747778,-0.11097273433879687 36 | a_share_market_val_2,HealthCare,-0.016442154211948196,-0.06823771497421101,-0.1313262609982461 37 | a_share_market_val_2,Industrials,-0.022288503369883857,-0.08422835061983074,-0.16421150228922107 38 | a_share_market_val_2,InformationTechnology,-0.01602024151930178,-0.06606228480096947,-0.16184394862584908 39 | a_share_market_val_2,Materials,-0.026673267375488252,-0.11840215884371369,-0.22834195826194792 40 | a_share_market_val_2,TelecommunicationServices,-0.019427402862985693,-0.050102249488752554,-0.006134969325153374 41 | a_share_market_val_2,Utilities,-0.01685842238760016,-0.07250987448493125,-0.12230908395788107 42 | cash_received_from_sales_of_goods,ConsumerDiscretionary,-0.00103083821123597,-0.03206445749325148,-0.06876501903118355 43 | cash_received_from_sales_of_goods,ConsumerStaples,0.0011848183502683814,-0.026253336689890584,-0.048118632633388406 44 | cash_received_from_sales_of_goods,Energy,-0.016016456712267646,-0.08156643860850851,-0.1719692794464853 45 | cash_received_from_sales_of_goods,Financials,0.002049322081554394,-0.01712216286244825,-0.04627365414791241 46 | cash_received_from_sales_of_goods,HealthCare,-0.007054075457333036,-0.04186122320226921,-0.09264530897763122 47 | cash_received_from_sales_of_goods,Industrials,-0.0033712276790714055,-0.026317880861985995,-0.07804687752350649 48 | cash_received_from_sales_of_goods,InformationTechnology,-0.003735428340115983,-0.03292082920277817,-0.09611089193235643 49 | cash_received_from_sales_of_goods,Materials,-0.009364496334334471,-0.062445293690082074,-0.13116837426913705 50 | cash_received_from_sales_of_goods,TelecommunicationServices,-0.0010224948875255625,0.013292433537832311,0.044989775051124746 51 | cash_received_from_sales_of_goods,Utilities,-0.004586561111603414,-0.020219037138779015,-0.02362352504750876 52 | pb_ratio,ConsumerDiscretionary,-0.02786139236091668,-0.04723618727888782,-0.06416866121080726 53 | pb_ratio,ConsumerStaples,-0.029475817225151503,-0.07143808423799812,-0.12540901349300188 54 | pb_ratio,Energy,-0.0036947086202947805,0.00801451621850763,0.050710527187911955 55 | pb_ratio,Financials,-0.02539575463258946,-0.05441037179139589,-0.08509951720797193 56 | pb_ratio,HealthCare,-0.01773232906393156,-0.005604430266716481,-0.0050057836964287755 57 | pb_ratio,Industrials,-0.02302510249518233,-0.04874561973483434,-0.0640169980335691 58 | pb_ratio,InformationTechnology,-0.02211961063266852,-0.05069932164964732,-0.0718467294959841 59 | pb_ratio,Materials,-0.01959489813139817,-0.039595205878501294,-0.06585604305979385 60 | pb_ratio,TelecommunicationServices,-0.002044989775051125,-0.016359918200409,-0.028629856850715747 61 | pb_ratio,Utilities,-0.026500309923510785,-0.07579696587327645,-0.1316871231270864 62 | net_profit,ConsumerDiscretionary,0.004534976198255834,-0.024587145489383917,-0.06697091876991088 63 | net_profit,ConsumerStaples,0.001762134488588463,-0.030557169605312656,-0.09508911979085215 64 | net_profit,Energy,-0.0037239591056138937,-0.05576234882359838,-0.14286158066900853 65 | net_profit,Financials,0.005619412446675389,-0.004552703253618796,-0.03282171908115013 66 | net_profit,HealthCare,0.004023627795232325,-0.0122560368659469,-0.054647179895811165 67 | net_profit,Industrials,0.005922822373344156,-0.004876234673987302,-0.037687191394743746 68 | net_profit,InformationTechnology,0.0034451143466464815,-0.01467883275591999,-0.07454590190755 69 | net_profit,Materials,0.007775417553720173,-0.017608951812301852,-0.097233933891964 70 | net_profit,TelecommunicationServices,-0.0010224948875255625,0.013292433537832311,0.044989775051124746 71 | net_profit,Utilities,0.003587155195185814,-0.009910187986564376,-0.04759442386949663 72 | ps_ratio,ConsumerDiscretionary,-0.012080771280731099,-0.018288440941963535,-0.014703891059025101 73 | ps_ratio,ConsumerStaples,-0.019200947719015923,-0.04964656897109551,-0.12391325426910196 74 | ps_ratio,Energy,0.009579091016352978,0.04035361462838712,0.07135150759411069 75 | ps_ratio,Financials,-0.014126989377467987,-0.014116943885514285,-0.0199666437383381 76 | ps_ratio,HealthCare,-0.0050853830262781955,-0.005927366792048748,0.0016129868796054226 77 | ps_ratio,Industrials,-0.008480756136386746,-0.010614464339011856,0.0075018939060060375 78 | ps_ratio,InformationTechnology,-0.01071146507594663,-0.016407405031958085,-0.013131536623179884 79 | ps_ratio,Materials,-0.00811664224948075,-0.003958926420953927,0.006930722032411271 80 | ps_ratio,TelecommunicationServices,-0.00408997955010225,-0.02556237218813906,-0.053169734151329244 81 | ps_ratio,Utilities,-0.011680459426909625,-0.03175334201449768,-0.08629345861643857 82 | -------------------------------------------------------------------------------- /report/Alpha Trading Workflow.md: -------------------------------------------------------------------------------- 1 | # Alpha Trading Workflow 2 | 3 | Analyst: Yuxuan Xia 4 | 5 | Date: 2018/06/04 6 | 7 | ## TODO 8 | 9 | * Input more effective factors: take advice from people and industry reports 10 | * Should add technical analysis, because it matters! People care about them and then make it good sentimental indexes. 11 | * Find well-known metrics to express results 12 | 13 | ## Workflow 14 | \checkmark stands for finished and \vartriangle stands for TODO 15 | 16 | * Universe definition 17 | * Factors collection and preprocessing 18 | * $\vartriangle$ Factors collection 19 | - Sources 20 | - balance sheet 21 | - cash flow statement 22 | - income statement 23 | - earning report 24 | - Econometric Classifications 25 | - value 26 | - growth 27 | - profitability 28 | - market size 29 | - liquidity 30 | - volatility 31 | - Momentom 32 | - Financial leverage (debt-to-equity ratio) 33 | * Factors preprocessing 34 | - $\vartriangle$daily, quaterly, annually 35 | - continuous: rescale, outliers 36 | - $\checkmark$discrete: rank 37 | * Factors screening and combination 38 | * Factors screening 39 | - $\checkmark$Factors' correlation 40 | - $\checkmark$Factors' foreseeablity 41 | - Fama-Macbeth regression 42 | * $\vartriangle$Factors combination 43 | - PCA, FA 44 | - Techniqual Analaysis 45 | - Financial Modeling 46 | - Linear combination to maximize Sharpe ratio 47 | - Non-linear learning algorithms 48 | - $\checkmark$AdaBoost 49 | - Reinforcement learning 50 | 51 | * Portfolio allocation 52 | 53 | 54 | ## Factors' Correlations 55 | Here, I use correlation matrix as the measure. The difference from the second result is that the correlation matrix is calculated by the rank data rather than the raw data 56 | ### Two ICs comparison 57 | * Pearson's IC: measures linear relationship between components 58 | 59 | * Spearman's IC: measures monotonic relationship between components. Since We only care about the monotonic relationships. Spearman's IC wins. 60 | 61 | 62 | ### Regular IC(Pearson's correlation coefficient) for each factors 63 | ![](Corr matrix for raw factors.png) 64 | ### Spearman's Rank correlation coefficient for each factors 65 | ![](Corr matrix for factor ranks.png) 66 | 67 | ### How to rule out redundant factors and why Spearman's rank correlation coefficients? 68 | From the correlation coefficients below, we can again conclude that Spearman's rank IC is far more robust. Take ps_ratio and sales_yield as a example. 69 | $$ps\_ratio = \frac{\mbox{adjusted close price}}{\mbox{sales per share}}$$ 70 | whereas 71 | $$sales\_yield = \frac{\mbox{sales per share}}{\mbox{price}}$$ 72 | Ahthogh the price in sales_yield formula is vague in our data source we can see roughly speaking, these two variable should be inverse of each other. The Spearman's rank correlation coefficient is -0.98 which verifies this statement, and we should avoid using both of these factors, which would exeggarate the impact of this peticular factor. However, we can not see such identity in the Pearson's regular correlation coefficients. It's quite misleading actually and that's why we choose Spearman's rank IC. 73 | 74 | ## Factors' Foreseeability 75 | 76 | ### Mehods 77 | * Spearman's rank correlation coefficients 78 | * Fama-Macbeth regression: Not only consider the foreseeability of factors itself but also consider the co-vary of different factors, which means rule out factors if the returns can be explained by the recent factors. 79 | 80 | 81 | ### Spearman's rank IC for factors vs. forward returns 82 | 83 | ![](mean spearmans rank IC.png) 84 | 85 | ### Spearman's rank IC (absolute value) for factors vs. forward returns 86 | ![](mean spearmans rank IC (absolute value).png) 87 | 88 | ### Rank of the Spearman's rank IC (absolute value) for factors vs. forward returns 89 | ![](rank of mean spearmans rank IC (absolute value).png) 90 | 91 | ## Factors Preprocessing 92 | * Get ranked data 93 | * Obtain the valid stocks set 94 | * Reshape the data: only valid stocks set 95 | * Fill null: using daily average 96 | * Rescale the data: MinMaxScaler 97 | * Variet reduction: PCA analysis 98 | * Sanity check 99 | 100 | ![](corr comparison after pca analysis.png) 101 | 102 | Here, I use principle component analysis because it can brings two benefits to our data - orthogonality and dimensionality reduction. Orthogonality makes data more separate, less dimensionality makes information more concentrated. Either of them is essential for machine learning algorithms. 103 | 104 | In the next part, I used this preprocessed data as the input to obtain a "mega alpha". 105 | 106 | ## Mega Alpha 107 | construct an aggregate alpha factor which has its return distribution profitable. The term "profitable" here means condense, little turnover, significant in the positive return. 108 | ### Methods 109 | #### linear methods 110 | * normalize factors and try a linear combination 111 | * rank each factor and then sum up 112 | * Financial modeling 113 | * linear combination to maximize Sharpe ratio 114 | 115 | #### Non-linear methods 116 | * AdaBoost 117 | * Reinforement Learning 118 | 119 | ### AdaBoost 120 | #### Description 121 | The algorithm sequentially applies a weak classification to modified versions of the data. By increasing the weights of the missclassified observations, each weak learner focuses on the error of the previous one. The predictions are aggregated through a weighted majority vote. 122 | 123 | #### Algorithm 124 | 125 | ![](adaboost_algorithm.png) 126 | 127 | #### Train set 128 | The adaboost classifier was applied to our fundamental dataset. The objective is to train a classifier which give a score for the bunch of factors. Or in other word, the mega alpha. Pink for the positive forward returns observations and blue for the negative forward returns observations. A good score system is to make the two classes more separated. 129 | ![](train_score_dist.png) 130 | We can see, in train set, AdaBoost classifier did so well! The next plot is the precision in each quantile of scores. In the top and bottom quantile, the predicted precision is nearly 100%! 131 | ![](train_accuracy_bar.png) 132 | 133 | #### Test set 134 | alpha values histogram 135 | ![](test_score_dist.png) 136 | quantile precision bar plot 137 | ![](test_accuracy_bar.png) 138 | The precision in the top and bottom quantile is only slightly higher than 50%. Far from good if we considered transaction cost. Frankly, there are plenty of works should be done before we get some satisfied results. Anyway, this pipeline gives us a flexible routine and a judgement system. I'll continue to tweak the routine and factors to make sure it goes on the right direction. 139 | 140 | ## References 141 | * Jonathan Larkin, *A Professional Quant Equity Workflow*. August 31, 2016 142 | * *A Practitioner‘s Guide to Factor Models*. The Research Foundation of The Institute of Chartered Financial Analysts 143 | * Thomas Wiecki, Machine Learning on Quantopian 144 | * Inigo Fraser Jenkins, *Using factors with different alpha decay times: The case for non-linear combination*  145 | * PNC, *Factor Analysis: What Drives Performance?* 146 | * O’Shaughnessy, *Alpha or Assets? — Factor Alpha vs. Smart Beta*. April 2016 147 | * *O’Shaughnessy Quarterly Investor Letter Q1 2018*  148 | * Jiantao Zhu, Orient Securities, *Alpha Forecasting - Factor-Based Strategy Research Series 13* 149 | * Yang Song, Bohai Securities, *Multi-Factor Models Research: Single Factor Testing*, 2017/10/11 -------------------------------------------------------------------------------- /report/Corr_matrix_for_factor_ranks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jerryxyx/AlphaTrading/5e73923786297faeadb27c76f83ec81fad74af51/report/Corr_matrix_for_factor_ranks.png -------------------------------------------------------------------------------- /report/Corr_matrix_for_raw_factors.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jerryxyx/AlphaTrading/5e73923786297faeadb27c76f83ec81fad74af51/report/Corr_matrix_for_raw_factors.png -------------------------------------------------------------------------------- /report/Quantitative Strategy Workflow.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jerryxyx/AlphaTrading/5e73923786297faeadb27c76f83ec81fad74af51/report/Quantitative Strategy Workflow.pptx -------------------------------------------------------------------------------- /report/adaboost_algorithm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jerryxyx/AlphaTrading/5e73923786297faeadb27c76f83ec81fad74af51/report/adaboost_algorithm.png -------------------------------------------------------------------------------- /report/corr_comparison_after_pca_analysis.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jerryxyx/AlphaTrading/5e73923786297faeadb27c76f83ec81fad74af51/report/corr_comparison_after_pca_analysis.png -------------------------------------------------------------------------------- /report/mean_spearmans_rank_IC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jerryxyx/AlphaTrading/5e73923786297faeadb27c76f83ec81fad74af51/report/mean_spearmans_rank_IC.png -------------------------------------------------------------------------------- /report/mean_spearmans_rank_IC_absolute_value.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jerryxyx/AlphaTrading/5e73923786297faeadb27c76f83ec81fad74af51/report/mean_spearmans_rank_IC_absolute_value.png -------------------------------------------------------------------------------- /report/rank_of_mean_spearmans_rank_IC_absolute_value.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jerryxyx/AlphaTrading/5e73923786297faeadb27c76f83ec81fad74af51/report/rank_of_mean_spearmans_rank_IC_absolute_value.png -------------------------------------------------------------------------------- /report/test_accuracy_bar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jerryxyx/AlphaTrading/5e73923786297faeadb27c76f83ec81fad74af51/report/test_accuracy_bar.png -------------------------------------------------------------------------------- /report/test_score_dist.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jerryxyx/AlphaTrading/5e73923786297faeadb27c76f83ec81fad74af51/report/test_score_dist.png -------------------------------------------------------------------------------- /report/train_accuracy_bar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jerryxyx/AlphaTrading/5e73923786297faeadb27c76f83ec81fad74af51/report/train_accuracy_bar.png -------------------------------------------------------------------------------- /report/train_score_dist.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jerryxyx/AlphaTrading/5e73923786297faeadb27c76f83ec81fad74af51/report/train_score_dist.png -------------------------------------------------------------------------------- /report/train_score_dist2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jerryxyx/AlphaTrading/5e73923786297faeadb27c76f83ec81fad74af51/report/train_score_dist2.png -------------------------------------------------------------------------------- /rqdata_utils.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import alphalens as al 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | 6 | def price_reader(price_path): 7 | price_df = pd.read_csv(price_path) 8 | price_df.rename(index=str,columns={"Unnamed: 0":"date"},inplace=True) 9 | price_df.date = pd.to_datetime(price_df.date,format="%Y-%m-%d",errors='ignore') 10 | # price_df.date = price_df.date.apply(timezone.localize) 11 | price_df.set_index(['date'],drop=True,inplace=True) 12 | price_df = price_df.sortlevel(axis=1) 13 | return price_df 14 | 15 | def instrument_reader(instrument_path): 16 | instrument_df = pd.read_csv(instrument_path) 17 | instrument_df.drop(['Unnamed: 0'],axis=1,inplace=True) 18 | instrument_df = instrument_df.set_index(['bookId']) 19 | instrument_df = instrument_df.sort_index() 20 | return instrument_df 21 | 22 | def equity_reader(equity_path): 23 | cn_df = pd.read_csv(equity_path) 24 | cn_df.date = pd.to_datetime(cn_df.date,format="%Y-%m-%d",errors='ignore') 25 | cn_df.set_index(['date','order_book_id'],drop=True,inplace=True) 26 | cn_df.drop(["Unnamed: 0"],axis=1,inplace=True) 27 | return cn_df 28 | 29 | def benchmark_reader(benchmark_path): 30 | benchmark_df = pd.read_csv(benchmark_path,names=['date','value']) 31 | benchmark_df = benchmark_df.set_index('date',drop=True) 32 | benchmark_df['return'] = np.log(benchmark_df.shift(1)/benchmark_df).fillna(0) 33 | return benchmark_df 34 | 35 | def equity_add_instrumentInfo(cn_df,instrument_df,instrument_column): 36 | instrumentInfoSeries = instrument_df[instrument_column] 37 | bookIdIdx = cn_df.index.get_level_values('order_book_id') 38 | bookIdArray = bookIdIdx.get_values() 39 | instrumentInfo = instrumentInfoSeries[bookIdArray[:]].values 40 | cn_df[instrument_column] = instrumentInfo 41 | return cn_df 42 | 43 | def get_price_instrument_equity(price_path,instrument_path,equity_path,addInstrumentColumn=None): 44 | price_df = price_reader(price_path) 45 | instrument_df = instrument_reader(instrument_path) 46 | equity_df = equity_reader(equity_path) 47 | if(addInstrumentColumn): 48 | equity_df = equity_add_instrumentInfo(equity_df,instrument_df,addInstrumentColumn) 49 | return price_df,instrument_df,equity_df 50 | 51 | def ic_analysis(equity_df, price_df, factor_columns, group_column, periods=(1,22,66), group_adjust=False): 52 | factor_list = [] 53 | ic_list = [] 54 | monthly_ic_list = [] 55 | groupby = equity_df[group_column] 56 | for col in factor_columns: 57 | factor_list.append(equity_df[col]) 58 | 59 | for my_factor in factor_list: 60 | factor_data = al.utils.get_clean_factor_and_forward_returns(factor=my_factor, 61 | prices=price_df, 62 | groupby=groupby, 63 | periods=periods, 64 | max_loss=1) 65 | mean_ic = al.performance.mean_information_coefficient(factor_data, group_adjust=group_adjust, 66 | by_group=True, 67 | by_time=None) 68 | mean_monthly_ic = al.performance.mean_information_coefficient(factor_data, group_adjust=group_adjust, 69 | by_group=False, 70 | by_time='M') 71 | print("#######################################################") 72 | print("factor: {}".format(my_factor.name)) 73 | print(mean_ic) 74 | # print(mean_monthly_ic) 75 | ic_list.append(mean_ic) 76 | monthly_ic_list.append(mean_monthly_ic) 77 | al.plotting.plot_monthly_ic_heatmap(mean_monthly_ic) 78 | plt.show() 79 | 80 | 81 | mean_ic_df = pd.concat(ic_list, keys=factor_columns) 82 | mean_ic_df.index = mean_ic_df.index.set_names(['factor', 'group']) 83 | return mean_ic_df, monthly_ic_list -------------------------------------------------------------------------------- /source/DownloadData.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [ 10 | { 11 | "name": "stdout", 12 | "output_type": "stream", 13 | "text": [ 14 | "Population Check - Initial #: 2320\n" 15 | ] 16 | } 17 | ], 18 | "source": [ 19 | "# Constructs Time Series Data for All Stocks\n", 20 | "import pandas as pd\n", 21 | "import numpy as np\n", 22 | "from datetime import datetime\n", 23 | "from datetime import timedelta\n", 24 | "import tushare as ts\n", 25 | "\n", 26 | "from scipy.stats import rankdata\n", 27 | "\n", 28 | "import seaborn as sns\n", 29 | "\n", 30 | "# Pull All Trade Dates\n", 31 | "trade_dates = pd.Series(data=[pd.Timestamp(date) for date in get_trading_dates('2001-01-01', '2018-12-31')], name='trade_date')\n", 32 | "\n", 33 | "# year_start = 2001\n", 34 | "year_start = 2012\n", 35 | "year_end = 2018\n", 36 | "\n", 37 | "# date_end_last_dt = max(trade_dates[trade_dates.dt.year == year_start-1])\n", 38 | "date_start_dt = min(trade_dates[trade_dates.dt.year == year_start])\n", 39 | "date_end_dt = max(trade_dates[trade_dates.dt.year == year_end])\n", 40 | "# date_end_dt = date_start_dt+timedelta(days=1) # 2012-01-05\n", 41 | "# date_end_last = date_end_last_dt.strftime('%Y-%m-%d')\n", 42 | "date_start = date_start_dt.strftime('%Y-%m-%d')\n", 43 | "date_end = date_end_dt.strftime('%Y-%m-%d')\n", 44 | "\n", 45 | "# Construct Stock Population\n", 46 | "stock_all = all_instruments(type=\"CS\", country='cn', date=date_start_dt)\n", 47 | "stock_list = stock_all['order_book_id'].tolist()\n", 48 | "print(\"Population Check - Initial #: {}\".format(stock_all.shape[0]))" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 2, 54 | "metadata": { 55 | "collapsed": false 56 | }, 57 | "outputs": [], 58 | "source": [ 59 | "price_data = get_price(stock_list, start_date=date_start, end_date=date_end, frequency='1d', \n", 60 | " fields=['close'], \n", 61 | " adjust_type='pre', skip_suspended=False, country='cn')\n", 62 | "price_data.to_csv(\"cn_stock_price_{}_{}.csv\".format(year_start,year_end)) # Download price data" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 3, 68 | "metadata": { 69 | "collapsed": false 70 | }, 71 | "outputs": [ 72 | { 73 | "data": { 74 | "text/html": [ 75 | "
\n", 76 | "\n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | "
300188.XSHE600337.XSHG600168.XSHG002337.XSHE600592.XSHG000950.XSHE600991.XSHG002473.XSHE600784.XSHG600736.XSHG...600345.XSHG600387.XSHG000063.XSHE002506.XSHE300151.XSHE002579.XSHE000563.XSHE000551.XSHE002578.XSHE000726.XSHE
2012-01-044.34013.17506.36932.87526.83144.016815.087.28985.01044.0135...9.43738.771113.36912.64062.00553.40372.31295.71811.89316.0165
2012-01-054.07432.85836.16972.67396.17373.910115.086.95054.50893.8534...8.84568.556013.28952.39201.88723.14522.29425.25251.78776.0005
2012-01-064.11732.69205.95112.71486.22074.142915.117.07654.61113.9570...8.93158.428912.94732.46241.91593.17242.24755.27871.78976.2645
2012-01-094.40412.77916.21722.81546.47444.278715.127.44494.87414.2114...9.37058.976513.29752.59912.00943.31942.33625.70501.88146.4965
2012-01-104.51242.91376.45492.95706.78454.443715.297.73575.13704.2962...9.77129.328513.88642.75652.11433.42552.40875.94102.00246.6565
\n", 226 | "

5 rows × 2320 columns

\n", 227 | "
" 228 | ], 229 | "text/plain": [ 230 | " 300188.XSHE 600337.XSHG 600168.XSHG 002337.XSHE 600592.XSHG \\\n", 231 | "2012-01-04 4.3401 3.1750 6.3693 2.8752 6.8314 \n", 232 | "2012-01-05 4.0743 2.8583 6.1697 2.6739 6.1737 \n", 233 | "2012-01-06 4.1173 2.6920 5.9511 2.7148 6.2207 \n", 234 | "2012-01-09 4.4041 2.7791 6.2172 2.8154 6.4744 \n", 235 | "2012-01-10 4.5124 2.9137 6.4549 2.9570 6.7845 \n", 236 | "\n", 237 | " 000950.XSHE 600991.XSHG 002473.XSHE 600784.XSHG 600736.XSHG \\\n", 238 | "2012-01-04 4.0168 15.08 7.2898 5.0104 4.0135 \n", 239 | "2012-01-05 3.9101 15.08 6.9505 4.5089 3.8534 \n", 240 | "2012-01-06 4.1429 15.11 7.0765 4.6111 3.9570 \n", 241 | "2012-01-09 4.2787 15.12 7.4449 4.8741 4.2114 \n", 242 | "2012-01-10 4.4437 15.29 7.7357 5.1370 4.2962 \n", 243 | "\n", 244 | " ... 600345.XSHG 600387.XSHG 000063.XSHE 002506.XSHE \\\n", 245 | "2012-01-04 ... 9.4373 8.7711 13.3691 2.6406 \n", 246 | "2012-01-05 ... 8.8456 8.5560 13.2895 2.3920 \n", 247 | "2012-01-06 ... 8.9315 8.4289 12.9473 2.4624 \n", 248 | "2012-01-09 ... 9.3705 8.9765 13.2975 2.5991 \n", 249 | "2012-01-10 ... 9.7712 9.3285 13.8864 2.7565 \n", 250 | "\n", 251 | " 300151.XSHE 002579.XSHE 000563.XSHE 000551.XSHE 002578.XSHE \\\n", 252 | "2012-01-04 2.0055 3.4037 2.3129 5.7181 1.8931 \n", 253 | "2012-01-05 1.8872 3.1452 2.2942 5.2525 1.7877 \n", 254 | "2012-01-06 1.9159 3.1724 2.2475 5.2787 1.7897 \n", 255 | "2012-01-09 2.0094 3.3194 2.3362 5.7050 1.8814 \n", 256 | "2012-01-10 2.1143 3.4255 2.4087 5.9410 2.0024 \n", 257 | "\n", 258 | " 000726.XSHE \n", 259 | "2012-01-04 6.0165 \n", 260 | "2012-01-05 6.0005 \n", 261 | "2012-01-06 6.2645 \n", 262 | "2012-01-09 6.4965 \n", 263 | "2012-01-10 6.6565 \n", 264 | "\n", 265 | "[5 rows x 2320 columns]" 266 | ] 267 | }, 268 | "execution_count": 3, 269 | "metadata": {}, 270 | "output_type": "execute_result" 271 | } 272 | ], 273 | "source": [ 274 | "price_data.head()" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": 4, 280 | "metadata": { 281 | "collapsed": false 282 | }, 283 | "outputs": [], 284 | "source": [ 285 | "trade_data = get_price(stock_list, start_date=date_start, end_date=date_end, frequency='1d', \n", 286 | " fields=['close', 'total_turnover', 'volume'], \n", 287 | " adjust_type='pre', skip_suspended=False, country='cn')" 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": 5, 293 | "metadata": { 294 | "collapsed": true 295 | }, 296 | "outputs": [], 297 | "source": [ 298 | "return_data = get_price_change_rate(stock_list, start_date=date_start, end_date=date_end)" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": 6, 304 | "metadata": { 305 | "collapsed": true 306 | }, 307 | "outputs": [], 308 | "source": [ 309 | "turnover_data = get_turnover_rate(stock_list, date_start, date_end, fields=['week', 'month'])" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": 9, 315 | "metadata": { 316 | "collapsed": false 317 | }, 318 | "outputs": [], 319 | "source": [ 320 | "instrument_info = instruments(stock_list)" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": 25, 326 | "metadata": { 327 | "collapsed": false 328 | }, 329 | "outputs": [ 330 | { 331 | "data": { 332 | "text/plain": [ 333 | "Instrument(industry_name='软件和信息技术服务业', sector_code_name='信息技术', abbrev_symbol='MYBK', listed_date='2011-03-16', exchange='XSHE', symbol='美亚柏科', industry_code='I65', round_lot=100.0, order_book_id='300188.XSHE', special_type='Normal', shenwan_industry_name='计算机', de_listed_date='0000-00-00', type='CS', sector_code='InformationTechnology', board_type='GEM', shenwan_industry_code='801750.INDX', status='Active')" 334 | ] 335 | }, 336 | "execution_count": 25, 337 | "metadata": {}, 338 | "output_type": "execute_result" 339 | } 340 | ], 341 | "source": [ 342 | "instrument_info[0]" 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": 27, 348 | "metadata": { 349 | "collapsed": false 350 | }, 351 | "outputs": [], 352 | "source": [ 353 | "# Download instrument information\n", 354 | "\n", 355 | "bookId_list = []\n", 356 | "exchange_list = []\n", 357 | "abbrevSymbol_list = []\n", 358 | "shenwanIndustryCode_list = []\n", 359 | "shenwanIndustryName_list = []\n", 360 | "industryCode_list = []\n", 361 | "industryName_list = []\n", 362 | "sectorCode_list = []\n", 363 | "sectorName_list = []\n", 364 | "for inst in instrument_info:\n", 365 | " bookId_list.append(inst.order_book_id)\n", 366 | " exchange_list.append(inst.exchange)\n", 367 | " abbrevSymbol_list.append(inst.abbrev_symbol)\n", 368 | " shenwanIndustryCode_list.append(inst.shenwan_industry_code)\n", 369 | " shenwanIndustryName_list.append(inst.shenwan_industry_name)\n", 370 | " industryCode_list.append(inst.industry_code)\n", 371 | " industryName_list.append(inst.industry_name)\n", 372 | " sectorCode_list.append(inst.sector_code)\n", 373 | " sectorName_list.append(inst.sector_code_name)\n", 374 | " \n", 375 | "instrument_df = pd.DataFrame({\"bookId\":bookId_list,\n", 376 | " \"exchange\":exchange_list,\n", 377 | " \"abbrevSymbol\":abbrevSymbol_list,\n", 378 | " \"shenwanIndustryCode\":shenwanIndustryCode_list,\n", 379 | " \"shenwanIndustryName\":shenwanIndustryName_list,\n", 380 | " \"industryCode\":industryCode_list,\n", 381 | " \"industryName\":industryName_list,\n", 382 | " \"sectorCode\":sectorCode_list,\n", 383 | " \"sectorName\":sectorName_list})" 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": 29, 389 | "metadata": { 390 | "collapsed": false 391 | }, 392 | "outputs": [], 393 | "source": [ 394 | "instrument_df.to_csv(\"cn_instrument_info_{}_{}.csv\".format(year_start,year_end))" 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": 7, 400 | "metadata": { 401 | "collapsed": false 402 | }, 403 | "outputs": [ 404 | { 405 | "name": "stdout", 406 | "output_type": "stream", 407 | "text": [ 408 | "Date: 2012-01-04 00:00:00 | Progress: 0.05875440658049354%\n", 409 | "Date: 2012-01-05 00:00:00 | Progress: 0.11750881316098707%\n", 410 | "Date: 2012-01-06 00:00:00 | Progress: 0.17626321974148063%\n", 411 | "Date: 2012-01-09 00:00:00 | Progress: 0.23501762632197415%\n", 412 | "Date: 2012-01-10 00:00:00 | Progress: 0.2937720329024677%\n", 413 | "Date: 2012-01-11 00:00:00 | Progress: 0.35252643948296125%\n", 414 | "Date: 2012-01-12 00:00:00 | Progress: 0.4112808460634548%\n", 415 | "Date: 2012-01-13 00:00:00 | Progress: 0.4700352526439483%\n", 416 | "Date: 2012-01-16 00:00:00 | Progress: 0.5287896592244419%\n", 417 | "Date: 2012-01-17 00:00:00 | Progress: 0.5875440658049353%\n", 418 | "Date: 2012-01-18 00:00:00 | Progress: 0.6462984723854289%\n", 419 | "Date: 2012-01-19 00:00:00 | Progress: 0.7050528789659225%\n", 420 | "Date: 2012-01-20 00:00:00 | Progress: 0.763807285546416%\n", 421 | "Date: 2012-01-30 00:00:00 | Progress: 0.8225616921269095%\n", 422 | "Date: 2012-01-31 00:00:00 | Progress: 0.881316098707403%\n", 423 | "Date: 2012-02-01 00:00:00 | Progress: 0.9400705052878966%\n" 424 | ] 425 | }, 426 | { 427 | "ename": "KeyboardInterrupt", 428 | "evalue": "", 429 | "traceback": [ 430 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 431 | "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", 432 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mfilter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfundamentals\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mincome_statement\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstockcode\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0min_\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstock_list\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 17\u001b[0;31m \u001b[0mentry_date\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdt\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minterval\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'1q'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreport_quarter\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 18\u001b[0m )\n\u001b[1;32m 19\u001b[0m \u001b[0m_fundamental_data\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_fundamental_data\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_frame\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 433 | "\u001b[0;32m/opt/conda/envs/ricequant/lib/python3.5/site-packages/rqcommons/facade.py\u001b[0m in \u001b[0;36mwrap\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 29\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Not inited yet. Please call rqdatac.init() first.'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 30\u001b[0m \u001b[0;32mreturn\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 31\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 32\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mwrap\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 33\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 434 | "\u001b[0;32m/opt/conda/envs/ricequant/lib/python3.5/site-packages/rqcommons/facade.py\u001b[0m in \u001b[0;36mget_fundamentals\u001b[0;34m(query, entry_date, interval, report_quarter, country)\u001b[0m\n\u001b[1;32m 314\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mget_fundamentals\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mquery\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mentry_date\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minterval\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreport_quarter\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcountry\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'cn'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 315\u001b[0m \u001b[0;34m\"\"\"获取财务数据\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 316\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mimplmentation\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_fundamentals\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mquery\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mentry_date\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minterval\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreport_quarter\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcountry\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 317\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 318\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 435 | "\u001b[0;32m/opt/conda/envs/ricequant/lib/python3.5/site-packages/rqdatac/implementation.py\u001b[0m in \u001b[0;36mget_fundamentals\u001b[0;34m(cls, query, entry_date, interval, report_quarter, country)\u001b[0m\n\u001b[1;32m 251\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 252\u001b[0m \u001b[0mquery\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_unsafe_apply_query_filter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mquery\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtrading_dates\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 253\u001b[0;31m \u001b[0mrecords\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcls\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_run_fundamental_query\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_compile_query\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mquery\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcountry\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcountry\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 254\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 255\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'ERROR: internal error, please contact public@ricequant.com. exception: {}'\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 436 | "\u001b[0;32m/opt/conda/envs/ricequant/lib/python3.5/site-packages/rqdatac/implementation.py\u001b[0m in \u001b[0;36m_compile_query\u001b[0;34m(query)\u001b[0m\n\u001b[1;32m 30\u001b[0m \u001b[0mparams\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 31\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mk\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mcomp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpositiontup\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 32\u001b[0;31m \u001b[0mv\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcomp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mparams\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 33\u001b[0m \u001b[0mparams\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mescape_item\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mv\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mconversions\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mencoders\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 34\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 437 | "\u001b[0;32m/opt/conda/envs/ricequant/lib/python3.5/site-packages/sqlalchemy/sql/compiler.py\u001b[0m in \u001b[0;36mparams\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 576\u001b[0m \"\"\"Return the bind param dictionary embedded into this\n\u001b[1;32m 577\u001b[0m compiled object, for those values that are present.\"\"\"\n\u001b[0;32m--> 578\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconstruct_params\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_check\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 579\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 580\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mutil\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdependencies\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"sqlalchemy.engine.result\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 438 | "\u001b[0;32m/opt/conda/envs/ricequant/lib/python3.5/site-packages/sqlalchemy/sql/compiler.py\u001b[0m in \u001b[0;36mconstruct_params\u001b[0;34m(self, params, _group_number, _check)\u001b[0m\n\u001b[1;32m 569\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbind_names\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mbindparam\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mbindparam\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0meffective_value\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 570\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 571\u001b[0;31m \u001b[0mpd\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbind_names\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mbindparam\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mbindparam\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 572\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 573\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 439 | "\u001b[0;31mKeyboardInterrupt\u001b[0m: " 440 | ], 441 | "output_type": "error" 442 | } 443 | ], 444 | "source": [ 445 | "fundamental_data = {}\n", 446 | "query_dates = trade_dates[(trade_dates >= date_start_dt) & (trade_dates <= date_end_dt)]\n", 447 | "ndates = len(query_dates)\n", 448 | "for counter,dt in enumerate(query_dates):\n", 449 | " print(\"Date: {} | Progress: {}%\".format(dt,(counter+1)/ndates*100))\n", 450 | " _fundamental_data = get_fundamentals(\n", 451 | " query(\n", 452 | " fundamentals.eod_derivative_indicator.market_cap, #总市值\n", 453 | " fundamentals.eod_derivative_indicator.a_share_market_val_2, #流通市值\n", 454 | " fundamentals.cash_flow_statement.cash_received_from_sales_of_goods, #销售额 - 单季/同比\n", 455 | " fundamentals.eod_derivative_indicator.pb_ratio, #净资产/总市值=市净率\n", 456 | " fundamentals.income_statement.net_profit, #净利润\n", 457 | " fundamentals.eod_derivative_indicator.ps_ratio #市销率\n", 458 | " )\n", 459 | " .filter(fundamentals.income_statement.stockcode.in_(stock_list))\n", 460 | " , \n", 461 | " entry_date=dt, interval='1q', report_quarter=True\n", 462 | " )\n", 463 | " _fundamental_data = _fundamental_data.to_frame()\n", 464 | " _fundamental_data.index.names = ['date', 'order_book_id']\n", 465 | " fundamental_data[dt] = _fundamental_data\n", 466 | " \n", 467 | "fundamental_data = pd.concat(fundamental_data)\n", 468 | "fundamental_data.reset_index(level=0, drop=True, inplace=True)" 469 | ] 470 | }, 471 | { 472 | "cell_type": "code", 473 | "execution_count": null, 474 | "metadata": { 475 | "collapsed": false 476 | }, 477 | "outputs": [], 478 | "source": [ 479 | "trade_ts = trade_data.to_frame()\n", 480 | "trade_ts.index.names = ['date', 'order_book_id']\n", 481 | "\n", 482 | "return_ts = pd.DataFrame(return_data.stack(), columns=['return'])\n", 483 | "return_ts.index.names = ['date', 'order_book_id']\n", 484 | "\n", 485 | "turnover_ts = turnover_data.to_frame()\n", 486 | "turnover_ts.index.names = ['date', 'order_book_id']\n", 487 | "\n", 488 | "data = return_ts.merge(trade_ts, how='left', left_index=True, right_index=True)\n", 489 | "data = data.merge(turnover_ts, how='left', left_index=True, right_index=True)\n", 490 | "data = data.merge(fundamental_data, how='left', left_index=True, right_index=True)" 491 | ] 492 | }, 493 | { 494 | "cell_type": "code", 495 | "execution_count": null, 496 | "metadata": { 497 | "collapsed": false 498 | }, 499 | "outputs": [], 500 | "source": [ 501 | "data.head()" 502 | ] 503 | }, 504 | { 505 | "cell_type": "code", 506 | "execution_count": null, 507 | "metadata": { 508 | "collapsed": true 509 | }, 510 | "outputs": [], 511 | "source": [ 512 | "# Save Data\n", 513 | "data.to_csv(\"cn_equity_daily_{}_{}.csv\".format(year_start,year_end))" 514 | ] 515 | }, 516 | { 517 | "cell_type": "code", 518 | "execution_count": null, 519 | "metadata": { 520 | "collapsed": true 521 | }, 522 | "outputs": [], 523 | "source": [] 524 | } 525 | ], 526 | "metadata": { 527 | "kernelspec": { 528 | "display_name": "Python 3", 529 | "language": "python", 530 | "name": "python3" 531 | }, 532 | "language_info": { 533 | "codemirror_mode": { 534 | "name": "ipython", 535 | "version": 3 536 | }, 537 | "file_extension": ".py", 538 | "mimetype": "text/x-python", 539 | "name": "python", 540 | "nbconvert_exporter": "python", 541 | "pygments_lexer": "ipython3", 542 | "version": "3.5.5" 543 | } 544 | }, 545 | "nbformat": 4, 546 | "nbformat_minor": 2 547 | } 548 | -------------------------------------------------------------------------------- /source/DownloadData_bak.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [ 10 | { 11 | "name": "stdout", 12 | "output_type": "stream", 13 | "text": [ 14 | "Population Check - Initial #: 1059\n" 15 | ] 16 | } 17 | ], 18 | "source": [ 19 | "# Constructs Time Series Data for All Stocks\n", 20 | "import pandas as pd\n", 21 | "import numpy as np\n", 22 | "from datetime import datetime\n", 23 | "import tushare as ts\n", 24 | "\n", 25 | "from scipy.stats import rankdata\n", 26 | "\n", 27 | "import seaborn as sns\n", 28 | "\n", 29 | "# Pull All Trade Dates\n", 30 | "trade_dates = pd.Series(data=[pd.Timestamp(date) for date in get_trading_dates('2001-01-01', '2018-12-31')], name='trade_date')\n", 31 | "\n", 32 | "year_start = 2001\n", 33 | "year_end = 2012\n", 34 | "\n", 35 | "# date_end_last_dt = max(trade_dates[trade_dates.dt.year == year_start-1])\n", 36 | "date_start_dt = min(trade_dates[trade_dates.dt.year == year_start])\n", 37 | "date_end_dt = max(trade_dates[trade_dates.dt.year == year_end])\n", 38 | "\n", 39 | "# date_end_last = date_end_last_dt.strftime('%Y-%m-%d')\n", 40 | "date_start = date_start_dt.strftime('%Y-%m-%d')\n", 41 | "date_end = date_end_dt.strftime('%Y-%m-%d')\n", 42 | "\n", 43 | "# Construct Stock Population\n", 44 | "stock_all = all_instruments(type=\"CS\", country='cn', date=date_start_dt)\n", 45 | "stock_list = stock_all['order_book_id'].tolist()\n", 46 | "print(\"Population Check - Initial #: {}\".format(stock_all.shape[0]))" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 3, 52 | "metadata": { 53 | "collapsed": false 54 | }, 55 | "outputs": [ 56 | { 57 | "name": "stdout", 58 | "output_type": "stream", 59 | "text": [ 60 | "WARN: start_date is earlier than 2005-01-04, adjusted\n" 61 | ] 62 | } 63 | ], 64 | "source": [ 65 | "trade_data = get_price(stock_list, start_date=date_start, end_date=date_end, frequency='1d', \n", 66 | " fields=['close', 'total_turnover', 'volume'], \n", 67 | " adjust_type='pre', skip_suspended=False, country='cn')\n", 68 | "\n", 69 | "return_data = get_price_change_rate(stock_list, start_date=date_start, end_date=date_end)\n", 70 | "\n", 71 | "turnover_data = get_turnover_rate(stock_list, date_start, date_end, fields=['week', 'month'])\n", 72 | "\n", 73 | "fundamental_data = {}\n", 74 | "for dt in trade_dates[(trade_dates.dt.year >= year_start) & (trade_dates.dt.year <= year_end)]:\n", 75 | " _fundamental_data = get_fundamentals(\n", 76 | " query(\n", 77 | " fundamentals.eod_derivative_indicator.market_cap, #总市值\n", 78 | " fundamentals.eod_derivative_indicator.a_share_market_val_2, #流通市值\n", 79 | " fundamentals.cash_flow_statement.cash_received_from_sales_of_goods, #销售额 - 单季/同比\n", 80 | " fundamentals.eod_derivative_indicator.pb_ratio, #净资产/总市值=市净率\n", 81 | " fundamentals.income_statement.net_profit, #净利润\n", 82 | " fundamentals.eod_derivative_indicator.ps_ratio #市销率\n", 83 | " ).filter(fundamentals.income_statement.stockcode.in_(stock_list)), \n", 84 | " entry_date=dt, interval='1q', report_quarter=True\n", 85 | " )\n", 86 | " _fundamental_data = _fundamental_data.to_frame()\n", 87 | " _fundamental_data.index.names = ['date', 'order_book_id']\n", 88 | " fundamental_data[dt] = _fundamental_data\n", 89 | " \n", 90 | "fundamental_data = pd.concat(fundamental_data)\n", 91 | "fundamental_data.reset_index(level=0, drop=True, inplace=True)\n", 92 | "\n", 93 | "# Aggregate Data\n", 94 | "\n", 95 | "trade_ts = trade_data.to_frame()\n", 96 | "trade_ts.index.names = ['date', 'order_book_id']\n", 97 | "\n", 98 | "return_ts = pd.DataFrame(return_data.stack(), columns=['return'])\n", 99 | "return_ts.index.names = ['date', 'order_book_id']\n", 100 | "\n", 101 | "turnover_ts = turnover_data.to_frame()\n", 102 | "turnover_ts.index.names = ['date', 'order_book_id']\n", 103 | "\n", 104 | "data = return_ts.merge(trade_ts, how='left', left_index=True, right_index=True)\n", 105 | "data = data.merge(turnover_ts, how='left', left_index=True, right_index=True)\n", 106 | "data = data.merge(fundamental_data, how='left', left_index=True, right_index=True)\n", 107 | "\n", 108 | "# Save Data\n", 109 | "data.to_csv(\"stock_data_all_2016_2018.csv\")\n" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 4, 115 | "metadata": { 116 | "collapsed": false 117 | }, 118 | "outputs": [], 119 | "source": [ 120 | "trade_ts = trade_data.to_frame()\n", 121 | "trade_ts.index.names = ['date', 'order_book_id']\n", 122 | "\n", 123 | "return_ts = pd.DataFrame(return_data.stack(), columns=['return'])\n", 124 | "return_ts.index.names = ['date', 'order_book_id']\n", 125 | "\n", 126 | "turnover_ts = turnover_data.to_frame()\n", 127 | "turnover_ts.index.names = ['date', 'order_book_id']\n", 128 | "\n", 129 | "data = return_ts.merge(trade_ts, how='left', left_index=True, right_index=True)\n", 130 | "data = data.merge(turnover_ts, how='left', left_index=True, right_index=True)\n", 131 | "data = data.merge(fundamental_data, how='left', left_index=True, right_index=True)\n" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 5, 137 | "metadata": { 138 | "collapsed": true 139 | }, 140 | "outputs": [], 141 | "source": [ 142 | "# data.to_csv(\"stock_data_all_2005_2012.csv\")\n", 143 | "\n", 144 | "# Break data into monthly chunks\n", 145 | "year = 2011\n", 146 | "for month in range(1,13):\n", 147 | " data_tmp = data.loc[(data['date'].dt.year == year) & (data['date'].dt.month == month), :]\n", 148 | " data_tmp.to_csv(\"stock_data_all_\"+str(year)+\"{0:0=2d}\".format(month)+\".csv\")" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "metadata": { 155 | "collapsed": true 156 | }, 157 | "outputs": [], 158 | "source": [] 159 | } 160 | ], 161 | "metadata": { 162 | "kernelspec": { 163 | "display_name": "Python 3", 164 | "language": "python", 165 | "name": "python3" 166 | }, 167 | "language_info": { 168 | "codemirror_mode": { 169 | "name": "ipython", 170 | "version": 3 171 | }, 172 | "file_extension": ".py", 173 | "mimetype": "text/x-python", 174 | "name": "python", 175 | "nbconvert_exporter": "python", 176 | "pygments_lexer": "ipython3", 177 | "version": "3.5.2" 178 | } 179 | }, 180 | "nbformat": 4, 181 | "nbformat_minor": 2 182 | } 183 | -------------------------------------------------------------------------------- /source/FactorModeling.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from rqdata_utils import *\n", 10 | "import pandas\n", 11 | "import numpy as np\n", 12 | "import scipy as sp\n", 13 | "import alphalens as al\n", 14 | "%matplotlib inline" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Loading Data" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 2, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "price_df,instrument_df,equity_df = get_price_instrument_equity(\"cn_stock_price_2012_2018.csv\",\"cn_instrument_info_2012_2018.csv\",\"cn_equity_daily_2012_2018.csv\",\"sectorCode\")" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 3, 36 | "metadata": {}, 37 | "outputs": [ 38 | { 39 | "data": { 40 | "text/html": [ 41 | "
\n", 42 | "\n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | "
returnclosetotal_turnovervolumeweekmonthreport_quartermarket_capa_share_market_val_2cash_received_from_sales_of_goodspb_rationet_profitps_ratiosectorCode
dateorder_book_id
2012-01-04000001.XSHE-0.0275825.12242.275637e+0840894428.00.57750.4331NaNNaNNaNNaNNaNNaNNaNFinancials
000002.XSHE-0.0187426.05253.559891e+0847432958.00.37110.40302011q38.059489e+107.082120e+107.516785e+101.52164.106349e+090.8679Financials
000004.XSHE-0.0222507.91003.763833e+06465469.00.57200.75062011q36.642556e+086.634549e+085.949968e+078.81754.500363e+0637.5796HealthCare
000005.XSHE0.0000003.86000.000000e+000.00.00000.00002011q33.529328e+093.527048e+092.565851e+075.34801.365665e+07-347.2191Industrials
000006.XSHE-0.0097562.67667.619286e+062513811.00.14160.16672011q34.015370e+093.929464e+092.531436e+091.43482.763917e+081.4139Financials
\n", 170 | "
" 171 | ], 172 | "text/plain": [ 173 | " return close total_turnover volume \\\n", 174 | "date order_book_id \n", 175 | "2012-01-04 000001.XSHE -0.027582 5.1224 2.275637e+08 40894428.0 \n", 176 | " 000002.XSHE -0.018742 6.0525 3.559891e+08 47432958.0 \n", 177 | " 000004.XSHE -0.022250 7.9100 3.763833e+06 465469.0 \n", 178 | " 000005.XSHE 0.000000 3.8600 0.000000e+00 0.0 \n", 179 | " 000006.XSHE -0.009756 2.6766 7.619286e+06 2513811.0 \n", 180 | "\n", 181 | " week month report_quarter market_cap \\\n", 182 | "date order_book_id \n", 183 | "2012-01-04 000001.XSHE 0.5775 0.4331 NaN NaN \n", 184 | " 000002.XSHE 0.3711 0.4030 2011q3 8.059489e+10 \n", 185 | " 000004.XSHE 0.5720 0.7506 2011q3 6.642556e+08 \n", 186 | " 000005.XSHE 0.0000 0.0000 2011q3 3.529328e+09 \n", 187 | " 000006.XSHE 0.1416 0.1667 2011q3 4.015370e+09 \n", 188 | "\n", 189 | " a_share_market_val_2 \\\n", 190 | "date order_book_id \n", 191 | "2012-01-04 000001.XSHE NaN \n", 192 | " 000002.XSHE 7.082120e+10 \n", 193 | " 000004.XSHE 6.634549e+08 \n", 194 | " 000005.XSHE 3.527048e+09 \n", 195 | " 000006.XSHE 3.929464e+09 \n", 196 | "\n", 197 | " cash_received_from_sales_of_goods pb_ratio \\\n", 198 | "date order_book_id \n", 199 | "2012-01-04 000001.XSHE NaN NaN \n", 200 | " 000002.XSHE 7.516785e+10 1.5216 \n", 201 | " 000004.XSHE 5.949968e+07 8.8175 \n", 202 | " 000005.XSHE 2.565851e+07 5.3480 \n", 203 | " 000006.XSHE 2.531436e+09 1.4348 \n", 204 | "\n", 205 | " net_profit ps_ratio sectorCode \n", 206 | "date order_book_id \n", 207 | "2012-01-04 000001.XSHE NaN NaN Financials \n", 208 | " 000002.XSHE 4.106349e+09 0.8679 Financials \n", 209 | " 000004.XSHE 4.500363e+06 37.5796 HealthCare \n", 210 | " 000005.XSHE 1.365665e+07 -347.2191 Industrials \n", 211 | " 000006.XSHE 2.763917e+08 1.4139 Financials " 212 | ] 213 | }, 214 | "execution_count": 3, 215 | "metadata": {}, 216 | "output_type": "execute_result" 217 | } 218 | ], 219 | "source": [ 220 | "equity_df.head()" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": 4, 226 | "metadata": {}, 227 | "outputs": [ 228 | { 229 | "data": { 230 | "text/plain": [ 231 | "164" 232 | ] 233 | }, 234 | "execution_count": 4, 235 | "metadata": {}, 236 | "output_type": "execute_result" 237 | } 238 | ], 239 | "source": [ 240 | "healthcareUniverse = instrument_df.index[instrument_df.sectorCode=='HealthCare'].values\n", 241 | "len(healthcareUniverse)" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": 5, 247 | "metadata": {}, 248 | "outputs": [], 249 | "source": [ 250 | "def equity_universe_filtering(equity_df, universe):\n", 251 | " universeFilter = [book_id in set(universe) for book_id in equity_df.index.get_level_values(level=1).values]\n", 252 | " return equity_df[universeFilter]" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": 6, 258 | "metadata": {}, 259 | "outputs": [ 260 | { 261 | "data": { 262 | "text/html": [ 263 | "
\n", 264 | "\n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | "
returnclosetotal_turnovervolumeweekmonthreport_quartermarket_capa_share_market_val_2cash_received_from_sales_of_goodspb_rationet_profitps_ratiosectorCode
dateorder_book_id
2012-01-04000004.XSHE-0.0222507.91003763832.88465469.00.57200.75062011q36.642556e+086.634549e+085.949968e+078.81754.500363e+0637.5796HealthCare
000028.XSHE-0.04543319.84229326924.28450553.00.42010.27222011q35.872485e+094.753820e+091.053298e+104.34932.481834e+080.3414HealthCare
000150.XSHE-0.0302953.17373109304.50952600.00.34600.36102011q31.036800e+091.036800e+094.913279e+071.47633.657858e+067.8956HealthCare
000153.XSHE-0.0280535.77009673054.491596020.00.68302.45942011q31.531454e+091.360856e+091.329425e+092.11691.560397e+070.7818HealthCare
000403.XSHE0.0000003.16250.000.00.00000.0000NaNNaNNaNNaNNaNNaNNaNHealthCare
\n", 392 | "
" 393 | ], 394 | "text/plain": [ 395 | " return close total_turnover volume \\\n", 396 | "date order_book_id \n", 397 | "2012-01-04 000004.XSHE -0.022250 7.9100 3763832.88 465469.0 \n", 398 | " 000028.XSHE -0.045433 19.8422 9326924.28 450553.0 \n", 399 | " 000150.XSHE -0.030295 3.1737 3109304.50 952600.0 \n", 400 | " 000153.XSHE -0.028053 5.7700 9673054.49 1596020.0 \n", 401 | " 000403.XSHE 0.000000 3.1625 0.00 0.0 \n", 402 | "\n", 403 | " week month report_quarter market_cap \\\n", 404 | "date order_book_id \n", 405 | "2012-01-04 000004.XSHE 0.5720 0.7506 2011q3 6.642556e+08 \n", 406 | " 000028.XSHE 0.4201 0.2722 2011q3 5.872485e+09 \n", 407 | " 000150.XSHE 0.3460 0.3610 2011q3 1.036800e+09 \n", 408 | " 000153.XSHE 0.6830 2.4594 2011q3 1.531454e+09 \n", 409 | " 000403.XSHE 0.0000 0.0000 NaN NaN \n", 410 | "\n", 411 | " a_share_market_val_2 \\\n", 412 | "date order_book_id \n", 413 | "2012-01-04 000004.XSHE 6.634549e+08 \n", 414 | " 000028.XSHE 4.753820e+09 \n", 415 | " 000150.XSHE 1.036800e+09 \n", 416 | " 000153.XSHE 1.360856e+09 \n", 417 | " 000403.XSHE NaN \n", 418 | "\n", 419 | " cash_received_from_sales_of_goods pb_ratio \\\n", 420 | "date order_book_id \n", 421 | "2012-01-04 000004.XSHE 5.949968e+07 8.8175 \n", 422 | " 000028.XSHE 1.053298e+10 4.3493 \n", 423 | " 000150.XSHE 4.913279e+07 1.4763 \n", 424 | " 000153.XSHE 1.329425e+09 2.1169 \n", 425 | " 000403.XSHE NaN NaN \n", 426 | "\n", 427 | " net_profit ps_ratio sectorCode \n", 428 | "date order_book_id \n", 429 | "2012-01-04 000004.XSHE 4.500363e+06 37.5796 HealthCare \n", 430 | " 000028.XSHE 2.481834e+08 0.3414 HealthCare \n", 431 | " 000150.XSHE 3.657858e+06 7.8956 HealthCare \n", 432 | " 000153.XSHE 1.560397e+07 0.7818 HealthCare \n", 433 | " 000403.XSHE NaN NaN HealthCare " 434 | ] 435 | }, 436 | "execution_count": 6, 437 | "metadata": {}, 438 | "output_type": "execute_result" 439 | } 440 | ], 441 | "source": [ 442 | "healthcare_equity_df = equity_universe_filtering(equity_df, healthcareUniverse)\n", 443 | "healthcare_equity_df.head()" 444 | ] 445 | }, 446 | { 447 | "cell_type": "code", 448 | "execution_count": 7, 449 | "metadata": {}, 450 | "outputs": [ 451 | { 452 | "name": "stdout", 453 | "output_type": "stream", 454 | "text": [ 455 | "universe ratio: 6.210331877919959%\n" 456 | ] 457 | } 458 | ], 459 | "source": [ 460 | "print(\"universe ratio: {}%\".format(len(healthcare_equity_df)/len(equity_df)*100))" 461 | ] 462 | }, 463 | { 464 | "cell_type": "markdown", 465 | "metadata": {}, 466 | "source": [ 467 | "### benchmark" 468 | ] 469 | }, 470 | { 471 | "cell_type": "code", 472 | "execution_count": 27, 473 | "metadata": {}, 474 | "outputs": [], 475 | "source": [ 476 | "benchmark_df = pd.read_csv(\"cn_SH_healthcare_index_2012_2018.csv\",names=['date','value'])\n", 477 | "benchmark_df = benchmark_df.set_index('date',drop=True)" 478 | ] 479 | }, 480 | { 481 | "cell_type": "code", 482 | "execution_count": 33, 483 | "metadata": {}, 484 | "outputs": [ 485 | { 486 | "data": { 487 | "text/html": [ 488 | "
\n", 489 | "\n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | "
valuereturn
date
2012-01-042891.4620.000000
2012-01-052766.9550.044015
2012-01-062744.7930.008042
2012-01-092833.219-0.031708
2012-01-102929.594-0.033450
\n", 530 | "
" 531 | ], 532 | "text/plain": [ 533 | " value return\n", 534 | "date \n", 535 | "2012-01-04 2891.462 0.000000\n", 536 | "2012-01-05 2766.955 0.044015\n", 537 | "2012-01-06 2744.793 0.008042\n", 538 | "2012-01-09 2833.219 -0.031708\n", 539 | "2012-01-10 2929.594 -0.033450" 540 | ] 541 | }, 542 | "execution_count": 33, 543 | "metadata": {}, 544 | "output_type": "execute_result" 545 | } 546 | ], 547 | "source": [ 548 | "benchmark_df['return'] = np.log(benchmark_df.shift(1)/benchmark_df).fillna(0)\n", 549 | "benchmark_df.head()" 550 | ] 551 | }, 552 | { 553 | "cell_type": "markdown", 554 | "metadata": {}, 555 | "source": [ 556 | "## Factor Returns" 557 | ] 558 | }, 559 | { 560 | "cell_type": "code", 561 | "execution_count": 8, 562 | "metadata": {}, 563 | "outputs": [], 564 | "source": [ 565 | "def equity_factor_return(equity_df, factorColumn, nAllocations, longTop=True):\n", 566 | " equity_copy = equity_df.copy()\n", 567 | "# equity_copy[\"{}_rank\".format(factorColumn)] = equity_copy.groupby(level='date')[factorColumn].rank()\n", 568 | "# equity_copy[equity_copy.groupby(level='date')[factorColumn].nlargest(nAllocations).index][\"biggest_{}_{}\".format(nAllocations,factorColumn)]=True\n", 569 | " largest = equity_copy[factorColumn].groupby(level='date').nlargest(nAllocations).reset_index(level=0,drop=True)\n", 570 | " smallest = equity_copy[factorColumn].groupby(level='date').nsmallest(nAllocations).reset_index(level=0,drop=True)\n", 571 | " r_largest = equity_copy.loc[largest.index,'return'].groupby(level='date').mean()\n", 572 | " r_smallest = equity_copy.loc[smallest.index,'return'].groupby(level='date').mean()\n", 573 | " LMS = r_largest - r_smallest\n", 574 | " if(longTop):\n", 575 | " return LMS\n", 576 | " else:\n", 577 | " return -LMS" 578 | ] 579 | }, 580 | { 581 | "cell_type": "code", 582 | "execution_count": 9, 583 | "metadata": {}, 584 | "outputs": [ 585 | { 586 | "data": { 587 | "text/plain": [ 588 | "date\n", 589 | "2012-01-04 0.005983\n", 590 | "2012-01-05 -0.009098\n", 591 | "2012-01-06 -0.004155\n", 592 | "2012-01-09 0.014615\n", 593 | "2012-01-10 0.006728\n", 594 | "Name: return, dtype: float64" 595 | ] 596 | }, 597 | "execution_count": 9, 598 | "metadata": {}, 599 | "output_type": "execute_result" 600 | } 601 | ], 602 | "source": [ 603 | "SMB = equity_factor_return(healthcare_equity_df, 'market_cap', 20,longTop=False)\n", 604 | "SMB.head()" 605 | ] 606 | }, 607 | { 608 | "cell_type": "code", 609 | "execution_count": 10, 610 | "metadata": {}, 611 | "outputs": [ 612 | { 613 | "data": { 614 | "text/plain": [ 615 | "date\n", 616 | "2012-01-04 0.005302\n", 617 | "2012-01-05 -0.007223\n", 618 | "2012-01-06 0.006031\n", 619 | "2012-01-09 -0.002597\n", 620 | "2012-01-10 -0.010780\n", 621 | "Name: return, dtype: float64" 622 | ] 623 | }, 624 | "execution_count": 10, 625 | "metadata": {}, 626 | "output_type": "execute_result" 627 | } 628 | ], 629 | "source": [ 630 | "HML = equity_factor_return(healthcare_equity_df, 'pb_ratio', 20,longTop=True)\n", 631 | "HML.head()" 632 | ] 633 | }, 634 | { 635 | "cell_type": "code", 636 | "execution_count": 11, 637 | "metadata": {}, 638 | "outputs": [], 639 | "source": [ 640 | "import itertools\n", 641 | "import statsmodels.api as sm\n", 642 | "from statsmodels import regression,stats\n", 643 | "import scipy\n", 644 | "\n", 645 | "data = healthcare_equity_df[['return']] # dataframe\n", 646 | "data = data.set_index(healthcare_equity_df.index) # elimilate redundant index (whole universe)\n", 647 | "asset_list_sizes = [group[1].size for group in data.groupby(level=0)]\n", 648 | "\n", 649 | "# Spreading the factor portfolio data across all assets for each day\n", 650 | "SMB_column = [[SMB.loc[group[0]]] * size for group, size \\\n", 651 | " in zip(data.groupby(level=0), asset_list_sizes)]\n", 652 | "data['SMB'] = list(itertools.chain(*SMB_column))\n", 653 | "\n", 654 | "HML_column = [[HML.loc[group[0]]] * size for group, size \\\n", 655 | " in zip(data.groupby(level=0), asset_list_sizes)]\n", 656 | "data['HML'] = list(itertools.chain(*HML_column))\n", 657 | "data = sm.add_constant(data.dropna())" 658 | ] 659 | }, 660 | { 661 | "cell_type": "code", 662 | "execution_count": 12, 663 | "metadata": {}, 664 | "outputs": [ 665 | { 666 | "data": { 667 | "text/html": [ 668 | "
\n", 669 | "\n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | "
constreturnSMBHML
dateorder_book_id
2012-01-04000004.XSHE1.0-0.0222500.0059830.005302
000028.XSHE1.0-0.0454330.0059830.005302
000150.XSHE1.0-0.0302950.0059830.005302
000153.XSHE1.0-0.0280530.0059830.005302
000403.XSHE1.00.0000000.0059830.005302
\n", 727 | "
" 728 | ], 729 | "text/plain": [ 730 | " const return SMB HML\n", 731 | "date order_book_id \n", 732 | "2012-01-04 000004.XSHE 1.0 -0.022250 0.005983 0.005302\n", 733 | " 000028.XSHE 1.0 -0.045433 0.005983 0.005302\n", 734 | " 000150.XSHE 1.0 -0.030295 0.005983 0.005302\n", 735 | " 000153.XSHE 1.0 -0.028053 0.005983 0.005302\n", 736 | " 000403.XSHE 1.0 0.000000 0.005983 0.005302" 737 | ] 738 | }, 739 | "execution_count": 12, 740 | "metadata": {}, 741 | "output_type": "execute_result" 742 | } 743 | ], 744 | "source": [ 745 | "data.head()" 746 | ] 747 | }, 748 | { 749 | "cell_type": "markdown", 750 | "metadata": {}, 751 | "source": [ 752 | "## Factor Exposures ($\\beta$)" 753 | ] 754 | }, 755 | { 756 | "cell_type": "code", 757 | "execution_count": 13, 758 | "metadata": {}, 759 | "outputs": [], 760 | "source": [ 761 | "assets = data.index.levels[1].unique()\n", 762 | "Y = [data.xs(asset,level=1)['return'] for asset in assets]\n", 763 | "X = [data.xs(asset,level=1)[['SMB','HML','const']] for asset in assets]\n", 764 | "reg_results = [regression.linear_model.OLS(y,x).fit().params for y,x in zip(Y,X) if not(x.empty or y.empty)]\n", 765 | "indices = [asset for y, x, asset in zip(Y, X, assets) if not(x.empty or y.empty)]\n", 766 | "betas = pd.DataFrame(reg_results, index=indices)" 767 | ] 768 | }, 769 | { 770 | "cell_type": "code", 771 | "execution_count": 15, 772 | "metadata": {}, 773 | "outputs": [ 774 | { 775 | "data": { 776 | "text/html": [ 777 | "
\n", 778 | "\n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | "
SMBHMLconst
000004.XSHE0.8839060.0487570.002002
000028.XSHE-0.003029-0.0642950.001073
000150.XSHE0.3541220.0660710.002031
000153.XSHE0.620706-0.0822290.001405
000403.XSHE2.03219211.457418-0.017412
\n", 820 | "
" 821 | ], 822 | "text/plain": [ 823 | " SMB HML const\n", 824 | "000004.XSHE 0.883906 0.048757 0.002002\n", 825 | "000028.XSHE -0.003029 -0.064295 0.001073\n", 826 | "000150.XSHE 0.354122 0.066071 0.002031\n", 827 | "000153.XSHE 0.620706 -0.082229 0.001405\n", 828 | "000403.XSHE 2.032192 11.457418 -0.017412" 829 | ] 830 | }, 831 | "execution_count": 15, 832 | "metadata": {}, 833 | "output_type": "execute_result" 834 | } 835 | ], 836 | "source": [ 837 | "betas.head()" 838 | ] 839 | }, 840 | { 841 | "cell_type": "markdown", 842 | "metadata": {}, 843 | "source": [ 844 | "## Factor Premium" 845 | ] 846 | }, 847 | { 848 | "cell_type": "code", 849 | "execution_count": 36, 850 | "metadata": {}, 851 | "outputs": [ 852 | { 853 | "data": { 854 | "text/html": [ 855 | "\n", 856 | "\n", 857 | "\n", 858 | " \n", 859 | "\n", 860 | "\n", 861 | " \n", 862 | "\n", 863 | "\n", 864 | " \n", 865 | "\n", 866 | "\n", 867 | " \n", 868 | "\n", 869 | "\n", 870 | " \n", 871 | "\n", 872 | "\n", 873 | " \n", 874 | "\n", 875 | "\n", 876 | " \n", 877 | "\n", 878 | "\n", 879 | " \n", 880 | "\n", 881 | "\n", 882 | " \n", 883 | "\n", 884 | "
OLS Regression Results
Dep. Variable: return R-squared: 0.398
Model: OLS Adj. R-squared: 0.391
Method: Least Squares F-statistic: 53.26
Date: Sat, 05 May 2018 Prob (F-statistic): 1.77e-18
Time: 21:03:25 Log-Likelihood: 1012.1
No. Observations: 164 AIC: -2018.
Df Residuals: 161 BIC: -2009.
Df Model: 2
Covariance Type: nonrobust
\n", 885 | "\n", 886 | "\n", 887 | " \n", 888 | "\n", 889 | "\n", 890 | " \n", 891 | "\n", 892 | "\n", 893 | " \n", 894 | "\n", 895 | "\n", 896 | " \n", 897 | "\n", 898 | "
coef std err t P>|t| [0.025 0.975]
const 0.0017 6.72e-05 24.956 0.000 0.002 0.002
SMB -7.597e-05 0.000 -0.599 0.550 -0.000 0.000
HML 0.0005 4.81e-05 9.695 0.000 0.000 0.001
\n", 899 | "\n", 900 | "\n", 901 | " \n", 902 | "\n", 903 | "\n", 904 | " \n", 905 | "\n", 906 | "\n", 907 | " \n", 908 | "\n", 909 | "\n", 910 | " \n", 911 | "\n", 912 | "
Omnibus: 39.154 Durbin-Watson: 1.906
Prob(Omnibus): 0.000 Jarque-Bera (JB): 78.545
Skew: 1.087 Prob(JB): 8.80e-18
Kurtosis: 5.601 Cond. No. 3.92
" 913 | ], 914 | "text/plain": [ 915 | "\n", 916 | "\"\"\"\n", 917 | " OLS Regression Results \n", 918 | "==============================================================================\n", 919 | "Dep. Variable: return R-squared: 0.398\n", 920 | "Model: OLS Adj. R-squared: 0.391\n", 921 | "Method: Least Squares F-statistic: 53.26\n", 922 | "Date: Sat, 05 May 2018 Prob (F-statistic): 1.77e-18\n", 923 | "Time: 21:03:25 Log-Likelihood: 1012.1\n", 924 | "No. Observations: 164 AIC: -2018.\n", 925 | "Df Residuals: 161 BIC: -2009.\n", 926 | "Df Model: 2 \n", 927 | "Covariance Type: nonrobust \n", 928 | "==============================================================================\n", 929 | " coef std err t P>|t| [0.025 0.975]\n", 930 | "------------------------------------------------------------------------------\n", 931 | "const 0.0017 6.72e-05 24.956 0.000 0.002 0.002\n", 932 | "SMB -7.597e-05 0.000 -0.599 0.550 -0.000 0.000\n", 933 | "HML 0.0005 4.81e-05 9.695 0.000 0.000 0.001\n", 934 | "==============================================================================\n", 935 | "Omnibus: 39.154 Durbin-Watson: 1.906\n", 936 | "Prob(Omnibus): 0.000 Jarque-Bera (JB): 78.545\n", 937 | "Skew: 1.087 Prob(JB): 8.80e-18\n", 938 | "Kurtosis: 5.601 Cond. No. 3.92\n", 939 | "==============================================================================\n", 940 | "\n", 941 | "Warnings:\n", 942 | "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n", 943 | "\"\"\"" 944 | ] 945 | }, 946 | "execution_count": 36, 947 | "metadata": {}, 948 | "output_type": "execute_result" 949 | } 950 | ], 951 | "source": [ 952 | "betas = sm.add_constant(betas.drop('const', axis=1))\n", 953 | "\n", 954 | "R = data['return'].mean(axis=0, level=1)\n", 955 | "\n", 956 | "# Second regression step: estimating the risk premia\n", 957 | "risk_free_rate = benchmark_df['return'].mean()\n", 958 | "\n", 959 | "final_results = regression.linear_model.OLS(R - risk_free_rate, betas).fit()\n", 960 | "\n", 961 | "final_results.summary()" 962 | ] 963 | }, 964 | { 965 | "cell_type": "markdown", 966 | "metadata": {}, 967 | "source": [ 968 | "## Fama-Macbeth Test Conclusion: \n", 969 | "although our individual factors are significant, we have a very low $R^2$ . What this may suggest is that there is a real link between our factors and the returns of our assets, but that there still remains a lot of unexplained noise!" 970 | ] 971 | }, 972 | { 973 | "cell_type": "code", 974 | "execution_count": null, 975 | "metadata": {}, 976 | "outputs": [], 977 | "source": [] 978 | } 979 | ], 980 | "metadata": { 981 | "kernelspec": { 982 | "display_name": "Python 3", 983 | "language": "python", 984 | "name": "python3" 985 | }, 986 | "language_info": { 987 | "codemirror_mode": { 988 | "name": "ipython", 989 | "version": 3 990 | }, 991 | "file_extension": ".py", 992 | "mimetype": "text/x-python", 993 | "name": "python", 994 | "nbconvert_exporter": "python", 995 | "pygments_lexer": "ipython3", 996 | "version": "3.5.4" 997 | } 998 | }, 999 | "nbformat": 4, 1000 | "nbformat_minor": 2 1001 | } 1002 | -------------------------------------------------------------------------------- /source/rqdata_utils.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import alphalens as al 3 | import matplotlib.pyplot as plt 4 | 5 | def price_reader(price_path): 6 | price_df = pd.read_csv(price_path) 7 | price_df.rename(index=str,columns={"Unnamed: 0":"date"},inplace=True) 8 | price_df.date = pd.to_datetime(price_df.date,format="%Y-%m-%d",errors='ignore') 9 | # price_df.date = price_df.date.apply(timezone.localize) 10 | price_df.set_index(['date'],drop=True,inplace=True) 11 | price_df = price_df.sortlevel(axis=1) 12 | return price_df 13 | 14 | def instrument_reader(instrument_path): 15 | instrument_df = pd.read_csv(instrument_path) 16 | instrument_df.drop(['Unnamed: 0'],axis=1,inplace=True) 17 | instrument_df = instrument_df.set_index(['bookId']) 18 | instrument_df = instrument_df.sort_index() 19 | return instrument_df 20 | 21 | def equity_reader(equity_path): 22 | cn_df = pd.read_csv(equity_path) 23 | cn_df.date = pd.to_datetime(cn_df.date,format="%Y-%m-%d",errors='ignore') 24 | cn_df.set_index(['date','order_book_id'],drop=True,inplace=True) 25 | cn_df.drop(["Unnamed: 0"],axis=1,inplace=True) 26 | return cn_df 27 | 28 | def equity_add_instrumentInfo(cn_df,instrument_df,instrument_column): 29 | instrumentInfoSeries = instrument_df[instrument_column] 30 | bookIdIdx = cn_df.index.get_level_values('order_book_id') 31 | bookIdArray = bookIdIdx.get_values() 32 | instrumentInfo = instrumentInfoSeries[bookIdArray[:]].values 33 | cn_df[instrument_column] = instrumentInfo 34 | return cn_df 35 | 36 | def get_price_instrument_equity(price_path,instrument_path,equity_path,addInstrumentColumn=None): 37 | price_df = price_reader(price_path) 38 | instrument_df = instrument_reader(instrument_path) 39 | equity_df = equity_reader(equity_path) 40 | if(addInstrumentColumn): 41 | equity_df = equity_add_instrumentInfo(equity_df,instrument_df,addInstrumentColumn) 42 | return price_df,instrument_df,equity_df 43 | 44 | def ic_analysis(equity_df, price_df, factor_columns, group_column, periods=(1,22,66), group_adjust=False): 45 | factor_list = [] 46 | ic_list = [] 47 | monthly_ic_list = [] 48 | groupby = equity_df[group_column] 49 | for col in factor_columns: 50 | factor_list.append(equity_df[col]) 51 | 52 | for my_factor in factor_list: 53 | factor_data = al.utils.get_clean_factor_and_forward_returns(factor=my_factor, 54 | prices=price_df, 55 | groupby=groupby, 56 | periods=periods, 57 | max_loss=1) 58 | mean_ic = al.performance.mean_information_coefficient(factor_data, group_adjust=group_adjust, 59 | by_group=True, 60 | by_time=None) 61 | mean_monthly_ic = al.performance.mean_information_coefficient(factor_data, group_adjust=group_adjust, 62 | by_group=False, 63 | by_time='M') 64 | print("#######################################################") 65 | print("factor: {}".format(my_factor.name)) 66 | print(mean_ic) 67 | # print(mean_monthly_ic) 68 | ic_list.append(mean_ic) 69 | monthly_ic_list.append(mean_monthly_ic) 70 | al.plotting.plot_monthly_ic_heatmap(mean_monthly_ic) 71 | plt.show() 72 | 73 | 74 | mean_ic_df = pd.concat(ic_list, keys=factor_columns) 75 | mean_ic_df.index = mean_ic_df.index.set_names(['factor', 'group']) 76 | return mean_ic_df, monthly_ic_list --------------------------------------------------------------------------------