├── .gitignore ├── FP1.1_Preprocess_Data.ipynb ├── FP1.2_Preprocess_Factors.ipynb ├── FP1.3_Factor_Modeling.ipynb ├── FP1.4_Backtesting.ipynb ├── Final Project.pptx ├── README.md ├── backtesting.png ├── backtesting_noncumulative.png ├── comparison1.png ├── comparison2.png ├── csi_500_constituent_info.gz ├── csi_500_data_preprocessed.csv ├── csi_500_data_preprocessed.gz ├── csi_500_data_raw.gz ├── factor_data.csv ├── factor_data.gz ├── factors_corr.png ├── final_project.py ├── industry_data.gz ├── pvalues.png ├── test_return.gz ├── winsorized_factors.gz └── workflow.md /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /FP1.1_Preprocess_Data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Preprocess Data" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 2, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "%matplotlib inline\n", 17 | "\n", 18 | "import matplotlib\n", 19 | "import matplotlib.pyplot as plt\n", 20 | "import numpy as np\n", 21 | "import pandas as pd\n", 22 | "import tushare as ts\n", 23 | "from WindPy import *\n", 24 | "import datetime\n", 25 | "import time\n", 26 | "\n", 27 | "%run final_project.py\n", 28 | "\n", 29 | "matplotlib.rcParams[\"figure.figsize\"] = (16, 9)" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 2, 35 | "metadata": {}, 36 | "outputs": [ 37 | { 38 | "name": "stdout", 39 | "output_type": "stream", 40 | "text": [ 41 | "Welcome to use Wind Quant API for Python (WindPy)!\n", 42 | "\n", 43 | "COPYRIGHT (C) 2017 WIND INFORMATION CO., LTD. ALL RIGHTS RESERVED.\n", 44 | "IN NO CIRCUMSTANCE SHALL WIND BE RESPONSIBLE FOR ANY DAMAGES OR LOSSES CAUSED BY USING WIND QUANT API FOR Python.\n" 45 | ] 46 | }, 47 | { 48 | "data": { 49 | "text/plain": [ 50 | ".ErrorCode=0\n", 51 | ".Data=[OK!]" 52 | ] 53 | }, 54 | "execution_count": 2, 55 | "metadata": {}, 56 | "output_type": "execute_result" 57 | } 58 | ], 59 | "source": [ 60 | "w.start()" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "### 1. Get CSI 500 constituent stocks and weights" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 3, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "# get current constituent stocks and weights of CSI 500 Index\n", 77 | "csi_500_constituent = w.wset(\"indexconstituent\", \"date=2019-04-26;windcode=000905.SH\")\n", 78 | "csi_500_constituent = pd.DataFrame(csi_500_constituent.Data, index=csi_500_constituent.Fields)\n", 79 | "csi_500_constituent = csi_500_constituent.T\n", 80 | "csi_500_constituent = csi_500_constituent.drop(columns=[\"date\"])" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 4, 86 | "metadata": {}, 87 | "outputs": [ 88 | { 89 | "data": { 90 | "text/html": [ 91 | "
\n", 92 | "\n", 105 | "\n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | "
wind_codesec_namei_weight
0000006.SZ深振业A0.159
1000008.SZ神州高铁0.26
2000009.SZ中国宝安0.36
3000012.SZ南玻A0.154
4000021.SZ深科技0.172
\n", 147 | "
" 148 | ], 149 | "text/plain": [ 150 | " wind_code sec_name i_weight\n", 151 | "0 000006.SZ 深振业A 0.159\n", 152 | "1 000008.SZ 神州高铁 0.26\n", 153 | "2 000009.SZ 中国宝安 0.36\n", 154 | "3 000012.SZ 南玻A 0.154\n", 155 | "4 000021.SZ 深科技 0.172" 156 | ] 157 | }, 158 | "execution_count": 4, 159 | "metadata": {}, 160 | "output_type": "execute_result" 161 | } 162 | ], 163 | "source": [ 164 | "csi_500_constituent.head()" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 5, 170 | "metadata": {}, 171 | "outputs": [], 172 | "source": [ 173 | "csi_500_constituent.to_pickle(\"csi_500_constituent_info.gz\")" 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "metadata": {}, 179 | "source": [ 180 | "### 2. Get monthly data of 500 constituent stocks from 2015-01-01 to 2019-04-26" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": null, 186 | "metadata": {}, 187 | "outputs": [], 188 | "source": [ 189 | "stock_data = pd.DataFrame(columns=[\"WINDCODE\", \"SEC_NAME\", \"INDEXCODE_SW\", \"INDUSTRY_SW\", \n", 190 | " \"EV\", \"PE_TTM\", \"PB_MRQ\", \"PS_TTM\", \"PCF_OCF_TTM\", \"EV2_TO_EBITDA\", \n", 191 | " \"ROE\", \"ROIC\", \"PROFITTOGR\", \"YOYPROFIT\", \"YOY_TR\", \"TURN\", \"CLOSE\"])\n", 192 | "\n", 193 | "show_time(\"start loop\")\n", 194 | "\n", 195 | "for code in csi_500_constituent.wind_code:\n", 196 | " apidata = w.wsd(code, \n", 197 | " \"windcode,sec_name,indexcode_sw,industry_sw,ev,pe_ttm,pb_mrq,ps_ttm,pcf_ocf_ttm,ev2_to_ebitda,roe,roic,profittogr,yoyprofit,yoy_tr,turn,close\", \n", 198 | " \"2015-01-01\", \"2019-4-26\", \n", 199 | " \"industryType=1;unit=1;Period=M;Fill=Previous,PriceAdj=F\")\n", 200 | " df = apidata_to_df(apidata)\n", 201 | " stock_data = pd.concat([stock_data, df], axis=0, join=\"outer\")\n", 202 | " \n", 203 | "show_time(\"end loop\")" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": 7, 209 | "metadata": {}, 210 | "outputs": [], 211 | "source": [ 212 | "# convert index into datetime object\n", 213 | "stock_data[\"date\"] = pd.to_datetime(stock_data.index, format=\"%Y-%m-%d\")\n", 214 | "stock_data.set_index(\"date\", inplace=True)\n", 215 | "# save as pickle file for convenience\n", 216 | "stock_data.to_pickle(\"csi_500_data_raw.gz\")" 217 | ] 218 | }, 219 | { 220 | "cell_type": "markdown", 221 | "metadata": {}, 222 | "source": [ 223 | "### 3. Preprocess data" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": 3, 229 | "metadata": {}, 230 | "outputs": [], 231 | "source": [ 232 | "data = pd.read_pickle(\"csi_500_data_raw.gz\")" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": 4, 238 | "metadata": {}, 239 | "outputs": [ 240 | { 241 | "data": { 242 | "text/html": [ 243 | "
\n", 244 | "\n", 257 | "\n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | "
WINDCODESEC_NAMEINDEXCODE_SWINDUSTRY_SWEVPE_TTMPB_MRQPS_TTMPCF_OCF_TTMEV2_TO_EBITDAROEROICPROFITTOGRYOYPROFITYOY_TRTURNCLOSE
date
2015-01-30000006.SZ深振业A801180.SI房地产8.77497e+0917.44012.239573.78038-9.3429413.0551NoneNoneNoneNoneNone50.85446.5
2015-02-27000006.SZ深振业A801180.SI房地产8.86947e+0917.62792.263693.82109-9.4435513.1681NoneNoneNoneNoneNone16.56376.57
2015-03-31000006.SZ深振业A801180.SI房地产1.188e+1023.61123.032045.11805-12.648917.78064.76033.03115.1535597.159428.38791.68818.8
2015-04-30000006.SZ深振业A801180.SI房地产1.57679e+1023.22463.636014.56375-9.0296722.43374.76033.03115.1535597.159428.387124.39911.68
2015-05-29000006.SZ深振业A801180.SI房地产2.10599e+1031.01924.856326.09542-12.060228.76714.76033.03115.1535597.159428.387102.96515.6
\n", 403 | "
" 404 | ], 405 | "text/plain": [ 406 | " WINDCODE SEC_NAME INDEXCODE_SW INDUSTRY_SW EV PE_TTM \\\n", 407 | "date \n", 408 | "2015-01-30 000006.SZ 深振业A 801180.SI 房地产 8.77497e+09 17.4401 \n", 409 | "2015-02-27 000006.SZ 深振业A 801180.SI 房地产 8.86947e+09 17.6279 \n", 410 | "2015-03-31 000006.SZ 深振业A 801180.SI 房地产 1.188e+10 23.6112 \n", 411 | "2015-04-30 000006.SZ 深振业A 801180.SI 房地产 1.57679e+10 23.2246 \n", 412 | "2015-05-29 000006.SZ 深振业A 801180.SI 房地产 2.10599e+10 31.0192 \n", 413 | "\n", 414 | " PB_MRQ PS_TTM PCF_OCF_TTM EV2_TO_EBITDA ROE ROIC \\\n", 415 | "date \n", 416 | "2015-01-30 2.23957 3.78038 -9.34294 13.0551 None None \n", 417 | "2015-02-27 2.26369 3.82109 -9.44355 13.1681 None None \n", 418 | "2015-03-31 3.03204 5.11805 -12.6489 17.7806 4.7603 3.031 \n", 419 | "2015-04-30 3.63601 4.56375 -9.02967 22.4337 4.7603 3.031 \n", 420 | "2015-05-29 4.85632 6.09542 -12.0602 28.7671 4.7603 3.031 \n", 421 | "\n", 422 | " PROFITTOGR YOYPROFIT YOY_TR TURN CLOSE \n", 423 | "date \n", 424 | "2015-01-30 None None None 50.8544 6.5 \n", 425 | "2015-02-27 None None None 16.5637 6.57 \n", 426 | "2015-03-31 15.1535 597.159 428.387 91.6881 8.8 \n", 427 | "2015-04-30 15.1535 597.159 428.387 124.399 11.68 \n", 428 | "2015-05-29 15.1535 597.159 428.387 102.965 15.6 " 429 | ] 430 | }, 431 | "execution_count": 4, 432 | "metadata": {}, 433 | "output_type": "execute_result" 434 | } 435 | ], 436 | "source": [ 437 | "data.head()" 438 | ] 439 | }, 440 | { 441 | "cell_type": "code", 442 | "execution_count": 123, 443 | "metadata": {}, 444 | "outputs": [ 445 | { 446 | "data": { 447 | "text/plain": [ 448 | "SEC_NAME False\n", 449 | "INDEXCODE_SW False\n", 450 | "INDUSTRY_SW False\n", 451 | "EV False\n", 452 | "PE_TTM False\n", 453 | "PB_MRQ False\n", 454 | "PS_TTM False\n", 455 | "PCF_OCF_TTM False\n", 456 | "EV2_TO_EBITDA True\n", 457 | "ROE False\n", 458 | "ROIC True\n", 459 | "PROFITTOGR False\n", 460 | "YOYPROFIT False\n", 461 | "YOY_TR False\n", 462 | "TURN False\n", 463 | "CLOSE False\n", 464 | "dtype: bool" 465 | ] 466 | }, 467 | "execution_count": 123, 468 | "metadata": {}, 469 | "output_type": "execute_result" 470 | } 471 | ], 472 | "source": [ 473 | "# fill nan with data of the nearest month\n", 474 | "for code in csi_500_constituent.wind_code:\n", 475 | " data[data[\"WINDCODE\"] == code] = data[data[\"WINDCODE\"] == code].fillna(method=\"ffill\", axis=0)\n", 476 | " data[data[\"WINDCODE\"] == code] = data[data[\"WINDCODE\"] == code].fillna(method=\"bfill\", axis=0)\n", 477 | "# check if there're still nans left\n", 478 | "pd.isnull(full_data).any()" 479 | ] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "execution_count": 127, 484 | "metadata": {}, 485 | "outputs": [ 486 | { 487 | "data": { 488 | "text/plain": [ 489 | "True" 490 | ] 491 | }, 492 | "execution_count": 127, 493 | "metadata": {}, 494 | "output_type": "execute_result" 495 | } 496 | ], 497 | "source": [ 498 | "# locate nans\n", 499 | "bools_1 = pd.isnull(data[\"EV2_TO_EBITDA\"])\n", 500 | "bools_2 = pd.isnull(data[\"ROIC\"])\n", 501 | "ind_1 = []\n", 502 | "for i in range(len(bools_1)):\n", 503 | " if bools_1[i]:\n", 504 | " ind_1.append(i)\n", 505 | "ind_2 = []\n", 506 | "for j in range(len(bools_2)):\n", 507 | " if bools_2[j]:\n", 508 | " ind_2.append(j)\n", 509 | "ind_1 == ind_2 # nans in EV/EBITDA and ROIC appearing from same companies" 510 | ] 511 | }, 512 | { 513 | "cell_type": "code", 514 | "execution_count": 128, 515 | "metadata": {}, 516 | "outputs": [ 517 | { 518 | "name": "stdout", 519 | "output_type": "stream", 520 | "text": [ 521 | "{'非银金融', '银行'}\n" 522 | ] 523 | } 524 | ], 525 | "source": [ 526 | "# figure out the corresponding industries\n", 527 | "ind_code = []\n", 528 | "for ind in ind_1:\n", 529 | " ind_code.append(data[\"INDUSTRY_SW\"][ind])\n", 530 | "print(set(ind_code)) # 10 companies in banking and non-bank financial industries have missing data" 531 | ] 532 | }, 533 | { 534 | "cell_type": "code", 535 | "execution_count": 143, 536 | "metadata": {}, 537 | "outputs": [ 538 | { 539 | "name": "stdout", 540 | "output_type": "stream", 541 | "text": [ 542 | "start loading: 2019-04-27 23:16:56:863190\n", 543 | "end loading: 2019-04-27 23:16:58:359194\n" 544 | ] 545 | } 546 | ], 547 | "source": [ 548 | "# get related industries' data from Wind to calculate the industries's average EV/EBITDA and ROIC\n", 549 | "\n", 550 | "show_time(\"start loading\")\n", 551 | "\n", 552 | "industry_mktcap = w.wses(\"1000012612000000,1000012613000000\", \"sec_mkt_cap_today_sum_chn\", \"2015-01-01\", \"2019-04-26\", \"Period=M;Fill=Previous\")\n", 553 | "industry_ebitdatosales = w.wses(\"1000012612000000,1000012613000000\", \"sec_ebitdatosales_overall_glb\", \"2015-01-01\", \"2019-04-26\", \"Period=M;Fill=Previous\")\n", 554 | "industry_gr = w.wses(\"1000012612000000,1000012613000000\", \"sec_gr_sum_chn\", \"2015-01-01\", \"2019-04-26\", \"Period=M;Fill=Previous\")\n", 555 | "industry_roic = w.wses(\"1000012612000000,1000012613000000\", \"sec_roic_avg_glb\", \"2015-01-01\", \"2019-04-26\", \"Period=M;Fill=Previous\")\n", 556 | "\n", 557 | "show_time(\"end loading\")" 558 | ] 559 | }, 560 | { 561 | "cell_type": "code", 562 | "execution_count": 144, 563 | "metadata": {}, 564 | "outputs": [], 565 | "source": [ 566 | "# convert the data into dataframe\n", 567 | "df1 = to_industry_df(industry_mktcap)\n", 568 | "df2 = to_industry_df(industry_ebitdatosales)\n", 569 | "df3 = to_industry_df(industry_gr)\n", 570 | "df4 = to_industry_df(industry_roic)\n", 571 | "# concat dataframes together\n", 572 | "industry_data = pd.concat([df1, df2[\"SEC_EBITDATOSALES_OVERALL_GLB\"], df3[\"SEC_GR_SUM_CHN\"], df4[\"SEC_ROIC_AVG_GLB\"]], \n", 573 | " axis=1, join=\"outer\")\n", 574 | "# convert index into datetime object\n", 575 | "industry_data[\"date\"] = pd.to_datetime(industry_data.index, format=\"%Y-%m-%d\")\n", 576 | "industry_data.set_index(\"date\", inplace=True)\n", 577 | "# fill nan with data of the nearest month\n", 578 | "industry_data[industry_data[\"INDUSTRY_SW\"] == \"银行\"] = industry_data[industry_data[\"INDUSTRY_SW\"] == \"银行\"].fillna(method=\"ffill\", axis=0)\n", 579 | "industry_data[industry_data[\"INDUSTRY_SW\"] == \"银行\"] = industry_data[industry_data[\"INDUSTRY_SW\"] == \"银行\"].fillna(method=\"bfill\", axis=0)\n", 580 | "industry_data[industry_data[\"INDUSTRY_SW\"] == \"非银金融\"] = industry_data[industry_data[\"INDUSTRY_SW\"] == \"非银金融\"].fillna(method=\"ffill\", axis=0)\n", 581 | "industry_data[industry_data[\"INDUSTRY_SW\"] == \"非银金融\"] = industry_data[industry_data[\"INDUSTRY_SW\"] == \"非银金融\"].fillna(method=\"bfill\", axis=0)" 582 | ] 583 | }, 584 | { 585 | "cell_type": "code", 586 | "execution_count": 152, 587 | "metadata": {}, 588 | "outputs": [], 589 | "source": [ 590 | "# calculate industries' EV/EBITDA \n", 591 | "industry_data[\"EV2_TO_EBITDA\"] = industry_data[\"SEC_MKT_CAP_TODAY_SUM_CHN\"] / (industry_data[\"SEC_EBITDATOSALES_OVERALL_GLB\"] * industry_data[\"SEC_GR_SUM_CHN\"])\n", 592 | "# deal with the abnormal values\n", 593 | "for i in range(len(industry_data)):\n", 594 | " if np.isinf(industry_data[\"EV2_TO_EBITDA\"][i]):\n", 595 | " industry_data[\"EV2_TO_EBITDA\"][i] = np.nan\n", 596 | "industry_data[industry_data[\"INDUSTRY_SW\"] == \"银行\"] = industry_data[industry_data[\"INDUSTRY_SW\"] == \"银行\"].fillna(method=\"ffill\", axis=0)\n", 597 | "industry_data[industry_data[\"INDUSTRY_SW\"] == \"银行\"] = industry_data[industry_data[\"INDUSTRY_SW\"] == \"银行\"].fillna(method=\"bfill\", axis=0)\n", 598 | "industry_data[industry_data[\"INDUSTRY_SW\"] == \"非银金融\"] = industry_data[industry_data[\"INDUSTRY_SW\"] == \"非银金融\"].fillna(method=\"ffill\", axis=0)\n", 599 | "industry_data[industry_data[\"INDUSTRY_SW\"] == \"非银金融\"] = industry_data[industry_data[\"INDUSTRY_SW\"] == \"非银金融\"].fillna(method=\"bfill\", axis=0)\n", 600 | "# drop needless columns\n", 601 | "industry_data = industry_data.drop(columns=[\"SEC_MKT_CAP_TODAY_SUM_CHN\", \"SEC_EBITDATOSALES_OVERALL_GLB\", \"SEC_GR_SUM_CHN\"])" 602 | ] 603 | }, 604 | { 605 | "cell_type": "code", 606 | "execution_count": 164, 607 | "metadata": {}, 608 | "outputs": [], 609 | "source": [ 610 | "industry_data.to_pickle(\"industry_data.gz\")" 611 | ] 612 | }, 613 | { 614 | "cell_type": "code", 615 | "execution_count": 168, 616 | "metadata": {}, 617 | "outputs": [], 618 | "source": [ 619 | "# fill the nans with industry's average\n", 620 | "for ind in ind_1: # loop through the nans\n", 621 | " if data[\"INDUSTRY_SW\"][ind] == \"银行\": # see which industry it belongs to\n", 622 | " date = data.index[ind]\n", 623 | " for index, row in industry_data[industry_data[\"INDUSTRY_SW\"] == \"银行\"].iterrows():\n", 624 | " if index == date:\n", 625 | " data[\"EV2_TO_EBITDA\"][ind] = row[\"EV2_TO_EBITDA\"]\n", 626 | " data[\"ROIC\"][ind] = row[\"SEC_ROIC_AVG_GLB\"]\n", 627 | " if data[\"INDUSTRY_SW\"][ind] == \"非银金融\":\n", 628 | " date = data.index[ind]\n", 629 | " for index, row in industry_data[industry_data[\"INDUSTRY_SW\"] == \"非银金融\"].iterrows():\n", 630 | " if index == date:\n", 631 | " data[\"EV2_TO_EBITDA\"][ind] = row[\"EV2_TO_EBITDA\"]\n", 632 | " data[\"ROIC\"][ind] = row[\"SEC_ROIC_AVG_GLB\"]" 633 | ] 634 | }, 635 | { 636 | "cell_type": "code", 637 | "execution_count": 170, 638 | "metadata": {}, 639 | "outputs": [ 640 | { 641 | "data": { 642 | "text/plain": [ 643 | "WINDCODE False\n", 644 | "SEC_NAME False\n", 645 | "INDEXCODE_SW False\n", 646 | "INDUSTRY_SW False\n", 647 | "EV False\n", 648 | "PE_TTM False\n", 649 | "PB_MRQ False\n", 650 | "PS_TTM False\n", 651 | "PCF_OCF_TTM False\n", 652 | "EV2_TO_EBITDA False\n", 653 | "ROE False\n", 654 | "ROIC False\n", 655 | "PROFITTOGR False\n", 656 | "YOYPROFIT False\n", 657 | "YOY_TR False\n", 658 | "TURN False\n", 659 | "CLOSE False\n", 660 | "dtype: bool" 661 | ] 662 | }, 663 | "execution_count": 170, 664 | "metadata": {}, 665 | "output_type": "execute_result" 666 | } 667 | ], 668 | "source": [ 669 | "pd.isnull(data).any() # check if there're still nans" 670 | ] 671 | }, 672 | { 673 | "cell_type": "code", 674 | "execution_count": 173, 675 | "metadata": {}, 676 | "outputs": [], 677 | "source": [ 678 | "data.to_pickle(\"csi_500_data_preprocessed.gz\")" 679 | ] 680 | } 681 | ], 682 | "metadata": { 683 | "kernelspec": { 684 | "display_name": "Python 3", 685 | "language": "python", 686 | "name": "python3" 687 | }, 688 | "language_info": { 689 | "codemirror_mode": { 690 | "name": "ipython", 691 | "version": 3 692 | }, 693 | "file_extension": ".py", 694 | "mimetype": "text/x-python", 695 | "name": "python", 696 | "nbconvert_exporter": "python", 697 | "pygments_lexer": "ipython3", 698 | "version": "3.7.3" 699 | } 700 | }, 701 | "nbformat": 4, 702 | "nbformat_minor": 2 703 | } 704 | -------------------------------------------------------------------------------- /FP1.2_Preprocess_Factors.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Preprocess Factors" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 2, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "%matplotlib inline\n", 17 | "\n", 18 | "import matplotlib\n", 19 | "import matplotlib.pyplot as plt\n", 20 | "import numpy as np\n", 21 | "import pandas as pd\n", 22 | "import tushare as ts\n", 23 | "from WindPy import *\n", 24 | "import datetime\n", 25 | "import time\n", 26 | "import math \n", 27 | "from statsmodels import regression, stats\n", 28 | "import statsmodels.api as sm\n", 29 | "\n", 30 | "%run final_project.py\n", 31 | "\n", 32 | "matplotlib.rcParams[\"figure.figsize\"] = (16, 9)" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 3, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "csi_500_data = pd.read_pickle(\"csi_500_data_preprocessed.gz\")" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "### 1. Calculate next month's return" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 4, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "# calculate monthly return\n", 58 | "# increase the scale to make factor betas easier to read\n", 59 | "csi_500_data[\"NEXT_RETURN\"] = (csi_500_data[\"CLOSE\"] / csi_500_data[\"CLOSE\"].shift(1) - 1) * 100\n", 60 | "csi_500_data[\"NEXT_RETURN\"] = csi_500_data[\"NEXT_RETURN\"].shift(-1)\n", 61 | "# trim the last month with no next month's return\n", 62 | "csi_500_data = csi_500_data[\"2015-01-01\":\"2019-03-31\"]\n", 63 | "# sort the data by index for cross-sectional regression\n", 64 | "csi_500_data = csi_500_data.sort_index()" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 5, 70 | "metadata": {}, 71 | "outputs": [ 72 | { 73 | "data": { 74 | "text/html": [ 75 | "
\n", 76 | "\n", 89 | "\n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | "
WINDCODESEC_NAMEINDEXCODE_SWINDUSTRY_SWEVPE_TTMPB_MRQPS_TTMPCF_OCF_TTMEV2_TO_EBITDAROEROICPROFITTOGRYOYPROFITYOY_TRTURNCLOSENEXT_RETURN
date
2015-01-30000006.SZ深振业A801180.SI房地产8.77497e+0917.44012.239573.78038-9.3429413.05514.76033.03115.1535597.159428.38750.85446.51.07692
2015-01-30000656.SZ金科股份801180.SI房地产2.17947e+1030.36532.712921.25632-2.4980851.01881.0530.23686.2844-62.3157-13.703398.802215.811.45478
2015-01-30002517.SZ恺英网络801130.SI传媒2.54769e+09-122.0024.083977.134812939.07-154.1250.70140.71925.0912145.8313.16492.0515314.410
2015-01-30600872.SH中炬高新801120.SI食品饮料9.66321e+0933.71974.24413.7250347.204421.87792.23581.98038.9345-25.7219-2.862952.802812.139.15087
2015-01-30002544.SZ杰赛科技801770.SI通信1.4529e+10151.60513.35917.95045-124.90486.26761.0651.09673.0641-3.873510.910430.554828.173.26589
\n", 242 | "
" 243 | ], 244 | "text/plain": [ 245 | " WINDCODE SEC_NAME INDEXCODE_SW INDUSTRY_SW EV PE_TTM \\\n", 246 | "date \n", 247 | "2015-01-30 000006.SZ 深振业A 801180.SI 房地产 8.77497e+09 17.4401 \n", 248 | "2015-01-30 000656.SZ 金科股份 801180.SI 房地产 2.17947e+10 30.3653 \n", 249 | "2015-01-30 002517.SZ 恺英网络 801130.SI 传媒 2.54769e+09 -122.002 \n", 250 | "2015-01-30 600872.SH 中炬高新 801120.SI 食品饮料 9.66321e+09 33.7197 \n", 251 | "2015-01-30 002544.SZ 杰赛科技 801770.SI 通信 1.4529e+10 151.605 \n", 252 | "\n", 253 | " PB_MRQ PS_TTM PCF_OCF_TTM EV2_TO_EBITDA ROE ROIC \\\n", 254 | "date \n", 255 | "2015-01-30 2.23957 3.78038 -9.34294 13.0551 4.7603 3.031 \n", 256 | "2015-01-30 2.71292 1.25632 -2.49808 51.0188 1.053 0.2368 \n", 257 | "2015-01-30 4.08397 7.13481 2939.07 -154.125 0.7014 0.7192 \n", 258 | "2015-01-30 4.2441 3.72503 47.2044 21.8779 2.2358 1.9803 \n", 259 | "2015-01-30 13.3591 7.95045 -124.904 86.2676 1.065 1.0967 \n", 260 | "\n", 261 | " PROFITTOGR YOYPROFIT YOY_TR TURN CLOSE NEXT_RETURN \n", 262 | "date \n", 263 | "2015-01-30 15.1535 597.159 428.387 50.8544 6.5 1.07692 \n", 264 | "2015-01-30 6.2844 -62.3157 -13.7033 98.8022 15.81 1.45478 \n", 265 | "2015-01-30 5.0912 145.83 13.1649 2.05153 14.41 0 \n", 266 | "2015-01-30 8.9345 -25.7219 -2.8629 52.8028 12.13 9.15087 \n", 267 | "2015-01-30 3.0641 -3.8735 10.9104 30.5548 28.17 3.26589 " 268 | ] 269 | }, 270 | "execution_count": 5, 271 | "metadata": {}, 272 | "output_type": "execute_result" 273 | } 274 | ], 275 | "source": [ 276 | "csi_500_data.head()" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": 6, 282 | "metadata": {}, 283 | "outputs": [ 284 | { 285 | "name": "stdout", 286 | "output_type": "stream", 287 | "text": [ 288 | "53347050139.36038\n", 289 | "1611123200.0\n", 290 | "52.0827\n", 291 | "-49.3532\n" 292 | ] 293 | } 294 | ], 295 | "source": [ 296 | "# print(csi_500_data.loc[\"2015-02-27\", \"EV\"].max())\n", 297 | "# print(csi_500_data.loc[\"2015-02-27\", \"EV\"].min())\n", 298 | "# print(csi_500_data.loc[\"2015-02-27\", \"ROE\"].max())\n", 299 | "# print(csi_500_data.loc[\"2015-02-27\", \"ROE\"].min())" 300 | ] 301 | }, 302 | { 303 | "cell_type": "markdown", 304 | "metadata": {}, 305 | "source": [ 306 | "### 2. Winsorize, standardize and neutralize factor values" 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": 58, 312 | "metadata": {}, 313 | "outputs": [], 314 | "source": [ 315 | "factor_cols = [\"EV\", \"PE_TTM\", \"PB_MRQ\", \"PS_TTM\", \"PCF_OCF_TTM\", \"EV2_TO_EBITDA\", \n", 316 | " \"ROE\", \"ROIC\", \"PROFITTOGR\", \"YOYPROFIT\", \"YOY_TR\", \"TURN\"]" 317 | ] 318 | }, 319 | { 320 | "cell_type": "markdown", 321 | "metadata": {}, 322 | "source": [ 323 | "#### 2.1 Winsorize\n", 324 | "Trim the outliers at the tail (here we use 2.5 percentile as limit)." 325 | ] 326 | }, 327 | { 328 | "cell_type": "code", 329 | "execution_count": 7, 330 | "metadata": {}, 331 | "outputs": [], 332 | "source": [ 333 | "def winsorize(df, factor, min=0.025, max=0.975):\n", 334 | " \"\"\" Quantile Method \"\"\"\n", 335 | " sort = df[factor].sort_values()\n", 336 | " q = sort.quantile([min, max])\n", 337 | " return np.clip(df[factor], q.iloc[0], q.iloc[1])\n", 338 | "\n", 339 | "# def winsorize(series, n=3):\n", 340 | "# \"\"\" Median Absolute Deviation Method \"\"\"\n", 341 | "# median = series.quantile(n)\n", 342 | "# new_median = ((series - median).abs()).quantile(n)\n", 343 | "# max_range = median + new_median * n\n", 344 | "# min_range = median - new_median * n\n", 345 | "# return np.clip(series, min_range, max_range)" 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": 8, 351 | "metadata": {}, 352 | "outputs": [ 353 | { 354 | "name": "stdout", 355 | "output_type": "stream", 356 | "text": [ 357 | "start loop: 2019-04-30 10:09:55:675264\n", 358 | "end loop: 2019-04-30 10:27:38:808778\n" 359 | ] 360 | } 361 | ], 362 | "source": [ 363 | "show_time(\"start loop\")\n", 364 | "\n", 365 | "for date in csi_500_data.index:\n", 366 | " for factor in factor_cols:\n", 367 | " csi_500_data.loc[date, factor] = winsorize(csi_500_data.loc[date], factor)\n", 368 | " \n", 369 | "show_time(\"end loop\") # run for about 20 mins, inefficient code needed to be improved" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": 9, 375 | "metadata": {}, 376 | "outputs": [ 377 | { 378 | "name": "stdout", 379 | "output_type": "stream", 380 | "text": [ 381 | "35778658816.96001\n", 382 | "3276000000.0\n", 383 | "26.54500000000001\n", 384 | "-3.4263000000000003\n" 385 | ] 386 | } 387 | ], 388 | "source": [ 389 | "print(csi_500_data.loc[\"2015-02-27\", \"EV\"].max())\n", 390 | "print(csi_500_data.loc[\"2015-02-27\", \"EV\"].min())\n", 391 | "print(csi_500_data.loc[\"2015-02-27\", \"ROE\"].max())\n", 392 | "print(csi_500_data.loc[\"2015-02-27\", \"ROE\"].min())" 393 | ] 394 | }, 395 | { 396 | "cell_type": "markdown", 397 | "metadata": {}, 398 | "source": [ 399 | "#### 2.2 Standardize\n", 400 | "Convert factor values into z-scores." 401 | ] 402 | }, 403 | { 404 | "cell_type": "code", 405 | "execution_count": 46, 406 | "metadata": {}, 407 | "outputs": [], 408 | "source": [ 409 | "# standardize factor values\n", 410 | "def standardize(df, factor_cols):\n", 411 | " df[factor_cols] = (df[factor_cols] - df[factor_cols].groupby(\"date\").sum() / 500) / df[factor_cols].groupby(\"date\").std()\n", 412 | " return df" 413 | ] 414 | }, 415 | { 416 | "cell_type": "code", 417 | "execution_count": 47, 418 | "metadata": {}, 419 | "outputs": [], 420 | "source": [ 421 | "csi_500_factors = standardize(csi_500_data, factor_cols)" 422 | ] 423 | }, 424 | { 425 | "cell_type": "code", 426 | "execution_count": 48, 427 | "metadata": {}, 428 | "outputs": [ 429 | { 430 | "data": { 431 | "text/html": [ 432 | "
\n", 433 | "\n", 446 | "\n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | "
WINDCODESEC_NAMEINDEXCODE_SWINDUSTRY_SWEVPE_TTMPB_MRQPS_TTMPCF_OCF_TTMEV2_TO_EBITDAROEROICPROFITTOGRYOYPROFITYOY_TRTURNCLOSENEXT_RETURN
date
2015-01-30000006.SZ深振业A801180.SI房地产-0.788196-0.384012-0.69018-0.359667-0.364825-0.6361060.2788370.08234150.3752123.584474.06479-0.1114526.51.07692
2015-01-30000656.SZ金科股份801180.SI房地产0.909253-0.24109-0.591124-0.709688-0.3164790.816005-0.386612-0.540042-0.265779-0.556602-0.6095870.97258715.811.45478
2015-01-30002517.SZ恺英网络801130.SI传媒-1.5242-1.92591-0.3042070.1055054.10622-1.13014-0.449723-0.432591-0.3520140.848102-0.0838537-1.2148214.410
2015-01-30600872.SH中炬高新801120.SI食品饮料-0.672391-0.203998-0.270697-0.3673430.0345743-0.298632-0.174303-0.151693-0.0742496-0.309643-0.397472-0.067399812.139.15087
2015-01-30002544.SZ杰赛科技801770.SI通信-0.03801951.099531.636770.218613-1.181042.16427-0.384458-0.348507-0.498517-0.162196-0.127968-0.570428.173.26589
\n", 599 | "
" 600 | ], 601 | "text/plain": [ 602 | " WINDCODE SEC_NAME INDEXCODE_SW INDUSTRY_SW EV PE_TTM \\\n", 603 | "date \n", 604 | "2015-01-30 000006.SZ 深振业A 801180.SI 房地产 -0.788196 -0.384012 \n", 605 | "2015-01-30 000656.SZ 金科股份 801180.SI 房地产 0.909253 -0.24109 \n", 606 | "2015-01-30 002517.SZ 恺英网络 801130.SI 传媒 -1.5242 -1.92591 \n", 607 | "2015-01-30 600872.SH 中炬高新 801120.SI 食品饮料 -0.672391 -0.203998 \n", 608 | "2015-01-30 002544.SZ 杰赛科技 801770.SI 通信 -0.0380195 1.09953 \n", 609 | "\n", 610 | " PB_MRQ PS_TTM PCF_OCF_TTM EV2_TO_EBITDA ROE ROIC \\\n", 611 | "date \n", 612 | "2015-01-30 -0.69018 -0.359667 -0.364825 -0.636106 0.278837 0.0823415 \n", 613 | "2015-01-30 -0.591124 -0.709688 -0.316479 0.816005 -0.386612 -0.540042 \n", 614 | "2015-01-30 -0.304207 0.105505 4.10622 -1.13014 -0.449723 -0.432591 \n", 615 | "2015-01-30 -0.270697 -0.367343 0.0345743 -0.298632 -0.174303 -0.151693 \n", 616 | "2015-01-30 1.63677 0.218613 -1.18104 2.16427 -0.384458 -0.348507 \n", 617 | "\n", 618 | " PROFITTOGR YOYPROFIT YOY_TR TURN CLOSE NEXT_RETURN \n", 619 | "date \n", 620 | "2015-01-30 0.375212 3.58447 4.06479 -0.111452 6.5 1.07692 \n", 621 | "2015-01-30 -0.265779 -0.556602 -0.609587 0.972587 15.81 1.45478 \n", 622 | "2015-01-30 -0.352014 0.848102 -0.0838537 -1.21482 14.41 0 \n", 623 | "2015-01-30 -0.0742496 -0.309643 -0.397472 -0.0673998 12.13 9.15087 \n", 624 | "2015-01-30 -0.498517 -0.162196 -0.127968 -0.5704 28.17 3.26589 " 625 | ] 626 | }, 627 | "execution_count": 48, 628 | "metadata": {}, 629 | "output_type": "execute_result" 630 | } 631 | ], 632 | "source": [ 633 | "csi_500_factors.head()" 634 | ] 635 | }, 636 | { 637 | "cell_type": "markdown", 638 | "metadata": {}, 639 | "source": [ 640 | "#### 2.3 Neutralize\n", 641 | "For instance, P/B in banking industry is much lower than that in TMT. Therefore, we should eliminate the impact of industrial betas to avoid concentration of our stock selection model, by extracting residual in multiple linear regression between factor value and dummy variables of industires." 642 | ] 643 | }, 644 | { 645 | "cell_type": "code", 646 | "execution_count": 27, 647 | "metadata": {}, 648 | "outputs": [ 649 | { 650 | "name": "stdout", 651 | "output_type": "stream", 652 | "text": [ 653 | "start loop: 2019-04-30 10:49:37:841766\n", 654 | "end loop: 2019-04-30 10:54:59:545887\n" 655 | ] 656 | } 657 | ], 658 | "source": [ 659 | "show_time(\"start loop\")\n", 660 | "\n", 661 | "# add dummy variables to each row on the basis of corresponding industry\n", 662 | "industry_list = list(set(csi_500_factors[\"INDUSTRY_SW\"]))\n", 663 | "industry_df = pd.DataFrame(columns=industry_list, index=csi_500_factors.index)\n", 664 | "industry_df = pd.concat([csi_500_factors, industry_df], axis=1, join=\"outer\")\n", 665 | "# set the corresponding industry column to 1\n", 666 | "for i in range(len(industry_df)):\n", 667 | " industry_name = industry_df.iloc[i, 3]\n", 668 | " cols = list(industry_df.columns)\n", 669 | " for j in range(len(cols)):\n", 670 | " if cols[j] == industry_name:\n", 671 | " col_num = j\n", 672 | " industry_df.iloc[i, col_num] = 1\n", 673 | "# set the value of dummy variables of other industries to 0\n", 674 | "industry_df = industry_df.fillna(0)\n", 675 | "\n", 676 | "show_time(\"end loop\")" 677 | ] 678 | }, 679 | { 680 | "cell_type": "code", 681 | "execution_count": 64, 682 | "metadata": {}, 683 | "outputs": [ 684 | { 685 | "data": { 686 | "text/html": [ 687 | "
\n", 688 | "\n", 701 | "\n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | " \n", 860 | " \n", 861 | " \n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | "
WINDCODESEC_NAMEINDEXCODE_SWINDUSTRY_SWEVPE_TTMPB_MRQPS_TTMPCF_OCF_TTMEV2_TO_EBITDA...休闲服务交通运输建筑装饰医药生物汽车综合钢铁房地产公用事业传媒
date
2015-01-30000006.SZ深振业A801180.SI房地产-0.788196-0.383454-0.249332-0.112060-0.138478-0.761099...0000000100
2015-01-30000656.SZ金科股份801180.SI房地产0.909253-0.142172-0.290587-0.549539-0.1213180.798007...0000000100
2015-01-30002517.SZ恺英网络801130.SI传媒-1.524204-2.132754-0.758481-0.5169063.636857-1.456548...0000000001
2015-01-30600872.SH中炬高新801120.SI食品饮料-0.672391-0.021666-0.169467-0.1434360.1328260.052961...0000000000
2015-01-30002544.SZ杰赛科技801770.SI通信-0.0380201.2629481.6219210.062658-1.3607392.008192...0000000000
\n", 875 | "

5 rows × 46 columns

\n", 876 | "
" 877 | ], 878 | "text/plain": [ 879 | " WINDCODE SEC_NAME INDEXCODE_SW INDUSTRY_SW EV PE_TTM \\\n", 880 | "date \n", 881 | "2015-01-30 000006.SZ 深振业A 801180.SI 房地产 -0.788196 -0.383454 \n", 882 | "2015-01-30 000656.SZ 金科股份 801180.SI 房地产 0.909253 -0.142172 \n", 883 | "2015-01-30 002517.SZ 恺英网络 801130.SI 传媒 -1.524204 -2.132754 \n", 884 | "2015-01-30 600872.SH 中炬高新 801120.SI 食品饮料 -0.672391 -0.021666 \n", 885 | "2015-01-30 002544.SZ 杰赛科技 801770.SI 通信 -0.038020 1.262948 \n", 886 | "\n", 887 | " PB_MRQ PS_TTM PCF_OCF_TTM EV2_TO_EBITDA ... 休闲服务 交通运输 \\\n", 888 | "date ... \n", 889 | "2015-01-30 -0.249332 -0.112060 -0.138478 -0.761099 ... 0 0 \n", 890 | "2015-01-30 -0.290587 -0.549539 -0.121318 0.798007 ... 0 0 \n", 891 | "2015-01-30 -0.758481 -0.516906 3.636857 -1.456548 ... 0 0 \n", 892 | "2015-01-30 -0.169467 -0.143436 0.132826 0.052961 ... 0 0 \n", 893 | "2015-01-30 1.621921 0.062658 -1.360739 2.008192 ... 0 0 \n", 894 | "\n", 895 | " 建筑装饰 医药生物 汽车 综合 钢铁 房地产 公用事业 传媒 \n", 896 | "date \n", 897 | "2015-01-30 0 0 0 0 0 1 0 0 \n", 898 | "2015-01-30 0 0 0 0 0 1 0 0 \n", 899 | "2015-01-30 0 0 0 0 0 0 0 1 \n", 900 | "2015-01-30 0 0 0 0 0 0 0 0 \n", 901 | "2015-01-30 0 0 0 0 0 0 0 0 \n", 902 | "\n", 903 | "[5 rows x 46 columns]" 904 | ] 905 | }, 906 | "execution_count": 64, 907 | "metadata": {}, 908 | "output_type": "execute_result" 909 | } 910 | ], 911 | "source": [ 912 | "industry_df.head()" 913 | ] 914 | }, 915 | { 916 | "cell_type": "code", 917 | "execution_count": 33, 918 | "metadata": {}, 919 | "outputs": [], 920 | "source": [ 921 | "def neutralize(df, factor):\n", 922 | " y = df[factor]\n", 923 | " x = df.iloc[:, 18:46]\n", 924 | " result = sm.OLS(y, x).fit()\n", 925 | " return result.resid # return residual" 926 | ] 927 | }, 928 | { 929 | "cell_type": "code", 930 | "execution_count": 35, 931 | "metadata": {}, 932 | "outputs": [ 933 | { 934 | "name": "stdout", 935 | "output_type": "stream", 936 | "text": [ 937 | "start loop: 2019-04-30 11:00:07:874836\n", 938 | "end loop: 2019-04-30 11:41:56:515849\n" 939 | ] 940 | } 941 | ], 942 | "source": [ 943 | "show_time(\"start loop\")\n", 944 | "\n", 945 | "# replace factor values with residuals\n", 946 | "for date in industry_df.index:\n", 947 | " for factor in factor_cols:\n", 948 | " industry_df.loc[date, factor] = neutralize(industry_df.loc[date], factor)\n", 949 | " \n", 950 | "show_time(\"end loop\")" 951 | ] 952 | }, 953 | { 954 | "cell_type": "code", 955 | "execution_count": 37, 956 | "metadata": {}, 957 | "outputs": [], 958 | "source": [ 959 | "factor_data = industry_df.drop(columns=industry_list)" 960 | ] 961 | }, 962 | { 963 | "cell_type": "code", 964 | "execution_count": 38, 965 | "metadata": {}, 966 | "outputs": [], 967 | "source": [ 968 | "factor_data.to_pickle(\"factor_data.gz\")" 969 | ] 970 | } 971 | ], 972 | "metadata": { 973 | "kernelspec": { 974 | "display_name": "Python 3", 975 | "language": "python", 976 | "name": "python3" 977 | }, 978 | "language_info": { 979 | "codemirror_mode": { 980 | "name": "ipython", 981 | "version": 3 982 | }, 983 | "file_extension": ".py", 984 | "mimetype": "text/x-python", 985 | "name": "python", 986 | "nbconvert_exporter": "python", 987 | "pygments_lexer": "ipython3", 988 | "version": "3.7.3" 989 | } 990 | }, 991 | "nbformat": 4, 992 | "nbformat_minor": 2 993 | } 994 | -------------------------------------------------------------------------------- /FP1.4_Backtesting.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Backtesting" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 39, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "%matplotlib inline\n", 17 | "\n", 18 | "import seaborn as sns\n", 19 | "import matplotlib\n", 20 | "import matplotlib.pyplot as plt\n", 21 | "import numpy as np\n", 22 | "import pandas as pd\n", 23 | "import tushare as ts\n", 24 | "from WindPy import *\n", 25 | "import datetime\n", 26 | "import time\n", 27 | "import math \n", 28 | "from statsmodels import regression, stats\n", 29 | "import statsmodels.api as sm\n", 30 | "from pandas.plotting import register_matplotlib_converters\n", 31 | "\n", 32 | "register_matplotlib_converters()\n", 33 | "\n", 34 | "%run final_project.py\n", 35 | "\n", 36 | "matplotlib.rcParams[\"figure.figsize\"] = (16, 9)" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "### 1. Get CSI 500 Index price information" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 2, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "tsapi = \"4f025f8dd1a96fd251f95c75d65a1023db3a27f8c9f3bc964ebd51d9\"\n", 53 | "pro = ts.pro_api(tsapi)" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 3, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "df = pro.index_daily(ts_code='000905.SH', start_date='20180101', end_date='20190426')" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 4, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "converted_timestamp = pd.to_datetime(df[\"trade_date\"], format=\"%Y-%m-%d\")\n", 72 | "df.index = converted_timestamp\n", 73 | "df = df.drop(columns=[\"trade_date\"])\n", 74 | "df = df.sort_index()" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 5, 80 | "metadata": {}, 81 | "outputs": [ 82 | { 83 | "data": { 84 | "text/html": [ 85 | "
\n", 86 | "\n", 99 | "\n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | "
ts_codecloseopenhighlowpre_closechangepct_chgvolamount
trade_date
2019-04-22000905.SH5723.04175821.02295824.52075704.97695810.1798-87.1381-1.4997158161026.0143410616.8
2019-04-23000905.SH5628.06245710.73345710.73345610.73535723.0417-94.9793-1.6596149871461.0133768248.9
2019-04-24000905.SH5679.72565636.40165682.47885568.40805628.062451.66320.9180134784412.0121970724.6
2019-04-25000905.SH5458.41945652.00565662.78985457.82545679.7256-221.3062-3.8964151401718.0132098714.9
2019-04-26000905.SH5408.03065427.62285496.03535407.19425458.4194-50.3888-0.9231124279880.0108970816.5
\n", 196 | "
" 197 | ], 198 | "text/plain": [ 199 | " ts_code close open high low pre_close \\\n", 200 | "trade_date \n", 201 | "2019-04-22 000905.SH 5723.0417 5821.0229 5824.5207 5704.9769 5810.1798 \n", 202 | "2019-04-23 000905.SH 5628.0624 5710.7334 5710.7334 5610.7353 5723.0417 \n", 203 | "2019-04-24 000905.SH 5679.7256 5636.4016 5682.4788 5568.4080 5628.0624 \n", 204 | "2019-04-25 000905.SH 5458.4194 5652.0056 5662.7898 5457.8254 5679.7256 \n", 205 | "2019-04-26 000905.SH 5408.0306 5427.6228 5496.0353 5407.1942 5458.4194 \n", 206 | "\n", 207 | " change pct_chg vol amount \n", 208 | "trade_date \n", 209 | "2019-04-22 -87.1381 -1.4997 158161026.0 143410616.8 \n", 210 | "2019-04-23 -94.9793 -1.6596 149871461.0 133768248.9 \n", 211 | "2019-04-24 51.6632 0.9180 134784412.0 121970724.6 \n", 212 | "2019-04-25 -221.3062 -3.8964 151401718.0 132098714.9 \n", 213 | "2019-04-26 -50.3888 -0.9231 124279880.0 108970816.5 " 214 | ] 215 | }, 216 | "execution_count": 5, 217 | "metadata": {}, 218 | "output_type": "execute_result" 219 | } 220 | ], 221 | "source": [ 222 | "df.tail()" 223 | ] 224 | }, 225 | { 226 | "cell_type": "markdown", 227 | "metadata": {}, 228 | "source": [ 229 | "### 2. Compare the return of the index and our enhanced indexing strategy" 230 | ] 231 | }, 232 | { 233 | "cell_type": "markdown", 234 | "metadata": {}, 235 | "source": [ 236 | "2.1 Cumulative monthly return" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": 21, 242 | "metadata": {}, 243 | "outputs": [], 244 | "source": [ 245 | "test_return = pd.read_pickle(\"test_return.gz\")" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": 7, 251 | "metadata": {}, 252 | "outputs": [], 253 | "source": [ 254 | "test_return[\"NEXT_RETURN\"] = test_return[\"NEXT_RETURN\"].cumsum() # strategy's cumulative return" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": 8, 260 | "metadata": {}, 261 | "outputs": [ 262 | { 263 | "data": { 264 | "text/html": [ 265 | "
\n", 266 | "\n", 279 | "\n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | "
NEXT_RETURN
2018-01-31-4.74539
2018-02-2828.5398
2018-03-3024.1962
2018-04-2726.1281
2018-05-3113.8019
2018-06-2916.6309
2018-07-318.91487
2018-08-318.03425
2018-09-28-3.5057
2018-10-316.82129
2018-11-304.23708
2018-12-284.83581
2019-01-3131.6054
2019-02-2840.82
2019-03-2942.3805
\n", 349 | "
" 350 | ], 351 | "text/plain": [ 352 | " NEXT_RETURN\n", 353 | "2018-01-31 -4.74539\n", 354 | "2018-02-28 28.5398\n", 355 | "2018-03-30 24.1962\n", 356 | "2018-04-27 26.1281\n", 357 | "2018-05-31 13.8019\n", 358 | "2018-06-29 16.6309\n", 359 | "2018-07-31 8.91487\n", 360 | "2018-08-31 8.03425\n", 361 | "2018-09-28 -3.5057\n", 362 | "2018-10-31 6.82129\n", 363 | "2018-11-30 4.23708\n", 364 | "2018-12-28 4.83581\n", 365 | "2019-01-31 31.6054\n", 366 | "2019-02-28 40.82\n", 367 | "2019-03-29 42.3805" 368 | ] 369 | }, 370 | "execution_count": 8, 371 | "metadata": {}, 372 | "output_type": "execute_result" 373 | } 374 | ], 375 | "source": [ 376 | "test_return" 377 | ] 378 | }, 379 | { 380 | "cell_type": "code", 381 | "execution_count": 23, 382 | "metadata": {}, 383 | "outputs": [], 384 | "source": [ 385 | "# calculate next month's returns of the index\n", 386 | "dates = test_return.index\n", 387 | "df_dates = df.loc[dates]\n", 388 | "df_dates[\"NEXT_RETURN\"] = (df_dates[\"close\"] / df_dates[\"close\"].shift(1) - 1) * 100\n", 389 | "df_dates[\"NEXT_RETURN\"] = df_dates[\"NEXT_RETURN\"].shift(-1)\n", 390 | "# add next month's return for the last month here\n", 391 | "df_dates.iloc[-1, -1] = (df_dates.iloc[-1, 1] / df.iloc[-1, 1] - 1) * 100" 392 | ] 393 | }, 394 | { 395 | "cell_type": "code", 396 | "execution_count": 10, 397 | "metadata": {}, 398 | "outputs": [], 399 | "source": [ 400 | "df_dates[\"NEXT_RETURN\"] = df_dates[\"NEXT_RETURN\"].cumsum() # index's cumulative return" 401 | ] 402 | }, 403 | { 404 | "cell_type": "code", 405 | "execution_count": 11, 406 | "metadata": {}, 407 | "outputs": [ 408 | { 409 | "data": { 410 | "text/plain": [ 411 | "2018-01-31 -2.676606\n", 412 | "2018-02-28 -1.164885\n", 413 | "2018-03-30 -5.314770\n", 414 | "2018-04-27 -7.132025\n", 415 | "2018-05-31 -16.458859\n", 416 | "2018-06-29 -17.015567\n", 417 | "2018-07-31 -24.221451\n", 418 | "2018-08-31 -24.514192\n", 419 | "2018-09-28 -35.516208\n", 420 | "2018-10-31 -33.079741\n", 421 | "2018-11-30 -37.846280\n", 422 | "2018-12-28 -37.643845\n", 423 | "2019-01-31 -17.320064\n", 424 | "2019-02-28 -6.925347\n", 425 | "2019-03-29 -4.343529\n", 426 | "Name: NEXT_RETURN, dtype: float64" 427 | ] 428 | }, 429 | "execution_count": 11, 430 | "metadata": {}, 431 | "output_type": "execute_result" 432 | } 433 | ], 434 | "source": [ 435 | "df_dates[\"NEXT_RETURN\"]" 436 | ] 437 | }, 438 | { 439 | "cell_type": "code", 440 | "execution_count": 12, 441 | "metadata": {}, 442 | "outputs": [], 443 | "source": [ 444 | "diff = test_return[\"NEXT_RETURN\"] - df_dates[\"NEXT_RETURN\"] # difference between strategy's return and index's return" 445 | ] 446 | }, 447 | { 448 | "cell_type": "code", 449 | "execution_count": 13, 450 | "metadata": {}, 451 | "outputs": [ 452 | { 453 | "data": { 454 | "text/plain": [ 455 | "2018-01-31 -2.06879\n", 456 | "2018-02-28 29.7047\n", 457 | "2018-03-30 29.5109\n", 458 | "2018-04-27 33.2601\n", 459 | "2018-05-31 30.2608\n", 460 | "2018-06-29 33.6464\n", 461 | "2018-07-31 33.1363\n", 462 | "2018-08-31 32.5484\n", 463 | "2018-09-28 32.0105\n", 464 | "2018-10-31 39.901\n", 465 | "2018-11-30 42.0834\n", 466 | "2018-12-28 42.4797\n", 467 | "2019-01-31 48.9255\n", 468 | "2019-02-28 47.7454\n", 469 | "2019-03-29 46.7241\n", 470 | "Name: NEXT_RETURN, dtype: object" 471 | ] 472 | }, 473 | "execution_count": 13, 474 | "metadata": {}, 475 | "output_type": "execute_result" 476 | } 477 | ], 478 | "source": [ 479 | "diff" 480 | ] 481 | }, 482 | { 483 | "cell_type": "code", 484 | "execution_count": 14, 485 | "metadata": {}, 486 | "outputs": [ 487 | { 488 | "data": { 489 | "image/png": "\n", 490 | "text/plain": [ 491 | "
" 492 | ] 493 | }, 494 | "metadata": { 495 | "needs_background": "light" 496 | }, 497 | "output_type": "display_data" 498 | } 499 | ], 500 | "source": [ 501 | "fig, ax = plt.subplots(figsize=(16, 9))\n", 502 | "plt.plot(test_return[\"NEXT_RETURN\"], \"r-\", label=\"Strategy's Return\")\n", 503 | "plt.plot(df_dates[\"NEXT_RETURN\"], \"g-\", label=\"CSI 500 Index's Return\")\n", 504 | "plt.bar(diff.index, diff, width=10, label=\"Active Return\")\n", 505 | "plt.legend(loc=\"lower right\")\n", 506 | "plt.grid(axis=\"y\")\n", 507 | "plt.ylabel(\"Return(%)\")\n", 508 | "plt.savefig(\"backtesting.png\")" 509 | ] 510 | }, 511 | { 512 | "cell_type": "markdown", 513 | "metadata": {}, 514 | "source": [ 515 | "2.2 Noncumulative monthly return" 516 | ] 517 | }, 518 | { 519 | "cell_type": "code", 520 | "execution_count": 48, 521 | "metadata": {}, 522 | "outputs": [ 523 | { 524 | "data": { 525 | "text/html": [ 526 | "
\n", 527 | "\n", 540 | "\n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | "
strategy return
2018-01-31 00:00:00-4.74539
2018-02-28 00:00:0033.2852
2018-03-30 00:00:00-4.34367
2018-04-27 00:00:001.93191
2018-05-31 00:00:00-12.3262
2018-06-29 00:00:002.82897
2018-07-31 00:00:00-7.71601
2018-08-31 00:00:00-0.880618
2018-09-28 00:00:00-11.5399
2018-10-31 00:00:0010.327
2018-11-30 00:00:00-2.58421
2018-12-28 00:00:000.598739
2019-01-31 00:00:0026.7696
2019-02-28 00:00:009.21462
2019-03-29 00:00:001.56049
\n", 610 | "
" 611 | ], 612 | "text/plain": [ 613 | " strategy return\n", 614 | "2018-01-31 00:00:00 -4.74539\n", 615 | "2018-02-28 00:00:00 33.2852\n", 616 | "2018-03-30 00:00:00 -4.34367\n", 617 | "2018-04-27 00:00:00 1.93191\n", 618 | "2018-05-31 00:00:00 -12.3262\n", 619 | "2018-06-29 00:00:00 2.82897\n", 620 | "2018-07-31 00:00:00 -7.71601\n", 621 | "2018-08-31 00:00:00 -0.880618\n", 622 | "2018-09-28 00:00:00 -11.5399\n", 623 | "2018-10-31 00:00:00 10.327\n", 624 | "2018-11-30 00:00:00 -2.58421\n", 625 | "2018-12-28 00:00:00 0.598739\n", 626 | "2019-01-31 00:00:00 26.7696\n", 627 | "2019-02-28 00:00:00 9.21462\n", 628 | "2019-03-29 00:00:00 1.56049" 629 | ] 630 | }, 631 | "execution_count": 48, 632 | "metadata": {}, 633 | "output_type": "execute_result" 634 | } 635 | ], 636 | "source": [ 637 | "test_return" 638 | ] 639 | }, 640 | { 641 | "cell_type": "code", 642 | "execution_count": 49, 643 | "metadata": {}, 644 | "outputs": [ 645 | { 646 | "data": { 647 | "text/plain": [ 648 | "2018-01-31 -2.676606\n", 649 | "2018-02-28 1.511721\n", 650 | "2018-03-30 -4.149885\n", 651 | "2018-04-27 -1.817255\n", 652 | "2018-05-31 -9.326834\n", 653 | "2018-06-29 -0.556708\n", 654 | "2018-07-31 -7.205884\n", 655 | "2018-08-31 -0.292742\n", 656 | "2018-09-28 -11.002015\n", 657 | "2018-10-31 2.436467\n", 658 | "2018-11-30 -4.766539\n", 659 | "2018-12-28 0.202436\n", 660 | "2019-01-31 20.323781\n", 661 | "2019-02-28 10.394717\n", 662 | "2019-03-29 2.581818\n", 663 | "Name: NEXT_RETURN, dtype: float64" 664 | ] 665 | }, 666 | "execution_count": 49, 667 | "metadata": {}, 668 | "output_type": "execute_result" 669 | } 670 | ], 671 | "source": [ 672 | "df_dates[\"NEXT_RETURN\"]" 673 | ] 674 | }, 675 | { 676 | "cell_type": "code", 677 | "execution_count": 31, 678 | "metadata": {}, 679 | "outputs": [], 680 | "source": [ 681 | "test_return = test_return.rename(index=str, columns={\"NEXT_RETURN\": \"strategy return\"})\n", 682 | "comp = pd.concat([test_return, df_dates[\"NEXT_RETURN\"]], axis=1, join=\"outer\")\n", 683 | "comp = comp.rename(index=str, columns={\"NEXT_RETURN\": \"index return\"})\n", 684 | "comp[\"diff\"] = comp[\"strategy return\"] - comp[\"index return\"]" 685 | ] 686 | }, 687 | { 688 | "cell_type": "code", 689 | "execution_count": 47, 690 | "metadata": {}, 691 | "outputs": [ 692 | { 693 | "data": { 694 | "text/html": [ 695 | "
\n", 696 | "\n", 709 | "\n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | "
strategy returnindex returndiff
2018-01-31 00:00:00-4.74539-2.676606-2.06879
2018-02-28 00:00:0033.28521.51172131.7735
2018-03-30 00:00:00-4.34367-4.149885-0.193786
2018-04-27 00:00:001.93191-1.8172553.74917
2018-05-31 00:00:00-12.3262-9.326834-2.99934
2018-06-29 00:00:002.82897-0.5567083.38568
2018-07-31 00:00:00-7.71601-7.205884-0.510125
2018-08-31 00:00:00-0.880618-0.292742-0.587877
2018-09-28 00:00:00-11.5399-11.002015-0.537933
2018-10-31 00:00:0010.3272.4364677.89052
2018-11-30 00:00:00-2.58421-4.7665392.18233
2018-12-28 00:00:000.5987390.2024360.396303
2019-01-31 00:00:0026.769620.3237816.44582
2019-02-28 00:00:009.2146210.394717-1.1801
2019-03-29 00:00:001.560492.581818-1.02133
\n", 811 | "
" 812 | ], 813 | "text/plain": [ 814 | " strategy return index return diff\n", 815 | "2018-01-31 00:00:00 -4.74539 -2.676606 -2.06879\n", 816 | "2018-02-28 00:00:00 33.2852 1.511721 31.7735\n", 817 | "2018-03-30 00:00:00 -4.34367 -4.149885 -0.193786\n", 818 | "2018-04-27 00:00:00 1.93191 -1.817255 3.74917\n", 819 | "2018-05-31 00:00:00 -12.3262 -9.326834 -2.99934\n", 820 | "2018-06-29 00:00:00 2.82897 -0.556708 3.38568\n", 821 | "2018-07-31 00:00:00 -7.71601 -7.205884 -0.510125\n", 822 | "2018-08-31 00:00:00 -0.880618 -0.292742 -0.587877\n", 823 | "2018-09-28 00:00:00 -11.5399 -11.002015 -0.537933\n", 824 | "2018-10-31 00:00:00 10.327 2.436467 7.89052\n", 825 | "2018-11-30 00:00:00 -2.58421 -4.766539 2.18233\n", 826 | "2018-12-28 00:00:00 0.598739 0.202436 0.396303\n", 827 | "2019-01-31 00:00:00 26.7696 20.323781 6.44582\n", 828 | "2019-02-28 00:00:00 9.21462 10.394717 -1.1801\n", 829 | "2019-03-29 00:00:00 1.56049 2.581818 -1.02133" 830 | ] 831 | }, 832 | "execution_count": 47, 833 | "metadata": {}, 834 | "output_type": "execute_result" 835 | } 836 | ], 837 | "source": [ 838 | "comp" 839 | ] 840 | }, 841 | { 842 | "cell_type": "code", 843 | "execution_count": 46, 844 | "metadata": {}, 845 | "outputs": [ 846 | { 847 | "data": { 848 | "image/png": "\n", 849 | "text/plain": [ 850 | "
" 851 | ] 852 | }, 853 | "metadata": { 854 | "needs_background": "light" 855 | }, 856 | "output_type": "display_data" 857 | } 858 | ], 859 | "source": [ 860 | "fig, ax = plt.subplots(figsize=(16, 9))\n", 861 | "plt.plot(comp[\"strategy return\"], \"r-\", label=\"Strategy's Return\")\n", 862 | "plt.plot(comp[\"index return\"], \"g-\", label=\"CSI 500 Index's Return\")\n", 863 | "plt.bar(comp.index, comp[\"diff\"], width=0.8, label=\"Active Return\")\n", 864 | "plt.legend(loc=\"lower right\")\n", 865 | "plt.grid(axis=\"y\")\n", 866 | "plt.ylabel(\"Return(%)\")\n", 867 | "plt.xticks(rotation=45)\n", 868 | "plt.savefig(\"backtesting_noncumulative.png\")" 869 | ] 870 | } 871 | ], 872 | "metadata": { 873 | "kernelspec": { 874 | "display_name": "Python 3", 875 | "language": "python", 876 | "name": "python3" 877 | }, 878 | "language_info": { 879 | "codemirror_mode": { 880 | "name": "ipython", 881 | "version": 3 882 | }, 883 | "file_extension": ".py", 884 | "mimetype": "text/x-python", 885 | "name": "python", 886 | "nbconvert_exporter": "python", 887 | "pygments_lexer": "ipython3", 888 | "version": "3.7.3" 889 | } 890 | }, 891 | "nbformat": 4, 892 | "nbformat_minor": 2 893 | } 894 | -------------------------------------------------------------------------------- /Final Project.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chichihua/Multi-Factor-Model/f5cbcf0f7d65b11810e1d71523506a395fc44c2c/Final Project.pptx -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Final Project: Multi-Factor Model (Regression Method) 2 | 3 | ### Objectives: 4 | Use Multi-Factor Model to build an enhanced indexing strategy for CSI 500 Index; 5 | Capture a relatively high active return. 6 | 7 | ### Outlines: 8 | 1. Get data of CSI 500 constituent stocks from Chinese local information server, Wind, and preprocess the data; 9 | 2. Select factors such as size (market cap), value (P/E, P/B, P/S, P/CF, EV/EBITDA), profitability (net profit margin, ROE, ROIC), growth (year-on-year revenue growth rate, year-on-year net income growth rate), trade (turnover rate); 10 | 3. Use OLS (or LASSO which might be more optimal) to define factor's sensitivity, test each factor’s effectiveness and eliminate redundant factors with multicollinearity and low r-square; 11 | 4. Build an algo to allocate active weights on different constituent stocks on the basis of the expected next month's return predicted by the regression model (give more weights on stocks with relatively higher expected return and vice versa); 12 | 5. Backtesting: test the effectiveness of the model and check the portfolio’s alpha over the benchmark. 13 | -------------------------------------------------------------------------------- /backtesting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chichihua/Multi-Factor-Model/f5cbcf0f7d65b11810e1d71523506a395fc44c2c/backtesting.png -------------------------------------------------------------------------------- /backtesting_noncumulative.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chichihua/Multi-Factor-Model/f5cbcf0f7d65b11810e1d71523506a395fc44c2c/backtesting_noncumulative.png -------------------------------------------------------------------------------- /comparison1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chichihua/Multi-Factor-Model/f5cbcf0f7d65b11810e1d71523506a395fc44c2c/comparison1.png -------------------------------------------------------------------------------- /comparison2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chichihua/Multi-Factor-Model/f5cbcf0f7d65b11810e1d71523506a395fc44c2c/comparison2.png -------------------------------------------------------------------------------- /csi_500_constituent_info.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chichihua/Multi-Factor-Model/f5cbcf0f7d65b11810e1d71523506a395fc44c2c/csi_500_constituent_info.gz -------------------------------------------------------------------------------- /csi_500_data_preprocessed.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chichihua/Multi-Factor-Model/f5cbcf0f7d65b11810e1d71523506a395fc44c2c/csi_500_data_preprocessed.gz -------------------------------------------------------------------------------- /csi_500_data_raw.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chichihua/Multi-Factor-Model/f5cbcf0f7d65b11810e1d71523506a395fc44c2c/csi_500_data_raw.gz -------------------------------------------------------------------------------- /factor_data.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chichihua/Multi-Factor-Model/f5cbcf0f7d65b11810e1d71523506a395fc44c2c/factor_data.gz -------------------------------------------------------------------------------- /factors_corr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chichihua/Multi-Factor-Model/f5cbcf0f7d65b11810e1d71523506a395fc44c2c/factors_corr.png -------------------------------------------------------------------------------- /final_project.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | import pandas as pd 5 | import tushare as ts 6 | from WindPy import * 7 | import datetime 8 | import time 9 | import math 10 | from statsmodels import regression, stats 11 | import statsmodels.api as sm 12 | 13 | matplotlib.rcParams["figure.figsize"] = (14, 6) 14 | 15 | 16 | def show_time(label_string): 17 | t = time.time() 18 | st = datetime.datetime.fromtimestamp(t).strftime("%Y-%m-%d %H:%M:%S:%f") 19 | print(label_string + ": " + st) 20 | 21 | 22 | def apidata_to_df(apidata): 23 | df = pd.DataFrame(apidata.Data, index=apidata.Fields, columns=apidata.Times) 24 | df = df.T 25 | return df 26 | 27 | 28 | def to_industry_df(apidata): 29 | df1 = pd.DataFrame(apidata.Data[0], index=apidata.Times, columns=apidata.Fields) 30 | df1["INDUSTRY_SW"] = "银行" 31 | df2 = pd.DataFrame(apidata.Data[1], index=apidata.Times, columns=apidata.Fields) 32 | df2["INDUSTRY_SW"] = "非银金融" 33 | df = pd.concat([df1, df2], axis=0, join="outer") 34 | return df 35 | 36 | -------------------------------------------------------------------------------- /industry_data.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chichihua/Multi-Factor-Model/f5cbcf0f7d65b11810e1d71523506a395fc44c2c/industry_data.gz -------------------------------------------------------------------------------- /pvalues.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chichihua/Multi-Factor-Model/f5cbcf0f7d65b11810e1d71523506a395fc44c2c/pvalues.png -------------------------------------------------------------------------------- /test_return.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chichihua/Multi-Factor-Model/f5cbcf0f7d65b11810e1d71523506a395fc44c2c/test_return.gz -------------------------------------------------------------------------------- /winsorized_factors.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chichihua/Multi-Factor-Model/f5cbcf0f7d65b11810e1d71523506a395fc44c2c/winsorized_factors.gz -------------------------------------------------------------------------------- /workflow.md: -------------------------------------------------------------------------------- 1 | # Final Project: Enhanced Indexing Strategy with Multi-Factor Model 2 | 3 | Group: 4 4 | 5 | Members: Jiahua Jiang, Bo Sun, Baowen Cao 6 | 7 | Date: 2019/04/30 8 | 9 | ## Project Introduction 10 | 11 | In this project, we built up an Enhanced Indexing Strategy of China Securities Index (CSI) 500 on the basis of Arbitrage Pricing Theory (APT), by applying the Multi-Factor Model with regression method to predict the expected return of CSI 500 constituent stocks and determining the active weights in accord with our prediction. 12 | 13 | ## Dataset 14 | 15 | The stock price and financial data were retrieved from Chinese information server Wind and Tushare. 16 | 17 | ## Workflow 18 | 19 | * Collect and Preprocess Data 20 | - Get current CSI 500 constituent stocks and weights from Wind (csi_500_constituent_info.gz) 21 | - Get monthly price and factor information of 500 constituent stocks from 2015-01-01 to 2019-04-26 (csi_500_data_raw.gz) 22 | - Size 23 | - Market Capitalization: "EV" 24 | - Value 25 | - Trailing Twelve Months P/E Ratio: "PE_TTM" 26 | - Most Resent Quarter P/B Ratio: "PB_MRQ" 27 | - Trailing Twelve Months P/S Ratio: "PS_TTM" 28 | - Trailing Twelve Months P/CF(Operating Cash Flow) Ratio: "PCF_OCF_TTM" 29 | - EV/EBITDA Ratio: "EV2_TO_EBITDA" 30 | - Profitability 31 | - Return on Equity: "ROE" 32 | - Return on Invested Capital: "ROIC" 33 | - Net Profit Margin: "PROFITTOGR" 34 | - Growth 35 | - Year-over-Year Net Income Growth Rate: "YOYPROFIT" 36 | - Year-over-Year Revenue Growth Rate: "YOY_TR" 37 | - Trading 38 | - Turnover Rate: "TURN" 39 | - Preprocess data (csi_500_data_preprocessed.gz) 40 | - Fill nans with data of the nearest month 41 | - Check if there're any nans left and locate them 42 | - EV/EBITDA and ROIC of 10 companies in banking and non-bank financial industries are missing 43 | - Fill nans remaining with industry's average 44 | * Preprocess Factors 45 | - Calculate next month's return as dependent variable for future regression 46 | - Winsorize, standardize and neutralize factor values (factor_data.gz) 47 | - Winsorize: Trim the outliers at the tail (2.5 percentile) to avoid anomalies 48 | - Standardize: Convert factor values into z-scores to eliminate discrepancies in number scale 49 | - Neutralize: Eliminate the impact of industrial betas to avoid concentration of our stock selection model, by extracting residual in multiple linear regression between factor value and dummy variables of industries 50 | * Factor Modeling and Strategy Construction 51 | - Separate the data into training and testing 52 | - Training data: 2015-01-01 to 2017-12-31 53 | - Testing Data: 2018-01-01 to 2019-04-26 54 | - Multiple linear regression (OLS) between next month's return and factor values 55 | - Improve the regression model 56 | - Sort the p-values to check the significance level of each factor's exposure (slope coefficient) 57 | - 58 | - Plot a heatmap of the correlation matrix of factors to tell if there's multicollinearity 59 | - 60 | - Remove redundant factors of high correlation with other factors and high p-values (low significance level) 61 | - Factor remaining: "EV", "PB_MRQ", "EV2_TO_EBITDA", "ROE", "YOYPROFIT", "YOY_TR", "TURN" 62 | - Rerun multiple linear regression and compare the results 63 | - 64 | - 65 | - Build up an enhanced indexing strategy on the basis of model prediction 66 | - Clearing all positions and opening new positions at the end of each month 67 | - Using the multi-factor model to generate the score (prediction of expected return) of each constituent stock 68 | - Sort the constituent stocks by their scores and separate them into 10 groups 69 | - Set a multiplier for each group to put more weight on high-score stocks and vice versa 70 | - Clean up if there's still weight remaining 71 | - Evenly separated into 10 parts to buy top 10 stocks 72 | - Record the weighted return 73 | * Backtesting 74 | - Get CSI 500 price information from Tushare 75 | - Check the active return: compare the return of the index and our enhanced indexing strategy 76 | - Cumulative Monthly Return 77 | 78 | - Noncumulative Monthly Return 79 | 80 | - Performance evaluation 81 | - Cumulative return of the testing period: 42.38% 82 | - Cumulative active return: 46.72% 83 | - Information ratio: 0.36 84 | 85 | ## Summary 86 | * Benefits: 87 | - The enhanced indexing strategy outperforms the benchmark index to a considerable extent in 7 among 15 months, and has low drawbacks the month it performs worse than the benchmark 88 | - The model is meaningful on both statistical and economic aspects given the predictions are made on the basis of different catogories of factors 89 | - Since CSI 500 has a large sample size and the testing period is long, the multi-factor model tends to be more effective 90 | - Algorithmic model can be better than subjective judgment when dealing with stock selection in a large stock pool such as CSI 500 91 | * Limitations: 92 | - The model ignores trading cost which will narrower the active return 93 | - The model ignores free-float market capitalization which may lead to problems in stock trading 94 | - The model ignores specific investment amount when using 1 as total weight, not taking liquidity need and market impact in consideration 95 | - The OLS method might not be optimal given it has a low r-squared and the process of removing redundant factors is subjective, though the slope coefficients are significant, using LASSO (Least Absolute Shrinkage and Selection Operator) instead might be better 96 | - The effectiveness of the model might be changing over time --------------------------------------------------------------------------------