├── .gitignore
├── APT_FammaMacbeth.ipynb
├── Alpha Trading Workflow.pdf
├── KalmanFilterIntro.ipynb
├── README.md
├── README.pdf
├── README_old.md
├── Step1_FactorPretest.ipynb
├── Step2_FactorsScreening-Copy1.ipynb
├── Step2_FactorsScreening.ipynb
├── Step3_FactorCombination_AdaBoost_Quantopian.ipynb
├── Step3_FactorCombination_AdaBoost_Quantopian_old.ipynb
├── Step3_FactorCombination_BarraKalmanFilter.ipynb
├── output
    └── factor_ic_analysis.csv
├── report
    ├── Alpha Trading Workflow.md
    ├── Corr_matrix_for_factor_ranks.png
    ├── Corr_matrix_for_raw_factors.png
    ├── Quantitative Strategy Workflow.pptx
    ├── Step3_FactorCombination_AdaBoost_Quantopian.html
    ├── adaboost_algorithm.png
    ├── corr_comparison_after_pca_analysis.png
    ├── mean_spearmans_rank_IC.png
    ├── mean_spearmans_rank_IC_absolute_value.png
    ├── rank_of_mean_spearmans_rank_IC_absolute_value.png
    ├── test_accuracy_bar.png
    ├── test_score_dist.png
    ├── train_accuracy_bar.png
    ├── train_score_dist.png
    └── train_score_dist2.png
├── rqdata_utils.py
└── source
    ├── DownloadData.ipynb
    ├── DownloadData_bak.ipynb
    ├── FactorAnalysis.ipynb
    ├── FactorModeling.ipynb
    ├── FactorsScreening.ipynb
    ├── KalmanFilter.ipynb
    ├── MultiFactorModel.ipynb
    └── rqdata_utils.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Spesific files #
 2 | ###################
 3 | cn_*.csv
 4 | *-checkpoint.ipynb
 5 | 
 6 | # Spesific directors #
 7 | # #################
 8 | .idea/
 9 | .ipython_checkpoints/
10 | __pycache__/
11 | 
12 | # Backup #
13 | ###################
14 | *.pyo
15 | *.pyc
16 | *~
17 | *.bak
18 | *.swp
19 | *#
20 | 
21 | # Images #
22 | ###################
23 | #*.jpg
24 | *.gif
25 | #*.png
26 | *.svg
27 | *.ico
28 | 
29 | # Compiled source #
30 | ###################
31 | *.com
32 | *.class
33 | *.dll
34 | *.exe
35 | *.o
36 | *.so
37 | 
38 | # Packages #
39 | ############
40 | # it's better to unpack these files and commit the raw source
41 | # git has its own built in compression methods
42 | *.7z
43 | *.dmg
44 | *.gz
45 | *.iso
46 | *.jar
47 | *.rar
48 | *.tar
49 | *.zip
50 | 
51 | # Logs and databases #
52 | ######################
53 | *.log
54 | *.sql
55 | *.sqlite
56 | 
57 | # OS generated files #
58 | ######################
59 | .DS_Store
60 | .DS_Store?
61 | ._*
62 | .Spotlight-V100
63 | .Trashes
64 | ehthumbs.db
65 | Thumbs.db
66 | 


--------------------------------------------------------------------------------
/APT_FammaMacbeth.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# APT model: Famma-Macbeth Regression"
   8 |    ]
   9 |   },
  10 |   {
  11 |    "cell_type": "code",
  12 |    "execution_count": 1,
  13 |    "metadata": {
  14 |     "collapsed": true
  15 |    },
  16 |    "outputs": [],
  17 |    "source": [
  18 |     "from rqdata_utils import *\n",
  19 |     "import pandas\n",
  20 |     "import numpy as np\n",
  21 |     "import scipy as sp\n",
  22 |     "import alphalens as al\n",
  23 |     "%matplotlib inline"
  24 |    ]
  25 |   },
  26 |   {
  27 |    "cell_type": "markdown",
  28 |    "metadata": {},
  29 |    "source": [
  30 |     "## Loading Data"
  31 |    ]
  32 |   },
  33 |   {
  34 |    "cell_type": "code",
  35 |    "execution_count": 2,
  36 |    "metadata": {
  37 |     "collapsed": true
  38 |    },
  39 |    "outputs": [],
  40 |    "source": [
  41 |     "price_df,instrument_df,equity_df = get_price_instrument_equity(\"cn_stock_price_2012_2018.csv\",\"cn_instrument_info_2012_2018.csv\",\"cn_equity_daily_2012_2018.csv\",\"sectorCode\")"
  42 |    ]
  43 |   },
  44 |   {
  45 |    "cell_type": "code",
  46 |    "execution_count": 3,
  47 |    "metadata": {
  48 |     "collapsed": false
  49 |    },
  50 |    "outputs": [
  51 |     {
  52 |      "data": {
  53 |       "text/html": [
  54 |        "<div>\n",
  55 |        "<table border=\"1\" class=\"dataframe\">\n",
  56 |        "  <thead>\n",
  57 |        "    <tr style=\"text-align: right;\">\n",
  58 |        "      <th></th>\n",
  59 |        "      <th></th>\n",
  60 |        "      <th>return</th>\n",
  61 |        "      <th>close</th>\n",
  62 |        "      <th>total_turnover</th>\n",
  63 |        "      <th>volume</th>\n",
  64 |        "      <th>week</th>\n",
  65 |        "      <th>month</th>\n",
  66 |        "      <th>report_quarter</th>\n",
  67 |        "      <th>market_cap</th>\n",
  68 |        "      <th>a_share_market_val_2</th>\n",
  69 |        "      <th>cash_received_from_sales_of_goods</th>\n",
  70 |        "      <th>pb_ratio</th>\n",
  71 |        "      <th>net_profit</th>\n",
  72 |        "      <th>ps_ratio</th>\n",
  73 |        "      <th>sectorCode</th>\n",
  74 |        "    </tr>\n",
  75 |        "    <tr>\n",
  76 |        "      <th>date</th>\n",
  77 |        "      <th>order_book_id</th>\n",
  78 |        "      <th></th>\n",
  79 |        "      <th></th>\n",
  80 |        "      <th></th>\n",
  81 |        "      <th></th>\n",
  82 |        "      <th></th>\n",
  83 |        "      <th></th>\n",
  84 |        "      <th></th>\n",
  85 |        "      <th></th>\n",
  86 |        "      <th></th>\n",
  87 |        "      <th></th>\n",
  88 |        "      <th></th>\n",
  89 |        "      <th></th>\n",
  90 |        "      <th></th>\n",
  91 |        "      <th></th>\n",
  92 |        "    </tr>\n",
  93 |        "  </thead>\n",
  94 |        "  <tbody>\n",
  95 |        "    <tr>\n",
  96 |        "      <th rowspan=\"5\" valign=\"top\">2012-01-04</th>\n",
  97 |        "      <th>000001.XSHE</th>\n",
  98 |        "      <td>-0.027582</td>\n",
  99 |        "      <td>5.1224</td>\n",
 100 |        "      <td>2.275637e+08</td>\n",
 101 |        "      <td>40894428.0</td>\n",
 102 |        "      <td>0.5775</td>\n",
 103 |        "      <td>0.4331</td>\n",
 104 |        "      <td>NaN</td>\n",
 105 |        "      <td>NaN</td>\n",
 106 |        "      <td>NaN</td>\n",
 107 |        "      <td>NaN</td>\n",
 108 |        "      <td>NaN</td>\n",
 109 |        "      <td>NaN</td>\n",
 110 |        "      <td>NaN</td>\n",
 111 |        "      <td>Financials</td>\n",
 112 |        "    </tr>\n",
 113 |        "    <tr>\n",
 114 |        "      <th>000002.XSHE</th>\n",
 115 |        "      <td>-0.018742</td>\n",
 116 |        "      <td>6.0525</td>\n",
 117 |        "      <td>3.559891e+08</td>\n",
 118 |        "      <td>47432958.0</td>\n",
 119 |        "      <td>0.3711</td>\n",
 120 |        "      <td>0.4030</td>\n",
 121 |        "      <td>2011q3</td>\n",
 122 |        "      <td>8.059489e+10</td>\n",
 123 |        "      <td>7.082120e+10</td>\n",
 124 |        "      <td>7.516785e+10</td>\n",
 125 |        "      <td>1.5216</td>\n",
 126 |        "      <td>4.106349e+09</td>\n",
 127 |        "      <td>0.8679</td>\n",
 128 |        "      <td>Financials</td>\n",
 129 |        "    </tr>\n",
 130 |        "    <tr>\n",
 131 |        "      <th>000004.XSHE</th>\n",
 132 |        "      <td>-0.022250</td>\n",
 133 |        "      <td>7.9100</td>\n",
 134 |        "      <td>3.763833e+06</td>\n",
 135 |        "      <td>465469.0</td>\n",
 136 |        "      <td>0.5720</td>\n",
 137 |        "      <td>0.7506</td>\n",
 138 |        "      <td>2011q3</td>\n",
 139 |        "      <td>6.642556e+08</td>\n",
 140 |        "      <td>6.634549e+08</td>\n",
 141 |        "      <td>5.949968e+07</td>\n",
 142 |        "      <td>8.8175</td>\n",
 143 |        "      <td>4.500363e+06</td>\n",
 144 |        "      <td>37.5796</td>\n",
 145 |        "      <td>HealthCare</td>\n",
 146 |        "    </tr>\n",
 147 |        "    <tr>\n",
 148 |        "      <th>000005.XSHE</th>\n",
 149 |        "      <td>0.000000</td>\n",
 150 |        "      <td>3.8600</td>\n",
 151 |        "      <td>0.000000e+00</td>\n",
 152 |        "      <td>0.0</td>\n",
 153 |        "      <td>0.0000</td>\n",
 154 |        "      <td>0.0000</td>\n",
 155 |        "      <td>2011q3</td>\n",
 156 |        "      <td>3.529328e+09</td>\n",
 157 |        "      <td>3.527048e+09</td>\n",
 158 |        "      <td>2.565851e+07</td>\n",
 159 |        "      <td>5.3480</td>\n",
 160 |        "      <td>1.365665e+07</td>\n",
 161 |        "      <td>-347.2191</td>\n",
 162 |        "      <td>Industrials</td>\n",
 163 |        "    </tr>\n",
 164 |        "    <tr>\n",
 165 |        "      <th>000006.XSHE</th>\n",
 166 |        "      <td>-0.009756</td>\n",
 167 |        "      <td>2.6766</td>\n",
 168 |        "      <td>7.619286e+06</td>\n",
 169 |        "      <td>2513811.0</td>\n",
 170 |        "      <td>0.1416</td>\n",
 171 |        "      <td>0.1667</td>\n",
 172 |        "      <td>2011q3</td>\n",
 173 |        "      <td>4.015370e+09</td>\n",
 174 |        "      <td>3.929464e+09</td>\n",
 175 |        "      <td>2.531436e+09</td>\n",
 176 |        "      <td>1.4348</td>\n",
 177 |        "      <td>2.763917e+08</td>\n",
 178 |        "      <td>1.4139</td>\n",
 179 |        "      <td>Financials</td>\n",
 180 |        "    </tr>\n",
 181 |        "  </tbody>\n",
 182 |        "</table>\n",
 183 |        "</div>"
 184 |       ],
 185 |       "text/plain": [
 186 |        "                            return   close  total_turnover      volume  \\\n",
 187 |        "date       order_book_id                                                 \n",
 188 |        "2012-01-04 000001.XSHE   -0.027582  5.1224    2.275637e+08  40894428.0   \n",
 189 |        "           000002.XSHE   -0.018742  6.0525    3.559891e+08  47432958.0   \n",
 190 |        "           000004.XSHE   -0.022250  7.9100    3.763833e+06    465469.0   \n",
 191 |        "           000005.XSHE    0.000000  3.8600    0.000000e+00         0.0   \n",
 192 |        "           000006.XSHE   -0.009756  2.6766    7.619286e+06   2513811.0   \n",
 193 |        "\n",
 194 |        "                            week   month report_quarter    market_cap  \\\n",
 195 |        "date       order_book_id                                                \n",
 196 |        "2012-01-04 000001.XSHE    0.5775  0.4331            NaN           NaN   \n",
 197 |        "           000002.XSHE    0.3711  0.4030         2011q3  8.059489e+10   \n",
 198 |        "           000004.XSHE    0.5720  0.7506         2011q3  6.642556e+08   \n",
 199 |        "           000005.XSHE    0.0000  0.0000         2011q3  3.529328e+09   \n",
 200 |        "           000006.XSHE    0.1416  0.1667         2011q3  4.015370e+09   \n",
 201 |        "\n",
 202 |        "                          a_share_market_val_2  \\\n",
 203 |        "date       order_book_id                         \n",
 204 |        "2012-01-04 000001.XSHE                     NaN   \n",
 205 |        "           000002.XSHE            7.082120e+10   \n",
 206 |        "           000004.XSHE            6.634549e+08   \n",
 207 |        "           000005.XSHE            3.527048e+09   \n",
 208 |        "           000006.XSHE            3.929464e+09   \n",
 209 |        "\n",
 210 |        "                          cash_received_from_sales_of_goods  pb_ratio  \\\n",
 211 |        "date       order_book_id                                                \n",
 212 |        "2012-01-04 000001.XSHE                                  NaN       NaN   \n",
 213 |        "           000002.XSHE                         7.516785e+10    1.5216   \n",
 214 |        "           000004.XSHE                         5.949968e+07    8.8175   \n",
 215 |        "           000005.XSHE                         2.565851e+07    5.3480   \n",
 216 |        "           000006.XSHE                         2.531436e+09    1.4348   \n",
 217 |        "\n",
 218 |        "                            net_profit  ps_ratio   sectorCode  \n",
 219 |        "date       order_book_id                                       \n",
 220 |        "2012-01-04 000001.XSHE             NaN       NaN   Financials  \n",
 221 |        "           000002.XSHE    4.106349e+09    0.8679   Financials  \n",
 222 |        "           000004.XSHE    4.500363e+06   37.5796   HealthCare  \n",
 223 |        "           000005.XSHE    1.365665e+07 -347.2191  Industrials  \n",
 224 |        "           000006.XSHE    2.763917e+08    1.4139   Financials  "
 225 |       ]
 226 |      },
 227 |      "execution_count": 3,
 228 |      "metadata": {},
 229 |      "output_type": "execute_result"
 230 |     }
 231 |    ],
 232 |    "source": [
 233 |     "equity_df.head()"
 234 |    ]
 235 |   },
 236 |   {
 237 |    "cell_type": "code",
 238 |    "execution_count": 4,
 239 |    "metadata": {
 240 |     "collapsed": false
 241 |    },
 242 |    "outputs": [
 243 |     {
 244 |      "data": {
 245 |       "text/plain": [
 246 |        "164"
 247 |       ]
 248 |      },
 249 |      "execution_count": 4,
 250 |      "metadata": {},
 251 |      "output_type": "execute_result"
 252 |     }
 253 |    ],
 254 |    "source": [
 255 |     "healthcareUniverse = instrument_df.index[instrument_df.sectorCode=='HealthCare'].values\n",
 256 |     "len(healthcareUniverse)"
 257 |    ]
 258 |   },
 259 |   {
 260 |    "cell_type": "code",
 261 |    "execution_count": 5,
 262 |    "metadata": {
 263 |     "collapsed": true
 264 |    },
 265 |    "outputs": [],
 266 |    "source": [
 267 |     "def equity_universe_filtering(equity_df, universe):\n",
 268 |     "    universeFilter = [book_id in set(universe) for book_id in equity_df.index.get_level_values(level=1).values]\n",
 269 |     "    return equity_df[universeFilter]"
 270 |    ]
 271 |   },
 272 |   {
 273 |    "cell_type": "code",
 274 |    "execution_count": 6,
 275 |    "metadata": {
 276 |     "collapsed": false
 277 |    },
 278 |    "outputs": [
 279 |     {
 280 |      "data": {
 281 |       "text/html": [
 282 |        "<div>\n",
 283 |        "<table border=\"1\" class=\"dataframe\">\n",
 284 |        "  <thead>\n",
 285 |        "    <tr style=\"text-align: right;\">\n",
 286 |        "      <th></th>\n",
 287 |        "      <th></th>\n",
 288 |        "      <th>return</th>\n",
 289 |        "      <th>close</th>\n",
 290 |        "      <th>total_turnover</th>\n",
 291 |        "      <th>volume</th>\n",
 292 |        "      <th>week</th>\n",
 293 |        "      <th>month</th>\n",
 294 |        "      <th>report_quarter</th>\n",
 295 |        "      <th>market_cap</th>\n",
 296 |        "      <th>a_share_market_val_2</th>\n",
 297 |        "      <th>cash_received_from_sales_of_goods</th>\n",
 298 |        "      <th>pb_ratio</th>\n",
 299 |        "      <th>net_profit</th>\n",
 300 |        "      <th>ps_ratio</th>\n",
 301 |        "      <th>sectorCode</th>\n",
 302 |        "    </tr>\n",
 303 |        "    <tr>\n",
 304 |        "      <th>date</th>\n",
 305 |        "      <th>order_book_id</th>\n",
 306 |        "      <th></th>\n",
 307 |        "      <th></th>\n",
 308 |        "      <th></th>\n",
 309 |        "      <th></th>\n",
 310 |        "      <th></th>\n",
 311 |        "      <th></th>\n",
 312 |        "      <th></th>\n",
 313 |        "      <th></th>\n",
 314 |        "      <th></th>\n",
 315 |        "      <th></th>\n",
 316 |        "      <th></th>\n",
 317 |        "      <th></th>\n",
 318 |        "      <th></th>\n",
 319 |        "      <th></th>\n",
 320 |        "    </tr>\n",
 321 |        "  </thead>\n",
 322 |        "  <tbody>\n",
 323 |        "    <tr>\n",
 324 |        "      <th rowspan=\"5\" valign=\"top\">2012-01-04</th>\n",
 325 |        "      <th>000004.XSHE</th>\n",
 326 |        "      <td>-0.022250</td>\n",
 327 |        "      <td>7.9100</td>\n",
 328 |        "      <td>3763832.88</td>\n",
 329 |        "      <td>465469.0</td>\n",
 330 |        "      <td>0.5720</td>\n",
 331 |        "      <td>0.7506</td>\n",
 332 |        "      <td>2011q3</td>\n",
 333 |        "      <td>6.642556e+08</td>\n",
 334 |        "      <td>6.634549e+08</td>\n",
 335 |        "      <td>5.949968e+07</td>\n",
 336 |        "      <td>8.8175</td>\n",
 337 |        "      <td>4.500363e+06</td>\n",
 338 |        "      <td>37.5796</td>\n",
 339 |        "      <td>HealthCare</td>\n",
 340 |        "    </tr>\n",
 341 |        "    <tr>\n",
 342 |        "      <th>000028.XSHE</th>\n",
 343 |        "      <td>-0.045433</td>\n",
 344 |        "      <td>19.8422</td>\n",
 345 |        "      <td>9326924.28</td>\n",
 346 |        "      <td>450553.0</td>\n",
 347 |        "      <td>0.4201</td>\n",
 348 |        "      <td>0.2722</td>\n",
 349 |        "      <td>2011q3</td>\n",
 350 |        "      <td>5.872485e+09</td>\n",
 351 |        "      <td>4.753820e+09</td>\n",
 352 |        "      <td>1.053298e+10</td>\n",
 353 |        "      <td>4.3493</td>\n",
 354 |        "      <td>2.481834e+08</td>\n",
 355 |        "      <td>0.3414</td>\n",
 356 |        "      <td>HealthCare</td>\n",
 357 |        "    </tr>\n",
 358 |        "    <tr>\n",
 359 |        "      <th>000150.XSHE</th>\n",
 360 |        "      <td>-0.030295</td>\n",
 361 |        "      <td>3.1737</td>\n",
 362 |        "      <td>3109304.50</td>\n",
 363 |        "      <td>952600.0</td>\n",
 364 |        "      <td>0.3460</td>\n",
 365 |        "      <td>0.3610</td>\n",
 366 |        "      <td>2011q3</td>\n",
 367 |        "      <td>1.036800e+09</td>\n",
 368 |        "      <td>1.036800e+09</td>\n",
 369 |        "      <td>4.913279e+07</td>\n",
 370 |        "      <td>1.4763</td>\n",
 371 |        "      <td>3.657858e+06</td>\n",
 372 |        "      <td>7.8956</td>\n",
 373 |        "      <td>HealthCare</td>\n",
 374 |        "    </tr>\n",
 375 |        "    <tr>\n",
 376 |        "      <th>000153.XSHE</th>\n",
 377 |        "      <td>-0.028053</td>\n",
 378 |        "      <td>5.7700</td>\n",
 379 |        "      <td>9673054.49</td>\n",
 380 |        "      <td>1596020.0</td>\n",
 381 |        "      <td>0.6830</td>\n",
 382 |        "      <td>2.4594</td>\n",
 383 |        "      <td>2011q3</td>\n",
 384 |        "      <td>1.531454e+09</td>\n",
 385 |        "      <td>1.360856e+09</td>\n",
 386 |        "      <td>1.329425e+09</td>\n",
 387 |        "      <td>2.1169</td>\n",
 388 |        "      <td>1.560397e+07</td>\n",
 389 |        "      <td>0.7818</td>\n",
 390 |        "      <td>HealthCare</td>\n",
 391 |        "    </tr>\n",
 392 |        "    <tr>\n",
 393 |        "      <th>000403.XSHE</th>\n",
 394 |        "      <td>0.000000</td>\n",
 395 |        "      <td>3.1625</td>\n",
 396 |        "      <td>0.00</td>\n",
 397 |        "      <td>0.0</td>\n",
 398 |        "      <td>0.0000</td>\n",
 399 |        "      <td>0.0000</td>\n",
 400 |        "      <td>NaN</td>\n",
 401 |        "      <td>NaN</td>\n",
 402 |        "      <td>NaN</td>\n",
 403 |        "      <td>NaN</td>\n",
 404 |        "      <td>NaN</td>\n",
 405 |        "      <td>NaN</td>\n",
 406 |        "      <td>NaN</td>\n",
 407 |        "      <td>HealthCare</td>\n",
 408 |        "    </tr>\n",
 409 |        "  </tbody>\n",
 410 |        "</table>\n",
 411 |        "</div>"
 412 |       ],
 413 |       "text/plain": [
 414 |        "                            return    close  total_turnover     volume  \\\n",
 415 |        "date       order_book_id                                                 \n",
 416 |        "2012-01-04 000004.XSHE   -0.022250   7.9100      3763832.88   465469.0   \n",
 417 |        "           000028.XSHE   -0.045433  19.8422      9326924.28   450553.0   \n",
 418 |        "           000150.XSHE   -0.030295   3.1737      3109304.50   952600.0   \n",
 419 |        "           000153.XSHE   -0.028053   5.7700      9673054.49  1596020.0   \n",
 420 |        "           000403.XSHE    0.000000   3.1625            0.00        0.0   \n",
 421 |        "\n",
 422 |        "                            week   month report_quarter    market_cap  \\\n",
 423 |        "date       order_book_id                                                \n",
 424 |        "2012-01-04 000004.XSHE    0.5720  0.7506         2011q3  6.642556e+08   \n",
 425 |        "           000028.XSHE    0.4201  0.2722         2011q3  5.872485e+09   \n",
 426 |        "           000150.XSHE    0.3460  0.3610         2011q3  1.036800e+09   \n",
 427 |        "           000153.XSHE    0.6830  2.4594         2011q3  1.531454e+09   \n",
 428 |        "           000403.XSHE    0.0000  0.0000            NaN           NaN   \n",
 429 |        "\n",
 430 |        "                          a_share_market_val_2  \\\n",
 431 |        "date       order_book_id                         \n",
 432 |        "2012-01-04 000004.XSHE            6.634549e+08   \n",
 433 |        "           000028.XSHE            4.753820e+09   \n",
 434 |        "           000150.XSHE            1.036800e+09   \n",
 435 |        "           000153.XSHE            1.360856e+09   \n",
 436 |        "           000403.XSHE                     NaN   \n",
 437 |        "\n",
 438 |        "                          cash_received_from_sales_of_goods  pb_ratio  \\\n",
 439 |        "date       order_book_id                                                \n",
 440 |        "2012-01-04 000004.XSHE                         5.949968e+07    8.8175   \n",
 441 |        "           000028.XSHE                         1.053298e+10    4.3493   \n",
 442 |        "           000150.XSHE                         4.913279e+07    1.4763   \n",
 443 |        "           000153.XSHE                         1.329425e+09    2.1169   \n",
 444 |        "           000403.XSHE                                  NaN       NaN   \n",
 445 |        "\n",
 446 |        "                            net_profit  ps_ratio  sectorCode  \n",
 447 |        "date       order_book_id                                      \n",
 448 |        "2012-01-04 000004.XSHE    4.500363e+06   37.5796  HealthCare  \n",
 449 |        "           000028.XSHE    2.481834e+08    0.3414  HealthCare  \n",
 450 |        "           000150.XSHE    3.657858e+06    7.8956  HealthCare  \n",
 451 |        "           000153.XSHE    1.560397e+07    0.7818  HealthCare  \n",
 452 |        "           000403.XSHE             NaN       NaN  HealthCare  "
 453 |       ]
 454 |      },
 455 |      "execution_count": 6,
 456 |      "metadata": {},
 457 |      "output_type": "execute_result"
 458 |     }
 459 |    ],
 460 |    "source": [
 461 |     "healthcare_equity_df = equity_universe_filtering(equity_df, healthcareUniverse)\n",
 462 |     "healthcare_equity_df.head()"
 463 |    ]
 464 |   },
 465 |   {
 466 |    "cell_type": "code",
 467 |    "execution_count": 7,
 468 |    "metadata": {
 469 |     "collapsed": false
 470 |    },
 471 |    "outputs": [
 472 |     {
 473 |      "name": "stdout",
 474 |      "output_type": "stream",
 475 |      "text": [
 476 |       "universe ratio: 6.210331877919959%\n"
 477 |      ]
 478 |     }
 479 |    ],
 480 |    "source": [
 481 |     "print(\"universe ratio: {}%\".format(len(healthcare_equity_df)/len(equity_df)*100))"
 482 |    ]
 483 |   },
 484 |   {
 485 |    "cell_type": "markdown",
 486 |    "metadata": {},
 487 |    "source": [
 488 |     "### benchmark"
 489 |    ]
 490 |   },
 491 |   {
 492 |    "cell_type": "code",
 493 |    "execution_count": 27,
 494 |    "metadata": {
 495 |     "collapsed": true
 496 |    },
 497 |    "outputs": [],
 498 |    "source": [
 499 |     "benchmark_df = pd.read_csv(\"cn_SH_healthcare_index_2012_2018.csv\",names=['date','value'])\n",
 500 |     "benchmark_df = benchmark_df.set_index('date',drop=True)"
 501 |    ]
 502 |   },
 503 |   {
 504 |    "cell_type": "code",
 505 |    "execution_count": 33,
 506 |    "metadata": {
 507 |     "collapsed": false
 508 |    },
 509 |    "outputs": [
 510 |     {
 511 |      "data": {
 512 |       "text/html": [
 513 |        "<div>\n",
 514 |        "<table border=\"1\" class=\"dataframe\">\n",
 515 |        "  <thead>\n",
 516 |        "    <tr style=\"text-align: right;\">\n",
 517 |        "      <th></th>\n",
 518 |        "      <th>value</th>\n",
 519 |        "      <th>return</th>\n",
 520 |        "    </tr>\n",
 521 |        "    <tr>\n",
 522 |        "      <th>date</th>\n",
 523 |        "      <th></th>\n",
 524 |        "      <th></th>\n",
 525 |        "    </tr>\n",
 526 |        "  </thead>\n",
 527 |        "  <tbody>\n",
 528 |        "    <tr>\n",
 529 |        "      <th>2012-01-04</th>\n",
 530 |        "      <td>2891.462</td>\n",
 531 |        "      <td>0.000000</td>\n",
 532 |        "    </tr>\n",
 533 |        "    <tr>\n",
 534 |        "      <th>2012-01-05</th>\n",
 535 |        "      <td>2766.955</td>\n",
 536 |        "      <td>0.044015</td>\n",
 537 |        "    </tr>\n",
 538 |        "    <tr>\n",
 539 |        "      <th>2012-01-06</th>\n",
 540 |        "      <td>2744.793</td>\n",
 541 |        "      <td>0.008042</td>\n",
 542 |        "    </tr>\n",
 543 |        "    <tr>\n",
 544 |        "      <th>2012-01-09</th>\n",
 545 |        "      <td>2833.219</td>\n",
 546 |        "      <td>-0.031708</td>\n",
 547 |        "    </tr>\n",
 548 |        "    <tr>\n",
 549 |        "      <th>2012-01-10</th>\n",
 550 |        "      <td>2929.594</td>\n",
 551 |        "      <td>-0.033450</td>\n",
 552 |        "    </tr>\n",
 553 |        "  </tbody>\n",
 554 |        "</table>\n",
 555 |        "</div>"
 556 |       ],
 557 |       "text/plain": [
 558 |        "               value    return\n",
 559 |        "date                          \n",
 560 |        "2012-01-04  2891.462  0.000000\n",
 561 |        "2012-01-05  2766.955  0.044015\n",
 562 |        "2012-01-06  2744.793  0.008042\n",
 563 |        "2012-01-09  2833.219 -0.031708\n",
 564 |        "2012-01-10  2929.594 -0.033450"
 565 |       ]
 566 |      },
 567 |      "execution_count": 33,
 568 |      "metadata": {},
 569 |      "output_type": "execute_result"
 570 |     }
 571 |    ],
 572 |    "source": [
 573 |     "benchmark_df['return'] = np.log(benchmark_df.shift(1)/benchmark_df).fillna(0)\n",
 574 |     "benchmark_df.head()"
 575 |    ]
 576 |   },
 577 |   {
 578 |    "cell_type": "markdown",
 579 |    "metadata": {},
 580 |    "source": [
 581 |     "## Factor Returns"
 582 |    ]
 583 |   },
 584 |   {
 585 |    "cell_type": "code",
 586 |    "execution_count": 8,
 587 |    "metadata": {
 588 |     "collapsed": true
 589 |    },
 590 |    "outputs": [],
 591 |    "source": [
 592 |     "def equity_factor_return(equity_df, factorColumn, nAllocations, longTop=True):\n",
 593 |     "    equity_copy = equity_df.copy()\n",
 594 |     "#     equity_copy[\"{}_rank\".format(factorColumn)] = equity_copy.groupby(level='date')[factorColumn].rank()\n",
 595 |     "#     equity_copy[equity_copy.groupby(level='date')[factorColumn].nlargest(nAllocations).index][\"biggest_{}_{}\".format(nAllocations,factorColumn)]=True\n",
 596 |     "    largest = equity_copy[factorColumn].groupby(level='date').nlargest(nAllocations).reset_index(level=0,drop=True)\n",
 597 |     "    smallest = equity_copy[factorColumn].groupby(level='date').nsmallest(nAllocations).reset_index(level=0,drop=True)\n",
 598 |     "    r_largest = equity_copy.loc[largest.index,'return'].groupby(level='date').mean()\n",
 599 |     "    r_smallest = equity_copy.loc[smallest.index,'return'].groupby(level='date').mean()\n",
 600 |     "    LMS = r_largest - r_smallest\n",
 601 |     "    if(longTop):\n",
 602 |     "        return LMS\n",
 603 |     "    else:\n",
 604 |     "        return -LMS"
 605 |    ]
 606 |   },
 607 |   {
 608 |    "cell_type": "code",
 609 |    "execution_count": 9,
 610 |    "metadata": {
 611 |     "collapsed": false
 612 |    },
 613 |    "outputs": [
 614 |     {
 615 |      "data": {
 616 |       "text/plain": [
 617 |        "date\n",
 618 |        "2012-01-04    0.005983\n",
 619 |        "2012-01-05   -0.009098\n",
 620 |        "2012-01-06   -0.004155\n",
 621 |        "2012-01-09    0.014615\n",
 622 |        "2012-01-10    0.006728\n",
 623 |        "Name: return, dtype: float64"
 624 |       ]
 625 |      },
 626 |      "execution_count": 9,
 627 |      "metadata": {},
 628 |      "output_type": "execute_result"
 629 |     }
 630 |    ],
 631 |    "source": [
 632 |     "SMB = equity_factor_return(healthcare_equity_df, 'market_cap', 20,longTop=False)\n",
 633 |     "SMB.head()"
 634 |    ]
 635 |   },
 636 |   {
 637 |    "cell_type": "code",
 638 |    "execution_count": 10,
 639 |    "metadata": {
 640 |     "collapsed": false
 641 |    },
 642 |    "outputs": [
 643 |     {
 644 |      "data": {
 645 |       "text/plain": [
 646 |        "date\n",
 647 |        "2012-01-04    0.005302\n",
 648 |        "2012-01-05   -0.007223\n",
 649 |        "2012-01-06    0.006031\n",
 650 |        "2012-01-09   -0.002597\n",
 651 |        "2012-01-10   -0.010780\n",
 652 |        "Name: return, dtype: float64"
 653 |       ]
 654 |      },
 655 |      "execution_count": 10,
 656 |      "metadata": {},
 657 |      "output_type": "execute_result"
 658 |     }
 659 |    ],
 660 |    "source": [
 661 |     "HML = equity_factor_return(healthcare_equity_df, 'pb_ratio', 20,longTop=True)\n",
 662 |     "HML.head()"
 663 |    ]
 664 |   },
 665 |   {
 666 |    "cell_type": "code",
 667 |    "execution_count": 11,
 668 |    "metadata": {
 669 |     "collapsed": true
 670 |    },
 671 |    "outputs": [],
 672 |    "source": [
 673 |     "import itertools\n",
 674 |     "import statsmodels.api as sm\n",
 675 |     "from statsmodels import regression,stats\n",
 676 |     "import scipy\n",
 677 |     "\n",
 678 |     "data = healthcare_equity_df[['return']] # dataframe\n",
 679 |     "data = data.set_index(healthcare_equity_df.index) # elimilate redundant index (whole universe)\n",
 680 |     "asset_list_sizes = [group[1].size for group in data.groupby(level=0)]\n",
 681 |     "\n",
 682 |     "# Spreading the factor portfolio data across all assets for each day\n",
 683 |     "SMB_column = [[SMB.loc[group[0]]] * size for group, size \\\n",
 684 |     "              in zip(data.groupby(level=0), asset_list_sizes)]\n",
 685 |     "data['SMB'] = list(itertools.chain(*SMB_column))\n",
 686 |     "\n",
 687 |     "HML_column = [[HML.loc[group[0]]] * size for group, size \\\n",
 688 |     "              in zip(data.groupby(level=0), asset_list_sizes)]\n",
 689 |     "data['HML'] = list(itertools.chain(*HML_column))\n",
 690 |     "data = sm.add_constant(data.dropna())"
 691 |    ]
 692 |   },
 693 |   {
 694 |    "cell_type": "code",
 695 |    "execution_count": 12,
 696 |    "metadata": {
 697 |     "collapsed": false
 698 |    },
 699 |    "outputs": [
 700 |     {
 701 |      "data": {
 702 |       "text/html": [
 703 |        "<div>\n",
 704 |        "<table border=\"1\" class=\"dataframe\">\n",
 705 |        "  <thead>\n",
 706 |        "    <tr style=\"text-align: right;\">\n",
 707 |        "      <th></th>\n",
 708 |        "      <th></th>\n",
 709 |        "      <th>const</th>\n",
 710 |        "      <th>return</th>\n",
 711 |        "      <th>SMB</th>\n",
 712 |        "      <th>HML</th>\n",
 713 |        "    </tr>\n",
 714 |        "    <tr>\n",
 715 |        "      <th>date</th>\n",
 716 |        "      <th>order_book_id</th>\n",
 717 |        "      <th></th>\n",
 718 |        "      <th></th>\n",
 719 |        "      <th></th>\n",
 720 |        "      <th></th>\n",
 721 |        "    </tr>\n",
 722 |        "  </thead>\n",
 723 |        "  <tbody>\n",
 724 |        "    <tr>\n",
 725 |        "      <th rowspan=\"5\" valign=\"top\">2012-01-04</th>\n",
 726 |        "      <th>000004.XSHE</th>\n",
 727 |        "      <td>1.0</td>\n",
 728 |        "      <td>-0.022250</td>\n",
 729 |        "      <td>0.005983</td>\n",
 730 |        "      <td>0.005302</td>\n",
 731 |        "    </tr>\n",
 732 |        "    <tr>\n",
 733 |        "      <th>000028.XSHE</th>\n",
 734 |        "      <td>1.0</td>\n",
 735 |        "      <td>-0.045433</td>\n",
 736 |        "      <td>0.005983</td>\n",
 737 |        "      <td>0.005302</td>\n",
 738 |        "    </tr>\n",
 739 |        "    <tr>\n",
 740 |        "      <th>000150.XSHE</th>\n",
 741 |        "      <td>1.0</td>\n",
 742 |        "      <td>-0.030295</td>\n",
 743 |        "      <td>0.005983</td>\n",
 744 |        "      <td>0.005302</td>\n",
 745 |        "    </tr>\n",
 746 |        "    <tr>\n",
 747 |        "      <th>000153.XSHE</th>\n",
 748 |        "      <td>1.0</td>\n",
 749 |        "      <td>-0.028053</td>\n",
 750 |        "      <td>0.005983</td>\n",
 751 |        "      <td>0.005302</td>\n",
 752 |        "    </tr>\n",
 753 |        "    <tr>\n",
 754 |        "      <th>000403.XSHE</th>\n",
 755 |        "      <td>1.0</td>\n",
 756 |        "      <td>0.000000</td>\n",
 757 |        "      <td>0.005983</td>\n",
 758 |        "      <td>0.005302</td>\n",
 759 |        "    </tr>\n",
 760 |        "  </tbody>\n",
 761 |        "</table>\n",
 762 |        "</div>"
 763 |       ],
 764 |       "text/plain": [
 765 |        "                          const    return       SMB       HML\n",
 766 |        "date       order_book_id                                     \n",
 767 |        "2012-01-04 000004.XSHE      1.0 -0.022250  0.005983  0.005302\n",
 768 |        "           000028.XSHE      1.0 -0.045433  0.005983  0.005302\n",
 769 |        "           000150.XSHE      1.0 -0.030295  0.005983  0.005302\n",
 770 |        "           000153.XSHE      1.0 -0.028053  0.005983  0.005302\n",
 771 |        "           000403.XSHE      1.0  0.000000  0.005983  0.005302"
 772 |       ]
 773 |      },
 774 |      "execution_count": 12,
 775 |      "metadata": {},
 776 |      "output_type": "execute_result"
 777 |     }
 778 |    ],
 779 |    "source": [
 780 |     "data.head()"
 781 |    ]
 782 |   },
 783 |   {
 784 |    "cell_type": "markdown",
 785 |    "metadata": {},
 786 |    "source": [
 787 |     "## Factor Exposures ($\\beta$)"
 788 |    ]
 789 |   },
 790 |   {
 791 |    "cell_type": "code",
 792 |    "execution_count": 13,
 793 |    "metadata": {
 794 |     "collapsed": true
 795 |    },
 796 |    "outputs": [],
 797 |    "source": [
 798 |     "assets = data.index.levels[1].unique()\n",
 799 |     "Y = [data.xs(asset,level=1)['return'] for asset in assets]\n",
 800 |     "X = [data.xs(asset,level=1)[['SMB','HML','const']] for asset in assets]\n",
 801 |     "reg_results = [regression.linear_model.OLS(y,x).fit().params for y,x in zip(Y,X) if not(x.empty or y.empty)]\n",
 802 |     "indices = [asset for y, x, asset in zip(Y, X, assets) if not(x.empty or y.empty)]\n",
 803 |     "betas = pd.DataFrame(reg_results, index=indices)"
 804 |    ]
 805 |   },
 806 |   {
 807 |    "cell_type": "code",
 808 |    "execution_count": 15,
 809 |    "metadata": {
 810 |     "collapsed": false
 811 |    },
 812 |    "outputs": [
 813 |     {
 814 |      "data": {
 815 |       "text/html": [
 816 |        "<div>\n",
 817 |        "<table border=\"1\" class=\"dataframe\">\n",
 818 |        "  <thead>\n",
 819 |        "    <tr style=\"text-align: right;\">\n",
 820 |        "      <th></th>\n",
 821 |        "      <th>SMB</th>\n",
 822 |        "      <th>HML</th>\n",
 823 |        "      <th>const</th>\n",
 824 |        "    </tr>\n",
 825 |        "  </thead>\n",
 826 |        "  <tbody>\n",
 827 |        "    <tr>\n",
 828 |        "      <th>000004.XSHE</th>\n",
 829 |        "      <td>0.883906</td>\n",
 830 |        "      <td>0.048757</td>\n",
 831 |        "      <td>0.002002</td>\n",
 832 |        "    </tr>\n",
 833 |        "    <tr>\n",
 834 |        "      <th>000028.XSHE</th>\n",
 835 |        "      <td>-0.003029</td>\n",
 836 |        "      <td>-0.064295</td>\n",
 837 |        "      <td>0.001073</td>\n",
 838 |        "    </tr>\n",
 839 |        "    <tr>\n",
 840 |        "      <th>000150.XSHE</th>\n",
 841 |        "      <td>0.354122</td>\n",
 842 |        "      <td>0.066071</td>\n",
 843 |        "      <td>0.002031</td>\n",
 844 |        "    </tr>\n",
 845 |        "    <tr>\n",
 846 |        "      <th>000153.XSHE</th>\n",
 847 |        "      <td>0.620706</td>\n",
 848 |        "      <td>-0.082229</td>\n",
 849 |        "      <td>0.001405</td>\n",
 850 |        "    </tr>\n",
 851 |        "    <tr>\n",
 852 |        "      <th>000403.XSHE</th>\n",
 853 |        "      <td>2.032192</td>\n",
 854 |        "      <td>11.457418</td>\n",
 855 |        "      <td>-0.017412</td>\n",
 856 |        "    </tr>\n",
 857 |        "  </tbody>\n",
 858 |        "</table>\n",
 859 |        "</div>"
 860 |       ],
 861 |       "text/plain": [
 862 |        "                  SMB        HML     const\n",
 863 |        "000004.XSHE  0.883906   0.048757  0.002002\n",
 864 |        "000028.XSHE -0.003029  -0.064295  0.001073\n",
 865 |        "000150.XSHE  0.354122   0.066071  0.002031\n",
 866 |        "000153.XSHE  0.620706  -0.082229  0.001405\n",
 867 |        "000403.XSHE  2.032192  11.457418 -0.017412"
 868 |       ]
 869 |      },
 870 |      "execution_count": 15,
 871 |      "metadata": {},
 872 |      "output_type": "execute_result"
 873 |     }
 874 |    ],
 875 |    "source": [
 876 |     "betas.head()"
 877 |    ]
 878 |   },
 879 |   {
 880 |    "cell_type": "markdown",
 881 |    "metadata": {},
 882 |    "source": [
 883 |     "## Factor Premium"
 884 |    ]
 885 |   },
 886 |   {
 887 |    "cell_type": "code",
 888 |    "execution_count": 36,
 889 |    "metadata": {
 890 |     "collapsed": false
 891 |    },
 892 |    "outputs": [
 893 |     {
 894 |      "data": {
 895 |       "text/html": [
 896 |        "<table class=\"simpletable\">\n",
 897 |        "<caption>OLS Regression Results</caption>\n",
 898 |        "<tr>\n",
 899 |        "  <th>Dep. Variable:</th>         <td>return</td>      <th>  R-squared:         </th> <td>   0.398</td>\n",
 900 |        "</tr>\n",
 901 |        "<tr>\n",
 902 |        "  <th>Model:</th>                   <td>OLS</td>       <th>  Adj. R-squared:    </th> <td>   0.391</td>\n",
 903 |        "</tr>\n",
 904 |        "<tr>\n",
 905 |        "  <th>Method:</th>             <td>Least Squares</td>  <th>  F-statistic:       </th> <td>   53.26</td>\n",
 906 |        "</tr>\n",
 907 |        "<tr>\n",
 908 |        "  <th>Date:</th>             <td>Sat, 05 May 2018</td> <th>  Prob (F-statistic):</th> <td>1.77e-18</td>\n",
 909 |        "</tr>\n",
 910 |        "<tr>\n",
 911 |        "  <th>Time:</th>                 <td>21:03:25</td>     <th>  Log-Likelihood:    </th> <td>  1012.1</td>\n",
 912 |        "</tr>\n",
 913 |        "<tr>\n",
 914 |        "  <th>No. Observations:</th>      <td>   164</td>      <th>  AIC:               </th> <td>  -2018.</td>\n",
 915 |        "</tr>\n",
 916 |        "<tr>\n",
 917 |        "  <th>Df Residuals:</th>          <td>   161</td>      <th>  BIC:               </th> <td>  -2009.</td>\n",
 918 |        "</tr>\n",
 919 |        "<tr>\n",
 920 |        "  <th>Df Model:</th>              <td>     2</td>      <th>                     </th>     <td> </td>   \n",
 921 |        "</tr>\n",
 922 |        "<tr>\n",
 923 |        "  <th>Covariance Type:</th>      <td>nonrobust</td>    <th>                     </th>     <td> </td>   \n",
 924 |        "</tr>\n",
 925 |        "</table>\n",
 926 |        "<table class=\"simpletable\">\n",
 927 |        "<tr>\n",
 928 |        "    <td></td>       <th>coef</th>     <th>std err</th>      <th>t</th>      <th>P>|t|</th>  <th>[0.025</th>    <th>0.975]</th>  \n",
 929 |        "</tr>\n",
 930 |        "<tr>\n",
 931 |        "  <th>const</th> <td>    0.0017</td> <td> 6.72e-05</td> <td>   24.956</td> <td> 0.000</td> <td>    0.002</td> <td>    0.002</td>\n",
 932 |        "</tr>\n",
 933 |        "<tr>\n",
 934 |        "  <th>SMB</th>   <td>-7.597e-05</td> <td>    0.000</td> <td>   -0.599</td> <td> 0.550</td> <td>   -0.000</td> <td>    0.000</td>\n",
 935 |        "</tr>\n",
 936 |        "<tr>\n",
 937 |        "  <th>HML</th>   <td>    0.0005</td> <td> 4.81e-05</td> <td>    9.695</td> <td> 0.000</td> <td>    0.000</td> <td>    0.001</td>\n",
 938 |        "</tr>\n",
 939 |        "</table>\n",
 940 |        "<table class=\"simpletable\">\n",
 941 |        "<tr>\n",
 942 |        "  <th>Omnibus:</th>       <td>39.154</td> <th>  Durbin-Watson:     </th> <td>   1.906</td>\n",
 943 |        "</tr>\n",
 944 |        "<tr>\n",
 945 |        "  <th>Prob(Omnibus):</th> <td> 0.000</td> <th>  Jarque-Bera (JB):  </th> <td>  78.545</td>\n",
 946 |        "</tr>\n",
 947 |        "<tr>\n",
 948 |        "  <th>Skew:</th>          <td> 1.087</td> <th>  Prob(JB):          </th> <td>8.80e-18</td>\n",
 949 |        "</tr>\n",
 950 |        "<tr>\n",
 951 |        "  <th>Kurtosis:</th>      <td> 5.601</td> <th>  Cond. No.          </th> <td>    3.92</td>\n",
 952 |        "</tr>\n",
 953 |        "</table>"
 954 |       ],
 955 |       "text/plain": [
 956 |        "<class 'statsmodels.iolib.summary.Summary'>\n",
 957 |        "\"\"\"\n",
 958 |        "                            OLS Regression Results                            \n",
 959 |        "==============================================================================\n",
 960 |        "Dep. Variable:                 return   R-squared:                       0.398\n",
 961 |        "Model:                            OLS   Adj. R-squared:                  0.391\n",
 962 |        "Method:                 Least Squares   F-statistic:                     53.26\n",
 963 |        "Date:                Sat, 05 May 2018   Prob (F-statistic):           1.77e-18\n",
 964 |        "Time:                        21:03:25   Log-Likelihood:                 1012.1\n",
 965 |        "No. Observations:                 164   AIC:                            -2018.\n",
 966 |        "Df Residuals:                     161   BIC:                            -2009.\n",
 967 |        "Df Model:                           2                                         \n",
 968 |        "Covariance Type:            nonrobust                                         \n",
 969 |        "==============================================================================\n",
 970 |        "                 coef    std err          t      P>|t|      [0.025      0.975]\n",
 971 |        "------------------------------------------------------------------------------\n",
 972 |        "const          0.0017   6.72e-05     24.956      0.000       0.002       0.002\n",
 973 |        "SMB        -7.597e-05      0.000     -0.599      0.550      -0.000       0.000\n",
 974 |        "HML            0.0005   4.81e-05      9.695      0.000       0.000       0.001\n",
 975 |        "==============================================================================\n",
 976 |        "Omnibus:                       39.154   Durbin-Watson:                   1.906\n",
 977 |        "Prob(Omnibus):                  0.000   Jarque-Bera (JB):               78.545\n",
 978 |        "Skew:                           1.087   Prob(JB):                     8.80e-18\n",
 979 |        "Kurtosis:                       5.601   Cond. No.                         3.92\n",
 980 |        "==============================================================================\n",
 981 |        "\n",
 982 |        "Warnings:\n",
 983 |        "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
 984 |        "\"\"\""
 985 |       ]
 986 |      },
 987 |      "execution_count": 36,
 988 |      "metadata": {},
 989 |      "output_type": "execute_result"
 990 |     }
 991 |    ],
 992 |    "source": [
 993 |     "betas = sm.add_constant(betas.drop('const', axis=1))\n",
 994 |     "\n",
 995 |     "R = data['return'].mean(axis=0, level=1)\n",
 996 |     "\n",
 997 |     "# Second regression step: estimating the risk premia\n",
 998 |     "risk_free_rate = benchmark_df['return'].mean()\n",
 999 |     "\n",
1000 |     "final_results = regression.linear_model.OLS(R - risk_free_rate, betas).fit()\n",
1001 |     "\n",
1002 |     "final_results.summary()"
1003 |    ]
1004 |   },
1005 |   {
1006 |    "cell_type": "markdown",
1007 |    "metadata": {},
1008 |    "source": [
1009 |     "## Fama-Macbeth Test Conclusion: \n",
1010 |     "although our individual factors are significant, we have a very low  $R^2$ . What this may suggest is that there is a real link between our factors and the returns of our assets, but that there still remains a lot of unexplained noise!"
1011 |    ]
1012 |   },
1013 |   {
1014 |    "cell_type": "code",
1015 |    "execution_count": null,
1016 |    "metadata": {
1017 |     "collapsed": true
1018 |    },
1019 |    "outputs": [],
1020 |    "source": []
1021 |   }
1022 |  ],
1023 |  "metadata": {
1024 |   "kernelspec": {
1025 |    "display_name": "Python 3",
1026 |    "language": "python",
1027 |    "name": "python3"
1028 |   },
1029 |   "language_info": {
1030 |    "codemirror_mode": {
1031 |     "name": "ipython",
1032 |     "version": 3
1033 |    },
1034 |    "file_extension": ".py",
1035 |    "mimetype": "text/x-python",
1036 |    "name": "python",
1037 |    "nbconvert_exporter": "python",
1038 |    "pygments_lexer": "ipython3",
1039 |    "version": "3.5.2"
1040 |   }
1041 |  },
1042 |  "nbformat": 4,
1043 |  "nbformat_minor": 2
1044 | }
1045 | 


--------------------------------------------------------------------------------
/Alpha Trading Workflow.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jerryxyx/AlphaTrading/5e73923786297faeadb27c76f83ec81fad74af51/Alpha Trading Workflow.pdf


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Multi-Factor Models
  2 | 
  3 | Author: Jerry Xia
  4 | 
  5 | Date: 2018/07/27
  6 | 
  7 | *Note: The advanced Marckdown features such as math expression may not be compatible in GitHub, please see README.pdf instead if you want more details*
  8 | 
  9 | 
 10 | 
 11 | ## Project Introduction
 12 | This is a research survey about alpha trading. In this project, I built up a pipeline of alpha trading including:
 13 | 
 14 | * factor pretest
 15 | * factor screening
 16 | * factor combination (modeling)
 17 | 
 18 | The models involed are APT models, Barra's risk models and dynamic factors model using Kalman filter.
 19 | 
 20 | ### Files
 21 | 
 22 | * rqdata_utils.py: Utils dealing with the rice quant platform data
 23 | 
 24 | * Step1_FactorPretest.ipynb: Factor returns profile visulization
 25 | 
 26 | * Step2_FactorsScreening.ipynb: Factor returns turnover visulization and correlation coefficients
 27 | 
 28 | * Step3\_FactorCombination\_AdaBoost\_Quantopian.ipynb: A Quantopian notebook file to combine alpha factors using Adaboost
 29 | 
 30 | * Step3\_FactorCombination\_BarraKalmanFilter.ipynb: Barra's risk model with three calibration schemes:
 31 | 	* Scheme 1: Cross-sectional regression and weighted average
 32 | 	* Scheme 2: Optimization problem: minimize the exponential weighted average of squared error
 33 | 	* Scheme 3: Dynamic linear model using Kalman filter
 34 | 
 35 | * KalmanFilterIntro.ipynb: An introduction to the dynamic multi-factor model
 36 | * APT_FammaBeth.ipynb: Using Famma-Macbeth regression to calibrate APT model.
 37 | 
 38 | ### Dataset
 39 | The dataset is not available in GitHub as it is too large. Except for Step3\_FactorCombination\_AdaBoost\_Quantopian.ipynb which we used US stock data in Quantopian, among other files, we used Chinese A-stocks data downloaded from RiceQuant instead (hard for free US equities' data). 
 40 | 
 41 | The data frame is multi-indexed similar to Quantopian's format(see both Alphalens github codes and rqdata_utils.py). However, feel free to cast and apply your own dataset.
 42 | 
 43 | 
 44 | ## TODO
 45 | 
 46 | * Input more effective factors: take advice from people and industry reports
 47 | * Should add technical analysis, because it matters! People care about them and then make it good sentimental indexes.
 48 | * Find well-known metrics to express results
 49 | 
 50 | ## Workflow
 51 | $\checkmark$ stands for finished and $\vartriangle$ stands for TODO
 52 | 
 53 | * Universe definition
 54 | * Factors collection and preprocessing
 55 | 	* $\vartriangle$Factors collection
 56 | 		- Sources
 57 | 			- balance sheet
 58 | 			- cash flow statement
 59 | 			- income statement
 60 | 			- earning report
 61 | 		- Econometric Classifications
 62 | 			- value
 63 | 			- growth
 64 | 			- profitability
 65 | 			- market size
 66 | 			- liquidity
 67 | 			- volatility
 68 | 			- Momentom
 69 | 			- Financial leverage (debt-to-equity ratio)
 70 | 	* Factors preprocessing
 71 | 		- $\vartriangle$daily, quaterly, annually
 72 | 		- continuous: rescale, outliers
 73 | 		- $\checkmark$discrete: rank
 74 | * Factors screening and combination
 75 | 	* Factors screening
 76 | 		- $\checkmark$Factors' correlation
 77 | 		- $\checkmark$Factors' foreseeablity
 78 | 		- Fama-Macbeth regression
 79 | 	* $\vartriangle$Factors combination
 80 | 		- PCA, FA
 81 | 		- Techniqual Analaysis
 82 | 		- Financial Modeling
 83 | 			- $\checkmark$APT model
 84 | 			- $\checkmark$Barra's risk model
 85 | 			- $\checkmark$Dynamic multi-factors model
 86 | 		- Linear combination to maximize Sharpe ratio
 87 | 		- Non-linear learning algorithms
 88 | 			- $\checkmark$AdaBoost
 89 | 			- Reinforcement learning
 90 | 
 91 | * Portfolio allocation
 92 | 
 93 | 
 94 | ## Factors' Correlations
 95 | Here, I use correlation matrix as the measure. The difference from the second result is that the correlation matrix is calculated by the rank data rather than the raw data
 96 | ### Two ICs comparison
 97 | * Pearson's IC: measures linear relationship between components
 98 | 
 99 | * Spearman's IC: measures monotonic relationship between components. Since We only care about the monotonic relationships. Spearman's IC wins.
100 | 
101 | 
102 | ### Regular IC(Pearson's correlation coefficient) for each factors
103 | ![](report/Corr_matrix_for_raw_factors.png)
104 | ### Spearman's Rank correlation coefficient for each factors
105 | ![](report/Corr_matrix_for_factor_ranks.png)
106 | 
107 | ### How to rule out redundant factors and why Spearman's rank correlation coefficients?
108 | From the correlation coefficients below, we can again conclude that Spearman's rank IC is far more robust. Take ps_ratio and sales_yield as a example.
109 | $$ps\_ratio = \frac{\mbox{adjusted close price}}{\mbox{sales per share}}$$
110 | whereas
111 | $$sales\_yield = \frac{\mbox{sales per share}}{\mbox{price}}$$
112 | Ahthogh the price in sales_yield formula is vague in our data source we can see roughly speaking, these two variable should be inverse of each other. The Spearman's rank correlation coefficient is -0.98 which verifies this statement, and we should avoid using both of these factors, which would exeggarate the impact of this peticular factor. However, we can not see such identity in the Pearson's regular correlation coefficients. It's quite misleading actually and that's why we choose Spearman's rank IC.
113 | 
114 | ## Factors' Foreseeability
115 | 
116 | ### Methods
117 | * Spearman's rank correlation coefficients
118 | * Fama-Macbeth regression: Not only consider the foreseeability of factors itself but also consider the co-vary of different factors, which means rule out factors if the returns can be explained by the recent factors.
119 | 
120 | 
121 | ### Spearman's rank IC for factors vs. forward returns
122 | 
123 | ![](report/mean_spearmans_rank_IC.png)
124 | 
125 | ### Spearman's rank IC (absolute value) for factors vs. forward returns
126 | ![](report/mean_spearmans_rank_IC_absolute_value.png)
127 | 
128 | ### Rank of the Spearman's rank IC (absolute value) for factors vs. forward returns
129 | ![](report/rank_of_mean_spearmans_rank_IC_absolute_value.png)
130 | 
131 | ## Factors Preprocessing
132 | * Get ranked data
133 | * Obtain the valid stocks set
134 | * Reshape the data: only valid stocks set
135 | * Fill null: using daily average
136 | * Rescale the data: MinMaxScaler
137 | * Variable reduction: PCA analysis
138 | * Sanity check
139 | 
140 | ![](report/corr_comparison_after_pca_analysis.png)
141 | 
142 | Here, I use principle component analysis because it can brings two benefits to our data - orthogonality and dimensionality reduction. Orthogonality makes data more separate, less dimensionality makes information more concentrated. Either of them is essential for machine learning algorithms.
143 | 
144 | In the next part, I used this preprocessed data as the input to obtain a "mega alpha".
145 | 
146 | ## Mega Alpha
147 | construct an aggregate alpha factor which has its return distribution profitable. The term "profitable" here means condense, little turnover, significant in the positive return.
148 | ### Methods
149 | #### linear methods
150 | * normalize factors and try a linear combination 
151 | * rank each factor and then sum up
152 | * Financial modeling: **See the appendix and Step3\_FactorCombination\_BarraKalmanFilter.ipynb**
153 | * linear combination to maximize Sharpe ratio
154 | 
155 | #### Non-linear methods
156 | * AdaBoost: **See Step3\_FactorCombination\_AdaBoost\_Quantopian.ipynb**
157 | * Reinforement Learning
158 | 
159 | 
160 | Here we only introduce AdaBoost algorithm in this documentation. For more details about the linear models, please See the appendix and Step3\_FactorCombination\_BarraKalmanFilter.ipynb.
161 | 
162 | ### AdaBoost
163 | #### Description
164 | The algorithm sequentially applies a weak classification to modified versions of the data. By increasing the weights of the missclassified observations, each weak learner focuses on the error of the previous one. The predictions are aggregated through a weighted majority vote.
165 | 
166 | #### Algorithm
167 | 
168 | ![](report/adaboost_algorithm.png)
169 | 
170 | #### Train set
171 | The adaboost classifier was applied to our fundamental dataset. The objective is to train a classifier which give a score for the bunch of factors. Or in other word, the mega alpha. Pink for the positive forward returns observations and blue for the negative forward returns observations. A good score system is to make the two classes more separated.
172 | ![](report/train_score_dist.png)
173 | We can see, in train set, AdaBoost classifier did so well! The next plot is the precision in each quantile of scores. In the top and bottom quantile, the predicted precision is nearly 100%!
174 | ![](report/train_accuracy_bar.png)
175 | 
176 | #### Test set
177 | alpha values histogram
178 | ![](report/test_score_dist.png)
179 | quantile precision bar plot
180 | ![](report/test_accuracy_bar.png)
181 | The precision in the top and bottom quantile is only slightly higher than 50%. Far from good if we considered transaction cost. 
182 | 
183 | So, I added some technical analysis factors to see if we can tackle this problem.
184 | ![](report/train_score_dist2.png)
185 | Surprisingly, even the average accuracy in test set is about 67%. What if we only trade the extreme quantile? That is around 80% accuracy! It literally shows that technical factors are really important in US stock market and can be used to find arbitrage opportunity.
186 | 
187 | ## References
188 | * Jonathan Larkin, *A Professional Quant Equity Workflow*. August 31, 2016
189 | * *A Practitioner‘s Guide to Factor Models*. The Research Foundation of The Institute of Chartered Financial Analysts
190 | * Thomas Wiecki, Machine Learning on Quantopian
191 | * Inigo Fraser Jenkins, *Using factors with different alpha decay times: The case for non-linear combination* 
192 | * PNC, *Factor Analysis: What Drives Performance?*
193 | * O’Shaughnessy, *Alpha or Assets? — Factor Alpha vs. Smart Beta*. April 2016
194 | * *O’Shaughnessy Quarterly Investor Letter Q1 2018* 
195 | * Jiantao Zhu, Orient Securities, *Alpha Forecasting - Factor-Based Strategy Research Series 13*
196 | * Yang Song, Bohai Securities, *Multi-Factor Models Research: Single Factor Testing*, 2017/10/11
197 | 
198 | 
199 | ## Appendix: Notes on Factor Models
200 | 
201 | ### CAPM
202 | * Author: Markovitz(1959)
203 | * single-factor: 
204 | * explain: security returns
205 | 
206 | ### APT
207 | * Author: Stephen A. Ross(1976)
208 | * multi-factor
209 | * explain: security returns
210 | 
211 | #### Postulates:
212 | - The linear model
213 | $$r_i(t) - \alpha_i = \sum_{k=1}^K \beta_{ik} \cdot f_k(t) + \epsilon_i(t)$$
214 | 
215 | where $f_k(t)$ is the realization(value) of risk factor at time t
216 | 
217 | - No pure arbitrage profit
218 | 
219 | #### Conclusion
220 | * Exposure of each security on each factor
221 | * Risk premium on each factor
222 | $$(Mean[r_i(t)])_i = P_0 + \sum_{k=1}^K \beta_{ik} \cdot P_k$$
223 | or make $\beta_{0,k}$ equals 1 for each k,
224 | $$(Mean[r_i(t)])_i = \sum_{k=0}^K \bar{\beta}_{i,k} \cdot P_k$$
225 | where $P_0$ is the risk free return
226 | 
227 | * Portfolio exposure to each factor
228 | $$Portfolio_{it} = \beta_0 + \beta_k \cdot f_{kit}$$
229 | 
230 | 
231 | 
232 | #### Three alternative calibration methods
233 | * **statistical techniques** such as factor analysis, principle analysis
234 | 	- **Goodness**: good for determining the number of relevent risk factors
235 | 	- **Undesirable**: hard to interpret
236 | 	
237 | * **portfolios**: K different well-diversified portfolios as substitutions
238 | 	- **Goodness**: lead to insights
239 | 	- **Fama-Macbeth regression**
240 | 
241 | * **economic theory** (highly developed art)
242 | 	- **Goodness**: Intuitively appealing set of factors that admit economic interpretation of risk exposures
243 | 	- **Goodness**: Using economic information in addition to stock return. Avoid using stock return to explain stock return
244 | 	- **factors**: 
245 | 		1. confidence risk
246 | 		2. time horizon risk
247 | 		3. inflation risk
248 | 		4. bussiness cycle risk
249 | 		5. market-timing risk
250 | 
251 | #### Generalizations
252 | The simplicity of APT framework is a great virtue. It is helpful to understand the true sources of stock returns. The basic APT model can be enhanced in many ways.
253 | 
254 | * Allow risk prices $P_k$ to vary over time
255 | * Allow risk exposures $\beta_{i,k}$ to vary over time
256 | * Use Bayesian mothods to produce optimal out-of-sample forcasts for the risk exposures and hence for the expected returns
257 | * Introduce additional factor with zero-risk prices. Although do not contribute to expected return, help to explain the volatility.
258 | 
259 | ### Multi-Index Models (Factor Analysis & PCA)
260 | 
261 | #### Goal
262 | Using historical return extract the factors
263 | 
264 | $$r_{it} = \alpha_i + \sum_k \beta_{ik}\cdot f_{kt}$$
265 | where
266 | $$E[\epsilon_{it} \epsilon_{jt}]=0$$
267 | $$E[\epsilon_{it} f_{kt}]=0$$
268 | 
269 | $f_{kt}$: the return on index k inperiod t
270 | 
271 | $\beta$: sensitivities
272 | 
273 | #### Estimation
274 | Either exposure or factor return can be asserted on a priori grounds with the other identified empirically, or both can be identified empirically.
275 | 
276 | #### Characteristics
277 | * Have f(indexes) represents separate influence
278 | * The structure must be parsimonious: the returns can be described in terms of limited indexes
279 | 
280 | #### Statistical Solutions
281 | Let the data design the model
282 | 
283 | * PCA
284 | * Factor Analysis: better in heteroscedastic series
285 | 
286 | #### Design Issue
287 | * **The Choice of Data**: Individul stocks vs portfolio
288 | * **The number of Index**:
289 | 	- Stactical techniques: Factor analysis, PCA 
290 | 	- Common sense and economic significance play a major role in deciding on the number of factors
291 | * **The nonuniqueness of Factors**: The researcher should realize the resulting structure is not unique. Some researchers will examine alternative structures in an atempt to understand what influences are affecting security returns and to convince themself the overall separation make an intuitive sense
292 | * **Computational Problems**:
293 | 	- Roll and Ross: Multisample approach
294 | 	- Chen: Portfolio approach
295 | 
296 | #### Applications
297 | * **Identify the Indexes set**
298 | * **Determine the number of factors**: PCA / Factor Analysis
299 | 	- Single-group tests for each sample
300 | 		- Factor Analysis on return-generating process
301 | 		- Criteria: Chi2, AIC, **BIC**
302 | 	- Multiple-group tests for all stocks
303 | 		- Canonical Correlation (CCA): 
304 | 		
305 | 			take two sets of variables and see what is common amongst the two sets (can be two noncorresponding variables either on index or dimension)
306 | 			$$X_{N \times K}, Y_{N \times K^{\prime}}$$
307 | 			$$\mbox{x_weights}_{K,n}$$
308 | 			$$\mbox{y_weights}_{K^{\prime},n}$$
309 | 			Use CCA / PLS:
310 | 			$$\mbox{X_score}_{N\times n} = \mbox{Normalized}[X]_{N \times K} \mbox{x_weights}_{K,n}$$
311 | 			
312 | 			$$\mbox{Y_score}_{N\times n} = \mbox{Normalized}[Y]_{N \times K^{\prime}} \mbox{y_weights}_{K^{\prime},n}$$
313 | 		- Determin the number: 
314 | 			- r-value for $n=10$
315 | 			- correlation matrix pattern for each number of components: $n \times n$ for $n=1,\cdots,10$
316 | 
317 | * **Generate Factors**
318 | 
319 | * **Calibrate sensitivities**: 
320 | 	
321 | 	- Portfolio exposure to each factor
322 | 	- $Adjusted R^2$ (Should be stable)
323 | 	- Explanatory power: Compare these results with those for the single-index model (Should depend on the market cap)
324 | 	
325 | * **Explanatory Power** of the Model for Each Stock: R2>0.7 excellent
326 | 
327 | #### Conclusions
328 | * Goodness: simultaneously estimate the indexes and sensitivities in a multi-index model
329 | * Defect: Data Minning: Using return to explain return
330 | 
331 | 
332 | ### Multi-Factor Models for Portfolio Risk (BARRA)
333 | 
334 | $$r_{i,t} = a_{i,t} + X_{i,k,t} \cdot f_{k,t}$$
335 | where
336 | $X_{i,k,t}$: the exposure of asset i to factor k known at time t
337 | $f_{k,t}$: the factor return to factor k during the period from time $t$ to time $t+1$
338 | $a_{i,t}$: the stock i's specific return during period from time $t$ to time $t+1$
339 | $r_{i,t}$: the excess return (return above the risk-free return) on stock i during the period from time $t$ to time $t+1$
340 | 
341 | The risk structure
342 | $$V_{i,j} = X_{i,k1} F_{k1,k2} X_{j,k2}^T + \Delta_{i,j}$$
343 | $$V = X^T F X + \Delta$$
344 | where
345 | 
346 | $F_{k1,k2}$ is the K by K covariance matrix for factor returns
347 | 
348 | $\Delta_{i,j}$ is the N by N diagonal matrix of specific variance
349 | 
350 | A portfolio described by an N-element vector $h_i$ 
351 | 
352 | * portfolio exposure: $x_p =  X^T h_p$
353 | * portfolio variance: $\sigma_p^2 = x_p^T F x_p + h_p^T \Delta h_p = h_p^T V h_p$
354 | * Marginal Contribution for Total Risk
355 | $$MCTR = \frac{V h_p}{\sigma_p}$$
356 | * Risk-adjusted expected return:
357 | $$U = h_p^T r_p - \lambda\cdot h_p^T V h_p$$
358 | 
359 | 
360 | #### Choosing the Factors
361 | * External influences --> BARRA Model
362 | 	- Return in bond market (bond beta)
363 | 	- Unexpected changes in inflation
364 | 	- Change in oil price
365 | 	- Change in exchange rate
366 | * Cross-sectional comparisons
367 | 	- Fundamental
368 | 	- Market
369 | 		- volatility
370 | 		- price
371 | 		- share turnover
372 | * Purely internal or statistical factors
373 | 	- see multi-index model
374 | 
375 | #### Exposures
376 | * Industry Exposures
377 | 	- 1/0 variable
378 | * Risk Index Exposures
379 | 	- Volatility: beta, daily return vol, option implied vol
380 | 	- Momentum
381 | 	- Size
382 | 	- Liquidity
383 | 	- Growth
384 | 	- Value(Fundamentals)
385 | 	- Earning volatility
386 | 	- Financial leverage: debt-to-equity ratios
387 | 
388 | #### Applications
389 | * Rescale the Exposures
390 | * Regress the Factor Returns Against Exposures via Cross-sectional Regression
391 | $$f = (X^T W X)^{-1} (X^T W r)\\
392 | = \sum_{i=1}^N C_{k,i} r_i$$
393 | Here factor return can be interpreted as the return to a portfolio with weights $C_{k,i}$. So factor returns are the returns to factor portfolios. This portfolio has unit exposure to the particular factor
394 | * Factor Covariance and Specific
395 | 	- Stock returns
396 | 	- Factor exposures
397 | 	- Stock dividends, splits, and other adjustment
398 | 
399 | #### Model Validation
400 | * Model Setting:
401 | 	- 50 factors
402 | 	- 1000 assets
403 | * Measures:
404 | 	
405 | 	- $R^2$: 30-40%. It can vary quite significantly from month to month. And depends on the market return level.
406 | 	- root mean square error: 6% roughly against 10% volatility
407 | 	- Portfolio Risk
408 | * Goal:
409 | 	- Expain the portfolio risk
410 | 	- Forecast variances and covariances of factors and specific returns
411 | 	- Providing incisive, intuitive and interesting risk analysis
412 | 
413 | 
414 | You can think of this as slicing through the other direction from the APT analysis, as now the factor returns are unknowns to be solved for, whereas originally the coefficients b were the unknowns. Another way to think about it is that you're determining how predictive of returns the factor was on that day, and therefore how much return you could have squeezed out of that factor.
415 | 


--------------------------------------------------------------------------------
/README.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jerryxyx/AlphaTrading/5e73923786297faeadb27c76f83ec81fad74af51/README.pdf


--------------------------------------------------------------------------------
/README_old.md:
--------------------------------------------------------------------------------
  1 | # Multi-Factor Models
  2 | 
  3 | Author: Jerry Xia
  4 | 
  5 | Date: 2018/05/21
  6 | 
  7 | *Note: The advanced Marckdown features such as math expression may not be compatible in GitHub, please see README.pdf instead if you want more details*
  8 | 
  9 | 
 10 | 
 11 | ## Project Introduction
 12 | This is a research survey about alpha trading. In this project, I built up a pipeline of alpha trading including:
 13 | 
 14 | * factor pretest
 15 | * factor screening
 16 | * factor combination (modeling)
 17 | 
 18 | The models involed are APT models, Barra's risk models and dynamic factors model using Kalman filter.
 19 | 
 20 | ### Files
 21 | 
 22 | * rqdata_utils.py: Utils dealing with the rice quant platform data
 23 | 
 24 | * Step1_FactorPretest.ipynb: Factor returns profile visulization
 25 | 
 26 | * Step2_FactorsScreening.ipynb: Factor returns turnover visulization and correlation coefficients
 27 | 
 28 | * Step3\_FactorCombination\_AdaBoost\_Quantopian.ipynb: A Quantopian notebook file to combine alpha factors using Adaboost
 29 | 
 30 | * Step3\_FactorCombination\_BarraKalmanFilter.ipynb: Barra's risk model with three calibration schemes:
 31 | 	* Scheme 1: Cross-sectional regression and weighted average
 32 | 	* Scheme 2: Optimization problem: minimize the exponential weighted average of squared error
 33 | 	* Scheme 3: Dynamic linear model using Kalman filter
 34 | 
 35 | * KalmanFilterIntro.ipynb: An introduction to the dynamic multi-factor model
 36 | * APT_FammaBeth.ipynb: Using Famma-Macbeth regression to calibrate APT model.
 37 | 
 38 | ### Dataset
 39 | The dataset is not available in GitHub as it is too large. Except for Step3\_FactorCombination\_AdaBoost\_Quantopian.ipynb which we used US stock data in Quantopian, among other files, we used Chinese A-stocks data downloaded from RiceQuant instead (hard for free US equities' data). 
 40 | 
 41 | The data frame is multi-indexed similar to Quantopian's format(see both Alphalens github codes and rqdata_utils.py). However, feel free to cast and apply your own dataset.
 42 | 
 43 | 
 44 | ### Goal
 45 | * **Equity Return Forecasting** 
 46 | 	
 47 | * **Portfolio Risk Estimation**
 48 | 	
 49 | 	- APT
 50 | 		- Risk Exposure: $\beta_{i,k}$
 51 | 		- Risk Premium: $P_k$
 52 | 		- Contribution of Risk Factor to Long Term Excess Return:
 53 | 		$$E[r_i] - TB = \sum_k \beta_{i,k}P_k$$
 54 | 	- BARRA
 55 | 		- Factor Return Covariance: V
 56 | 		- Portfolio Risk: $\sigma_p$
 57 | 		- Portfolio Risk Exposures: $$x_p=X^T h_p$$
 58 | 		- Marginal Contribution for Total Risk: $$MCTR = \frac{V h_p}{\sigma_p}$$
 59 | 		- Portfolio Risk-Adjusted Expected Return: $$U = h_p^T r - \lambda \cdot h_p^T V h_p$$
 60 | 	
 61 | ### Model Classification
 62 | * CAPM
 63 | 	- a kind of sigle-factor model
 64 | 	- usually, a validity benchmark for other models
 65 | 
 66 | * APT
 67 | 	- factor returns are assumed to be known
 68 | 	- factor exposure can be regressed from factor returns
 69 | 	- aimed at forecasting
 70 | 	- how to fit: Fama-Macbeth Algorithm
 71 | 
 72 | * Multi-Index Models
 73 | 	- statistical indogeneous model using factor analysis
 74 | 	- useful at factors parsimouny and decouple
 75 | 
 76 | * Multi-Factor Risk Models(BARRA)
 77 | 	- factor exposures are assumed to be known (can be derived as the rescaled factor value)
 78 | 	- factor return can be regressed from factor exposures
 79 | 	- aimed at risk management
 80 | 	- how to fit: cross-sectional regression
 81 | 
 82 | ### Calibration Algorithms
 83 | Here I used 2 traditional way add a novel Kalman filter technique (see KalmanFilter.ipynb or MultiFactorModel.ipynb)
 84 | 
 85 | * Time-series regression (fix equity)
 86 | * Cross-sectional regression (fix time-stamp)
 87 | * Kalmn filter (APT model allowing risk exposure and risk premium to vary over time. In another word, a dynamic model with gaussian noise)
 88 | 
 89 | ### Improvements
 90 | 
 91 | * A percentage rank test is a good alternative to a z score
 92 | * Beware of quarterly ratios (referring to ROA, ROE, gross margin, etc.)
 93 | * Factor for quality: gross profitability a la Novy-Marx (2013). It's simply gross profits divided by total assets.
 94 | * Substituting ROA/Gross Margin with gross profitability
 95 | * 
 96 | 
 97 | ## Appendix: Notes on Factor Models
 98 | 
 99 | ### CAPM
100 | * Author: Markovitz(1959)
101 | * single-factor: 
102 | * explain: security returns
103 | 
104 | ### APT
105 | * Author: Stephen A. Ross(1976)
106 | * multi-factor
107 | * explain: security returns
108 | 
109 | #### Postulates:
110 | - The linear model
111 | $$r_i(t) - \alpha_i = \sum_{k=1}^K \beta_{ik} \cdot f_k(t) + \epsilon_i(t)$$
112 | 
113 | where $f_k(t)$ is the realization(value) of risk factor at time t
114 | 
115 | - No pure arbitrage profit
116 | 
117 | #### Conclusion
118 | * Exposure of each security on each factor
119 | * Risk premium on each factor
120 | $$(Mean[r_i(t)])_i = P_0 + \sum_{k=1}^K \beta_{ik} \cdot P_k$$
121 | or make $\beta_{0,k}$ equals 1 for each k,
122 | $$(Mean[r_i(t)])_i = \sum_{k=0}^K \bar{\beta}_{i,k} \cdot P_k$$
123 | where $P_0$ is the risk free return
124 | 
125 | * Portfolio exposure to each factor
126 | $$Portfolio_{it} = \beta_0 + \beta_k \cdot f_{kit}$$
127 | 
128 | 
129 | 
130 | #### Three alternative calibration methods
131 | * **statistical techniques** such as factor analysis, principle analysis
132 | 	- **Goodness**: good for determining the number of relevent risk factors
133 | 	- **Undesirable**: hard to interpret
134 | 	
135 | * **portfolios**: K different well-diversified portfolios as substitutions
136 | 	- **Goodness**: lead to insights
137 | 	- **Fama-Macbeth regression**
138 | 
139 | * **economic theory** (highly developed art)
140 | 	- **Goodness**: Intuitively appealing set of factors that admit economic interpretation of risk exposures
141 | 	- **Goodness**: Using economic information in addition to stock return. Avoid using stock return to explain stock return
142 | 	- **factors**: 
143 | 		1. confidence risk
144 | 		2. time horizon risk
145 | 		3. inflation risk
146 | 		4. bussiness cycle risk
147 | 		5. market-timing risk
148 | 
149 | #### Generalizations
150 | The simplicity of APT framework is a great virtue. It is helpful to understand the true sources of stock returns. The basic APT model can be enhanced in many ways.
151 | 
152 | * Allow risk prices $P_k$ to vary over time
153 | * Allow risk exposures $\beta_{i,k}$ to vary over time
154 | * Use Bayesian mothods to produce optimal out-of-sample forcasts for the risk exposures and hence for the expected returns
155 | * Introduce additional factor with zero-risk prices. Although do not contribute to expected return, help to explain the volatility.
156 | 
157 | ### Multi-Index Models (Factor Analysis & PCA)
158 | 
159 | #### Goal
160 | Using historical return extract the factors
161 | 
162 | $$r_{it} = \alpha_i + \sum_k \beta_{ik}\cdot f_{kt}$$
163 | where
164 | $$E[\epsilon_{it} \epsilon_{jt}]=0$$
165 | $$E[\epsilon_{it} f_{kt}]=0$$
166 | 
167 | $f_{kt}$: the return on index k inperiod t
168 | 
169 | $\beta$: sensitivities
170 | 
171 | #### Estimation
172 | Either exposure or factor return can be asserted on a priori grounds with the other identified empirically, or both can be identified empirically.
173 | 
174 | #### Characteristics
175 | * Have f(indexes) represents separate influence
176 | * The structure must be parsimonious: the returns can be described in terms of limited indexes
177 | 
178 | #### Statistical Solutions
179 | Let the data design the model
180 | 
181 | * PCA
182 | * Factor Analysis: better in heteroscedastic series
183 | 
184 | #### Design Issue
185 | * **The Choice of Data**: Individul stocks vs portfolio
186 | * **The number of Index**:
187 | 	- Stactical techniques: Factor analysis, PCA 
188 | 	- Common sense and economic significance play a major role in deciding on the number of factors
189 | * **The nonuniqueness of Factors**: The researcher should realize the resulting structure is not unique. Some researchers will examine alternative structures in an atempt to understand what influences are affecting security returns and to convince themself the overall separation make an intuitive sense
190 | * **Computational Problems**:
191 | 	- Roll and Ross: Multisample approach
192 | 	- Chen: Portfolio approach
193 | 
194 | #### Applications
195 | * **Identify the Indexes set**
196 | * **Determine the number of factors**: PCA / Factor Analysis
197 | 	- Single-group tests for each sample
198 | 		- Factor Analysis on return-generating process
199 | 		- Criteria: Chi2, AIC, **BIC**
200 | 	- Multiple-group tests for all stocks
201 | 		- Canonical Correlation (CCA): 
202 | 		
203 | 			take two sets of variables and see what is common amongst the two sets (can be two noncorresponding variables either on index or dimension)
204 | 			$$X_{N \times K}, Y_{N \times K^{\prime}}$$
205 | 			$$\mbox{x_weights}_{K,n}$$
206 | 			$$\mbox{y_weights}_{K^{\prime},n}$$
207 | 			Use CCA / PLS:
208 | 			$$\mbox{X_score}_{N\times n} = \mbox{Normalized}[X]_{N \times K} \mbox{x_weights}_{K,n}$$
209 | 			
210 | 			$$\mbox{Y_score}_{N\times n} = \mbox{Normalized}[Y]_{N \times K^{\prime}} \mbox{y_weights}_{K^{\prime},n}$$
211 | 		- Determin the number: 
212 | 			- r-value for $n=10$
213 | 			- correlation matrix pattern for each number of components: $n \times n$ for $n=1,\cdots,10$
214 | 
215 | * **Generate Factors**
216 | 
217 | * **Calibrate sensitivities**: 
218 | 	
219 | 	- Portfolio exposure to each factor
220 | 	- $Adjusted R^2$ (Should be stable)
221 | 	- Explanatory power: Compare these results with those for the single-index model (Should depend on the market cap)
222 | 	
223 | * **Explanatory Power** of the Model for Each Stock: R2>0.7 excellent
224 | 
225 | #### Conclusions
226 | * Goodness: simultaneously estimate the indexes and sensitivities in a multi-index model
227 | * Defect: Data Minning: Using return to explain return
228 | 
229 | 
230 | ### Multi-Factor Models for Portfolio Risk (BARRA)
231 | 
232 | $$r_{i,t} = a_{i,t} + X_{i,k,t} \cdot f_{k,t}$$
233 | where
234 | $X_{i,k,t}$: the exposure of asset i to factor k known at time t
235 | $f_{k,t}$: the factor return to factor k during the period from time $t$ to time $t+1$
236 | $a_{i,t}$: the stock i's specific return during period from time $t$ to time $t+1$
237 | $r_{i,t}$: the excess return (return above the risk-free return) on stock i during the period from time $t$ to time $t+1$
238 | 
239 | The risk structure
240 | $$V_{i,j} = X_{i,k1} F_{k1,k2} X_{j,k2}^T + \Delta_{i,j}$$
241 | $$V = X^T F X + \Delta$$
242 | where
243 | 
244 | $F_{k1,k2}$ is the K by K covariance matrix for factor returns
245 | 
246 | $\Delta_{i,j}$ is the N by N diagonal matrix of specific variance
247 | 
248 | A portfolio described by an N-element vector $h_i$ 
249 | 
250 | * portfolio exposure: $x_p =  X^T h_p$
251 | * portfolio variance: $\sigma_p^2 = x_p^T F x_p + h_p^T \Delta h_p = h_p^T V h_p$
252 | * Marginal Contribution for Total Risk
253 | $$MCTR = \frac{V h_p}{\sigma_p}$$
254 | * Risk-adjusted expected return:
255 | $$U = h_p^T r_p - \lambda\cdot h_p^T V h_p$$
256 | 
257 | 
258 | #### Choosing the Factors
259 | * External influences --> BARRA Model
260 | 	- Return in bond market (bond beta)
261 | 	- Unexpected changes in inflation
262 | 	- Change in oil price
263 | 	- Change in exchange rate
264 | * Cross-sectional comparisons
265 | 	- Fundamental
266 | 	- Market
267 | 		- volatility
268 | 		- price
269 | 		- share turnover
270 | * Purely internal or statistical factors
271 | 	- see multi-index model
272 | 
273 | #### Exposures
274 | * Industry Exposures
275 | 	- 1/0 variable
276 | * Risk Index Exposures
277 | 	- Volatility: beta, daily return vol, option implied vol
278 | 	- Momentum
279 | 	- Size
280 | 	- Liquidity
281 | 	- Growth
282 | 	- Value(Fundamentals)
283 | 	- Earning volatility
284 | 	- Financial leverage: debt-to-equity ratios
285 | 
286 | #### Applications
287 | * Rescale the Exposures
288 | * Regress the Factor Returns Against Exposures via Cross-sectional Regression
289 | $$f = (X^T W X)^{-1} (X^T W r)\\
290 | = \sum_{i=1}^N C_{k,i} r_i$$
291 | Here factor return can be interpreted as the return to a portfolio with weights $C_{k,i}$. So factor returns are the returns to factor portfolios. This portfolio has unit exposure to the particular factor
292 | * Factor Covariance and Specific
293 | 	- Stock returns
294 | 	- Factor exposures
295 | 	- Stock dividends, splits, and other adjustment
296 | 
297 | #### Model Validation
298 | * Model Setting:
299 | 	- 50 factors
300 | 	- 1000 assets
301 | * Measures:
302 | 	
303 | 	- $R^2$: 30-40%. It can vary quite significantly from month to month. And depends on the market return level.
304 | 	- root mean square error: 6% roughly against 10% volatility
305 | 	- Portfolio Risk
306 | * Goal:
307 | 	- Expain the portfolio risk
308 | 	- Forecast variances and covariances of factors and specific returns
309 | 	- Providing incisive, intuitive and interesting risk analysis
310 | 
311 | 
312 | You can think of this as slicing through the other direction from the APT analysis, as now the factor returns are unknowns to be solved for, whereas originally the coefficients b were the unknowns. Another way to think about it is that you're determining how predictive of returns the factor was on that day, and therefore how much return you could have squeezed out of that factor.
313 | 


--------------------------------------------------------------------------------
/output/factor_ic_analysis.csv:
--------------------------------------------------------------------------------
 1 | factor,group,1D,27D,98D
 2 | total_turnover,ConsumerDiscretionary,-0.052061319774850776,-0.11316983202573735,-0.1929657054345211
 3 | total_turnover,ConsumerStaples,-0.06218330937378258,-0.12809022695465364,-0.23566073217524877
 4 | total_turnover,Energy,-0.053310899809748104,-0.114291646238941,-0.1828861184315757
 5 | total_turnover,Financials,-0.043483723699830806,-0.09250154138845446,-0.15258051713146398
 6 | total_turnover,HealthCare,-0.03054709278550283,-0.07126898666786867,-0.1370994975862821
 7 | total_turnover,Industrials,-0.06155338477832689,-0.12191628841788626,-0.1913136626300075
 8 | total_turnover,InformationTechnology,-0.04331167468163195,-0.09994088680694369,-0.18793138112573995
 9 | total_turnover,Materials,-0.06347263107873605,-0.1454435080579407,-0.23516052362957487
10 | total_turnover,TelecommunicationServices,-0.05623721881390593,-0.12195086829491936,-0.08384458077709611
11 | total_turnover,Utilities,-0.06710637414072558,-0.11822026772629673,-0.18005363953326778
12 | volume,ConsumerDiscretionary,-0.038821806113053185,-0.06517649803074897,-0.10564426190383719
13 | volume,ConsumerStaples,-0.04881955414007564,-0.07004912944675076,-0.10700285231017939
14 | volume,Energy,-0.05050533179526796,-0.05539159512201123,-0.04449211070962409
15 | volume,Financials,-0.037379891386358104,-0.06717310704498229,-0.11532946922437222
16 | volume,HealthCare,-0.018552307919222938,-0.031143940450130612,-0.06310080118478258
17 | volume,Industrials,-0.05369190239494951,-0.09056039930841121,-0.12912730395652844
18 | volume,InformationTechnology,-0.03270391959214507,-0.06303027646641893,-0.12181248844576387
19 | volume,Materials,-0.05213678473116451,-0.09941954650749407,-0.1433535572754988
20 | volume,TelecommunicationServices,-0.03401630796772099,-0.013292433537832311,-0.028629856850715747
21 | volume,Utilities,-0.053355632253003354,-0.07805499426744068,-0.09309882686634807
22 | market_cap,ConsumerDiscretionary,-0.018642170440300053,-0.08071467806160568,-0.15194577884185356
23 | market_cap,ConsumerStaples,-0.022631363847695745,-0.09367476053930583,-0.1991416188736341
24 | market_cap,Energy,-0.0215733019306775,-0.10099046052166827,-0.23276886886450332
25 | market_cap,Financials,-0.01783881472520913,-0.06584105506902525,-0.12906327329642123
26 | market_cap,HealthCare,-0.016506107254639694,-0.07322090125094058,-0.14809213265805843
27 | market_cap,Industrials,-0.024891064518646394,-0.08946681310150975,-0.1722836413867398
28 | market_cap,InformationTechnology,-0.018400836027949976,-0.06852777925101093,-0.16777140860823717
29 | market_cap,Materials,-0.028921427376547782,-0.12317603287175999,-0.2357473587754678
30 | market_cap,TelecommunicationServices,-0.024539877300613498,-0.12706334273254716,-0.18916155419222905
31 | market_cap,Utilities,-0.021505325470817266,-0.06974356783593436,-0.11709171910355881
32 | a_share_market_val_2,ConsumerDiscretionary,-0.018301595390333388,-0.0834047795877226,-0.1622640445487711
33 | a_share_market_val_2,ConsumerStaples,-0.021353636086466607,-0.08957475782545855,-0.19257272420530416
34 | a_share_market_val_2,Energy,-0.01589375064124426,-0.08269370057080777,-0.22624001661944343
35 | a_share_market_val_2,Financials,-0.013595140013004083,-0.05757239677747778,-0.11097273433879687
36 | a_share_market_val_2,HealthCare,-0.016442154211948196,-0.06823771497421101,-0.1313262609982461
37 | a_share_market_val_2,Industrials,-0.022288503369883857,-0.08422835061983074,-0.16421150228922107
38 | a_share_market_val_2,InformationTechnology,-0.01602024151930178,-0.06606228480096947,-0.16184394862584908
39 | a_share_market_val_2,Materials,-0.026673267375488252,-0.11840215884371369,-0.22834195826194792
40 | a_share_market_val_2,TelecommunicationServices,-0.019427402862985693,-0.050102249488752554,-0.006134969325153374
41 | a_share_market_val_2,Utilities,-0.01685842238760016,-0.07250987448493125,-0.12230908395788107
42 | cash_received_from_sales_of_goods,ConsumerDiscretionary,-0.00103083821123597,-0.03206445749325148,-0.06876501903118355
43 | cash_received_from_sales_of_goods,ConsumerStaples,0.0011848183502683814,-0.026253336689890584,-0.048118632633388406
44 | cash_received_from_sales_of_goods,Energy,-0.016016456712267646,-0.08156643860850851,-0.1719692794464853
45 | cash_received_from_sales_of_goods,Financials,0.002049322081554394,-0.01712216286244825,-0.04627365414791241
46 | cash_received_from_sales_of_goods,HealthCare,-0.007054075457333036,-0.04186122320226921,-0.09264530897763122
47 | cash_received_from_sales_of_goods,Industrials,-0.0033712276790714055,-0.026317880861985995,-0.07804687752350649
48 | cash_received_from_sales_of_goods,InformationTechnology,-0.003735428340115983,-0.03292082920277817,-0.09611089193235643
49 | cash_received_from_sales_of_goods,Materials,-0.009364496334334471,-0.062445293690082074,-0.13116837426913705
50 | cash_received_from_sales_of_goods,TelecommunicationServices,-0.0010224948875255625,0.013292433537832311,0.044989775051124746
51 | cash_received_from_sales_of_goods,Utilities,-0.004586561111603414,-0.020219037138779015,-0.02362352504750876
52 | pb_ratio,ConsumerDiscretionary,-0.02786139236091668,-0.04723618727888782,-0.06416866121080726
53 | pb_ratio,ConsumerStaples,-0.029475817225151503,-0.07143808423799812,-0.12540901349300188
54 | pb_ratio,Energy,-0.0036947086202947805,0.00801451621850763,0.050710527187911955
55 | pb_ratio,Financials,-0.02539575463258946,-0.05441037179139589,-0.08509951720797193
56 | pb_ratio,HealthCare,-0.01773232906393156,-0.005604430266716481,-0.0050057836964287755
57 | pb_ratio,Industrials,-0.02302510249518233,-0.04874561973483434,-0.0640169980335691
58 | pb_ratio,InformationTechnology,-0.02211961063266852,-0.05069932164964732,-0.0718467294959841
59 | pb_ratio,Materials,-0.01959489813139817,-0.039595205878501294,-0.06585604305979385
60 | pb_ratio,TelecommunicationServices,-0.002044989775051125,-0.016359918200409,-0.028629856850715747
61 | pb_ratio,Utilities,-0.026500309923510785,-0.07579696587327645,-0.1316871231270864
62 | net_profit,ConsumerDiscretionary,0.004534976198255834,-0.024587145489383917,-0.06697091876991088
63 | net_profit,ConsumerStaples,0.001762134488588463,-0.030557169605312656,-0.09508911979085215
64 | net_profit,Energy,-0.0037239591056138937,-0.05576234882359838,-0.14286158066900853
65 | net_profit,Financials,0.005619412446675389,-0.004552703253618796,-0.03282171908115013
66 | net_profit,HealthCare,0.004023627795232325,-0.0122560368659469,-0.054647179895811165
67 | net_profit,Industrials,0.005922822373344156,-0.004876234673987302,-0.037687191394743746
68 | net_profit,InformationTechnology,0.0034451143466464815,-0.01467883275591999,-0.07454590190755
69 | net_profit,Materials,0.007775417553720173,-0.017608951812301852,-0.097233933891964
70 | net_profit,TelecommunicationServices,-0.0010224948875255625,0.013292433537832311,0.044989775051124746
71 | net_profit,Utilities,0.003587155195185814,-0.009910187986564376,-0.04759442386949663
72 | ps_ratio,ConsumerDiscretionary,-0.012080771280731099,-0.018288440941963535,-0.014703891059025101
73 | ps_ratio,ConsumerStaples,-0.019200947719015923,-0.04964656897109551,-0.12391325426910196
74 | ps_ratio,Energy,0.009579091016352978,0.04035361462838712,0.07135150759411069
75 | ps_ratio,Financials,-0.014126989377467987,-0.014116943885514285,-0.0199666437383381
76 | ps_ratio,HealthCare,-0.0050853830262781955,-0.005927366792048748,0.0016129868796054226
77 | ps_ratio,Industrials,-0.008480756136386746,-0.010614464339011856,0.0075018939060060375
78 | ps_ratio,InformationTechnology,-0.01071146507594663,-0.016407405031958085,-0.013131536623179884
79 | ps_ratio,Materials,-0.00811664224948075,-0.003958926420953927,0.006930722032411271
80 | ps_ratio,TelecommunicationServices,-0.00408997955010225,-0.02556237218813906,-0.053169734151329244
81 | ps_ratio,Utilities,-0.011680459426909625,-0.03175334201449768,-0.08629345861643857
82 | 


--------------------------------------------------------------------------------
/report/Alpha Trading Workflow.md:
--------------------------------------------------------------------------------
  1 | # Alpha Trading Workflow
  2 | 
  3 | Analyst: Yuxuan Xia
  4 | 
  5 | Date: 2018/06/04
  6 | 
  7 | ## TODO
  8 | 
  9 | * Input more effective factors: take advice from people and industry reports
 10 | * Should add technical analysis, because it matters! People care about them and then make it good sentimental indexes.
 11 | * Find well-known metrics to express results
 12 | 
 13 | ## Workflow
 14 | \checkmark stands for finished and \vartriangle stands for TODO
 15 | 
 16 | * Universe definition
 17 | * Factors collection and preprocessing
 18 | 	* $\vartriangle$ Factors collection
 19 | 		- Sources
 20 | 			- balance sheet
 21 | 			- cash flow statement
 22 | 			- income statement
 23 | 			- earning report
 24 | 		- Econometric Classifications
 25 | 			- value
 26 | 			- growth
 27 | 			- profitability
 28 | 			- market size
 29 | 			- liquidity
 30 | 			- volatility
 31 | 			- Momentom
 32 | 			- Financial leverage (debt-to-equity ratio)
 33 | 	* Factors preprocessing
 34 | 		- $\vartriangle$daily, quaterly, annually
 35 | 		- continuous: rescale, outliers
 36 | 		- $\checkmark$discrete: rank
 37 | * Factors screening and combination
 38 | 	* Factors screening
 39 | 		- $\checkmark$Factors' correlation
 40 | 		- $\checkmark$Factors' foreseeablity
 41 | 		- Fama-Macbeth regression
 42 | 	* $\vartriangle$Factors combination
 43 | 		- PCA, FA
 44 | 		- Techniqual Analaysis
 45 | 		- Financial Modeling
 46 | 		- Linear combination to maximize Sharpe ratio
 47 | 		- Non-linear learning algorithms
 48 | 			- $\checkmark$AdaBoost
 49 | 			- Reinforcement learning
 50 | 
 51 | * Portfolio allocation
 52 | 
 53 | 
 54 | ## Factors' Correlations
 55 | Here, I use correlation matrix as the measure. The difference from the second result is that the correlation matrix is calculated by the rank data rather than the raw data
 56 | ### Two ICs comparison
 57 | * Pearson's IC: measures linear relationship between components
 58 | 
 59 | * Spearman's IC: measures monotonic relationship between components. Since We only care about the monotonic relationships. Spearman's IC wins.
 60 | 
 61 | 
 62 | ### Regular IC(Pearson's correlation coefficient) for each factors
 63 | ![](Corr matrix for raw factors.png)
 64 | ### Spearman's Rank correlation coefficient for each factors
 65 | ![](Corr matrix for factor ranks.png)
 66 | 
 67 | ### How to rule out redundant factors and why Spearman's rank correlation coefficients?
 68 | From the correlation coefficients below, we can again conclude that Spearman's rank IC is far more robust. Take ps_ratio and sales_yield as a example.
 69 | $$ps\_ratio = \frac{\mbox{adjusted close price}}{\mbox{sales per share}}$$
 70 | whereas
 71 | $$sales\_yield = \frac{\mbox{sales per share}}{\mbox{price}}$$
 72 | Ahthogh the price in sales_yield formula is vague in our data source we can see roughly speaking, these two variable should be inverse of each other. The Spearman's rank correlation coefficient is -0.98 which verifies this statement, and we should avoid using both of these factors, which would exeggarate the impact of this peticular factor. However, we can not see such identity in the Pearson's regular correlation coefficients. It's quite misleading actually and that's why we choose Spearman's rank IC.
 73 | 
 74 | ## Factors' Foreseeability
 75 | 
 76 | ### Mehods
 77 | * Spearman's rank correlation coefficients
 78 | * Fama-Macbeth regression: Not only consider the foreseeability of factors itself but also consider the co-vary of different factors, which means rule out factors if the returns can be explained by the recent factors.
 79 | 
 80 | 
 81 | ### Spearman's rank IC for factors vs. forward returns
 82 | 
 83 | ![](mean spearmans rank IC.png)
 84 | 
 85 | ### Spearman's rank IC (absolute value) for factors vs. forward returns
 86 | ![](mean spearmans rank IC (absolute value).png)
 87 | 
 88 | ### Rank of the Spearman's rank IC (absolute value) for factors vs. forward returns
 89 | ![](rank of mean spearmans rank IC (absolute value).png)
 90 | 
 91 | ## Factors Preprocessing
 92 | * Get ranked data
 93 | * Obtain the valid stocks set
 94 | * Reshape the data: only valid stocks set
 95 | * Fill null: using daily average
 96 | * Rescale the data: MinMaxScaler
 97 | * Variet reduction: PCA analysis
 98 | * Sanity check
 99 | 
100 | ![](corr comparison after pca analysis.png)
101 | 
102 | Here, I use principle component analysis because it can brings two benefits to our data - orthogonality and dimensionality reduction. Orthogonality makes data more separate, less dimensionality makes information more concentrated. Either of them is essential for machine learning algorithms.
103 | 
104 | In the next part, I used this preprocessed data as the input to obtain a "mega alpha".
105 | 
106 | ## Mega Alpha
107 | construct an aggregate alpha factor which has its return distribution profitable. The term "profitable" here means condense, little turnover, significant in the positive return.
108 | ### Methods
109 | #### linear methods
110 | * normalize factors and try a linear combination 
111 | * rank each factor and then sum up
112 | * Financial modeling
113 | * linear combination to maximize Sharpe ratio
114 | 
115 | #### Non-linear methods
116 | * AdaBoost
117 | * Reinforement Learning
118 | 
119 | ### AdaBoost
120 | #### Description
121 | The algorithm sequentially applies a weak classification to modified versions of the data. By increasing the weights of the missclassified observations, each weak learner focuses on the error of the previous one. The predictions are aggregated through a weighted majority vote.
122 | 
123 | #### Algorithm
124 | 
125 | ![](adaboost_algorithm.png)
126 | 
127 | #### Train set
128 | The adaboost classifier was applied to our fundamental dataset. The objective is to train a classifier which give a score for the bunch of factors. Or in other word, the mega alpha. Pink for the positive forward returns observations and blue for the negative forward returns observations. A good score system is to make the two classes more separated.
129 | ![](train_score_dist.png)
130 | We can see, in train set, AdaBoost classifier did so well! The next plot is the precision in each quantile of scores. In the top and bottom quantile, the predicted precision is nearly 100%!
131 | ![](train_accuracy_bar.png)
132 | 
133 | #### Test set
134 | alpha values histogram
135 | ![](test_score_dist.png)
136 | quantile precision bar plot
137 | ![](test_accuracy_bar.png)
138 | The precision in the top and bottom quantile is only slightly higher than 50%. Far from good if we considered transaction cost. Frankly, there are plenty of works should be done before we get some satisfied results. Anyway, this pipeline gives us a flexible routine and a judgement system. I'll continue to tweak the routine and factors to make sure it goes on the right direction.
139 | 
140 | ## References
141 | * Jonathan Larkin, *A Professional Quant Equity Workflow*. August 31, 2016
142 | * *A Practitioner‘s Guide to Factor Models*. The Research Foundation of The Institute of Chartered Financial Analysts
143 | * Thomas Wiecki, Machine Learning on Quantopian
144 | * Inigo Fraser Jenkins, *Using factors with different alpha decay times: The case for non-linear combination* 
145 | * PNC, *Factor Analysis: What Drives Performance?*
146 | * O’Shaughnessy, *Alpha or Assets? — Factor Alpha vs. Smart Beta*. April 2016
147 | * *O’Shaughnessy Quarterly Investor Letter Q1 2018* 
148 | * Jiantao Zhu, Orient Securities, *Alpha Forecasting - Factor-Based Strategy Research Series 13*
149 | * Yang Song, Bohai Securities, *Multi-Factor Models Research: Single Factor Testing*, 2017/10/11


--------------------------------------------------------------------------------
/report/Corr_matrix_for_factor_ranks.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jerryxyx/AlphaTrading/5e73923786297faeadb27c76f83ec81fad74af51/report/Corr_matrix_for_factor_ranks.png


--------------------------------------------------------------------------------
/report/Corr_matrix_for_raw_factors.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jerryxyx/AlphaTrading/5e73923786297faeadb27c76f83ec81fad74af51/report/Corr_matrix_for_raw_factors.png


--------------------------------------------------------------------------------
/report/Quantitative Strategy Workflow.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jerryxyx/AlphaTrading/5e73923786297faeadb27c76f83ec81fad74af51/report/Quantitative Strategy Workflow.pptx


--------------------------------------------------------------------------------
/report/adaboost_algorithm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jerryxyx/AlphaTrading/5e73923786297faeadb27c76f83ec81fad74af51/report/adaboost_algorithm.png


--------------------------------------------------------------------------------
/report/corr_comparison_after_pca_analysis.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jerryxyx/AlphaTrading/5e73923786297faeadb27c76f83ec81fad74af51/report/corr_comparison_after_pca_analysis.png


--------------------------------------------------------------------------------
/report/mean_spearmans_rank_IC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jerryxyx/AlphaTrading/5e73923786297faeadb27c76f83ec81fad74af51/report/mean_spearmans_rank_IC.png


--------------------------------------------------------------------------------
/report/mean_spearmans_rank_IC_absolute_value.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jerryxyx/AlphaTrading/5e73923786297faeadb27c76f83ec81fad74af51/report/mean_spearmans_rank_IC_absolute_value.png


--------------------------------------------------------------------------------
/report/rank_of_mean_spearmans_rank_IC_absolute_value.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jerryxyx/AlphaTrading/5e73923786297faeadb27c76f83ec81fad74af51/report/rank_of_mean_spearmans_rank_IC_absolute_value.png


--------------------------------------------------------------------------------
/report/test_accuracy_bar.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jerryxyx/AlphaTrading/5e73923786297faeadb27c76f83ec81fad74af51/report/test_accuracy_bar.png


--------------------------------------------------------------------------------
/report/test_score_dist.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jerryxyx/AlphaTrading/5e73923786297faeadb27c76f83ec81fad74af51/report/test_score_dist.png


--------------------------------------------------------------------------------
/report/train_accuracy_bar.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jerryxyx/AlphaTrading/5e73923786297faeadb27c76f83ec81fad74af51/report/train_accuracy_bar.png


--------------------------------------------------------------------------------
/report/train_score_dist.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jerryxyx/AlphaTrading/5e73923786297faeadb27c76f83ec81fad74af51/report/train_score_dist.png


--------------------------------------------------------------------------------
/report/train_score_dist2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jerryxyx/AlphaTrading/5e73923786297faeadb27c76f83ec81fad74af51/report/train_score_dist2.png


--------------------------------------------------------------------------------
/rqdata_utils.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import alphalens as al
 3 | import matplotlib.pyplot as plt
 4 | import numpy as np
 5 | 
 6 | def price_reader(price_path):
 7 |     price_df = pd.read_csv(price_path)
 8 |     price_df.rename(index=str,columns={"Unnamed: 0":"date"},inplace=True)
 9 |     price_df.date = pd.to_datetime(price_df.date,format="%Y-%m-%d",errors='ignore')
10 |     # price_df.date = price_df.date.apply(timezone.localize)
11 |     price_df.set_index(['date'],drop=True,inplace=True)
12 |     price_df = price_df.sortlevel(axis=1)
13 |     return price_df
14 | 
15 | def instrument_reader(instrument_path):
16 |     instrument_df = pd.read_csv(instrument_path)
17 |     instrument_df.drop(['Unnamed: 0'],axis=1,inplace=True)
18 |     instrument_df = instrument_df.set_index(['bookId'])
19 |     instrument_df = instrument_df.sort_index()
20 |     return instrument_df
21 | 
22 | def equity_reader(equity_path):
23 |     cn_df = pd.read_csv(equity_path)
24 |     cn_df.date = pd.to_datetime(cn_df.date,format="%Y-%m-%d",errors='ignore')
25 |     cn_df.set_index(['date','order_book_id'],drop=True,inplace=True)
26 |     cn_df.drop(["Unnamed: 0"],axis=1,inplace=True)
27 |     return cn_df
28 | 
29 | def benchmark_reader(benchmark_path):
30 |     benchmark_df = pd.read_csv(benchmark_path,names=['date','value'])
31 |     benchmark_df = benchmark_df.set_index('date',drop=True)
32 |     benchmark_df['return'] = np.log(benchmark_df.shift(1)/benchmark_df).fillna(0)
33 |     return benchmark_df
34 | 
35 | def equity_add_instrumentInfo(cn_df,instrument_df,instrument_column):
36 |     instrumentInfoSeries = instrument_df[instrument_column]
37 |     bookIdIdx = cn_df.index.get_level_values('order_book_id')
38 |     bookIdArray = bookIdIdx.get_values()
39 |     instrumentInfo = instrumentInfoSeries[bookIdArray[:]].values
40 |     cn_df[instrument_column] = instrumentInfo
41 |     return cn_df
42 | 
43 | def get_price_instrument_equity(price_path,instrument_path,equity_path,addInstrumentColumn=None):
44 |     price_df = price_reader(price_path)
45 |     instrument_df = instrument_reader(instrument_path)
46 |     equity_df = equity_reader(equity_path)
47 |     if(addInstrumentColumn):
48 |         equity_df = equity_add_instrumentInfo(equity_df,instrument_df,addInstrumentColumn)
49 |     return price_df,instrument_df,equity_df
50 | 
51 | def ic_analysis(equity_df, price_df, factor_columns, group_column, periods=(1,22,66), group_adjust=False):
52 |     factor_list = []
53 |     ic_list = []
54 |     monthly_ic_list = []
55 |     groupby = equity_df[group_column]
56 |     for col in factor_columns:
57 |         factor_list.append(equity_df[col])
58 | 
59 |     for my_factor in factor_list:
60 |         factor_data = al.utils.get_clean_factor_and_forward_returns(factor=my_factor,
61 |                                                                     prices=price_df,
62 |                                                                     groupby=groupby,
63 |                                                                     periods=periods,
64 |                                                                     max_loss=1)
65 |         mean_ic = al.performance.mean_information_coefficient(factor_data, group_adjust=group_adjust,
66 |                                                               by_group=True,
67 |                                                               by_time=None)
68 |         mean_monthly_ic = al.performance.mean_information_coefficient(factor_data, group_adjust=group_adjust,
69 |                                                                       by_group=False,
70 |                                                                       by_time='M')
71 |         print("#######################################################")
72 |         print("factor: {}".format(my_factor.name))
73 |         print(mean_ic)
74 |         # print(mean_monthly_ic)
75 |         ic_list.append(mean_ic)
76 |         monthly_ic_list.append(mean_monthly_ic)
77 |         al.plotting.plot_monthly_ic_heatmap(mean_monthly_ic)
78 |         plt.show()
79 | 
80 | 
81 |     mean_ic_df = pd.concat(ic_list, keys=factor_columns)
82 |     mean_ic_df.index = mean_ic_df.index.set_names(['factor', 'group'])
83 |     return mean_ic_df, monthly_ic_list


--------------------------------------------------------------------------------
/source/DownloadData.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [
 10 |     {
 11 |      "name": "stdout",
 12 |      "output_type": "stream",
 13 |      "text": [
 14 |       "Population Check - Initial #: 2320\n"
 15 |      ]
 16 |     }
 17 |    ],
 18 |    "source": [
 19 |     "# Constructs Time Series Data for All Stocks\n",
 20 |     "import pandas as pd\n",
 21 |     "import numpy as np\n",
 22 |     "from datetime import datetime\n",
 23 |     "from datetime import timedelta\n",
 24 |     "import tushare as ts\n",
 25 |     "\n",
 26 |     "from scipy.stats import rankdata\n",
 27 |     "\n",
 28 |     "import seaborn as sns\n",
 29 |     "\n",
 30 |     "# Pull All Trade Dates\n",
 31 |     "trade_dates = pd.Series(data=[pd.Timestamp(date) for date in get_trading_dates('2001-01-01', '2018-12-31')], name='trade_date')\n",
 32 |     "\n",
 33 |     "# year_start = 2001\n",
 34 |     "year_start = 2012\n",
 35 |     "year_end = 2018\n",
 36 |     "\n",
 37 |     "# date_end_last_dt = max(trade_dates[trade_dates.dt.year == year_start-1])\n",
 38 |     "date_start_dt = min(trade_dates[trade_dates.dt.year == year_start])\n",
 39 |     "date_end_dt = max(trade_dates[trade_dates.dt.year == year_end])\n",
 40 |     "# date_end_dt = date_start_dt+timedelta(days=1) # 2012-01-05\n",
 41 |     "# date_end_last = date_end_last_dt.strftime('%Y-%m-%d')\n",
 42 |     "date_start = date_start_dt.strftime('%Y-%m-%d')\n",
 43 |     "date_end = date_end_dt.strftime('%Y-%m-%d')\n",
 44 |     "\n",
 45 |     "# Construct Stock Population\n",
 46 |     "stock_all = all_instruments(type=\"CS\", country='cn', date=date_start_dt)\n",
 47 |     "stock_list = stock_all['order_book_id'].tolist()\n",
 48 |     "print(\"Population Check - Initial #: {}\".format(stock_all.shape[0]))"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": 2,
 54 |    "metadata": {
 55 |     "collapsed": false
 56 |    },
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "price_data = get_price(stock_list, start_date=date_start, end_date=date_end, frequency='1d', \n",
 60 |     "                       fields=['close'], \n",
 61 |     "                       adjust_type='pre', skip_suspended=False, country='cn')\n",
 62 |     "price_data.to_csv(\"cn_stock_price_{}_{}.csv\".format(year_start,year_end)) # Download price data"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": 3,
 68 |    "metadata": {
 69 |     "collapsed": false
 70 |    },
 71 |    "outputs": [
 72 |     {
 73 |      "data": {
 74 |       "text/html": [
 75 |        "<div>\n",
 76 |        "<table border=\"1\" class=\"dataframe\">\n",
 77 |        "  <thead>\n",
 78 |        "    <tr style=\"text-align: right;\">\n",
 79 |        "      <th></th>\n",
 80 |        "      <th>300188.XSHE</th>\n",
 81 |        "      <th>600337.XSHG</th>\n",
 82 |        "      <th>600168.XSHG</th>\n",
 83 |        "      <th>002337.XSHE</th>\n",
 84 |        "      <th>600592.XSHG</th>\n",
 85 |        "      <th>000950.XSHE</th>\n",
 86 |        "      <th>600991.XSHG</th>\n",
 87 |        "      <th>002473.XSHE</th>\n",
 88 |        "      <th>600784.XSHG</th>\n",
 89 |        "      <th>600736.XSHG</th>\n",
 90 |        "      <th>...</th>\n",
 91 |        "      <th>600345.XSHG</th>\n",
 92 |        "      <th>600387.XSHG</th>\n",
 93 |        "      <th>000063.XSHE</th>\n",
 94 |        "      <th>002506.XSHE</th>\n",
 95 |        "      <th>300151.XSHE</th>\n",
 96 |        "      <th>002579.XSHE</th>\n",
 97 |        "      <th>000563.XSHE</th>\n",
 98 |        "      <th>000551.XSHE</th>\n",
 99 |        "      <th>002578.XSHE</th>\n",
100 |        "      <th>000726.XSHE</th>\n",
101 |        "    </tr>\n",
102 |        "  </thead>\n",
103 |        "  <tbody>\n",
104 |        "    <tr>\n",
105 |        "      <th>2012-01-04</th>\n",
106 |        "      <td>4.3401</td>\n",
107 |        "      <td>3.1750</td>\n",
108 |        "      <td>6.3693</td>\n",
109 |        "      <td>2.8752</td>\n",
110 |        "      <td>6.8314</td>\n",
111 |        "      <td>4.0168</td>\n",
112 |        "      <td>15.08</td>\n",
113 |        "      <td>7.2898</td>\n",
114 |        "      <td>5.0104</td>\n",
115 |        "      <td>4.0135</td>\n",
116 |        "      <td>...</td>\n",
117 |        "      <td>9.4373</td>\n",
118 |        "      <td>8.7711</td>\n",
119 |        "      <td>13.3691</td>\n",
120 |        "      <td>2.6406</td>\n",
121 |        "      <td>2.0055</td>\n",
122 |        "      <td>3.4037</td>\n",
123 |        "      <td>2.3129</td>\n",
124 |        "      <td>5.7181</td>\n",
125 |        "      <td>1.8931</td>\n",
126 |        "      <td>6.0165</td>\n",
127 |        "    </tr>\n",
128 |        "    <tr>\n",
129 |        "      <th>2012-01-05</th>\n",
130 |        "      <td>4.0743</td>\n",
131 |        "      <td>2.8583</td>\n",
132 |        "      <td>6.1697</td>\n",
133 |        "      <td>2.6739</td>\n",
134 |        "      <td>6.1737</td>\n",
135 |        "      <td>3.9101</td>\n",
136 |        "      <td>15.08</td>\n",
137 |        "      <td>6.9505</td>\n",
138 |        "      <td>4.5089</td>\n",
139 |        "      <td>3.8534</td>\n",
140 |        "      <td>...</td>\n",
141 |        "      <td>8.8456</td>\n",
142 |        "      <td>8.5560</td>\n",
143 |        "      <td>13.2895</td>\n",
144 |        "      <td>2.3920</td>\n",
145 |        "      <td>1.8872</td>\n",
146 |        "      <td>3.1452</td>\n",
147 |        "      <td>2.2942</td>\n",
148 |        "      <td>5.2525</td>\n",
149 |        "      <td>1.7877</td>\n",
150 |        "      <td>6.0005</td>\n",
151 |        "    </tr>\n",
152 |        "    <tr>\n",
153 |        "      <th>2012-01-06</th>\n",
154 |        "      <td>4.1173</td>\n",
155 |        "      <td>2.6920</td>\n",
156 |        "      <td>5.9511</td>\n",
157 |        "      <td>2.7148</td>\n",
158 |        "      <td>6.2207</td>\n",
159 |        "      <td>4.1429</td>\n",
160 |        "      <td>15.11</td>\n",
161 |        "      <td>7.0765</td>\n",
162 |        "      <td>4.6111</td>\n",
163 |        "      <td>3.9570</td>\n",
164 |        "      <td>...</td>\n",
165 |        "      <td>8.9315</td>\n",
166 |        "      <td>8.4289</td>\n",
167 |        "      <td>12.9473</td>\n",
168 |        "      <td>2.4624</td>\n",
169 |        "      <td>1.9159</td>\n",
170 |        "      <td>3.1724</td>\n",
171 |        "      <td>2.2475</td>\n",
172 |        "      <td>5.2787</td>\n",
173 |        "      <td>1.7897</td>\n",
174 |        "      <td>6.2645</td>\n",
175 |        "    </tr>\n",
176 |        "    <tr>\n",
177 |        "      <th>2012-01-09</th>\n",
178 |        "      <td>4.4041</td>\n",
179 |        "      <td>2.7791</td>\n",
180 |        "      <td>6.2172</td>\n",
181 |        "      <td>2.8154</td>\n",
182 |        "      <td>6.4744</td>\n",
183 |        "      <td>4.2787</td>\n",
184 |        "      <td>15.12</td>\n",
185 |        "      <td>7.4449</td>\n",
186 |        "      <td>4.8741</td>\n",
187 |        "      <td>4.2114</td>\n",
188 |        "      <td>...</td>\n",
189 |        "      <td>9.3705</td>\n",
190 |        "      <td>8.9765</td>\n",
191 |        "      <td>13.2975</td>\n",
192 |        "      <td>2.5991</td>\n",
193 |        "      <td>2.0094</td>\n",
194 |        "      <td>3.3194</td>\n",
195 |        "      <td>2.3362</td>\n",
196 |        "      <td>5.7050</td>\n",
197 |        "      <td>1.8814</td>\n",
198 |        "      <td>6.4965</td>\n",
199 |        "    </tr>\n",
200 |        "    <tr>\n",
201 |        "      <th>2012-01-10</th>\n",
202 |        "      <td>4.5124</td>\n",
203 |        "      <td>2.9137</td>\n",
204 |        "      <td>6.4549</td>\n",
205 |        "      <td>2.9570</td>\n",
206 |        "      <td>6.7845</td>\n",
207 |        "      <td>4.4437</td>\n",
208 |        "      <td>15.29</td>\n",
209 |        "      <td>7.7357</td>\n",
210 |        "      <td>5.1370</td>\n",
211 |        "      <td>4.2962</td>\n",
212 |        "      <td>...</td>\n",
213 |        "      <td>9.7712</td>\n",
214 |        "      <td>9.3285</td>\n",
215 |        "      <td>13.8864</td>\n",
216 |        "      <td>2.7565</td>\n",
217 |        "      <td>2.1143</td>\n",
218 |        "      <td>3.4255</td>\n",
219 |        "      <td>2.4087</td>\n",
220 |        "      <td>5.9410</td>\n",
221 |        "      <td>2.0024</td>\n",
222 |        "      <td>6.6565</td>\n",
223 |        "    </tr>\n",
224 |        "  </tbody>\n",
225 |        "</table>\n",
226 |        "<p>5 rows × 2320 columns</p>\n",
227 |        "</div>"
228 |       ],
229 |       "text/plain": [
230 |        "            300188.XSHE  600337.XSHG  600168.XSHG  002337.XSHE  600592.XSHG  \\\n",
231 |        "2012-01-04       4.3401       3.1750       6.3693       2.8752       6.8314   \n",
232 |        "2012-01-05       4.0743       2.8583       6.1697       2.6739       6.1737   \n",
233 |        "2012-01-06       4.1173       2.6920       5.9511       2.7148       6.2207   \n",
234 |        "2012-01-09       4.4041       2.7791       6.2172       2.8154       6.4744   \n",
235 |        "2012-01-10       4.5124       2.9137       6.4549       2.9570       6.7845   \n",
236 |        "\n",
237 |        "            000950.XSHE  600991.XSHG  002473.XSHE  600784.XSHG  600736.XSHG  \\\n",
238 |        "2012-01-04       4.0168        15.08       7.2898       5.0104       4.0135   \n",
239 |        "2012-01-05       3.9101        15.08       6.9505       4.5089       3.8534   \n",
240 |        "2012-01-06       4.1429        15.11       7.0765       4.6111       3.9570   \n",
241 |        "2012-01-09       4.2787        15.12       7.4449       4.8741       4.2114   \n",
242 |        "2012-01-10       4.4437        15.29       7.7357       5.1370       4.2962   \n",
243 |        "\n",
244 |        "               ...       600345.XSHG  600387.XSHG  000063.XSHE  002506.XSHE  \\\n",
245 |        "2012-01-04     ...            9.4373       8.7711      13.3691       2.6406   \n",
246 |        "2012-01-05     ...            8.8456       8.5560      13.2895       2.3920   \n",
247 |        "2012-01-06     ...            8.9315       8.4289      12.9473       2.4624   \n",
248 |        "2012-01-09     ...            9.3705       8.9765      13.2975       2.5991   \n",
249 |        "2012-01-10     ...            9.7712       9.3285      13.8864       2.7565   \n",
250 |        "\n",
251 |        "            300151.XSHE  002579.XSHE  000563.XSHE  000551.XSHE  002578.XSHE  \\\n",
252 |        "2012-01-04       2.0055       3.4037       2.3129       5.7181       1.8931   \n",
253 |        "2012-01-05       1.8872       3.1452       2.2942       5.2525       1.7877   \n",
254 |        "2012-01-06       1.9159       3.1724       2.2475       5.2787       1.7897   \n",
255 |        "2012-01-09       2.0094       3.3194       2.3362       5.7050       1.8814   \n",
256 |        "2012-01-10       2.1143       3.4255       2.4087       5.9410       2.0024   \n",
257 |        "\n",
258 |        "            000726.XSHE  \n",
259 |        "2012-01-04       6.0165  \n",
260 |        "2012-01-05       6.0005  \n",
261 |        "2012-01-06       6.2645  \n",
262 |        "2012-01-09       6.4965  \n",
263 |        "2012-01-10       6.6565  \n",
264 |        "\n",
265 |        "[5 rows x 2320 columns]"
266 |       ]
267 |      },
268 |      "execution_count": 3,
269 |      "metadata": {},
270 |      "output_type": "execute_result"
271 |     }
272 |    ],
273 |    "source": [
274 |     "price_data.head()"
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "code",
279 |    "execution_count": 4,
280 |    "metadata": {
281 |     "collapsed": false
282 |    },
283 |    "outputs": [],
284 |    "source": [
285 |     "trade_data = get_price(stock_list, start_date=date_start, end_date=date_end, frequency='1d', \n",
286 |     "                       fields=['close', 'total_turnover', 'volume'], \n",
287 |     "                       adjust_type='pre', skip_suspended=False, country='cn')"
288 |    ]
289 |   },
290 |   {
291 |    "cell_type": "code",
292 |    "execution_count": 5,
293 |    "metadata": {
294 |     "collapsed": true
295 |    },
296 |    "outputs": [],
297 |    "source": [
298 |     "return_data = get_price_change_rate(stock_list, start_date=date_start, end_date=date_end)"
299 |    ]
300 |   },
301 |   {
302 |    "cell_type": "code",
303 |    "execution_count": 6,
304 |    "metadata": {
305 |     "collapsed": true
306 |    },
307 |    "outputs": [],
308 |    "source": [
309 |     "turnover_data = get_turnover_rate(stock_list, date_start, date_end, fields=['week', 'month'])"
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "code",
314 |    "execution_count": 9,
315 |    "metadata": {
316 |     "collapsed": false
317 |    },
318 |    "outputs": [],
319 |    "source": [
320 |     "instrument_info = instruments(stock_list)"
321 |    ]
322 |   },
323 |   {
324 |    "cell_type": "code",
325 |    "execution_count": 25,
326 |    "metadata": {
327 |     "collapsed": false
328 |    },
329 |    "outputs": [
330 |     {
331 |      "data": {
332 |       "text/plain": [
333 |        "Instrument(industry_name='软件和信息技术服务业', sector_code_name='信息技术', abbrev_symbol='MYBK', listed_date='2011-03-16', exchange='XSHE', symbol='美亚柏科', industry_code='I65', round_lot=100.0, order_book_id='300188.XSHE', special_type='Normal', shenwan_industry_name='计算机', de_listed_date='0000-00-00', type='CS', sector_code='InformationTechnology', board_type='GEM', shenwan_industry_code='801750.INDX', status='Active')"
334 |       ]
335 |      },
336 |      "execution_count": 25,
337 |      "metadata": {},
338 |      "output_type": "execute_result"
339 |     }
340 |    ],
341 |    "source": [
342 |     "instrument_info[0]"
343 |    ]
344 |   },
345 |   {
346 |    "cell_type": "code",
347 |    "execution_count": 27,
348 |    "metadata": {
349 |     "collapsed": false
350 |    },
351 |    "outputs": [],
352 |    "source": [
353 |     "# Download instrument information\n",
354 |     "\n",
355 |     "bookId_list = []\n",
356 |     "exchange_list = []\n",
357 |     "abbrevSymbol_list = []\n",
358 |     "shenwanIndustryCode_list = []\n",
359 |     "shenwanIndustryName_list = []\n",
360 |     "industryCode_list = []\n",
361 |     "industryName_list = []\n",
362 |     "sectorCode_list = []\n",
363 |     "sectorName_list = []\n",
364 |     "for inst in instrument_info:\n",
365 |     "  bookId_list.append(inst.order_book_id)\n",
366 |     "  exchange_list.append(inst.exchange)\n",
367 |     "  abbrevSymbol_list.append(inst.abbrev_symbol)\n",
368 |     "  shenwanIndustryCode_list.append(inst.shenwan_industry_code)\n",
369 |     "  shenwanIndustryName_list.append(inst.shenwan_industry_name)\n",
370 |     "  industryCode_list.append(inst.industry_code)\n",
371 |     "  industryName_list.append(inst.industry_name)\n",
372 |     "  sectorCode_list.append(inst.sector_code)\n",
373 |     "  sectorName_list.append(inst.sector_code_name)\n",
374 |     "  \n",
375 |     "instrument_df = pd.DataFrame({\"bookId\":bookId_list,\n",
376 |     "                             \"exchange\":exchange_list,\n",
377 |     "                             \"abbrevSymbol\":abbrevSymbol_list,\n",
378 |     "                             \"shenwanIndustryCode\":shenwanIndustryCode_list,\n",
379 |     "                             \"shenwanIndustryName\":shenwanIndustryName_list,\n",
380 |     "                             \"industryCode\":industryCode_list,\n",
381 |     "                             \"industryName\":industryName_list,\n",
382 |     "                             \"sectorCode\":sectorCode_list,\n",
383 |     "                             \"sectorName\":sectorName_list})"
384 |    ]
385 |   },
386 |   {
387 |    "cell_type": "code",
388 |    "execution_count": 29,
389 |    "metadata": {
390 |     "collapsed": false
391 |    },
392 |    "outputs": [],
393 |    "source": [
394 |     "instrument_df.to_csv(\"cn_instrument_info_{}_{}.csv\".format(year_start,year_end))"
395 |    ]
396 |   },
397 |   {
398 |    "cell_type": "code",
399 |    "execution_count": 7,
400 |    "metadata": {
401 |     "collapsed": false
402 |    },
403 |    "outputs": [
404 |     {
405 |      "name": "stdout",
406 |      "output_type": "stream",
407 |      "text": [
408 |       "Date: 2012-01-04 00:00:00 | Progress: 0.05875440658049354%\n",
409 |       "Date: 2012-01-05 00:00:00 | Progress: 0.11750881316098707%\n",
410 |       "Date: 2012-01-06 00:00:00 | Progress: 0.17626321974148063%\n",
411 |       "Date: 2012-01-09 00:00:00 | Progress: 0.23501762632197415%\n",
412 |       "Date: 2012-01-10 00:00:00 | Progress: 0.2937720329024677%\n",
413 |       "Date: 2012-01-11 00:00:00 | Progress: 0.35252643948296125%\n",
414 |       "Date: 2012-01-12 00:00:00 | Progress: 0.4112808460634548%\n",
415 |       "Date: 2012-01-13 00:00:00 | Progress: 0.4700352526439483%\n",
416 |       "Date: 2012-01-16 00:00:00 | Progress: 0.5287896592244419%\n",
417 |       "Date: 2012-01-17 00:00:00 | Progress: 0.5875440658049353%\n",
418 |       "Date: 2012-01-18 00:00:00 | Progress: 0.6462984723854289%\n",
419 |       "Date: 2012-01-19 00:00:00 | Progress: 0.7050528789659225%\n",
420 |       "Date: 2012-01-20 00:00:00 | Progress: 0.763807285546416%\n",
421 |       "Date: 2012-01-30 00:00:00 | Progress: 0.8225616921269095%\n",
422 |       "Date: 2012-01-31 00:00:00 | Progress: 0.881316098707403%\n",
423 |       "Date: 2012-02-01 00:00:00 | Progress: 0.9400705052878966%\n"
424 |      ]
425 |     },
426 |     {
427 |      "ename": "KeyboardInterrupt",
428 |      "evalue": "",
429 |      "traceback": [
430 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
431 |       "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
432 |       "\u001b[0;32m<ipython-input-7-7e4cef048025>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m     15\u001b[0m       \u001b[0;34m.\u001b[0m\u001b[0mfilter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfundamentals\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mincome_statement\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstockcode\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0min_\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstock_list\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     16\u001b[0m       \u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 17\u001b[0;31m         \u001b[0mentry_date\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdt\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minterval\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'1q'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreport_quarter\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     18\u001b[0m     )\n\u001b[1;32m     19\u001b[0m     \u001b[0m_fundamental_data\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_fundamental_data\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_frame\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
433 |       "\u001b[0;32m/opt/conda/envs/ricequant/lib/python3.5/site-packages/rqcommons/facade.py\u001b[0m in \u001b[0;36mwrap\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m     29\u001b[0m             \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Not inited yet. Please call rqdatac.init() first.'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     30\u001b[0m             \u001b[0;32mreturn\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 31\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     32\u001b[0m     \u001b[0;32mreturn\u001b[0m \u001b[0mwrap\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     33\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
434 |       "\u001b[0;32m/opt/conda/envs/ricequant/lib/python3.5/site-packages/rqcommons/facade.py\u001b[0m in \u001b[0;36mget_fundamentals\u001b[0;34m(query, entry_date, interval, report_quarter, country)\u001b[0m\n\u001b[1;32m    314\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mget_fundamentals\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mquery\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mentry_date\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minterval\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreport_quarter\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcountry\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'cn'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    315\u001b[0m     \u001b[0;34m\"\"\"获取财务数据\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 316\u001b[0;31m     \u001b[0;32mreturn\u001b[0m \u001b[0mimplmentation\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_fundamentals\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mquery\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mentry_date\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minterval\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreport_quarter\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcountry\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    317\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    318\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
435 |       "\u001b[0;32m/opt/conda/envs/ricequant/lib/python3.5/site-packages/rqdatac/implementation.py\u001b[0m in \u001b[0;36mget_fundamentals\u001b[0;34m(cls, query, entry_date, interval, report_quarter, country)\u001b[0m\n\u001b[1;32m    251\u001b[0m         \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    252\u001b[0m             \u001b[0mquery\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_unsafe_apply_query_filter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mquery\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtrading_dates\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 253\u001b[0;31m             \u001b[0mrecords\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcls\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_run_fundamental_query\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_compile_query\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mquery\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcountry\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcountry\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    254\u001b[0m         \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    255\u001b[0m             \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'ERROR: internal error, please contact public@ricequant.com. exception: {}'\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
436 |       "\u001b[0;32m/opt/conda/envs/ricequant/lib/python3.5/site-packages/rqdatac/implementation.py\u001b[0m in \u001b[0;36m_compile_query\u001b[0;34m(query)\u001b[0m\n\u001b[1;32m     30\u001b[0m     \u001b[0mparams\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     31\u001b[0m     \u001b[0;32mfor\u001b[0m \u001b[0mk\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mcomp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpositiontup\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 32\u001b[0;31m         \u001b[0mv\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcomp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mparams\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     33\u001b[0m         \u001b[0mparams\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mescape_item\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mv\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mconversions\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mencoders\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     34\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
437 |       "\u001b[0;32m/opt/conda/envs/ricequant/lib/python3.5/site-packages/sqlalchemy/sql/compiler.py\u001b[0m in \u001b[0;36mparams\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    576\u001b[0m         \"\"\"Return the bind param dictionary embedded into this\n\u001b[1;32m    577\u001b[0m         compiled object, for those values that are present.\"\"\"\n\u001b[0;32m--> 578\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconstruct_params\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_check\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    579\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    580\u001b[0m     \u001b[0;34m@\u001b[0m\u001b[0mutil\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdependencies\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"sqlalchemy.engine.result\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
438 |       "\u001b[0;32m/opt/conda/envs/ricequant/lib/python3.5/site-packages/sqlalchemy/sql/compiler.py\u001b[0m in \u001b[0;36mconstruct_params\u001b[0;34m(self, params, _group_number, _check)\u001b[0m\n\u001b[1;32m    569\u001b[0m                     \u001b[0mpd\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbind_names\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mbindparam\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mbindparam\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0meffective_value\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    570\u001b[0m                 \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 571\u001b[0;31m                     \u001b[0mpd\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbind_names\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mbindparam\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mbindparam\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    572\u001b[0m             \u001b[0;32mreturn\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    573\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
439 |       "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
440 |      ],
441 |      "output_type": "error"
442 |     }
443 |    ],
444 |    "source": [
445 |     "fundamental_data = {}\n",
446 |     "query_dates = trade_dates[(trade_dates >= date_start_dt) & (trade_dates <= date_end_dt)]\n",
447 |     "ndates = len(query_dates)\n",
448 |     "for counter,dt in enumerate(query_dates):\n",
449 |     "    print(\"Date: {} | Progress: {}%\".format(dt,(counter+1)/ndates*100))\n",
450 |     "    _fundamental_data = get_fundamentals(\n",
451 |     "        query(\n",
452 |     "        fundamentals.eod_derivative_indicator.market_cap, #总市值\n",
453 |     "        fundamentals.eod_derivative_indicator.a_share_market_val_2, #流通市值\n",
454 |     "        fundamentals.cash_flow_statement.cash_received_from_sales_of_goods, #销售额 - 单季/同比\n",
455 |     "        fundamentals.eod_derivative_indicator.pb_ratio, #净资产/总市值=市净率\n",
456 |     "        fundamentals.income_statement.net_profit, #净利润\n",
457 |     "        fundamentals.eod_derivative_indicator.ps_ratio #市销率\n",
458 |     "        )\n",
459 |     "      .filter(fundamentals.income_statement.stockcode.in_(stock_list))\n",
460 |     "      , \n",
461 |     "        entry_date=dt, interval='1q', report_quarter=True\n",
462 |     "    )\n",
463 |     "    _fundamental_data = _fundamental_data.to_frame()\n",
464 |     "    _fundamental_data.index.names = ['date', 'order_book_id']\n",
465 |     "    fundamental_data[dt] = _fundamental_data\n",
466 |     "    \n",
467 |     "fundamental_data = pd.concat(fundamental_data)\n",
468 |     "fundamental_data.reset_index(level=0, drop=True, inplace=True)"
469 |    ]
470 |   },
471 |   {
472 |    "cell_type": "code",
473 |    "execution_count": null,
474 |    "metadata": {
475 |     "collapsed": false
476 |    },
477 |    "outputs": [],
478 |    "source": [
479 |     "trade_ts = trade_data.to_frame()\n",
480 |     "trade_ts.index.names = ['date', 'order_book_id']\n",
481 |     "\n",
482 |     "return_ts = pd.DataFrame(return_data.stack(), columns=['return'])\n",
483 |     "return_ts.index.names = ['date', 'order_book_id']\n",
484 |     "\n",
485 |     "turnover_ts = turnover_data.to_frame()\n",
486 |     "turnover_ts.index.names = ['date', 'order_book_id']\n",
487 |     "\n",
488 |     "data = return_ts.merge(trade_ts, how='left', left_index=True, right_index=True)\n",
489 |     "data = data.merge(turnover_ts, how='left', left_index=True, right_index=True)\n",
490 |     "data = data.merge(fundamental_data, how='left', left_index=True, right_index=True)"
491 |    ]
492 |   },
493 |   {
494 |    "cell_type": "code",
495 |    "execution_count": null,
496 |    "metadata": {
497 |     "collapsed": false
498 |    },
499 |    "outputs": [],
500 |    "source": [
501 |     "data.head()"
502 |    ]
503 |   },
504 |   {
505 |    "cell_type": "code",
506 |    "execution_count": null,
507 |    "metadata": {
508 |     "collapsed": true
509 |    },
510 |    "outputs": [],
511 |    "source": [
512 |     "# Save Data\n",
513 |     "data.to_csv(\"cn_equity_daily_{}_{}.csv\".format(year_start,year_end))"
514 |    ]
515 |   },
516 |   {
517 |    "cell_type": "code",
518 |    "execution_count": null,
519 |    "metadata": {
520 |     "collapsed": true
521 |    },
522 |    "outputs": [],
523 |    "source": []
524 |   }
525 |  ],
526 |  "metadata": {
527 |   "kernelspec": {
528 |    "display_name": "Python 3",
529 |    "language": "python",
530 |    "name": "python3"
531 |   },
532 |   "language_info": {
533 |    "codemirror_mode": {
534 |     "name": "ipython",
535 |     "version": 3
536 |    },
537 |    "file_extension": ".py",
538 |    "mimetype": "text/x-python",
539 |    "name": "python",
540 |    "nbconvert_exporter": "python",
541 |    "pygments_lexer": "ipython3",
542 |    "version": "3.5.5"
543 |   }
544 |  },
545 |  "nbformat": 4,
546 |  "nbformat_minor": 2
547 | }
548 | 


--------------------------------------------------------------------------------
/source/DownloadData_bak.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 2,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [
 10 |     {
 11 |      "name": "stdout",
 12 |      "output_type": "stream",
 13 |      "text": [
 14 |       "Population Check - Initial #: 1059\n"
 15 |      ]
 16 |     }
 17 |    ],
 18 |    "source": [
 19 |     "# Constructs Time Series Data for All Stocks\n",
 20 |     "import pandas as pd\n",
 21 |     "import numpy as np\n",
 22 |     "from datetime import datetime\n",
 23 |     "import tushare as ts\n",
 24 |     "\n",
 25 |     "from scipy.stats import rankdata\n",
 26 |     "\n",
 27 |     "import seaborn as sns\n",
 28 |     "\n",
 29 |     "# Pull All Trade Dates\n",
 30 |     "trade_dates = pd.Series(data=[pd.Timestamp(date) for date in get_trading_dates('2001-01-01', '2018-12-31')], name='trade_date')\n",
 31 |     "\n",
 32 |     "year_start = 2001\n",
 33 |     "year_end = 2012\n",
 34 |     "\n",
 35 |     "# date_end_last_dt = max(trade_dates[trade_dates.dt.year == year_start-1])\n",
 36 |     "date_start_dt = min(trade_dates[trade_dates.dt.year == year_start])\n",
 37 |     "date_end_dt = max(trade_dates[trade_dates.dt.year == year_end])\n",
 38 |     "\n",
 39 |     "# date_end_last = date_end_last_dt.strftime('%Y-%m-%d')\n",
 40 |     "date_start = date_start_dt.strftime('%Y-%m-%d')\n",
 41 |     "date_end = date_end_dt.strftime('%Y-%m-%d')\n",
 42 |     "\n",
 43 |     "# Construct Stock Population\n",
 44 |     "stock_all = all_instruments(type=\"CS\", country='cn', date=date_start_dt)\n",
 45 |     "stock_list = stock_all['order_book_id'].tolist()\n",
 46 |     "print(\"Population Check - Initial #: {}\".format(stock_all.shape[0]))"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 3,
 52 |    "metadata": {
 53 |     "collapsed": false
 54 |    },
 55 |    "outputs": [
 56 |     {
 57 |      "name": "stdout",
 58 |      "output_type": "stream",
 59 |      "text": [
 60 |       "WARN: start_date is earlier than 2005-01-04, adjusted\n"
 61 |      ]
 62 |     }
 63 |    ],
 64 |    "source": [
 65 |     "trade_data = get_price(stock_list, start_date=date_start, end_date=date_end, frequency='1d', \n",
 66 |     "                       fields=['close', 'total_turnover', 'volume'], \n",
 67 |     "                       adjust_type='pre', skip_suspended=False, country='cn')\n",
 68 |     "\n",
 69 |     "return_data = get_price_change_rate(stock_list, start_date=date_start, end_date=date_end)\n",
 70 |     "\n",
 71 |     "turnover_data = get_turnover_rate(stock_list, date_start, date_end, fields=['week', 'month'])\n",
 72 |     "\n",
 73 |     "fundamental_data = {}\n",
 74 |     "for dt in trade_dates[(trade_dates.dt.year >= year_start) & (trade_dates.dt.year <= year_end)]:\n",
 75 |     "    _fundamental_data = get_fundamentals(\n",
 76 |     "        query(\n",
 77 |     "        fundamentals.eod_derivative_indicator.market_cap, #总市值\n",
 78 |     "        fundamentals.eod_derivative_indicator.a_share_market_val_2, #流通市值\n",
 79 |     "        fundamentals.cash_flow_statement.cash_received_from_sales_of_goods, #销售额 - 单季/同比\n",
 80 |     "        fundamentals.eod_derivative_indicator.pb_ratio, #净资产/总市值=市净率\n",
 81 |     "        fundamentals.income_statement.net_profit, #净利润\n",
 82 |     "        fundamentals.eod_derivative_indicator.ps_ratio #市销率\n",
 83 |     "        ).filter(fundamentals.income_statement.stockcode.in_(stock_list)), \n",
 84 |     "        entry_date=dt, interval='1q', report_quarter=True\n",
 85 |     "    )\n",
 86 |     "    _fundamental_data = _fundamental_data.to_frame()\n",
 87 |     "    _fundamental_data.index.names = ['date', 'order_book_id']\n",
 88 |     "    fundamental_data[dt] = _fundamental_data\n",
 89 |     "    \n",
 90 |     "fundamental_data = pd.concat(fundamental_data)\n",
 91 |     "fundamental_data.reset_index(level=0, drop=True, inplace=True)\n",
 92 |     "\n",
 93 |     "# Aggregate Data\n",
 94 |     "\n",
 95 |     "trade_ts = trade_data.to_frame()\n",
 96 |     "trade_ts.index.names = ['date', 'order_book_id']\n",
 97 |     "\n",
 98 |     "return_ts = pd.DataFrame(return_data.stack(), columns=['return'])\n",
 99 |     "return_ts.index.names = ['date', 'order_book_id']\n",
100 |     "\n",
101 |     "turnover_ts = turnover_data.to_frame()\n",
102 |     "turnover_ts.index.names = ['date', 'order_book_id']\n",
103 |     "\n",
104 |     "data = return_ts.merge(trade_ts, how='left', left_index=True, right_index=True)\n",
105 |     "data = data.merge(turnover_ts, how='left', left_index=True, right_index=True)\n",
106 |     "data = data.merge(fundamental_data, how='left', left_index=True, right_index=True)\n",
107 |     "\n",
108 |     "# Save Data\n",
109 |     "data.to_csv(\"stock_data_all_2016_2018.csv\")\n"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": 4,
115 |    "metadata": {
116 |     "collapsed": false
117 |    },
118 |    "outputs": [],
119 |    "source": [
120 |     "trade_ts = trade_data.to_frame()\n",
121 |     "trade_ts.index.names = ['date', 'order_book_id']\n",
122 |     "\n",
123 |     "return_ts = pd.DataFrame(return_data.stack(), columns=['return'])\n",
124 |     "return_ts.index.names = ['date', 'order_book_id']\n",
125 |     "\n",
126 |     "turnover_ts = turnover_data.to_frame()\n",
127 |     "turnover_ts.index.names = ['date', 'order_book_id']\n",
128 |     "\n",
129 |     "data = return_ts.merge(trade_ts, how='left', left_index=True, right_index=True)\n",
130 |     "data = data.merge(turnover_ts, how='left', left_index=True, right_index=True)\n",
131 |     "data = data.merge(fundamental_data, how='left', left_index=True, right_index=True)\n"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": 5,
137 |    "metadata": {
138 |     "collapsed": true
139 |    },
140 |    "outputs": [],
141 |    "source": [
142 |     "# data.to_csv(\"stock_data_all_2005_2012.csv\")\n",
143 |     "\n",
144 |     "# Break data into monthly chunks\n",
145 |     "year = 2011\n",
146 |     "for month in range(1,13):\n",
147 |     "  data_tmp = data.loc[(data['date'].dt.year == year) & (data['date'].dt.month == month), :]\n",
148 |     "  data_tmp.to_csv(\"stock_data_all_\"+str(year)+\"{0:0=2d}\".format(month)+\".csv\")"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": null,
154 |    "metadata": {
155 |     "collapsed": true
156 |    },
157 |    "outputs": [],
158 |    "source": []
159 |   }
160 |  ],
161 |  "metadata": {
162 |   "kernelspec": {
163 |    "display_name": "Python 3",
164 |    "language": "python",
165 |    "name": "python3"
166 |   },
167 |   "language_info": {
168 |    "codemirror_mode": {
169 |     "name": "ipython",
170 |     "version": 3
171 |    },
172 |    "file_extension": ".py",
173 |    "mimetype": "text/x-python",
174 |    "name": "python",
175 |    "nbconvert_exporter": "python",
176 |    "pygments_lexer": "ipython3",
177 |    "version": "3.5.2"
178 |   }
179 |  },
180 |  "nbformat": 4,
181 |  "nbformat_minor": 2
182 | }
183 | 


--------------------------------------------------------------------------------
/source/FactorModeling.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "code",
   5 |    "execution_count": 1,
   6 |    "metadata": {},
   7 |    "outputs": [],
   8 |    "source": [
   9 |     "from rqdata_utils import *\n",
  10 |     "import pandas\n",
  11 |     "import numpy as np\n",
  12 |     "import scipy as sp\n",
  13 |     "import alphalens as al\n",
  14 |     "%matplotlib inline"
  15 |    ]
  16 |   },
  17 |   {
  18 |    "cell_type": "markdown",
  19 |    "metadata": {},
  20 |    "source": [
  21 |     "## Loading Data"
  22 |    ]
  23 |   },
  24 |   {
  25 |    "cell_type": "code",
  26 |    "execution_count": 2,
  27 |    "metadata": {},
  28 |    "outputs": [],
  29 |    "source": [
  30 |     "price_df,instrument_df,equity_df = get_price_instrument_equity(\"cn_stock_price_2012_2018.csv\",\"cn_instrument_info_2012_2018.csv\",\"cn_equity_daily_2012_2018.csv\",\"sectorCode\")"
  31 |    ]
  32 |   },
  33 |   {
  34 |    "cell_type": "code",
  35 |    "execution_count": 3,
  36 |    "metadata": {},
  37 |    "outputs": [
  38 |     {
  39 |      "data": {
  40 |       "text/html": [
  41 |        "<div>\n",
  42 |        "<table border=\"1\" class=\"dataframe\">\n",
  43 |        "  <thead>\n",
  44 |        "    <tr style=\"text-align: right;\">\n",
  45 |        "      <th></th>\n",
  46 |        "      <th></th>\n",
  47 |        "      <th>return</th>\n",
  48 |        "      <th>close</th>\n",
  49 |        "      <th>total_turnover</th>\n",
  50 |        "      <th>volume</th>\n",
  51 |        "      <th>week</th>\n",
  52 |        "      <th>month</th>\n",
  53 |        "      <th>report_quarter</th>\n",
  54 |        "      <th>market_cap</th>\n",
  55 |        "      <th>a_share_market_val_2</th>\n",
  56 |        "      <th>cash_received_from_sales_of_goods</th>\n",
  57 |        "      <th>pb_ratio</th>\n",
  58 |        "      <th>net_profit</th>\n",
  59 |        "      <th>ps_ratio</th>\n",
  60 |        "      <th>sectorCode</th>\n",
  61 |        "    </tr>\n",
  62 |        "    <tr>\n",
  63 |        "      <th>date</th>\n",
  64 |        "      <th>order_book_id</th>\n",
  65 |        "      <th></th>\n",
  66 |        "      <th></th>\n",
  67 |        "      <th></th>\n",
  68 |        "      <th></th>\n",
  69 |        "      <th></th>\n",
  70 |        "      <th></th>\n",
  71 |        "      <th></th>\n",
  72 |        "      <th></th>\n",
  73 |        "      <th></th>\n",
  74 |        "      <th></th>\n",
  75 |        "      <th></th>\n",
  76 |        "      <th></th>\n",
  77 |        "      <th></th>\n",
  78 |        "      <th></th>\n",
  79 |        "    </tr>\n",
  80 |        "  </thead>\n",
  81 |        "  <tbody>\n",
  82 |        "    <tr>\n",
  83 |        "      <th rowspan=\"5\" valign=\"top\">2012-01-04</th>\n",
  84 |        "      <th>000001.XSHE</th>\n",
  85 |        "      <td>-0.027582</td>\n",
  86 |        "      <td>5.1224</td>\n",
  87 |        "      <td>2.275637e+08</td>\n",
  88 |        "      <td>40894428.0</td>\n",
  89 |        "      <td>0.5775</td>\n",
  90 |        "      <td>0.4331</td>\n",
  91 |        "      <td>NaN</td>\n",
  92 |        "      <td>NaN</td>\n",
  93 |        "      <td>NaN</td>\n",
  94 |        "      <td>NaN</td>\n",
  95 |        "      <td>NaN</td>\n",
  96 |        "      <td>NaN</td>\n",
  97 |        "      <td>NaN</td>\n",
  98 |        "      <td>Financials</td>\n",
  99 |        "    </tr>\n",
 100 |        "    <tr>\n",
 101 |        "      <th>000002.XSHE</th>\n",
 102 |        "      <td>-0.018742</td>\n",
 103 |        "      <td>6.0525</td>\n",
 104 |        "      <td>3.559891e+08</td>\n",
 105 |        "      <td>47432958.0</td>\n",
 106 |        "      <td>0.3711</td>\n",
 107 |        "      <td>0.4030</td>\n",
 108 |        "      <td>2011q3</td>\n",
 109 |        "      <td>8.059489e+10</td>\n",
 110 |        "      <td>7.082120e+10</td>\n",
 111 |        "      <td>7.516785e+10</td>\n",
 112 |        "      <td>1.5216</td>\n",
 113 |        "      <td>4.106349e+09</td>\n",
 114 |        "      <td>0.8679</td>\n",
 115 |        "      <td>Financials</td>\n",
 116 |        "    </tr>\n",
 117 |        "    <tr>\n",
 118 |        "      <th>000004.XSHE</th>\n",
 119 |        "      <td>-0.022250</td>\n",
 120 |        "      <td>7.9100</td>\n",
 121 |        "      <td>3.763833e+06</td>\n",
 122 |        "      <td>465469.0</td>\n",
 123 |        "      <td>0.5720</td>\n",
 124 |        "      <td>0.7506</td>\n",
 125 |        "      <td>2011q3</td>\n",
 126 |        "      <td>6.642556e+08</td>\n",
 127 |        "      <td>6.634549e+08</td>\n",
 128 |        "      <td>5.949968e+07</td>\n",
 129 |        "      <td>8.8175</td>\n",
 130 |        "      <td>4.500363e+06</td>\n",
 131 |        "      <td>37.5796</td>\n",
 132 |        "      <td>HealthCare</td>\n",
 133 |        "    </tr>\n",
 134 |        "    <tr>\n",
 135 |        "      <th>000005.XSHE</th>\n",
 136 |        "      <td>0.000000</td>\n",
 137 |        "      <td>3.8600</td>\n",
 138 |        "      <td>0.000000e+00</td>\n",
 139 |        "      <td>0.0</td>\n",
 140 |        "      <td>0.0000</td>\n",
 141 |        "      <td>0.0000</td>\n",
 142 |        "      <td>2011q3</td>\n",
 143 |        "      <td>3.529328e+09</td>\n",
 144 |        "      <td>3.527048e+09</td>\n",
 145 |        "      <td>2.565851e+07</td>\n",
 146 |        "      <td>5.3480</td>\n",
 147 |        "      <td>1.365665e+07</td>\n",
 148 |        "      <td>-347.2191</td>\n",
 149 |        "      <td>Industrials</td>\n",
 150 |        "    </tr>\n",
 151 |        "    <tr>\n",
 152 |        "      <th>000006.XSHE</th>\n",
 153 |        "      <td>-0.009756</td>\n",
 154 |        "      <td>2.6766</td>\n",
 155 |        "      <td>7.619286e+06</td>\n",
 156 |        "      <td>2513811.0</td>\n",
 157 |        "      <td>0.1416</td>\n",
 158 |        "      <td>0.1667</td>\n",
 159 |        "      <td>2011q3</td>\n",
 160 |        "      <td>4.015370e+09</td>\n",
 161 |        "      <td>3.929464e+09</td>\n",
 162 |        "      <td>2.531436e+09</td>\n",
 163 |        "      <td>1.4348</td>\n",
 164 |        "      <td>2.763917e+08</td>\n",
 165 |        "      <td>1.4139</td>\n",
 166 |        "      <td>Financials</td>\n",
 167 |        "    </tr>\n",
 168 |        "  </tbody>\n",
 169 |        "</table>\n",
 170 |        "</div>"
 171 |       ],
 172 |       "text/plain": [
 173 |        "                            return   close  total_turnover      volume  \\\n",
 174 |        "date       order_book_id                                                 \n",
 175 |        "2012-01-04 000001.XSHE   -0.027582  5.1224    2.275637e+08  40894428.0   \n",
 176 |        "           000002.XSHE   -0.018742  6.0525    3.559891e+08  47432958.0   \n",
 177 |        "           000004.XSHE   -0.022250  7.9100    3.763833e+06    465469.0   \n",
 178 |        "           000005.XSHE    0.000000  3.8600    0.000000e+00         0.0   \n",
 179 |        "           000006.XSHE   -0.009756  2.6766    7.619286e+06   2513811.0   \n",
 180 |        "\n",
 181 |        "                            week   month report_quarter    market_cap  \\\n",
 182 |        "date       order_book_id                                                \n",
 183 |        "2012-01-04 000001.XSHE    0.5775  0.4331            NaN           NaN   \n",
 184 |        "           000002.XSHE    0.3711  0.4030         2011q3  8.059489e+10   \n",
 185 |        "           000004.XSHE    0.5720  0.7506         2011q3  6.642556e+08   \n",
 186 |        "           000005.XSHE    0.0000  0.0000         2011q3  3.529328e+09   \n",
 187 |        "           000006.XSHE    0.1416  0.1667         2011q3  4.015370e+09   \n",
 188 |        "\n",
 189 |        "                          a_share_market_val_2  \\\n",
 190 |        "date       order_book_id                         \n",
 191 |        "2012-01-04 000001.XSHE                     NaN   \n",
 192 |        "           000002.XSHE            7.082120e+10   \n",
 193 |        "           000004.XSHE            6.634549e+08   \n",
 194 |        "           000005.XSHE            3.527048e+09   \n",
 195 |        "           000006.XSHE            3.929464e+09   \n",
 196 |        "\n",
 197 |        "                          cash_received_from_sales_of_goods  pb_ratio  \\\n",
 198 |        "date       order_book_id                                                \n",
 199 |        "2012-01-04 000001.XSHE                                  NaN       NaN   \n",
 200 |        "           000002.XSHE                         7.516785e+10    1.5216   \n",
 201 |        "           000004.XSHE                         5.949968e+07    8.8175   \n",
 202 |        "           000005.XSHE                         2.565851e+07    5.3480   \n",
 203 |        "           000006.XSHE                         2.531436e+09    1.4348   \n",
 204 |        "\n",
 205 |        "                            net_profit  ps_ratio   sectorCode  \n",
 206 |        "date       order_book_id                                       \n",
 207 |        "2012-01-04 000001.XSHE             NaN       NaN   Financials  \n",
 208 |        "           000002.XSHE    4.106349e+09    0.8679   Financials  \n",
 209 |        "           000004.XSHE    4.500363e+06   37.5796   HealthCare  \n",
 210 |        "           000005.XSHE    1.365665e+07 -347.2191  Industrials  \n",
 211 |        "           000006.XSHE    2.763917e+08    1.4139   Financials  "
 212 |       ]
 213 |      },
 214 |      "execution_count": 3,
 215 |      "metadata": {},
 216 |      "output_type": "execute_result"
 217 |     }
 218 |    ],
 219 |    "source": [
 220 |     "equity_df.head()"
 221 |    ]
 222 |   },
 223 |   {
 224 |    "cell_type": "code",
 225 |    "execution_count": 4,
 226 |    "metadata": {},
 227 |    "outputs": [
 228 |     {
 229 |      "data": {
 230 |       "text/plain": [
 231 |        "164"
 232 |       ]
 233 |      },
 234 |      "execution_count": 4,
 235 |      "metadata": {},
 236 |      "output_type": "execute_result"
 237 |     }
 238 |    ],
 239 |    "source": [
 240 |     "healthcareUniverse = instrument_df.index[instrument_df.sectorCode=='HealthCare'].values\n",
 241 |     "len(healthcareUniverse)"
 242 |    ]
 243 |   },
 244 |   {
 245 |    "cell_type": "code",
 246 |    "execution_count": 5,
 247 |    "metadata": {},
 248 |    "outputs": [],
 249 |    "source": [
 250 |     "def equity_universe_filtering(equity_df, universe):\n",
 251 |     "    universeFilter = [book_id in set(universe) for book_id in equity_df.index.get_level_values(level=1).values]\n",
 252 |     "    return equity_df[universeFilter]"
 253 |    ]
 254 |   },
 255 |   {
 256 |    "cell_type": "code",
 257 |    "execution_count": 6,
 258 |    "metadata": {},
 259 |    "outputs": [
 260 |     {
 261 |      "data": {
 262 |       "text/html": [
 263 |        "<div>\n",
 264 |        "<table border=\"1\" class=\"dataframe\">\n",
 265 |        "  <thead>\n",
 266 |        "    <tr style=\"text-align: right;\">\n",
 267 |        "      <th></th>\n",
 268 |        "      <th></th>\n",
 269 |        "      <th>return</th>\n",
 270 |        "      <th>close</th>\n",
 271 |        "      <th>total_turnover</th>\n",
 272 |        "      <th>volume</th>\n",
 273 |        "      <th>week</th>\n",
 274 |        "      <th>month</th>\n",
 275 |        "      <th>report_quarter</th>\n",
 276 |        "      <th>market_cap</th>\n",
 277 |        "      <th>a_share_market_val_2</th>\n",
 278 |        "      <th>cash_received_from_sales_of_goods</th>\n",
 279 |        "      <th>pb_ratio</th>\n",
 280 |        "      <th>net_profit</th>\n",
 281 |        "      <th>ps_ratio</th>\n",
 282 |        "      <th>sectorCode</th>\n",
 283 |        "    </tr>\n",
 284 |        "    <tr>\n",
 285 |        "      <th>date</th>\n",
 286 |        "      <th>order_book_id</th>\n",
 287 |        "      <th></th>\n",
 288 |        "      <th></th>\n",
 289 |        "      <th></th>\n",
 290 |        "      <th></th>\n",
 291 |        "      <th></th>\n",
 292 |        "      <th></th>\n",
 293 |        "      <th></th>\n",
 294 |        "      <th></th>\n",
 295 |        "      <th></th>\n",
 296 |        "      <th></th>\n",
 297 |        "      <th></th>\n",
 298 |        "      <th></th>\n",
 299 |        "      <th></th>\n",
 300 |        "      <th></th>\n",
 301 |        "    </tr>\n",
 302 |        "  </thead>\n",
 303 |        "  <tbody>\n",
 304 |        "    <tr>\n",
 305 |        "      <th rowspan=\"5\" valign=\"top\">2012-01-04</th>\n",
 306 |        "      <th>000004.XSHE</th>\n",
 307 |        "      <td>-0.022250</td>\n",
 308 |        "      <td>7.9100</td>\n",
 309 |        "      <td>3763832.88</td>\n",
 310 |        "      <td>465469.0</td>\n",
 311 |        "      <td>0.5720</td>\n",
 312 |        "      <td>0.7506</td>\n",
 313 |        "      <td>2011q3</td>\n",
 314 |        "      <td>6.642556e+08</td>\n",
 315 |        "      <td>6.634549e+08</td>\n",
 316 |        "      <td>5.949968e+07</td>\n",
 317 |        "      <td>8.8175</td>\n",
 318 |        "      <td>4.500363e+06</td>\n",
 319 |        "      <td>37.5796</td>\n",
 320 |        "      <td>HealthCare</td>\n",
 321 |        "    </tr>\n",
 322 |        "    <tr>\n",
 323 |        "      <th>000028.XSHE</th>\n",
 324 |        "      <td>-0.045433</td>\n",
 325 |        "      <td>19.8422</td>\n",
 326 |        "      <td>9326924.28</td>\n",
 327 |        "      <td>450553.0</td>\n",
 328 |        "      <td>0.4201</td>\n",
 329 |        "      <td>0.2722</td>\n",
 330 |        "      <td>2011q3</td>\n",
 331 |        "      <td>5.872485e+09</td>\n",
 332 |        "      <td>4.753820e+09</td>\n",
 333 |        "      <td>1.053298e+10</td>\n",
 334 |        "      <td>4.3493</td>\n",
 335 |        "      <td>2.481834e+08</td>\n",
 336 |        "      <td>0.3414</td>\n",
 337 |        "      <td>HealthCare</td>\n",
 338 |        "    </tr>\n",
 339 |        "    <tr>\n",
 340 |        "      <th>000150.XSHE</th>\n",
 341 |        "      <td>-0.030295</td>\n",
 342 |        "      <td>3.1737</td>\n",
 343 |        "      <td>3109304.50</td>\n",
 344 |        "      <td>952600.0</td>\n",
 345 |        "      <td>0.3460</td>\n",
 346 |        "      <td>0.3610</td>\n",
 347 |        "      <td>2011q3</td>\n",
 348 |        "      <td>1.036800e+09</td>\n",
 349 |        "      <td>1.036800e+09</td>\n",
 350 |        "      <td>4.913279e+07</td>\n",
 351 |        "      <td>1.4763</td>\n",
 352 |        "      <td>3.657858e+06</td>\n",
 353 |        "      <td>7.8956</td>\n",
 354 |        "      <td>HealthCare</td>\n",
 355 |        "    </tr>\n",
 356 |        "    <tr>\n",
 357 |        "      <th>000153.XSHE</th>\n",
 358 |        "      <td>-0.028053</td>\n",
 359 |        "      <td>5.7700</td>\n",
 360 |        "      <td>9673054.49</td>\n",
 361 |        "      <td>1596020.0</td>\n",
 362 |        "      <td>0.6830</td>\n",
 363 |        "      <td>2.4594</td>\n",
 364 |        "      <td>2011q3</td>\n",
 365 |        "      <td>1.531454e+09</td>\n",
 366 |        "      <td>1.360856e+09</td>\n",
 367 |        "      <td>1.329425e+09</td>\n",
 368 |        "      <td>2.1169</td>\n",
 369 |        "      <td>1.560397e+07</td>\n",
 370 |        "      <td>0.7818</td>\n",
 371 |        "      <td>HealthCare</td>\n",
 372 |        "    </tr>\n",
 373 |        "    <tr>\n",
 374 |        "      <th>000403.XSHE</th>\n",
 375 |        "      <td>0.000000</td>\n",
 376 |        "      <td>3.1625</td>\n",
 377 |        "      <td>0.00</td>\n",
 378 |        "      <td>0.0</td>\n",
 379 |        "      <td>0.0000</td>\n",
 380 |        "      <td>0.0000</td>\n",
 381 |        "      <td>NaN</td>\n",
 382 |        "      <td>NaN</td>\n",
 383 |        "      <td>NaN</td>\n",
 384 |        "      <td>NaN</td>\n",
 385 |        "      <td>NaN</td>\n",
 386 |        "      <td>NaN</td>\n",
 387 |        "      <td>NaN</td>\n",
 388 |        "      <td>HealthCare</td>\n",
 389 |        "    </tr>\n",
 390 |        "  </tbody>\n",
 391 |        "</table>\n",
 392 |        "</div>"
 393 |       ],
 394 |       "text/plain": [
 395 |        "                            return    close  total_turnover     volume  \\\n",
 396 |        "date       order_book_id                                                 \n",
 397 |        "2012-01-04 000004.XSHE   -0.022250   7.9100      3763832.88   465469.0   \n",
 398 |        "           000028.XSHE   -0.045433  19.8422      9326924.28   450553.0   \n",
 399 |        "           000150.XSHE   -0.030295   3.1737      3109304.50   952600.0   \n",
 400 |        "           000153.XSHE   -0.028053   5.7700      9673054.49  1596020.0   \n",
 401 |        "           000403.XSHE    0.000000   3.1625            0.00        0.0   \n",
 402 |        "\n",
 403 |        "                            week   month report_quarter    market_cap  \\\n",
 404 |        "date       order_book_id                                                \n",
 405 |        "2012-01-04 000004.XSHE    0.5720  0.7506         2011q3  6.642556e+08   \n",
 406 |        "           000028.XSHE    0.4201  0.2722         2011q3  5.872485e+09   \n",
 407 |        "           000150.XSHE    0.3460  0.3610         2011q3  1.036800e+09   \n",
 408 |        "           000153.XSHE    0.6830  2.4594         2011q3  1.531454e+09   \n",
 409 |        "           000403.XSHE    0.0000  0.0000            NaN           NaN   \n",
 410 |        "\n",
 411 |        "                          a_share_market_val_2  \\\n",
 412 |        "date       order_book_id                         \n",
 413 |        "2012-01-04 000004.XSHE            6.634549e+08   \n",
 414 |        "           000028.XSHE            4.753820e+09   \n",
 415 |        "           000150.XSHE            1.036800e+09   \n",
 416 |        "           000153.XSHE            1.360856e+09   \n",
 417 |        "           000403.XSHE                     NaN   \n",
 418 |        "\n",
 419 |        "                          cash_received_from_sales_of_goods  pb_ratio  \\\n",
 420 |        "date       order_book_id                                                \n",
 421 |        "2012-01-04 000004.XSHE                         5.949968e+07    8.8175   \n",
 422 |        "           000028.XSHE                         1.053298e+10    4.3493   \n",
 423 |        "           000150.XSHE                         4.913279e+07    1.4763   \n",
 424 |        "           000153.XSHE                         1.329425e+09    2.1169   \n",
 425 |        "           000403.XSHE                                  NaN       NaN   \n",
 426 |        "\n",
 427 |        "                            net_profit  ps_ratio  sectorCode  \n",
 428 |        "date       order_book_id                                      \n",
 429 |        "2012-01-04 000004.XSHE    4.500363e+06   37.5796  HealthCare  \n",
 430 |        "           000028.XSHE    2.481834e+08    0.3414  HealthCare  \n",
 431 |        "           000150.XSHE    3.657858e+06    7.8956  HealthCare  \n",
 432 |        "           000153.XSHE    1.560397e+07    0.7818  HealthCare  \n",
 433 |        "           000403.XSHE             NaN       NaN  HealthCare  "
 434 |       ]
 435 |      },
 436 |      "execution_count": 6,
 437 |      "metadata": {},
 438 |      "output_type": "execute_result"
 439 |     }
 440 |    ],
 441 |    "source": [
 442 |     "healthcare_equity_df = equity_universe_filtering(equity_df, healthcareUniverse)\n",
 443 |     "healthcare_equity_df.head()"
 444 |    ]
 445 |   },
 446 |   {
 447 |    "cell_type": "code",
 448 |    "execution_count": 7,
 449 |    "metadata": {},
 450 |    "outputs": [
 451 |     {
 452 |      "name": "stdout",
 453 |      "output_type": "stream",
 454 |      "text": [
 455 |       "universe ratio: 6.210331877919959%\n"
 456 |      ]
 457 |     }
 458 |    ],
 459 |    "source": [
 460 |     "print(\"universe ratio: {}%\".format(len(healthcare_equity_df)/len(equity_df)*100))"
 461 |    ]
 462 |   },
 463 |   {
 464 |    "cell_type": "markdown",
 465 |    "metadata": {},
 466 |    "source": [
 467 |     "### benchmark"
 468 |    ]
 469 |   },
 470 |   {
 471 |    "cell_type": "code",
 472 |    "execution_count": 27,
 473 |    "metadata": {},
 474 |    "outputs": [],
 475 |    "source": [
 476 |     "benchmark_df = pd.read_csv(\"cn_SH_healthcare_index_2012_2018.csv\",names=['date','value'])\n",
 477 |     "benchmark_df = benchmark_df.set_index('date',drop=True)"
 478 |    ]
 479 |   },
 480 |   {
 481 |    "cell_type": "code",
 482 |    "execution_count": 33,
 483 |    "metadata": {},
 484 |    "outputs": [
 485 |     {
 486 |      "data": {
 487 |       "text/html": [
 488 |        "<div>\n",
 489 |        "<table border=\"1\" class=\"dataframe\">\n",
 490 |        "  <thead>\n",
 491 |        "    <tr style=\"text-align: right;\">\n",
 492 |        "      <th></th>\n",
 493 |        "      <th>value</th>\n",
 494 |        "      <th>return</th>\n",
 495 |        "    </tr>\n",
 496 |        "    <tr>\n",
 497 |        "      <th>date</th>\n",
 498 |        "      <th></th>\n",
 499 |        "      <th></th>\n",
 500 |        "    </tr>\n",
 501 |        "  </thead>\n",
 502 |        "  <tbody>\n",
 503 |        "    <tr>\n",
 504 |        "      <th>2012-01-04</th>\n",
 505 |        "      <td>2891.462</td>\n",
 506 |        "      <td>0.000000</td>\n",
 507 |        "    </tr>\n",
 508 |        "    <tr>\n",
 509 |        "      <th>2012-01-05</th>\n",
 510 |        "      <td>2766.955</td>\n",
 511 |        "      <td>0.044015</td>\n",
 512 |        "    </tr>\n",
 513 |        "    <tr>\n",
 514 |        "      <th>2012-01-06</th>\n",
 515 |        "      <td>2744.793</td>\n",
 516 |        "      <td>0.008042</td>\n",
 517 |        "    </tr>\n",
 518 |        "    <tr>\n",
 519 |        "      <th>2012-01-09</th>\n",
 520 |        "      <td>2833.219</td>\n",
 521 |        "      <td>-0.031708</td>\n",
 522 |        "    </tr>\n",
 523 |        "    <tr>\n",
 524 |        "      <th>2012-01-10</th>\n",
 525 |        "      <td>2929.594</td>\n",
 526 |        "      <td>-0.033450</td>\n",
 527 |        "    </tr>\n",
 528 |        "  </tbody>\n",
 529 |        "</table>\n",
 530 |        "</div>"
 531 |       ],
 532 |       "text/plain": [
 533 |        "               value    return\n",
 534 |        "date                          \n",
 535 |        "2012-01-04  2891.462  0.000000\n",
 536 |        "2012-01-05  2766.955  0.044015\n",
 537 |        "2012-01-06  2744.793  0.008042\n",
 538 |        "2012-01-09  2833.219 -0.031708\n",
 539 |        "2012-01-10  2929.594 -0.033450"
 540 |       ]
 541 |      },
 542 |      "execution_count": 33,
 543 |      "metadata": {},
 544 |      "output_type": "execute_result"
 545 |     }
 546 |    ],
 547 |    "source": [
 548 |     "benchmark_df['return'] = np.log(benchmark_df.shift(1)/benchmark_df).fillna(0)\n",
 549 |     "benchmark_df.head()"
 550 |    ]
 551 |   },
 552 |   {
 553 |    "cell_type": "markdown",
 554 |    "metadata": {},
 555 |    "source": [
 556 |     "## Factor Returns"
 557 |    ]
 558 |   },
 559 |   {
 560 |    "cell_type": "code",
 561 |    "execution_count": 8,
 562 |    "metadata": {},
 563 |    "outputs": [],
 564 |    "source": [
 565 |     "def equity_factor_return(equity_df, factorColumn, nAllocations, longTop=True):\n",
 566 |     "    equity_copy = equity_df.copy()\n",
 567 |     "#     equity_copy[\"{}_rank\".format(factorColumn)] = equity_copy.groupby(level='date')[factorColumn].rank()\n",
 568 |     "#     equity_copy[equity_copy.groupby(level='date')[factorColumn].nlargest(nAllocations).index][\"biggest_{}_{}\".format(nAllocations,factorColumn)]=True\n",
 569 |     "    largest = equity_copy[factorColumn].groupby(level='date').nlargest(nAllocations).reset_index(level=0,drop=True)\n",
 570 |     "    smallest = equity_copy[factorColumn].groupby(level='date').nsmallest(nAllocations).reset_index(level=0,drop=True)\n",
 571 |     "    r_largest = equity_copy.loc[largest.index,'return'].groupby(level='date').mean()\n",
 572 |     "    r_smallest = equity_copy.loc[smallest.index,'return'].groupby(level='date').mean()\n",
 573 |     "    LMS = r_largest - r_smallest\n",
 574 |     "    if(longTop):\n",
 575 |     "        return LMS\n",
 576 |     "    else:\n",
 577 |     "        return -LMS"
 578 |    ]
 579 |   },
 580 |   {
 581 |    "cell_type": "code",
 582 |    "execution_count": 9,
 583 |    "metadata": {},
 584 |    "outputs": [
 585 |     {
 586 |      "data": {
 587 |       "text/plain": [
 588 |        "date\n",
 589 |        "2012-01-04    0.005983\n",
 590 |        "2012-01-05   -0.009098\n",
 591 |        "2012-01-06   -0.004155\n",
 592 |        "2012-01-09    0.014615\n",
 593 |        "2012-01-10    0.006728\n",
 594 |        "Name: return, dtype: float64"
 595 |       ]
 596 |      },
 597 |      "execution_count": 9,
 598 |      "metadata": {},
 599 |      "output_type": "execute_result"
 600 |     }
 601 |    ],
 602 |    "source": [
 603 |     "SMB = equity_factor_return(healthcare_equity_df, 'market_cap', 20,longTop=False)\n",
 604 |     "SMB.head()"
 605 |    ]
 606 |   },
 607 |   {
 608 |    "cell_type": "code",
 609 |    "execution_count": 10,
 610 |    "metadata": {},
 611 |    "outputs": [
 612 |     {
 613 |      "data": {
 614 |       "text/plain": [
 615 |        "date\n",
 616 |        "2012-01-04    0.005302\n",
 617 |        "2012-01-05   -0.007223\n",
 618 |        "2012-01-06    0.006031\n",
 619 |        "2012-01-09   -0.002597\n",
 620 |        "2012-01-10   -0.010780\n",
 621 |        "Name: return, dtype: float64"
 622 |       ]
 623 |      },
 624 |      "execution_count": 10,
 625 |      "metadata": {},
 626 |      "output_type": "execute_result"
 627 |     }
 628 |    ],
 629 |    "source": [
 630 |     "HML = equity_factor_return(healthcare_equity_df, 'pb_ratio', 20,longTop=True)\n",
 631 |     "HML.head()"
 632 |    ]
 633 |   },
 634 |   {
 635 |    "cell_type": "code",
 636 |    "execution_count": 11,
 637 |    "metadata": {},
 638 |    "outputs": [],
 639 |    "source": [
 640 |     "import itertools\n",
 641 |     "import statsmodels.api as sm\n",
 642 |     "from statsmodels import regression,stats\n",
 643 |     "import scipy\n",
 644 |     "\n",
 645 |     "data = healthcare_equity_df[['return']] # dataframe\n",
 646 |     "data = data.set_index(healthcare_equity_df.index) # elimilate redundant index (whole universe)\n",
 647 |     "asset_list_sizes = [group[1].size for group in data.groupby(level=0)]\n",
 648 |     "\n",
 649 |     "# Spreading the factor portfolio data across all assets for each day\n",
 650 |     "SMB_column = [[SMB.loc[group[0]]] * size for group, size \\\n",
 651 |     "              in zip(data.groupby(level=0), asset_list_sizes)]\n",
 652 |     "data['SMB'] = list(itertools.chain(*SMB_column))\n",
 653 |     "\n",
 654 |     "HML_column = [[HML.loc[group[0]]] * size for group, size \\\n",
 655 |     "              in zip(data.groupby(level=0), asset_list_sizes)]\n",
 656 |     "data['HML'] = list(itertools.chain(*HML_column))\n",
 657 |     "data = sm.add_constant(data.dropna())"
 658 |    ]
 659 |   },
 660 |   {
 661 |    "cell_type": "code",
 662 |    "execution_count": 12,
 663 |    "metadata": {},
 664 |    "outputs": [
 665 |     {
 666 |      "data": {
 667 |       "text/html": [
 668 |        "<div>\n",
 669 |        "<table border=\"1\" class=\"dataframe\">\n",
 670 |        "  <thead>\n",
 671 |        "    <tr style=\"text-align: right;\">\n",
 672 |        "      <th></th>\n",
 673 |        "      <th></th>\n",
 674 |        "      <th>const</th>\n",
 675 |        "      <th>return</th>\n",
 676 |        "      <th>SMB</th>\n",
 677 |        "      <th>HML</th>\n",
 678 |        "    </tr>\n",
 679 |        "    <tr>\n",
 680 |        "      <th>date</th>\n",
 681 |        "      <th>order_book_id</th>\n",
 682 |        "      <th></th>\n",
 683 |        "      <th></th>\n",
 684 |        "      <th></th>\n",
 685 |        "      <th></th>\n",
 686 |        "    </tr>\n",
 687 |        "  </thead>\n",
 688 |        "  <tbody>\n",
 689 |        "    <tr>\n",
 690 |        "      <th rowspan=\"5\" valign=\"top\">2012-01-04</th>\n",
 691 |        "      <th>000004.XSHE</th>\n",
 692 |        "      <td>1.0</td>\n",
 693 |        "      <td>-0.022250</td>\n",
 694 |        "      <td>0.005983</td>\n",
 695 |        "      <td>0.005302</td>\n",
 696 |        "    </tr>\n",
 697 |        "    <tr>\n",
 698 |        "      <th>000028.XSHE</th>\n",
 699 |        "      <td>1.0</td>\n",
 700 |        "      <td>-0.045433</td>\n",
 701 |        "      <td>0.005983</td>\n",
 702 |        "      <td>0.005302</td>\n",
 703 |        "    </tr>\n",
 704 |        "    <tr>\n",
 705 |        "      <th>000150.XSHE</th>\n",
 706 |        "      <td>1.0</td>\n",
 707 |        "      <td>-0.030295</td>\n",
 708 |        "      <td>0.005983</td>\n",
 709 |        "      <td>0.005302</td>\n",
 710 |        "    </tr>\n",
 711 |        "    <tr>\n",
 712 |        "      <th>000153.XSHE</th>\n",
 713 |        "      <td>1.0</td>\n",
 714 |        "      <td>-0.028053</td>\n",
 715 |        "      <td>0.005983</td>\n",
 716 |        "      <td>0.005302</td>\n",
 717 |        "    </tr>\n",
 718 |        "    <tr>\n",
 719 |        "      <th>000403.XSHE</th>\n",
 720 |        "      <td>1.0</td>\n",
 721 |        "      <td>0.000000</td>\n",
 722 |        "      <td>0.005983</td>\n",
 723 |        "      <td>0.005302</td>\n",
 724 |        "    </tr>\n",
 725 |        "  </tbody>\n",
 726 |        "</table>\n",
 727 |        "</div>"
 728 |       ],
 729 |       "text/plain": [
 730 |        "                          const    return       SMB       HML\n",
 731 |        "date       order_book_id                                     \n",
 732 |        "2012-01-04 000004.XSHE      1.0 -0.022250  0.005983  0.005302\n",
 733 |        "           000028.XSHE      1.0 -0.045433  0.005983  0.005302\n",
 734 |        "           000150.XSHE      1.0 -0.030295  0.005983  0.005302\n",
 735 |        "           000153.XSHE      1.0 -0.028053  0.005983  0.005302\n",
 736 |        "           000403.XSHE      1.0  0.000000  0.005983  0.005302"
 737 |       ]
 738 |      },
 739 |      "execution_count": 12,
 740 |      "metadata": {},
 741 |      "output_type": "execute_result"
 742 |     }
 743 |    ],
 744 |    "source": [
 745 |     "data.head()"
 746 |    ]
 747 |   },
 748 |   {
 749 |    "cell_type": "markdown",
 750 |    "metadata": {},
 751 |    "source": [
 752 |     "## Factor Exposures ($\\beta$)"
 753 |    ]
 754 |   },
 755 |   {
 756 |    "cell_type": "code",
 757 |    "execution_count": 13,
 758 |    "metadata": {},
 759 |    "outputs": [],
 760 |    "source": [
 761 |     "assets = data.index.levels[1].unique()\n",
 762 |     "Y = [data.xs(asset,level=1)['return'] for asset in assets]\n",
 763 |     "X = [data.xs(asset,level=1)[['SMB','HML','const']] for asset in assets]\n",
 764 |     "reg_results = [regression.linear_model.OLS(y,x).fit().params for y,x in zip(Y,X) if not(x.empty or y.empty)]\n",
 765 |     "indices = [asset for y, x, asset in zip(Y, X, assets) if not(x.empty or y.empty)]\n",
 766 |     "betas = pd.DataFrame(reg_results, index=indices)"
 767 |    ]
 768 |   },
 769 |   {
 770 |    "cell_type": "code",
 771 |    "execution_count": 15,
 772 |    "metadata": {},
 773 |    "outputs": [
 774 |     {
 775 |      "data": {
 776 |       "text/html": [
 777 |        "<div>\n",
 778 |        "<table border=\"1\" class=\"dataframe\">\n",
 779 |        "  <thead>\n",
 780 |        "    <tr style=\"text-align: right;\">\n",
 781 |        "      <th></th>\n",
 782 |        "      <th>SMB</th>\n",
 783 |        "      <th>HML</th>\n",
 784 |        "      <th>const</th>\n",
 785 |        "    </tr>\n",
 786 |        "  </thead>\n",
 787 |        "  <tbody>\n",
 788 |        "    <tr>\n",
 789 |        "      <th>000004.XSHE</th>\n",
 790 |        "      <td>0.883906</td>\n",
 791 |        "      <td>0.048757</td>\n",
 792 |        "      <td>0.002002</td>\n",
 793 |        "    </tr>\n",
 794 |        "    <tr>\n",
 795 |        "      <th>000028.XSHE</th>\n",
 796 |        "      <td>-0.003029</td>\n",
 797 |        "      <td>-0.064295</td>\n",
 798 |        "      <td>0.001073</td>\n",
 799 |        "    </tr>\n",
 800 |        "    <tr>\n",
 801 |        "      <th>000150.XSHE</th>\n",
 802 |        "      <td>0.354122</td>\n",
 803 |        "      <td>0.066071</td>\n",
 804 |        "      <td>0.002031</td>\n",
 805 |        "    </tr>\n",
 806 |        "    <tr>\n",
 807 |        "      <th>000153.XSHE</th>\n",
 808 |        "      <td>0.620706</td>\n",
 809 |        "      <td>-0.082229</td>\n",
 810 |        "      <td>0.001405</td>\n",
 811 |        "    </tr>\n",
 812 |        "    <tr>\n",
 813 |        "      <th>000403.XSHE</th>\n",
 814 |        "      <td>2.032192</td>\n",
 815 |        "      <td>11.457418</td>\n",
 816 |        "      <td>-0.017412</td>\n",
 817 |        "    </tr>\n",
 818 |        "  </tbody>\n",
 819 |        "</table>\n",
 820 |        "</div>"
 821 |       ],
 822 |       "text/plain": [
 823 |        "                  SMB        HML     const\n",
 824 |        "000004.XSHE  0.883906   0.048757  0.002002\n",
 825 |        "000028.XSHE -0.003029  -0.064295  0.001073\n",
 826 |        "000150.XSHE  0.354122   0.066071  0.002031\n",
 827 |        "000153.XSHE  0.620706  -0.082229  0.001405\n",
 828 |        "000403.XSHE  2.032192  11.457418 -0.017412"
 829 |       ]
 830 |      },
 831 |      "execution_count": 15,
 832 |      "metadata": {},
 833 |      "output_type": "execute_result"
 834 |     }
 835 |    ],
 836 |    "source": [
 837 |     "betas.head()"
 838 |    ]
 839 |   },
 840 |   {
 841 |    "cell_type": "markdown",
 842 |    "metadata": {},
 843 |    "source": [
 844 |     "## Factor Premium"
 845 |    ]
 846 |   },
 847 |   {
 848 |    "cell_type": "code",
 849 |    "execution_count": 36,
 850 |    "metadata": {},
 851 |    "outputs": [
 852 |     {
 853 |      "data": {
 854 |       "text/html": [
 855 |        "<table class=\"simpletable\">\n",
 856 |        "<caption>OLS Regression Results</caption>\n",
 857 |        "<tr>\n",
 858 |        "  <th>Dep. Variable:</th>         <td>return</td>      <th>  R-squared:         </th> <td>   0.398</td>\n",
 859 |        "</tr>\n",
 860 |        "<tr>\n",
 861 |        "  <th>Model:</th>                   <td>OLS</td>       <th>  Adj. R-squared:    </th> <td>   0.391</td>\n",
 862 |        "</tr>\n",
 863 |        "<tr>\n",
 864 |        "  <th>Method:</th>             <td>Least Squares</td>  <th>  F-statistic:       </th> <td>   53.26</td>\n",
 865 |        "</tr>\n",
 866 |        "<tr>\n",
 867 |        "  <th>Date:</th>             <td>Sat, 05 May 2018</td> <th>  Prob (F-statistic):</th> <td>1.77e-18</td>\n",
 868 |        "</tr>\n",
 869 |        "<tr>\n",
 870 |        "  <th>Time:</th>                 <td>21:03:25</td>     <th>  Log-Likelihood:    </th> <td>  1012.1</td>\n",
 871 |        "</tr>\n",
 872 |        "<tr>\n",
 873 |        "  <th>No. Observations:</th>      <td>   164</td>      <th>  AIC:               </th> <td>  -2018.</td>\n",
 874 |        "</tr>\n",
 875 |        "<tr>\n",
 876 |        "  <th>Df Residuals:</th>          <td>   161</td>      <th>  BIC:               </th> <td>  -2009.</td>\n",
 877 |        "</tr>\n",
 878 |        "<tr>\n",
 879 |        "  <th>Df Model:</th>              <td>     2</td>      <th>                     </th>     <td> </td>   \n",
 880 |        "</tr>\n",
 881 |        "<tr>\n",
 882 |        "  <th>Covariance Type:</th>      <td>nonrobust</td>    <th>                     </th>     <td> </td>   \n",
 883 |        "</tr>\n",
 884 |        "</table>\n",
 885 |        "<table class=\"simpletable\">\n",
 886 |        "<tr>\n",
 887 |        "    <td></td>       <th>coef</th>     <th>std err</th>      <th>t</th>      <th>P>|t|</th>  <th>[0.025</th>    <th>0.975]</th>  \n",
 888 |        "</tr>\n",
 889 |        "<tr>\n",
 890 |        "  <th>const</th> <td>    0.0017</td> <td> 6.72e-05</td> <td>   24.956</td> <td> 0.000</td> <td>    0.002</td> <td>    0.002</td>\n",
 891 |        "</tr>\n",
 892 |        "<tr>\n",
 893 |        "  <th>SMB</th>   <td>-7.597e-05</td> <td>    0.000</td> <td>   -0.599</td> <td> 0.550</td> <td>   -0.000</td> <td>    0.000</td>\n",
 894 |        "</tr>\n",
 895 |        "<tr>\n",
 896 |        "  <th>HML</th>   <td>    0.0005</td> <td> 4.81e-05</td> <td>    9.695</td> <td> 0.000</td> <td>    0.000</td> <td>    0.001</td>\n",
 897 |        "</tr>\n",
 898 |        "</table>\n",
 899 |        "<table class=\"simpletable\">\n",
 900 |        "<tr>\n",
 901 |        "  <th>Omnibus:</th>       <td>39.154</td> <th>  Durbin-Watson:     </th> <td>   1.906</td>\n",
 902 |        "</tr>\n",
 903 |        "<tr>\n",
 904 |        "  <th>Prob(Omnibus):</th> <td> 0.000</td> <th>  Jarque-Bera (JB):  </th> <td>  78.545</td>\n",
 905 |        "</tr>\n",
 906 |        "<tr>\n",
 907 |        "  <th>Skew:</th>          <td> 1.087</td> <th>  Prob(JB):          </th> <td>8.80e-18</td>\n",
 908 |        "</tr>\n",
 909 |        "<tr>\n",
 910 |        "  <th>Kurtosis:</th>      <td> 5.601</td> <th>  Cond. No.          </th> <td>    3.92</td>\n",
 911 |        "</tr>\n",
 912 |        "</table>"
 913 |       ],
 914 |       "text/plain": [
 915 |        "<class 'statsmodels.iolib.summary.Summary'>\n",
 916 |        "\"\"\"\n",
 917 |        "                            OLS Regression Results                            \n",
 918 |        "==============================================================================\n",
 919 |        "Dep. Variable:                 return   R-squared:                       0.398\n",
 920 |        "Model:                            OLS   Adj. R-squared:                  0.391\n",
 921 |        "Method:                 Least Squares   F-statistic:                     53.26\n",
 922 |        "Date:                Sat, 05 May 2018   Prob (F-statistic):           1.77e-18\n",
 923 |        "Time:                        21:03:25   Log-Likelihood:                 1012.1\n",
 924 |        "No. Observations:                 164   AIC:                            -2018.\n",
 925 |        "Df Residuals:                     161   BIC:                            -2009.\n",
 926 |        "Df Model:                           2                                         \n",
 927 |        "Covariance Type:            nonrobust                                         \n",
 928 |        "==============================================================================\n",
 929 |        "                 coef    std err          t      P>|t|      [0.025      0.975]\n",
 930 |        "------------------------------------------------------------------------------\n",
 931 |        "const          0.0017   6.72e-05     24.956      0.000       0.002       0.002\n",
 932 |        "SMB        -7.597e-05      0.000     -0.599      0.550      -0.000       0.000\n",
 933 |        "HML            0.0005   4.81e-05      9.695      0.000       0.000       0.001\n",
 934 |        "==============================================================================\n",
 935 |        "Omnibus:                       39.154   Durbin-Watson:                   1.906\n",
 936 |        "Prob(Omnibus):                  0.000   Jarque-Bera (JB):               78.545\n",
 937 |        "Skew:                           1.087   Prob(JB):                     8.80e-18\n",
 938 |        "Kurtosis:                       5.601   Cond. No.                         3.92\n",
 939 |        "==============================================================================\n",
 940 |        "\n",
 941 |        "Warnings:\n",
 942 |        "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
 943 |        "\"\"\""
 944 |       ]
 945 |      },
 946 |      "execution_count": 36,
 947 |      "metadata": {},
 948 |      "output_type": "execute_result"
 949 |     }
 950 |    ],
 951 |    "source": [
 952 |     "betas = sm.add_constant(betas.drop('const', axis=1))\n",
 953 |     "\n",
 954 |     "R = data['return'].mean(axis=0, level=1)\n",
 955 |     "\n",
 956 |     "# Second regression step: estimating the risk premia\n",
 957 |     "risk_free_rate = benchmark_df['return'].mean()\n",
 958 |     "\n",
 959 |     "final_results = regression.linear_model.OLS(R - risk_free_rate, betas).fit()\n",
 960 |     "\n",
 961 |     "final_results.summary()"
 962 |    ]
 963 |   },
 964 |   {
 965 |    "cell_type": "markdown",
 966 |    "metadata": {},
 967 |    "source": [
 968 |     "## Fama-Macbeth Test Conclusion: \n",
 969 |     "although our individual factors are significant, we have a very low  $R^2$ . What this may suggest is that there is a real link between our factors and the returns of our assets, but that there still remains a lot of unexplained noise!"
 970 |    ]
 971 |   },
 972 |   {
 973 |    "cell_type": "code",
 974 |    "execution_count": null,
 975 |    "metadata": {},
 976 |    "outputs": [],
 977 |    "source": []
 978 |   }
 979 |  ],
 980 |  "metadata": {
 981 |   "kernelspec": {
 982 |    "display_name": "Python 3",
 983 |    "language": "python",
 984 |    "name": "python3"
 985 |   },
 986 |   "language_info": {
 987 |    "codemirror_mode": {
 988 |     "name": "ipython",
 989 |     "version": 3
 990 |    },
 991 |    "file_extension": ".py",
 992 |    "mimetype": "text/x-python",
 993 |    "name": "python",
 994 |    "nbconvert_exporter": "python",
 995 |    "pygments_lexer": "ipython3",
 996 |    "version": "3.5.4"
 997 |   }
 998 |  },
 999 |  "nbformat": 4,
1000 |  "nbformat_minor": 2
1001 | }
1002 | 


--------------------------------------------------------------------------------
/source/rqdata_utils.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import alphalens as al
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | def price_reader(price_path):
 6 |     price_df = pd.read_csv(price_path)
 7 |     price_df.rename(index=str,columns={"Unnamed: 0":"date"},inplace=True)
 8 |     price_df.date = pd.to_datetime(price_df.date,format="%Y-%m-%d",errors='ignore')
 9 |     # price_df.date = price_df.date.apply(timezone.localize)
10 |     price_df.set_index(['date'],drop=True,inplace=True)
11 |     price_df = price_df.sortlevel(axis=1)
12 |     return price_df
13 | 
14 | def instrument_reader(instrument_path):
15 |     instrument_df = pd.read_csv(instrument_path)
16 |     instrument_df.drop(['Unnamed: 0'],axis=1,inplace=True)
17 |     instrument_df = instrument_df.set_index(['bookId'])
18 |     instrument_df = instrument_df.sort_index()
19 |     return instrument_df
20 | 
21 | def equity_reader(equity_path):
22 |     cn_df = pd.read_csv(equity_path)
23 |     cn_df.date = pd.to_datetime(cn_df.date,format="%Y-%m-%d",errors='ignore')
24 |     cn_df.set_index(['date','order_book_id'],drop=True,inplace=True)
25 |     cn_df.drop(["Unnamed: 0"],axis=1,inplace=True)
26 |     return cn_df
27 | 
28 | def equity_add_instrumentInfo(cn_df,instrument_df,instrument_column):
29 |     instrumentInfoSeries = instrument_df[instrument_column]
30 |     bookIdIdx = cn_df.index.get_level_values('order_book_id')
31 |     bookIdArray = bookIdIdx.get_values()
32 |     instrumentInfo = instrumentInfoSeries[bookIdArray[:]].values
33 |     cn_df[instrument_column] = instrumentInfo
34 |     return cn_df
35 | 
36 | def get_price_instrument_equity(price_path,instrument_path,equity_path,addInstrumentColumn=None):
37 |     price_df = price_reader(price_path)
38 |     instrument_df = instrument_reader(instrument_path)
39 |     equity_df = equity_reader(equity_path)
40 |     if(addInstrumentColumn):
41 |         equity_df = equity_add_instrumentInfo(equity_df,instrument_df,addInstrumentColumn)
42 |     return price_df,instrument_df,equity_df
43 | 
44 | def ic_analysis(equity_df, price_df, factor_columns, group_column, periods=(1,22,66), group_adjust=False):
45 |     factor_list = []
46 |     ic_list = []
47 |     monthly_ic_list = []
48 |     groupby = equity_df[group_column]
49 |     for col in factor_columns:
50 |         factor_list.append(equity_df[col])
51 | 
52 |     for my_factor in factor_list:
53 |         factor_data = al.utils.get_clean_factor_and_forward_returns(factor=my_factor,
54 |                                                                     prices=price_df,
55 |                                                                     groupby=groupby,
56 |                                                                     periods=periods,
57 |                                                                     max_loss=1)
58 |         mean_ic = al.performance.mean_information_coefficient(factor_data, group_adjust=group_adjust,
59 |                                                               by_group=True,
60 |                                                               by_time=None)
61 |         mean_monthly_ic = al.performance.mean_information_coefficient(factor_data, group_adjust=group_adjust,
62 |                                                                       by_group=False,
63 |                                                                       by_time='M')
64 |         print("#######################################################")
65 |         print("factor: {}".format(my_factor.name))
66 |         print(mean_ic)
67 |         # print(mean_monthly_ic)
68 |         ic_list.append(mean_ic)
69 |         monthly_ic_list.append(mean_monthly_ic)
70 |         al.plotting.plot_monthly_ic_heatmap(mean_monthly_ic)
71 |         plt.show()
72 | 
73 | 
74 |     mean_ic_df = pd.concat(ic_list, keys=factor_columns)
75 |     mean_ic_df.index = mean_ic_df.index.set_names(['factor', 'group'])
76 |     return mean_ic_df, monthly_ic_list


--------------------------------------------------------------------------------