├── .gitignore
├── LICENSE
├── MANiFEST.in
├── README.md
├── docs
    └── API文档.md
├── jqfactor_analyzer
    ├── __init__.py
    ├── analyze.py
    ├── attribution.py
    ├── compat.py
    ├── config.json
    ├── data.py
    ├── exceptions.py
    ├── factor_cache.py
    ├── performance.py
    ├── plot_utils.py
    ├── plotting.py
    ├── prepare.py
    ├── preprocess.py
    ├── sample.py
    ├── sample_data
    │   ├── VOL5.csv
    │   ├── index_weight_info.csv
    │   └── weight_info.csv
    ├── utils.py
    ├── version.py
    └── when.py
├── requirements.txt
├── setup.cfg
├── setup.py
└── tests
    ├── __init__.py
    ├── test_attribution.py
    ├── test_data.py
    ├── test_performance.py
    └── test_prepare.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 JoinQuant
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANiFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | include *.txt
3 | include jqfactor_analyzer/sample_data/*.csv
4 | include jqfactor_analyzer/config.json
5 | 


--------------------------------------------------------------------------------
/docs/API文档.md:
--------------------------------------------------------------------------------
   1 | # **API文档**
   2 | 
   3 | ## 一、因子缓存factor_cache模块
   4 | 
   5 | 为了在本地进行分析时，为了提高数据获取的速度并避免反复从服务端获取数据，所以增加了本地数据缓存的方法。
   6 | 
   7 | 注意缓存格式为pyarrow.feather格式，pyarrow库不同版本之间可能存在兼容问题，建议不要随意修改pyarrow库的版本，如果修改后产生大量缓存文件无法读取(提示已损坏)的情况，建议删除整个缓存目录后重新缓存。
   8 | 
   9 | ### 1. 设置缓存目录
  10 | 
  11 | 对于单因子分析和归因分析中使用到的市值/价格和风格因子等数据，默认会缓存到用户的主目录( `os.path.expanduser( '~/jqfactor_datacache/bundle')` )。 一般地，在 Unix 系统上可能是 `/home/username/jqfactor_datacache/bundle`，而在 Windows 系统上可能是 `C:\Users\username\jqfactor_datacache\bundle`。
  12 | 
  13 | 您可以通过以下代码修改配置信息来设置为其他路径，设置过一次后后续都将沿用设置的这个路径，不用重复设置。
  14 | 
  15 | ```python
  16 | from jqfactor_analyzer.factor_cache import set_cache_dir,get_cache_dir
  17 | set_cache_dir(my_path) #设置缓存目录为my_path
  18 | print(get_cache_dir()) #输出缓存目录
  19 | ```
  20 | 
  21 | ### 2. 缓存/检查缓存和读取已缓存数据
  22 | 
  23 | 除过对单因子分析及归因分析依赖的数据进行缓存外，factor_cache还可以缓存自定义的因子组(仅限聚宽因子库中支持的因子)
  24 | 
  25 | ```python
  26 | def save_factor_values_by_group(start_date,end_date,factor_names='prices',
  27 | 				group_name=None,overwrite=False,cache_dir=None,show_progress=True):
  28 |     """将因子库数据按因子组储存到本地,根据factor_names因子列表(顺序无关)自动生成因子组的名称
  29 |     start_date : 开始时间
  30 |     end_date : 结束时间
  31 |     factor_names : 因子组所含因子的名称,除过因子库中支持的因子外，还支持指定为'prices'缓存价格数据
  32 |     group_name : 因子组名称，不指定时使用get_factor_folder自动生成因子组名(即缓存文件夹名)，如果指定则按照指定的名称生成文件夹名(使用get_factor_values_by_cache时,需要自行指定factor_path)
  33 |     overwrite  : 文件已存在时是否覆盖更新,默认为False即增量更新,文件已存在时跳过
  34 |     cache_dir : 缓存的路径，如果没有指定则使用配置信息中的路径,一般不用指定
  35 |     show_progress : 是否展示缓存进度,默认为True
  36 |     返回 : 因子组储存的路径 , 文件以天为单位储存为feather文件,每天一个feather文件,每月一个文件夹,columns为因子名称, index为当天在市的所有标的代码
  37 |     """
  38 | def get_factor_values_by_cache(date,codes=None,factor_names=None,group_name=None,
  39 | 								factor_path=None):
  40 |     """从缓存的文件读取因子数据,文件不存在时返回空的dataframe
  41 |     date : 日期
  42 |     codes : 标的代码,默认为None获取当天在市的所有标的
  43 |     factor_names : 因子列表(顺序无关),当指定factor_path/group_name时失效
  44 |     group_name : 因子组名,如果缓存时指定了group_name,则获取时必须也指定group_name或factor_path
  45 |     factor_path : 可选参数,因子组的路径,一般不用指定
  46 |     返回:
  47 |     如果缓存文件存在，则返回当天的因子数据,index是标的代码,columns是因子名
  48 |     如果缓存文件不存在,则返回空的dataframe, 建议在使用get_factor_values_by_cache前,先运行save_factor_values_by_group检查时间区间内的缓存文件是否完整
  49 |     """
  50 | def get_factor_folder(factor_names,group_name=None):
  51 |     """获取因子组的文件夹名(文件夹位于get_cache_dir()获取的缓存目录下)
  52 |     factor_names : 因子储存时,如果未指定group_name,则根据因子列表(顺序无关)获取md5值生成因子组名(即储存的文件夹名)，使用此方法可以获取生成的文件夹名称
  53 |     group_name : 如果储存时指定了因子组名,则直接返回此因子组名
  54 |     """
  55 | 
  56 | ```
  57 | 
  58 | **示例**
  59 | 
  60 | ```python
  61 | from jqfactor_analyzer.factor_cache import save_factor_values_by_group,get_factor_values_by_cache,get_factor_folder,get_cache_dir
  62 | # import jqdatasdk as jq
  63 | # jq.auth("账号",'密码') #登陆jqdatasdk来从服务端缓存数据
  64 | 
  65 | all_factors = jq.get_all_factors()
  66 | factor_names = all_factors[all_factors.category=='growth'].factor.tolist()  #将聚宽因子库中的成长类因子作为一组因子
  67 | group_name = 'growth_factors' #因子组名定义为'growth_factors'
  68 | start_date = '2021-01-01'
  69 | end_date = '2021-06-01'
  70 | # 检查/缓存因子数据
  71 | factor_path = save_factor_values_by_group(start_date,end_date,factor_names=factor_names,group_name=group_name,overwrite=False,show_progress=True)
  72 | # factor_path = os.path.join(get_cache_dir(), get_factor_folder(factor_names,group_name=group_name)  #等同于save_factor_values_by_group返回的路径
  73 | 
  74 | # 循环获取缓存的因子数据,并拼接
  75 | trade_days = jq.get_trade_days(start_date,end_date)
  76 | factor_values = {}
  77 | for date in trade_days:
  78 |     factor_values[date] = get_factor_values_by_cache(date,codes=None,factor_names=factor_names,group_name=group_name, factor_path=factor_path)#这里实际只需要指定group_name,factor_names参数的其中一个,缓存时指定了group_name时,factor_names不生效
  79 | factor_values = pd.concat(factor_values)
  80 | ```
  81 | 
  82 | ## 二、归因分析模块
  83 | 
  84 | ```python
  85 | from jqfactor_analyzer import AttributionAnalysis
  86 | AttributionAnalysis(weights,daily_return,style_type='style_pro',industry ='sw_l1',use_cn=True,show_data_progress=True)
  87 | ```
  88 | 
  89 | **参数 :**
  90 | 
  91 | - `weights`:持仓权重信息，index是日期，columns是标的代码， value对应的是组合当天的仓位占比(单日仓位占比总和不为1时，剩余部分认为是当天的现金)
  92 | -  `daily_return`:Series,index是日期，values为当天组合的收益率
  93 | -  `style_type`:归因分析所使用的风格因子类型，可选'style'和'style_pro'中的一个
  94 | -  `industry`:归因分析所使用的行业分类，可选'sw_l1'和'jq_l1'中的一个
  95 | -  `use_cn`:绘图时是否使用中文
  96 | -  `show_data_progress`:是否展示数据获取进度(使用本地缓存,第一次运行时速度较慢,后续对于本地不存在的数据将增量缓存)
  97 | 
  98 | **示例**
  99 | 
 100 | ```python
 101 | import pandas as pd
 102 | # position_weights.csv 是一个储存了组合权重信息的csv文件,index是日期,columns是股票代码
 103 | # position_daily_return.csv 是一个储存了组合日收益率的csv文件,index是日期,daily_return列是日收益
 104 | weights = pd.read_csv("position_weights.csv",index_col=0)
 105 | returns = pd.read_csv("position_daily_return.csv",index_col=0)['daily_return']
 106 | 
 107 | An =  AttributionAnalysis(weights , returns ,style_type='style_pro',industry ='sw_l1', show_data_progress=True )
 108 | ```
 109 | 
 110 | 
 111 | 
 112 | ### 1. 属性
 113 | 
 114 | - `style_exposure` : 组合的风格暴露
 115 | - `industry_exposure` : 组合的行业暴露
 116 | - `exposure_portfolio` : 组合的风格+行业及country暴露
 117 | - `attr_daily_returns` : 组合的\风格+行业及country日度归因收益率
 118 | - `attr_returns` : 组合的日度风格+行业及country累积归因收益率
 119 | 
 120 | ### 2. 方法
 121 | 
 122 | #### (1) 获取组合相对于指数的暴露
 123 | 
 124 | ```python
 125 | get_exposure2bench(index_symbol)
 126 | ```
 127 | 
 128 | **参数 :**
 129 | 
 130 | - `index_symbol` : 基准指数, 可选`['000300.XSHG','000905.XSHG','000906.XSHG','000852.XSHG','932000.CSI','000985.XSHG']`中的一个
 131 | 
 132 | **返回 :**
 133 | 
 134 | - 一个dataframe,index为日期,columns为风格因子+行业因子+county , 其中country为股票总持仓占比
 135 | 
 136 | #### (2) 获取组合相对于指数的日度归因收益率
 137 | 
 138 | ```python
 139 | get_attr_daily_returns2bench(index_symbol)
 140 | ```
 141 | 
 142 | 假设组合相对于指数的收益由以下部分构成 : 风格+行业暴露收益(common_return ) , 现金闲置收益(cash) ,策略本身的超额收益(specific_return)
 143 | **参数 :**
 144 | 
 145 | - `index_symbol` : 基准指数, 可选`['000300.XSHG','000905.XSHG','000906.XSHG','000852.XSHG','932000.CSI','000985.XSHG']`中的一个
 146 | 
 147 | **返回 :**
 148 | 
 149 | - 一个dataframe,index为日期,columns为`风格因子+行业因子+cash+common_return,specific_return,total_return`
 150 | 
 151 |   其中:
 152 |   cash是假设现金收益(0)相对指数带来的收益率
 153 |   common_return 为风格+行业总收益率
 154 |   specific_return 为特意收益率
 155 |   total_return 为组合相对于指数的总收益
 156 | 
 157 | #### (3) 获取相对于指数的累积归因收益率
 158 | 
 159 | ```python
 160 | get_attr_returns2bench(index_symbol)
 161 | ```
 162 | 
 163 | 假设组合相对于指数的收益由以下部分构成 : 风格+行业暴露收益(common_return ) , 现金闲置收益(cash) ,策略本身的超额收益(specific_return)
 164 | 
 165 | **参数 :**
 166 | 
 167 |  `index_symbol` : 基准指数, 可选`['000300.XSHG','000905.XSHG','000906.XSHG','000852.XSHG','932000.CSI','000985.XSHG']`中的一个
 168 | 
 169 | **返回 :**
 170 | 
 171 | - 一个dataframe,index为日期,columns为`风格因子+行业因子+cash+common_return,specific_return,total_return`
 172 | 
 173 |   其中:
 174 |   cash是假设现金收益(0)相对指数带来的收益率
 175 |   common_return 为风格+行业总收益率
 176 |   specific_return 为特异收益率
 177 |   total_return 为组合相对于指数的总收益(减法超额)
 178 | 
 179 | ### 3. 绘图方法
 180 | 
 181 | #### (1) 绘制风格暴露时序图
 182 | 
 183 | ```python
 184 | plot_exposure(factors='style',index_symbol=None,figsize=(15,7))
 185 | ```
 186 | 
 187 | 绘制风格暴露时序
 188 | 
 189 | **参数**
 190 | 
 191 | - factors : 绘制的暴露类型 , 可选 'style'(所有风格因子) , 'industry'(所有行业因子),也可以传递一个list,list为exposure_portfolio中columns的一个或者多个
 192 | - index_symbol : 基准指数代码,指定时绘制相对于指数的暴露 , 默认None为组合本身的暴露
 193 | - figsize : 画布大小
 194 | 
 195 | #### (2) 绘制归因分析收益时序图
 196 | 
 197 | ```python
 198 | plot_returns(factors='style',index_symbol=None,figsize=(15,7))
 199 | ```
 200 | 
 201 | 绘制归因分析收益时序
 202 | 
 203 | **参数**
 204 | 
 205 | - factors : 绘制的暴露类型 , 可选 'style'(所有风格因子) , 'industry'(所有行业因子),也可以传递一个list,list为exposure_portfolio中columns的一个或者多个
 206 |   同时也支持指定['common_return'(风格总收益),'specific_return'(特异收益),'total_return'(总收益)', 'country'(国家因子收益,当指定index_symbol时会用现金相对于指数的收益替代)]
 207 | - index_symbol : 基准指数代码,指定时绘制相对于指数的暴露 , 默认None为组合本身的暴露
 208 | - figsize : 画布大小
 209 | 
 210 | #### (3) 绘制暴露与收益对照图
 211 | 
 212 | ```python
 213 | plot_exposure_and_returns(factors='style',index_symbol=None,show_factor_perf=False,figsize=(12,6))
 214 | ```
 215 | 
 216 | 将因子暴露与收益同时绘制在多个子图上
 217 | 
 218 | **参数**
 219 | 
 220 | -  factors : 绘制的暴露类型 , 可选 'style'(所有风格因子) , 'industry'(所有行业因子,也可以传递一个list,list为exposure_portfolio中columns的一个或者多个
 221 |   当指定index_symbol时,country会用现金相对于指数的收益替代)
 222 | - index_symbol : 基准指数代码,指定时绘制相对于指数的暴露及收益 , 默认None为组合本身的暴露和收益
 223 | - show_factor_perf : 是否同时绘制因子表现
 224 | - figsize : 画布大小,这里第一个参数是画布的宽度, 第二个参数为单个子图的高度
 225 | 
 226 | #### (4) 关闭中文图例显示
 227 | 
 228 | ```python
 229 | plot_disable_chinese_label()
 230 | ```
 231 | 
 232 |  画图时默认会从系统中查找中文字体显示以中文图例
 233 |  如果找不到中文字体则默认使用英文图例
 234 |  当找到中文字体但中文显示乱码时, 可调用此 API 关闭中文图例显示而使用英文
 235 | 
 236 | 
 237 | 
 238 | ## 三、单因子分析模块
 239 | 
 240 | ```python
 241 | from jqfactor_analyzer import analyze_factor
 242 | analyze_factor(factor, industry='jq_l1', quantiles=5, periods=(1, 5, 10), weight_method='avg', max_loss=0.25, allow_cache=True, show_data_progress=True )
 243 | ```
 244 | 
 245 | 单因子分析函数
 246 | 
 247 | 
 248 | 
 249 | **参数**
 250 | 
 251 | * `factor`: 因子值，
 252 | 
 253 |   pandas.DataFrame格式的数据
 254 | 
 255 |   - index为日期，格式为pandas日期通用的DatetimeIndex，转换方法见[将自有因子值转换成 DataFrame 格式的数据](#将自有因子值转换成-dataframe-格式的数据)
 256 |   - columns为股票代码，格式要求符合聚宽的代码定义规则（如：平安银行的股票代码为000001.XSHE）
 257 |       - 如果是深交所上市的股票，在股票代码后面需要加入.XSHE
 258 |       - 如果是上交所上市的股票，在股票代码后面需要加入.XSHG
 259 | 
 260 |   或 pd.Series格式的数据
 261 |   - index为日期和股票代码组成的MultiIndex
 262 | 
 263 | * `industry`: 行业分类, 默认为 `'jq_l1'`
 264 | 
 265 |   * `'sw_l1'`: 申万一级行业
 266 |   * `'sw_l2'`: 申万二级行业
 267 |   * `'sw_l3'`: 申万三级行业
 268 |   * `'jq_l1'`: 聚宽一级行业
 269 |   * `'jq_l2'`: 聚宽二级行业
 270 |   * `'zjw'`: 证监会行业
 271 | 
 272 | * `quantiles`: 分位数数量, 默认为 `5`
 273 | 
 274 |   `int`
 275 | 
 276 |   在因子分组中按照因子值大小平均分组的组数.
 277 | 
 278 | * `periods`: 调仓周期, 默认为 [1, 5, 10]
 279 | 
 280 |   `int` or `list[int]`
 281 | 
 282 | * `weight_method`: 基于分位数收益时的加权方法, 默认为 `'avg'`
 283 | 
 284 |   * `'avg'`: 等权重
 285 |   * `'mktcap'`：按总市值加权
 286 |   * `'ln_mktcap'`: 按总市值的对数加权
 287 |   * `'cmktcap'`: 按流通市值加权
 288 |   * `'ln_cmktcap'`: 按流通市值的对数加权
 289 | 
 290 | * `max_loss`: 因重复值或nan值太多而无效的因子值的最大占比, 默认为 0.25
 291 | 
 292 |   `float`
 293 | 
 294 |   允许的丢弃因子数据的最大百分比 (0.00 到 1.00),
 295 | 
 296 |   计算比较输入因子索引中的项目数和输出 DataFrame 索引中的项目数.
 297 | 
 298 |   因子数据本身存在缺陷 (例如 NaN),
 299 | 
 300 |   没有提供足够的价格数据来计算所有因子值的远期收益,
 301 | 
 302 |   或者因为分组失败, 因此可以部分地丢弃因子数据
 303 | 
 304 | * `allow_cache` : 是否允许对价格,市值等信息进行本地缓存(按天缓存,初次运行可能比较慢,但后续重新获取对应区间的数据将非常快,且分析时仅消耗较小的jqdatasdk流量)
 305 | 
 306 | * show_data_progress: 是否展示数据获取的进度信息
 307 | 
 308 | 
 309 | 
 310 | **示例**
 311 | 
 312 | ```python
 313 | #载入函数库
 314 | import pandas as pd
 315 | import jqfactor_analyzer as ja
 316 | 
 317 | # 获取 jqdatasdk 授权
 318 | # 输入用户名、密码，申请地址：http://t.cn/EINDOxE
 319 | # 聚宽官网及金融终端，使用方法参见：http://t.cn/EINcS4j
 320 | import jqdatasdk
 321 | jqdatasdk.auth('username', 'password')
 322 | 
 323 | # 对因子进行分析
 324 | far = ja.analyze_factor(
 325 |     factor_data,  # factor_data 为因子值的 pandas.DataFrame
 326 |     quantiles=10,
 327 |     periods=(1, 10),
 328 |     industry='jq_l1',
 329 |     weight_method='avg',
 330 |     max_loss=0.1
 331 | )
 332 | 
 333 | # 生成统计图表
 334 | far.create_full_tear_sheet(
 335 |     demeaned=False, group_adjust=False, by_group=False,
 336 |     turnover_periods=None, avgretplot=(5, 15), std_bar=False
 337 | )
 338 | ```
 339 | 
 340 | 
 341 | 
 342 | 
 343 | 
 344 | 
 345 | 
 346 | ### 1. 绘制结果
 347 | 
 348 | #### 展示全部分析
 349 | 
 350 | ```
 351 | far.create_full_tear_sheet(demeaned=False, group_adjust=False, by_group=False,
 352 | turnover_periods=None, avgretplot=(5, 15), std_bar=False)
 353 | ```
 354 | 
 355 | **参数:**
 356 | 
 357 | - demeaned:
 358 |     - True: 使用超额收益计算 (基准收益被认为是每日所有股票收益按照weight列中权重加权的均值)
 359 |     - False: 不使用超额收益
 360 | - group_adjust:
 361 |     - True: 使用行业中性化后的收益计算 (行业收益被认为是每日各个行业股票收益按照weight列中权重加权的均值)
 362 |     - False: 不使用行业中性化后的收益
 363 | - by_group:
 364 |     - True: 按行业展示
 365 |     - False: 不按行业展示
 366 | - turnover_periods: 调仓周期
 367 | - avgretplot: tuple 因子预测的天数-(计算过去的天数, 计算未来的天数)
 368 | - std_bar:
 369 |     - True: 显示标准差
 370 |     - False: 不显示标准差
 371 | 
 372 | #### 因子值特征分析
 373 | 
 374 | ```
 375 | far.create_summary_tear_sheet(demeaned=False, group_adjust=False)
 376 | ```
 377 | 
 378 | **参数:**
 379 | 
 380 | - demeaned:
 381 |     - True: 对每日因子收益去均值求得因子收益表
 382 |     - False: 因子收益表
 383 | - group_adjust:
 384 |     - True: 按行业对因子收益去均值后求得因子收益表
 385 |     - False: 因子收益表
 386 | 
 387 | #### 因子收益分析
 388 | 
 389 | ```
 390 | far.create_returns_tear_sheet(demeaned=False, group_adjust=False, by_group=False)
 391 | 
 392 | ```
 393 | 
 394 | **参数:**
 395 | 
 396 | - demeaned:
 397 |     - True: 使用超额收益计算 (基准收益被认为是每日所有股票收益按照weight列中权重加权的均值)
 398 |     - False: 不使用超额收益
 399 | - group_adjust:
 400 |     - True: 使用行业中性化后的收益计算 (行业收益被认为是每日各个行业股票收益按照weight列中权重加权的均值)
 401 |     - False: 不使用行业中性化后的收益
 402 | - by_group:
 403 |     - True: 画各行业的各分位数平均收益图
 404 |     - False: 不画各行业的各分位数平均收益图
 405 | 
 406 | #### 因子 IC 分析
 407 | 
 408 | ```
 409 | far.create_information_tear_sheet(group_adjust=False, by_group=False)
 410 | 
 411 | ```
 412 | 
 413 | **参数:**
 414 | 
 415 | - group_adjust:
 416 |     - True: 使用行业中性收益 (行业收益被认为是每日各个行业股票收益按照weight列中权重的加权的均值)
 417 |     - False: 不使用行业中性收益
 418 | - by_group:
 419 |     - True: 画按行业分组信息比率(IC)图
 420 |     - False: 画月度信息比率(IC)图
 421 | 
 422 | #### 因子换手率分析
 423 | 
 424 | ```
 425 | far.create_turnover_tear_sheet(turnover_periods=None)
 426 | 
 427 | ```
 428 | 
 429 | **参数:**
 430 | 
 431 | - turnover_periods: 调仓周期
 432 | 
 433 | #### 因子预测能力分析
 434 | 
 435 | ```
 436 | far.create_event_returns_tear_sheet(avgretplot=(5, 15),demeaned=False, group_adjust=False,std_bar=False)
 437 | 
 438 | ```
 439 | 
 440 | **参数:**
 441 | 
 442 | - avgretplot: tuple 因子预测的天数-(计算过去的天数, 计算未来的天数)
 443 | - demeaned:
 444 |     - True: 使用超额收益计算累积收益 (基准收益被认为是每日所有股票收益按照weight列中权重加权的均值)
 445 |     - False: 不使用超额收益
 446 | - group_adjust:
 447 |     - True: 使用行业中性化后的收益计算累积收益 (行业收益被认为是每日各个行业股票收益按照weight列中权重加权的均值)
 448 |     - False: 不使用行业中性化后的收益
 449 | - std_bar:
 450 |     - True: 显示标准差
 451 |     - False: 不显示标准差
 452 | 
 453 | #### 打印因子收益表
 454 | 
 455 | ```
 456 | far.plot_returns_table(demeaned=False, group_adjust=False)
 457 | 
 458 | ```
 459 | 
 460 | **参数：**
 461 | 
 462 | - demeaned:
 463 |     - True：使用超额收益计算 (基准收益被认为是每日所有股票收益按照weight列中权重的加权的均值)
 464 |     - False：不使用超额收益
 465 | - group_adjust:
 466 |     - True：使用行业中性收益 (行业收益被认为是每日各个行业股票收益按照weight列中权重的加权的均值)
 467 |     - False：不使用行业中性收益
 468 | 
 469 | #### 打印换手率表
 470 | 
 471 | ```
 472 | far.plot_turnover_table()
 473 | 
 474 | ```
 475 | 
 476 | #### 打印信息比率（IC）相关表
 477 | 
 478 | ```
 479 | far.plot_information_table(group_adjust=False, method='rank')
 480 | 
 481 | ```
 482 | 
 483 | **参数：**
 484 | 
 485 | - group_adjust:
 486 |     - True：使用行业中性收益 (行业收益被认为是每日各个行业股票收益按照weight列中权重的加权的均值)
 487 |     - False：不使用行业中性收益
 488 | - method：
 489 |     - 'rank'：用秩相关系数计算IC值
 490 |     - 'normal': 用相关系数计算IC值
 491 | 
 492 | #### 打印个分位数统计表
 493 | 
 494 | ```
 495 | far.plot_quantile_statistics_table()
 496 | 
 497 | ```
 498 | 
 499 | #### 画信息比率(IC)时间序列图
 500 | 
 501 | ```
 502 | far.plot_ic_ts(group_adjust=False, method='rank')
 503 | 
 504 | ```
 505 | 
 506 | **参数：**
 507 | 
 508 | - group_adjust:
 509 |     - True：使用行业中性收益 (行业收益被认为是每日各个行业股票收益按照weight列中权重的加权的均值)
 510 |     - False：不使用行业中性收益
 511 | - method：
 512 |     - 'rank'：用秩相关系数计算IC值
 513 |     - 'normal': 用相关系数计算IC值
 514 | 
 515 | #### 画信息比率分布直方图
 516 | 
 517 | ```
 518 | far.plot_ic_hist(group_adjust=False, method='rank')
 519 | 
 520 | ```
 521 | 
 522 | **参数：**
 523 | 
 524 | - group_adjust:
 525 |     - True：使用行业中性收益 (行业收益被认为是每日各个行业股票收益按照weight列中权重的加权的均值)
 526 |     - False：不使用行业中性收益
 527 | - method：
 528 |     - 'rank'：用秩相关系数计算IC值
 529 |     - 'normal': 用相关系数计算IC值
 530 | 
 531 | #### 画信息比率 qq 图
 532 | 
 533 | ```
 534 | far.plot_ic_qq(group_adjust=False, method='rank', theoretical_dist='norm')
 535 | 
 536 | ```
 537 | 
 538 | **参数：**
 539 | 
 540 | - group_adjust:
 541 |     - True：使用行业中性收益 (行业收益被认为是每日各个行业股票收益按照weight列中权重的加权的均值)
 542 |     - False：不使用行业中性收益
 543 | - method：
 544 |     - 'rank'：用秩相关系数计算IC值
 545 |     - 'normal': 用相关系数计算IC值
 546 | - theoretical_dist：
 547 |     - 'norm'：正态分布
 548 |     - 't'：t分布
 549 | 
 550 | #### 画各分位数平均收益图
 551 | 
 552 | ```
 553 | far.plot_quantile_returns_bar(by_group=False, demeaned=False, group_adjust=False)
 554 | 
 555 | ```
 556 | 
 557 | **参数：**
 558 | 
 559 | - by_group：
 560 |     - True：各行业的各分位数平均收益图
 561 |     - False：各分位数平均收益图
 562 | - demeaned:
 563 |     - True：使用超额收益计算累积收益 (基准收益被认为是每日所有股票收益按照weight列中权重加权的均值)
 564 |     - False：不使用超额收益
 565 | - group_adjust:
 566 |     - True：使用行业中性化后的收益计算累积收益 (行业收益被认为是每日各个行业股票收益按照weight列中权重加权的均值)
 567 |     - False：不使用行业中性化后的收益
 568 | 
 569 | #### 画最高分位减最低分位收益图
 570 | 
 571 | ```
 572 | far.plot_mean_quantile_returns_spread_time_series(demeaned=False, group_adjust=False, bandwidth=1)
 573 | 
 574 | ```
 575 | 
 576 | **参数：**
 577 | 
 578 | - demeaned:
 579 |     - True：使用超额收益计算累积收益 (基准收益被认为是每日所有股票收益按照weight列中权重加权的均值)
 580 |     - False：不使用超额收益
 581 | - group_adjust:
 582 |     - True：使用行业中性化后的收益计算累积收益 (行业收益被认为是每日各个行业股票收益按照weight列中权重加权的均值)
 583 |     - False：不使用行业中性化后的收益
 584 | - bandwidth：n，加减n倍当日标准差
 585 | 
 586 | #### 画按行业分组信息比率(IC)图
 587 | 
 588 | ```
 589 | far.plot_ic_by_group(group_adjust=False, method='rank')
 590 | 
 591 | ```
 592 | 
 593 | **参数：**
 594 | 
 595 | - group_adjust:
 596 |     - True：使用行业中性收益 (行业收益被认为是每日各个行业股票收益按照weight列中权重的加权的均值)
 597 |     - False：不使用行业中性收益
 598 | - method：
 599 |     - 'rank'：用秩相关系数计算IC值
 600 |     - 'normal': 用相关系数计算IC值
 601 | 
 602 | #### 画因子自相关图
 603 | 
 604 | ```
 605 | far.plot_factor_auto_correlation(rank=True)
 606 | 
 607 | ```
 608 | 
 609 | **参数：**
 610 | 
 611 | - rank：
 612 |     - True：用秩相关系数
 613 |     - False：用相关系数
 614 | 
 615 | #### 画最高最低分位换手率图
 616 | 
 617 | ```
 618 | far.plot_top_bottom_quantile_turnover(periods=(1, 3, 9))
 619 | 
 620 | ```
 621 | 
 622 | **参数：**
 623 | 
 624 | - periods：调仓周期
 625 | 
 626 | #### 画月度信息比率(IC)图
 627 | 
 628 | ```
 629 | far.plot_monthly_ic_heatmap(group_adjust=False)
 630 | 
 631 | ```
 632 | 
 633 | **参数：**
 634 | 
 635 | - group_adjust:
 636 |     - True：使用行业中性收益 (行业收益被认为是每日各个行业股票收益按照weight列中权重的加权的均值)
 637 |     - False：不使用行业中性收益
 638 | 
 639 | #### 画按因子值加权多空组合每日累积收益图
 640 | 
 641 | ```
 642 | far.plot_cumulative_returns(period=1, demeaned=False, group_adjust=False)
 643 | 
 644 | ```
 645 | 
 646 | **参数：**
 647 | 
 648 | - periods：调仓周期
 649 | - demeaned:
 650 |     - True：对因子值加权组合每日收益的权重去均值 (每日权重 = 每日权重 - 每日权重的均值)，使组合转换为cash-neutral多空组合
 651 |     - False：不对权重去均值
 652 | - group_adjust:
 653 |     - True：对权重分行业去均值 (每日权重 = 每日权重 - 每日各行业权重的均值)，使组合转换为 industry-neutral 多空组合
 654 |     - False：不对权重分行业去均值
 655 | 
 656 | #### 画做多最大分位数做空最小分位数组合每日累积收益图
 657 | 
 658 | ```
 659 | far.plot_top_down_cumulative_returns(period=1, demeaned=False, group_adjust=False)
 660 | 
 661 | ```
 662 | 
 663 | **参数：**
 664 | 
 665 | - periods：指定调仓周期
 666 | - demeaned:
 667 |     - True：使用超额收益计算累积收益 (基准收益被认为是每日所有股票收益按照weight列中权重加权的均值)
 668 |     - False：不使用超额收益
 669 | - group_adjust:
 670 |     - True：使用行业中性化后的收益计算累积收益 (行业收益被认为是每日各个行业股票收益按照weight列中权重加权的均值)
 671 |     - False：不使用行业中性化后的收益
 672 | 
 673 | #### 画各分位数每日累积收益图
 674 | 
 675 | ```
 676 | far.plot_cumulative_returns_by_quantile(period=(1, 3, 9), demeaned=False, group_adjust=False)
 677 | 
 678 | ```
 679 | 
 680 | **参数：**
 681 | 
 682 | - periods：调仓周期
 683 | - demeaned:
 684 |     - True：使用超额收益计算累积收益 (基准收益被认为是每日所有股票收益按照weight列中权重加权的均值)
 685 |     - False：不使用超额收益
 686 | - group_adjust:
 687 |     - True：使用行业中性化后的收益计算累积收益 (行业收益被认为是每日各个行业股票收益按照weight列中权重加权的均值)
 688 |     - False：不使用行业中性化后的收益
 689 | 
 690 | #### 因子预测能力平均累计收益图
 691 | 
 692 | ```
 693 | far.plot_quantile_average_cumulative_return(periods_before=5, periods_after=10, by_quantile=False, std_bar=False, demeaned=False, group_adjust=False)
 694 | 
 695 | ```
 696 | 
 697 | **参数：**
 698 | 
 699 | - periods_before: 计算过去的天数
 700 | - periods_after: 计算未来的天数
 701 | - by_quantile：
 702 |     - True：各分位数分别显示因子预测能力平均累计收益图
 703 |     - False：不用各分位数分别显示因子预测能力平均累计收益图
 704 | - std_bar：
 705 |     - True：显示标准差
 706 |     - False：不显示标准差
 707 | - demeaned:
 708 |     - True: 使用超额收益计算累积收益 (基准收益被认为是每日所有股票收益按照weight列中权重加权的均值)
 709 |     - False: 不使用超额收益
 710 | - group_adjust:
 711 |     - True: 使用行业中性化后的收益计算累积收益 (行业收益被认为是每日各个行业股票收益按照weight列中权重加权的均值)
 712 |     - False: 不使用行业中性化后的收益
 713 | 
 714 | #### 画有效因子数量统计图
 715 | 
 716 | ```
 717 | far.plot_events_distribution(num_days=1)
 718 | 
 719 | ```
 720 | 
 721 | **参数：**
 722 | 
 723 | - num_days：统计间隔天数
 724 | 
 725 | #### 关闭中文图例显示
 726 | 
 727 | ```
 728 | far.plot_disable_chinese_label()
 729 | 
 730 | ```
 731 | 
 732 | 
 733 | 
 734 | ### 2. 属性列表
 735 | 
 736 | 用于访问因子分析的结果，大部分为惰性属性，在访问才会计算结果并返回
 737 | 
 738 | 
 739 | 
 740 | #### 查看因子值
 741 | 
 742 | ```
 743 | far.factor_data
 744 | ```
 745 | 
 746 | - 类型：pandas.Series
 747 | - index：为日期和股票代码的MultiIndex
 748 | 
 749 | #### 去除 nan/inf，整理后的因子值、forward_return 和分位数
 750 | 
 751 | ```
 752 | far.clean_factor_data
 753 | ```
 754 | 
 755 | - 类型：pandas.DataFrame index：为日期和股票代码的MultiIndex
 756 | - columns：根据period选择后的forward_return(如果调仓周期为1天，那么forward_return为[第二天的收盘价-今天的收盘价]/今天的收盘价)、因子值、行业分组、分位数数组、权重
 757 | 
 758 | #### 按分位数分组加权平均因子收益
 759 | 
 760 | ```
 761 | far.mean_return_by_quantile
 762 | ```
 763 | 
 764 | - 类型：pandas.DataFrame
 765 | - index：分位数分组
 766 | - columns：调仓周期
 767 | 
 768 | #### 按分位数分组加权因子收益标准差
 769 | 
 770 | ```
 771 | far.mean_return_std_by_quantile
 772 | ```
 773 | 
 774 | - 类型：pandas.DataFrame
 775 | - index：分位数分组
 776 | - columns：调仓周期
 777 | 
 778 | #### 按分位数及日期分组加权平均因子收益
 779 | 
 780 | ```
 781 | far.mean_return_by_date
 782 | ```
 783 | 
 784 | - 类型：pandas.DataFrame
 785 | - index：为日期和分位数的MultiIndex
 786 | - columns：调仓周期
 787 | 
 788 | #### 按分位数及日期分组加权因子收益标准差
 789 | 
 790 | ```
 791 | far.mean_return_std_by_date
 792 | ```
 793 | 
 794 | - 类型：pandas.DataFrame
 795 | - index：为日期和分位数的MultiIndex
 796 | - columns：调仓周期
 797 | 
 798 | #### 按分位数及行业分组加权平均因子收益
 799 | 
 800 | ```
 801 | far.mean_return_by_group
 802 | ```
 803 | 
 804 | - 类型：pandas.DataFrame
 805 | - index：为行业和分位数的MultiIndex
 806 | - columns：调仓周期
 807 | 
 808 | #### 按分位数及行业分组加权因子收益标准差
 809 | 
 810 | ```
 811 | far.mean_return_std_by_group
 812 | ```
 813 | 
 814 | - 类型：pandas.DataFrame
 815 | - index：为行业和分位数的MultiIndex
 816 | - columns：调仓周期
 817 | 
 818 | #### 最高分位数因子收益减最低分位数因子收益每日均值
 819 | 
 820 | ```
 821 | far.mean_return_spread_by_quantile
 822 | ```
 823 | 
 824 | - 类型：pandas.DataFrame
 825 | - index：日期
 826 | - columns：调仓周期
 827 | 
 828 | #### 最高分位数因子收益减最低分位数因子收益每日标准差
 829 | 
 830 | ```
 831 | far.mean_return_spread_std_by_quantile
 832 | ```
 833 | 
 834 | - 类型：pandas.DataFrame
 835 | - index：日期
 836 | - columns：调仓周期
 837 | 
 838 | #### 信息比率
 839 | 
 840 | ```
 841 | far.ic
 842 | ```
 843 | 
 844 | - 类型：pandas.DataFrame
 845 | - index：日期
 846 | - columns：调仓周期
 847 | 
 848 | #### 分行业信息比率
 849 | 
 850 | ```
 851 | far.ic_by_group
 852 | ```
 853 | 
 854 | - 类型：pandas.DataFrame
 855 | - index：行业
 856 | - columns：调仓周期
 857 | 
 858 | #### 月度信息比率
 859 | 
 860 | ```
 861 | far.ic_monthly
 862 | ```
 863 | 
 864 | - 类型：pandas.DataFrame
 865 | - index：月度
 866 | - columns：调仓周期表
 867 | 
 868 | #### 换手率
 869 | 
 870 | ```
 871 | far.quantile_turnover
 872 | ```
 873 | 
 874 | - 键：调仓周期
 875 | - 值: pandas.DataFrame 换手率
 876 |     - index：日期
 877 |     - columns：分位数分组
 878 | 
 879 | #### 计算按分位数分组加权因子收益和标准差
 880 | 
 881 | ```
 882 | mean, std = far.calc_mean_return_by_quantile(by_date=True, by_group=False, demeaned=False, group_adjust=False)
 883 | ```
 884 | 
 885 | **参数：**
 886 | 
 887 | - by_date：
 888 |     - True：按天计算收益
 889 |     - False：不按天计算收益
 890 | - by_group:
 891 |     - True: 按行业计算收益
 892 |     - False：不按行业计算收益
 893 | - demeaned:
 894 |     - True：使用超额收益计算各分位数收益，超额收益=收益-基准收益 (基准收益被认为是每日所有股票收益按照weight列中权重的加权的均值)
 895 |     - False：不使用超额收益
 896 | - group_adjust:
 897 |     - True：使用行业中性收益计算各分位数收益，行业中性收益=收益-行业收益 (行业收益被认为是每日各个行业股票收益按照weight列中权重的加权的均值)
 898 |     - False：不使用行业中性收益
 899 | 
 900 | #### 计算按因子值加权多空组合每日收益
 901 | 
 902 | ```
 903 | far.calc_factor_returns(demeaned=True, group_adjust=False)
 904 | ```
 905 | 
 906 | 权重 = 每日因子值 / 每日因子值的绝对值的和
 907 | 正的权重代表买入, 负的权重代表卖出
 908 | 
 909 | **参数：**
 910 | 
 911 | - demeaned:
 912 |     - True: 对权重去均值 (每日权重 = 每日权重 - 每日权重的均值), 使组合转换为 cash-neutral 多空组合
 913 |     - False：不对权重去均值
 914 | - group_adjust:
 915 |     - True：对权重分行业去均值 (每日权重 = 每日权重 - 每日各行业权重的均值)，使组合转换为 industry-neutral 多空组合
 916 |     - False：不对权重分行业去均值
 917 | 
 918 | #### 计算两个分位数相减的因子收益和标准差
 919 | 
 920 | ```
 921 | mean, std = far.compute_mean_returns_spread(upper_quant=None, lower_quant=None, by_date=False, by_group=False, demeaned=False, group_adjust=False)
 922 | ```
 923 | 
 924 | **参数：**
 925 | 
 926 | - upper_quant：用upper_quant选择的分位数减去lower_quant选择的分位数，只能在已有的范围内选择
 927 | - lower_quant：用upper_quant选择的分位数减去lower_quant选择的分位数，只能在已有的范围内选择
 928 | - by_date：
 929 |     - True：按天计算两个分位数相减的因子收益和标准差
 930 |     - False：不按天计算两个分位数相减的因子收益和标准差
 931 | - by_group:
 932 |     - True: 分行业计算两个分位数相减的因子收益和标准差
 933 |     - False：不分行业计算两个分位数相减的因子收益和标准差
 934 | - demeaned:
 935 |     - True：使用超额收益 (基准收益被认为是每日所有股票收益按照weight列中权重加权的均值)
 936 |     - False：不使用超额收益
 937 | - group_adjust:
 938 |     - True：使用行业中性收益 (行业收益被认为是每日各个行业股票收益按照weight列中权重加权的均值)
 939 |     - False：不使用行业中性收益
 940 | 
 941 | 
 942 | #### 计算因子的 alpha 和 beta
 943 | 
 944 | ```
 945 | far.calc_factor_alpha_beta(demeaned=True, group_adjust=False)
 946 | ```
 947 | 
 948 | 因子值加权组合每日收益 = beta * 市场组合每日收益 + alpha
 949 | 
 950 | 因子值加权组合每日收益计算方法见 calc_factor_returns 函数
 951 | 
 952 | 市场组合每日收益是每日所有股票收益按照weight列中权重加权的均值
 953 | 
 954 | 结果中的 alpha 是年化 alpha
 955 | 
 956 | **参数：**
 957 | 
 958 | - demeaned:
 959 |     - True: 对因子值加权组合每日收益的权重去均值 (每日权重 = 每日权重 - 每日权重的均值)，使组合转换为cash-neutral多空组合
 960 |     - False：不对权重去均值
 961 | - group_adjust:
 962 |     - True：对权重分行业去均值 (每日权重 = 每日权重 - 每日各行业权重的均值)，使组合转换为 industry-neutral 多空组合
 963 |     - False：不对权重分行业去均值
 964 | 
 965 | #### 计算每日因子信息比率（IC值）
 966 | 
 967 | ```
 968 | far.calc_factor_information_coefficient(group_adjust=False, by_group=False, method='rank')
 969 | ```
 970 | 
 971 | **参数：**
 972 | 
 973 | - group_adjust:
 974 |     - True：使用行业中性收益计算 IC (行业收益被认为是每日各个行业股票收益按照weight列中权重加权的均值)
 975 |     - False：不使用行业中性收益
 976 | - by_group:
 977 |     - True：分行业计算 IC
 978 |     - False：不分行业计算 IC
 979 | - method：
 980 |     - 'rank'：用秩相关系数计算IC值
 981 |     - 'normal'：用普通相关系数计算IC值
 982 | 
 983 | #### 计算因子信息比率均值（IC值均值）
 984 | 
 985 | ```
 986 | far.calc_mean_information_coefficient(group_adjust=False, by_group=False, by_time=None, method='rank')
 987 | ```
 988 | 
 989 | **参数：**
 990 | 
 991 | - group_adjust:
 992 |     - True：使用行业中性收益计算 IC (行业收益被认为是每日各个行业股票收益按照weight列中权重加权的均值)
 993 |     - False：不使用行业中性收益
 994 | - by_group:
 995 |     - True：分行业计算 IC
 996 |     - False：不分行业计算 IC
 997 | - by_time：
 998 |     - 'Y'：按年求均值
 999 |     - 'M'：按月求均值
1000 |     - None：对所有日期求均值
1001 | - method：
1002 |     - 'rank'：用秩相关系数计算IC值
1003 |     - 'normal'：用普通相关系数计算IC值
1004 | 
1005 | #### 按照当天的分位数算分位数未来和过去的收益均值和标准差
1006 | 
1007 | ```
1008 | far.calc_average_cumulative_return_by_quantile(periods_before=5, periods_after=15, demeaned=False, group_adjust=False)
1009 | ```
1010 | 
1011 | **参数：**
1012 | 
1013 | - periods_before：计算过去的天数
1014 | - periods_after：计算未来的天数
1015 | - demeaned：
1016 |     - True：使用超额收益计算累积收益 (基准收益被认为是每日所有股票收益按照weight列中权重加权的均值)
1017 |     - False：不使用超额收益
1018 | - group_adjust：
1019 |     - True：使用行业中性化后的收益计算累积收益
1020 |     - False：不使用行业中性化后的收益
1021 | 
1022 | #### 计算指定调仓周期的各分位数每日累积收益
1023 | 
1024 | ```
1025 | far.calc_cumulative_return_by_quantile(period=None, demeaned=False, group_adjust=False)
1026 | ```
1027 | 
1028 | **参数：**
1029 | 
1030 | - period：指定调仓周期
1031 | - demeaned：
1032 |     - True：使用超额收益计算累积收益 (基准收益被认为是每日所有股票收益按照weight列中权重加权的均值)
1033 |     - False：不使用超额收益
1034 | - group_adjust：
1035 |     - True：使用行业中性化后的收益计算累积收益
1036 |     - False：不使用行业中性化后的收益
1037 | 
1038 | #### 计算指定调仓周期的按因子值加权多空组合每日累积收益
1039 | 
1040 | ```
1041 | far.calc_cumulative_returns(period=5, demeaned=False, group_adjust=False)
1042 | ```
1043 | 
1044 | 当 period > 1 时，组合的累积收益计算方法为：
1045 | 
1046 | 组合每日收益 = （从第0天开始每period天一调仓的组合每日收益 + 从第1天开始每period天一调仓的组合每日收益 + ... + 从第period-1天开始每period天一调仓的组合每日收益) / period
1047 | 
1048 | 组合累积收益 = 组合每日收益的累积
1049 | 
1050 | **参数：**
1051 | 
1052 | - period：指定调仓周期
1053 | - demeaned:
1054 |     - True：对权重去均值 (每日权重 = 每日权重 - 每日权重的均值)，使组合转换为 cash-neutral 多空组合
1055 |     - False：不对权重去均值
1056 | - group_adjust:
1057 |     - True：对权重分行业去均值 (每日权重 = 每日权重 - 每日各行业权重的均值)，使组合转换为 industry-neutral 多空组合
1058 |     - False：不对权重分行业去均值
1059 | 
1060 | #### 计算指定调仓周期和前面定义好的加权方式计算多空组合每日累计收益
1061 | 
1062 | ```
1063 | far.calc_top_down_cumulative_returns(period=5, demeaned=False, group_adjust=False)
1064 | ```
1065 | 
1066 | **参数：**
1067 | 
1068 | - period：指定调仓周期
1069 | - demeaned:
1070 |     - True：使用超额收益计算累积收益 (基准收益被认为是每日所有股票收益按照weight列中权重加权的均值)
1071 |     - False：不使用超额收益
1072 | - group_adjust:
1073 |     - True：使用行业中性化后的收益计算累积收益 (行业收益被认为是每日各个行业股票收益按照weight列中权重加权的均值)
1074 |     - False：不使用行业中性化后的收益
1075 | 
1076 | #### 根据调仓周期确定滞后期的每天计算因子自相关性
1077 | 
1078 | ```
1079 | far.calc_autocorrelation(rank=True)
1080 | ```
1081 | 
1082 | **参数：**
1083 | 
1084 | - rank：
1085 |     - True：秩相关系数
1086 |     - False：普通相关系数
1087 | 
1088 | #### 滞后n天因子值自相关性
1089 | 
1090 | ```
1091 | far.calc_autocorrelation_n_days_lag(n=9,rank=True)
1092 | ```
1093 | 
1094 | **参数：**
1095 | 
1096 | - n：滞后n天到1天的因子值自相关性
1097 | - rank：
1098 |     - True：秩相关系数
1099 |     - False：普通相关系数
1100 | 
1101 | #### 各分位数滞后1天到n天的换手率均值
1102 | 
1103 | ```
1104 | far.calc_quantile_turnover_mean_n_days_lag(n=10)
1105 | ```
1106 | 
1107 | **参数：**
1108 | 
1109 | - n：滞后 1 天到 n 天的换手率
1110 | 
1111 | #### 滞后 0 - n 天因子收益信息比率(IC)的移动平均
1112 | 
1113 | ```
1114 | far.calc_ic_mean_n_days_lag(n=10,group_adjust=False,by_group=False,method=None)
1115 | ```
1116 | 
1117 | **参数：**
1118 | 
1119 | - n：滞后0-n天因子收益的信息比率(IC)的移动平均
1120 | - group_adjust:
1121 |     - True：使用行业中性收益计算 IC (行业收益被认为是每日各个行业股票收益按照weight列中权重加权的均值)
1122 |     - False：不使用行业中性收益
1123 | - by_group：
1124 |     - True：分行业计算 IC
1125 |     - False：不分行业计算 IC
1126 | - method：
1127 |     - 'rank'：用秩相关系数计算IC值
1128 |     - 'normal'：用普通相关系数计算IC值
1129 | 
1130 | 
1131 | 
1132 | ### 3. 获取聚宽因子库数据的方法
1133 | 
1134 | 1. [聚宽因子库](https://www.joinquant.com/help/api/help?name=factor_values)包含数百个质量、情绪、风险等其他类目的因子
1135 | 
1136 | 2. 连接jqdatasdk获取数据包，数据接口需调用聚宽 [`jqdatasdk`](https://github.com/JoinQuant/jqdatasdk/blob/master/README.md) 接口获取金融数据([试用注册地址](http://t.cn/EINDOxE))
1137 | 
1138 |     ```python
1139 |     # 获取因子数据：以5日平均换手率为例，该数据可以直接用于因子分析
1140 |     # 具体使用方法可以参照jqdatasdk的API文档
1141 |     import jqdatasdk
1142 |     jqdatasdk.auth('username', 'password')
1143 |     # 获取聚宽因子库中的VOL5数据
1144 |     factor_data=jqdatasdk.get_factor_values(
1145 |         securities=jqdatasdk.get_index_stocks('000300.XSHG'),
1146 |         factors=['VOL5'],
1147 |         start_date='2018-01-01',
1148 |         end_date='2018-12-31')['VOL5']
1149 |     ```
1150 | 
1151 | 
1152 | 
1153 | ### 4. 将自有因子值转换成 DataFrame 格式的数据
1154 | 
1155 | - index 为日期，格式为 pandas 日期通用的 DatetimeIndex
1156 | 
1157 | - columns 为股票代码，格式要求符合聚宽的代码定义规则（如：平安银行的股票代码为 000001.XSHE）
1158 | 
1159 |     - 如果是深交所上市的股票，在股票代码后面需要加入.XSHE
1160 |     - 如果是上交所上市的股票，在股票代码后面需要加入.XSHG
1161 | 
1162 | - 将 pandas.DataFrame 转换成满足格式要求数据格式
1163 | 
1164 |     首先要保证 index 为 `DatetimeIndex` 格式
1165 | 
1166 |     一般是通过 pandas 提供的 [`pandas.to_datetime`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html) 函数进行转换, 在转换前应确保 index 中的值都为合理的日期格式, 如 `'2018-01-01'` / `'20180101'`, 之后再调用 `pandas.to_datetime` 进行转换
1167 | 
1168 |     另外应确保 index 的日期是按照从小到大的顺序排列的, 可以通过 [`sort_index`](https://pandas.pydata.org/pandas-docs/version/0.23.3/generated/pandas.DataFrame.sort_index.html) 进行排序
1169 | 
1170 |     最后请检查 columns 中的股票代码是否都满足聚宽的代码定义
1171 | 
1172 |     ```python
1173 |     import pandas as pd
1174 | 
1175 |     sample_data = pd.DataFrame(
1176 |         [[0.84, 0.43, 2.33, 0.86, 0.96],
1177 |          [1.06, 0.51, 2.60, 0.90, 1.09],
1178 |          [1.12, 0.54, 2.68, 0.94, 1.12],
1179 |          [1.07, 0.64, 2.65, 1.33, 1.15],
1180 |          [1.21, 0.73, 2.97, 1.65, 1.19]],
1181 |         index=['2018-01-02', '2018-01-03', '2018-01-04', '2018-01-05', '2018-01-08'],
1182 |         columns=['000001.XSHE', '000002.XSHE', '000063.XSHE', '000069.XSHE', '000100.XSHE']
1183 |     )
1184 | 
1185 |     print(sample_data)
1186 | 
1187 |     factor_data = sample_data.copy()
1188 |     # 将 index 转换为 DatetimeIndex
1189 |     factor_data.index = pd.to_datetime(factor_data.index)
1190 |     # 将 DataFrame 按照日期顺序排列
1191 |     factor_data = factor_data.sort_index()
1192 |     # 检查 columns 是否满足聚宽股票代码格式
1193 |     if not sample_data.columns.astype(str).str.match('\d{6}\.XSH[EG]').all():
1194 |         print("有不满足聚宽股票代码格式的股票")
1195 |         print(sample_data.columns[~sample_data.columns.astype(str).str.match('\d{6}\.XSH[EG]')])
1196 | 
1197 |     print(factor_data)
1198 |     ```
1199 | 
1200 | - 将键为日期, 值为各股票因子值的 `Series` 的 `dict` 转换成 `pandas.DataFrame`
1201 | 
1202 |     可以直接利用 `pandas.DataFrame` 生成
1203 | 
1204 |     ```python
1205 |     sample_data = \
1206 |     {'2018-01-02': pd.Seris([0.84, 0.43, 2.33, 0.86, 0.96],
1207 |                             index=['000001.XSHE', '000002.XSHE', '000063.XSHE', '000069.XSHE', '000100.XSHE']),
1208 |      '2018-01-03': pd.Seris([1.06, 0.51, 2.60, 0.90, 1.09],
1209 |                             index=['000001.XSHE', '000002.XSHE', '000063.XSHE', '000069.XSHE', '000100.XSHE']),
1210 |      '2018-01-04': pd.Seris([1.12, 0.54, 2.68, 0.94, 1.12],
1211 |                             index=['000001.XSHE', '000002.XSHE', '000063.XSHE', '000069.XSHE', '000100.XSHE']),
1212 |      '2018-01-05': pd.Seris([1.07, 0.64, 2.65, 1.33, 1.15],
1213 |                             index=['000001.XSHE', '000002.XSHE', '000063.XSHE', '000069.XSHE', '000100.XSHE']),
1214 |      '2018-01-08': pd.Seris([1.21, 0.73, 2.97, 1.65, 1.19],
1215 |                             index=['000001.XSHE', '000002.XSHE', '000063.XSHE', '000069.XSHE', '000100.XSHE'])}
1216 | 
1217 |     import pandas as pd
1218 |     # 直接调用 pd.DataFrame 将 dict 转换为 DataFrame
1219 |     factor_data = pd.DataFrame(data).T
1220 | 
1221 |     print(factor_data)
1222 | 
1223 |     # 之后请按照 DataFrame 的方法转换成满足格式要求数据格式
1224 |     ```
1225 | 
1226 | ## 四、数据处理函数
1227 | 
1228 | #### 1.  中性化
1229 | 
1230 | ```python
1231 | from jqfactor_analyzer import neutralize
1232 | neutralize(data, how=None, date=None, axis=1, fillna=None, add_constant=False)
1233 | ```
1234 | 
1235 | **参数 :**
1236 | 
1237 | - data: pd.Series/pd.DataFrame, 待中性化的序列, 序列的 index/columns 为股票的 code
1238 | - how: str list. 中性化使用的因子名称列表.
1239 |   默认为 ['jq_l1', 'market_cap'], 支持的中性化方法有:
1240 |   (1) 行业: sw_l1, sw_l2, sw_l3, jq_l1, jq_l2
1241 |   (2) mktcap(总市值), ln_mktcap(对数总市值), cmktcap(流通市值), ln_cmktcap(对数流通市值)
1242 |   (3)自定义的中性化数据: 支持同时传入额外的 Series 或者 DataFrame 用来进行中性化, index 必须是标的代码
1243 |   数列表。
1244 | - date: 日期, 将用 date 这天的相关变量数据对 series 进行中性化 (注意依赖数据的实际可用时间, 如市值数据当天盘中是无法获取到的)
1245 | - axis: 默认为 1. 仅在 data 为 pd.DataFrame 时生效. 表示沿哪个方向做中性化, 0 为对每列做中性化, 1 为对每行做中性化
1246 | - fillna: 缺失值填充方式, 默认为None, 表示不填充. 支持的值:
1247 |           'jq_l1': 聚宽一级行业
1248 |           'jq_l2': 聚宽二级行业
1249 |           'sw_l1': 申万一级行业
1250 |           'sw_l2': 申万二级行业
1251 |           'sw_l3': 申万三级行业 表示使用某行业分类的均值进行填充.
1252 | -  add_constant: 中性化时是否添加常数项, 默认为 False
1253 | 
1254 | **返回 :**
1255 | 
1256 | - 中性化后的因子数据
1257 | 
1258 | 
1259 | 
1260 | #### 2.  去极值
1261 | 
1262 | ```python
1263 | from jqfactor_analyzer import winsorize
1264 | winsorize(data, scale=None, range=None, qrange=None, inclusive=True, inf2nan=True, axis=1)
1265 | ```
1266 | 
1267 | **参数 :**
1268 | 
1269 | - data: pd.Series/pd.DataFrame/np.array, 待缩尾的序列
1270 | - scale: 标准差倍数，与 range，qrange 三选一，不可同时使用。会将位于 [mu - scale * sigma, mu + scale * sigma] 边界之外的值替换为边界值
1271 | - range: 列表， 缩尾的上下边界。与 scale，qrange 三选一，不可同时使用。
1272 | - qrange: 列表，缩尾的上下分位数边界，值应在 0 到 1 之间，如 [0.05, 0.95]。与 scale，range 三选一，不可同时使用。
1273 | - inclusive: 是否将位于边界之外的值替换为边界值，默认为 True。如果为 True，则将边界之外的值替换为边界值，否则则替换为 np.nan
1274 | - inf2nan: 是否将 np.inf 和 -np.inf 替换成 np.nan，默认为 True如果为 True，在缩尾之前会先将 np.inf 和 -np.inf 替换成 np.nan，缩尾的时候不会考虑 np.nan，否则 inf 被认为是在上界之上，-inf 被认为在下界之下
1275 | - axis: 在 data 为 pd.DataFrame 时使用，沿哪个方向做标准化，默认为 1。 0 为对每列做缩尾，1 为对每行做缩尾。
1276 | 
1277 | **返回 :**
1278 | 
1279 | - 去极值处理之后的因子数据
1280 | 
1281 | 
1282 | 
1283 | #### 3.  中位数去极值
1284 | 
1285 | ```python
1286 | from jqfactor_analyzer import winsorize_med
1287 | winsorize_med(data, scale=1, inclusive=True, inf2nan=True, axis=1)
1288 | ```
1289 | 
1290 | **参数 :**
1291 | 
1292 | - data: pd.Series/pd.DataFrame/np.array, 待缩尾的序列
1293 | - scale: 倍数，默认为 1.0。会将位于 [med - scale * distance, med + scale * distance] 边界之外的值替换为边界值/np.nan
1294 | - inclusive bool 是否将位于边界之外的值替换为边界值，默认为 True。 如果为 True，则将边界之外的值替换为边界值，否则则替换为 np.nan
1295 | - inf2nan: 是否将 np.inf 和 -np.inf 替换成 np.nan，默认为 True。如果为 True，在缩尾之前会先将 np.inf 和 -np.inf 替换成 np.nan，缩尾的时候不会考虑 np.nan，否则 inf 被认为是在上界之上，-inf 被认为在下界之下
1296 | - axis: 在 data 为 pd.DataFrame 时使用，沿哪个方向做标准化，默认为 1。0 为对每列做缩尾，1 为对每行做缩尾
1297 | 
1298 | **返回 :**
1299 | 
1300 | - 中位数去极值之后的因子数据
1301 | 
1302 | 
1303 | 
1304 | #### 4.  标准化(z-score)
1305 | 
1306 | ```python
1307 | from jqfactor_analyzer import standardlize
1308 | standardlize(data, inf2nan=True, axis=1)
1309 | ```
1310 | 
1311 | **参数 :**
1312 | 
1313 | - data: pd.Series/pd.DataFrame/np.array, 待标准化的序列
1314 | - inf2nan: 是否将 np.inf 和 -np.inf 替换成 np.nan。默认为 True
1315 | - axis=1: 在 data 为 pd.DataFrame 时使用，如果 series 为 pd.DataFrame，沿哪个方向做标准化。0 为对每列做标准化，1 为对每行做标准化
1316 | 
1317 | **返回 :**
1318 | 
1319 | - 标准化后的因子数据
1320 | 


--------------------------------------------------------------------------------
/jqfactor_analyzer/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from .version import __version__
 4 | from .analyze import FactorAnalyzer
 5 | from .attribution import AttributionAnalysis
 6 | from .data import DataApi
 7 | from .preprocess import winsorize, winsorize_med, standardlize, neutralize
 8 | from .factor_cache import save_factor_values_by_group, get_factor_values_by_cache, get_cache_dir
 9 | 
10 | 
11 | def analyze_factor(
12 |     factor, industry='jq_l1', quantiles=5, periods=(1, 5, 10),
13 |     weight_method='avg', max_loss=0.25, allow_cache=True, show_data_progress=True
14 | ):
15 |     """单因子分析
16 | 
17 |     输入:
18 |         factor: pandas.DataFrame: 因子值, columns 为股票代码 (如 '000001.XSHE'),
19 |                                           index 为 日期的 DatetimeIndex
20 |                 或 pandas.Series: 因子值, index 为日期和股票代码的 MultiIndex
21 |         industry: 行业分类, 默认为 'jq_l1'
22 |             - 'jq_l1': 聚宽一级行业
23 |             - 'jq_l2': 聚宽二级行业
24 |             - 'sw_l1': 申万一级行业
25 |             - 'sw_l2': 申万二级行业
26 |             - 'sw_l3': 申万三级行业
27 |             - 'zjw': 证监会行业
28 |         quantiles: 分位数数量, 默认为 5
29 |         periods: 调仓周期, int 或 int 的 列表, 默认为 [1, 5, 10]
30 |         weight_method: 计算分位数收益时的加权方法, 默认为 'avg'
31 |             - 'avg': 等权重
32 |             - 'mktcap': 按总市值加权
33 |             - 'ln_mktcap': 按总市值的对数加权
34 |             - 'cmktcap': 按流通市值加权
35 |             - 'ln_cmktcap': 按流通市值的对数加权
36 |         max_loss: 因重复值或nan值太多而无效的因子值的最大占比, 默认为 0.25
37 |         allow_cache: 是否允许对价格,市值等信息进行本地缓存(按天缓存,初次运行可能比较慢,但后续重新获取对应区间的数据将非常快,且分析时仅消耗较小的jqdatasdk流量)
38 |         show_data_progress: 是否展示数据获取的进度信息
39 | 
40 |     """
41 | 
42 |     dataapi = DataApi(industry=industry, weight_method=weight_method,
43 |                       allow_cache=allow_cache, show_progress=show_data_progress)
44 |     return FactorAnalyzer(factor,
45 |                           quantiles=quantiles,
46 |                           periods=periods,
47 |                           max_loss=max_loss,
48 |                           **dataapi.apis)
49 | 
50 | 
51 | def attribution_analysis(
52 |     weights, daily_return, style_type='style_pro', industry='sw_l1',
53 |     use_cn=True, show_data_progress=True
54 | ):
55 |     """归因分析
56 | 
57 |     用户需要提供的数据:
58 |     1. 日度股票持仓权重 (加总不为 1 的剩余部分视为现金)
59 |     2. 组合的的日度收益率 (使用 T 日持仓盘后的因子暴露与 T+1 日的收益进行归因分析)
60 | 
61 |     组合风格因子暴露 (含行业, country) = sum(组合权重 * 个股因子值), country 暴露为总的股票持仓权重
62 |     组合风格收益率 (含行业, country) = sum(组合风格因子暴露 * factor_return)
63 |     组合特异收益率 = 组合总收益率 - 组合风格收益率(含行业, country 或 cash)
64 |     """
65 |     return AttributionAnalysis(weights,
66 |                                daily_return=daily_return,
67 |                                style_type=style_type,
68 |                                industry=industry,
69 |                                use_cn=use_cn,
70 |                                show_data_progress=show_data_progress)
71 | 


--------------------------------------------------------------------------------
/jqfactor_analyzer/attribution.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import datetime
  4 | from tqdm import tqdm
  5 | from functools import partial
  6 | import matplotlib.pyplot as plt
  7 | from matplotlib.lines import Line2D
  8 | 
  9 | from jqfactor_analyzer.data import DataApi
 10 | from jqfactor_analyzer.factor_cache import save_factor_values_by_group, get_factor_values_by_cache
 11 | from jqfactor_analyzer.plot_utils import _use_chinese
 12 | from functools import lru_cache
 13 | 
 14 | 
 15 | dataapi = DataApi(allow_cache=True, show_progress=True)
 16 | 
 17 | 
 18 | def get_factor_style_returns(factors=None, start_date=None, end_date=None,
 19 |                              count=None, universe=None, industry='sw_l1'):
 20 |     if dataapi._api_name == 'jqdatasdk':
 21 |         func = dataapi.api.get_factor_style_returns
 22 |     else:
 23 |         import jqfactor
 24 |         func = jqfactor.get_factor_style_returns
 25 |     return func(factors=factors, start_date=start_date, end_date=end_date,
 26 |                 count=count, universe=universe, industry=industry)
 27 | 
 28 | 
 29 | def get_price(security, start_date, end_date, fields):
 30 |     func = partial(dataapi.api.get_price, security=security,
 31 |                    start_date=start_date, end_date=end_date, fields=fields)
 32 |     if dataapi._api_name == 'jqdatasdk':
 33 |         return func()
 34 |     else:
 35 |         return func(pre_factor_ref_date=datetime.date.today())
 36 | 
 37 | 
 38 | def get_index_style_exposure(index, factors=None,
 39 |                              start_date=None, end_date=None, count=None):
 40 |     if dataapi._api_name == 'jqdatasdk':
 41 |         func = dataapi.api.get_index_style_exposure
 42 |     else:
 43 |         import jqfactor
 44 |         func = jqfactor.get_index_style_exposure
 45 |     return func(index=index, factors=factors,
 46 |                 start_date=start_date, end_date=end_date, count=count)
 47 | 
 48 | 
 49 | class AttributionAnalysis():
 50 |     """归因分析
 51 | 
 52 |     用户需要提供的数据:
 53 |     1. 日度股票持仓权重 (加总不为 1 的剩余部分视为现金)
 54 |     2. 组合的的日度收益率 (使用 T 日持仓盘后的因子暴露与 T+1 日的收益进行归因分析)
 55 | 
 56 |     组合风格因子暴露 (含行业, country) = sum(组合权重 * 个股因子值), country 暴露为总的股票持仓权重
 57 |     组合风格收益率 (含行业, country) = sum(组合风格因子暴露 * factor_return)
 58 |     组合特异收益率 = 组合总收益率 - 组合风格收益率(含行业, country 或 cash)
 59 |     """
 60 | 
 61 |     def __init__(self, weights, daily_return,
 62 |                  style_type='style_pro', industry='sw_l1',
 63 |                  use_cn=True, show_data_progress=True):
 64 |         """
 65 |         参数
 66 |         ----------
 67 |             weights:
 68 |                 持仓权重信息, index 是日期, columns 是标的代码, value 对应的是当天的仓位占比 (单日仓位占比总和不为 1 时, 剩余部分认为是当天的现金)
 69 |             daily_return:
 70 |                 Series, index 是日期, values 为当天账户的收益率
 71 |             style_type:
 72 |                 所选的风格因子类型, 'style' 和 'style_pro' 中的一个
 73 |             industry:
 74 |                 行业分类, 可选: 'sw_l1' 或 'jq_l1'
 75 |             use_cn:
 76 |                 绘图时是否使用中文
 77 |             show_data_progress:
 78 |                 是否展示数据获取进度 (使用本地缓存, 第一次运行时速度较慢, 后续对于本地不存在的数据将增量缓存)
 79 | 
 80 |         所有属性列表
 81 |         ----------
 82 |             style_exposure:
 83 |                 组合风格因子暴露
 84 |             industry_exposure:
 85 |                 组合行业因子暴露
 86 |             exposure_portfolio:
 87 |                 组合风格 / 行业及 country 的暴露
 88 |             attr_daily_return:
 89 |                 组合归因日收益率
 90 |             attr_returns:
 91 |                 组合归因累积收益汇总
 92 | 
 93 |         所有方法列表
 94 |         ----------
 95 |             get_exposure2bench(index_symbol):
 96 |                 获取相对于指数的暴露
 97 |             get_attr_daily_returns2bench(index_symbol):
 98 |                 获取相对于指数的日归因收益
 99 |             get_attr_returns2bench(index_symbol):
100 |                 获取相对于指数的累积归因收益
101 | 
102 |             plot_exposure(factors='style', index_symbol=None, use_cn=True, figsize=(15, 8))
103 |                 绘制风格或行业暴露, 当指定 index_symbol 时, 返回的是相对指数的暴露, 否则为组合自身的暴露
104 |             plot_returns(factors='style', index_symbol=None, use_cn=True, figsize=(15, 8))
105 |                 绘制风格或者行业的暴露收益, 当指定 index_symbol 时, 返回的是相对指数的暴露收益, 否则为组合自身的暴露收益
106 |             plot_exposure_and_returns(self, factors, index_symbol=None, use_cn=True, figsize=(12, 6))
107 |                 同时绘制暴露和收益信息
108 |         """
109 | 
110 |         self.STYLE_TYPE_DICT = {
111 |             'style': ['size', 'beta', 'momentum', 'residual_volatility', 'non_linear_size',
112 |                       'book_to_price_ratio', 'liquidity', 'earnings_yield', 'growth', 'leverage'],
113 |             'style_pro': ['btop', 'divyild', 'earnqlty', 'earnvar', 'earnyild', 'financial_leverage',
114 |                           'invsqlty', 'liquidty', 'long_growth', 'ltrevrsl', 'market_beta', 'market_size',
115 |                           'midcap', 'profit', 'relative_momentum', 'resvol']
116 |         }
117 |         weights.index = pd.to_datetime(weights.index)
118 |         daily_return.index = pd.to_datetime(daily_return.index)
119 |         weights.loc[weights.sum(axis=1) > 1] = weights.div(weights.sum(axis=1), axis=0)
120 |         self.weights = weights.replace(0, np.nan)
121 |         self.daily_return = daily_return
122 |         self.style_factor_names = self.STYLE_TYPE_DICT[style_type]
123 |         self.industry = industry
124 |         self.industry_code = list(
125 |             set(dataapi.api.get_industries(industry, date=weights.index[0]).index) |
126 |             set(dataapi.api.get_industries(industry, date=weights.index[-1]).index)
127 |         )
128 |         self.style_type = style_type
129 |         self.show_progress = show_data_progress
130 |         self.factor_cache_directory = self.check_factor_values()
131 | 
132 |         # 当日收盘后的暴露
133 |         self.style_exposure = self.calc_style_exposure()
134 |         # 当日收盘后的暴露
135 |         self.industry_exposure = self.calc_industry_exposure()
136 |         # 当日收盘后的暴露
137 |         self.exposure_portfolio = pd.concat([self.style_exposure, self.industry_exposure], axis=1)
138 |         self.exposure_portfolio['country'] = self.weights.sum(axis=1)
139 |         self.use_cn = use_cn
140 |         if use_cn:
141 |             _use_chinese(True)
142 | 
143 |         self._attr_daily_returns = None
144 |         self._attr_returns = None
145 |         self._factor_returns = None
146 |         self._factor_cn_name = None
147 | 
148 |     def _get_factor_cn_name(self):
149 |         """获取行业及风格因子的中文名称"""
150 |         industry_info = dataapi.api.get_industries(self.industry).name
151 |         factor_info = dataapi.api.get_all_factors()
152 |         factor_info = factor_info[factor_info.category ==
153 |                                   self.style_type].set_index("factor").factor_intro
154 |         factor_info = pd.concat([industry_info, factor_info])
155 |         factor_info['common_return'] = '因子收益'
156 |         factor_info['specific_return'] = '特异收益'
157 |         factor_info['total_return'] = '总收益'
158 |         factor_info['cash'] = '现金'
159 |         factor_info['country'] = 'country'
160 |         self._factor_cn_name = factor_info
161 |         return factor_info
162 | 
163 |     @property
164 |     def factor_cn_name(self):
165 |         if self._factor_cn_name is None:
166 |             return self._get_factor_cn_name()
167 |         else:
168 |             return self._factor_cn_name
169 | 
170 |     def check_factor_values(self):
171 |         """检查并缓存因子数据到本地"""
172 |         start_date = self.weights.index[0]
173 |         end_date = self.weights.index[-1]
174 |         return save_factor_values_by_group(start_date, end_date,
175 |                                            self.style_factor_names,
176 |                                            show_progress=self.show_progress)
177 | 
178 |     def _get_style_exposure_daily(self, date, weight):
179 |         weight = weight.dropna()
180 |         resdaily = get_factor_values_by_cache(
181 |             date,
182 |             codes=weight.index,
183 |             factor_names=self.style_factor_names,
184 |             factor_path=self.factor_cache_directory).T
185 |         resdaily = resdaily.mul(weight).sum(axis=1, min_count=1)
186 |         resdaily.name = date
187 |         return resdaily
188 | 
189 |     def calc_style_exposure(self):
190 |         """计算组合的风格因子暴露
191 |         返回: 一个 dataframe, index 为日期, columns 为风格因子名, values 为暴露值"""
192 | 
193 |         iters = self.weights.iterrows()
194 | 
195 |         if self.show_progress:
196 |             iters = tqdm(iters, total=self.weights.shape[0], desc='calc_style_exposure ')
197 |         results = []
198 |         for date, weight in iters:
199 |             results.append(self._get_style_exposure_daily(date, weight))
200 |         return pd.DataFrame(results)
201 | 
202 |     def _get_industry_exposure_daily(self, date, weight):
203 |         weight = weight.dropna()
204 |         resdaily = pd.get_dummies(dataapi._get_cached_industry_one_day(
205 |             str(date.date()), securities=weight.index, industry=self.industry))
206 |         resdaily = resdaily.mul(weight, axis=0).sum(axis=0, min_count=1)
207 |         resdaily.name = date
208 |         return resdaily
209 | 
210 |     def calc_industry_exposure(self):
211 |         """计算组合的行业因子暴露
212 |         返回: 一个 dataframe, index 为日期, columns为风格因子名, values为暴露值"""
213 |         iters = self.weights.iterrows()
214 |         if self.show_progress:
215 |             iters = tqdm(iters, total=self.weights.shape[0], desc='calc_industry_exposure ')
216 |         results = []
217 |         for date, weight in iters:
218 |             results.append(self._get_industry_exposure_daily(date, weight))
219 |         return pd.DataFrame(results).reindex(columns=self.industry_code, fill_value=0)
220 | 
221 |     @property
222 |     def attr_daily_returns(self):
223 |         if self._attr_daily_returns is None:
224 |             return self.calc_attr_returns()[0]
225 |         else:
226 |             return self._attr_daily_returns
227 | 
228 |     @property
229 |     def attr_returns(self):
230 |         if self._attr_returns is None:
231 |             return self.calc_attr_returns()[1]
232 |         else:
233 |             return self._attr_returns
234 | 
235 |     @property
236 |     def factor_returns(self):
237 |         if self._factor_returns is None:
238 |             exposure_portfolio = self.exposure_portfolio.copy()
239 |             self._factor_returns = get_factor_style_returns(
240 |                 exposure_portfolio.columns.tolist(),
241 |                 self.exposure_portfolio.index[0],
242 |                 dataapi.api.get_trade_days(self.exposure_portfolio.index[-1], count=2)[-1],
243 |                 industry=self.industry,
244 |                 universe='zzqz')
245 |             return self._factor_returns
246 |         else:
247 |             return self._factor_returns
248 | 
249 |     @lru_cache()
250 |     def _get_index_returns(self, index_symbol, start_date, end_date):
251 |         index_return = get_price(index_symbol,
252 |                                  start_date=start_date,
253 |                                  end_date=end_date,
254 |                                  fields='close')['close'].pct_change()
255 |         return index_return
256 | 
257 |     @lru_cache()
258 |     def _get_index_exposure(self, index_symbol):
259 |         index_exposure = get_index_style_exposure(
260 |             index_symbol,
261 |             factors=self.style_exposure.columns.tolist() + self.industry_exposure.columns.tolist(),
262 |             start_date=str(self.weights.index[0]),
263 |             end_date=str(self.weights.index[-1]))
264 |         index_exposure = index_exposure.mul(self.weights.sum(axis=1), axis=0)
265 |         index_exposure['country'] = 1
266 |         return index_exposure
267 | 
268 |     @lru_cache()
269 |     def get_exposure2bench(self, index_symbol):
270 |         """获取相对于指数的暴露"""
271 |         index_exposure = self._get_index_exposure(index_symbol)
272 |         return self.exposure_portfolio - index_exposure
273 | 
274 |     @lru_cache()
275 |     def get_attr_daily_returns2bench(self, index_symbol):
276 |         """获取相对于指数的日归因收益率
277 |         返回: 一个 datafame, index 是日期, value 为对应日期的收益率值
278 |         columns 为风格因子/行业因子/现金cash/因子总收益common_return(含风格,行业)/特异收益率 specific_return 及组合总收益率 total_return
279 |         注意: 日收益率直接加总, 可能和实际的最终收益率不一致, 因为没考虑到资产的变动情况
280 |         """
281 |         exposure2bench = self.get_exposure2bench(index_symbol)
282 |         exposure2bench = exposure2bench.reindex(self.factor_returns.index)
283 | 
284 |         index_return = self._get_index_returns(index_symbol,
285 |                                                start_date=exposure2bench.index[0],
286 |                                                end_date=exposure2bench.index[-1])
287 |         daily_return = self.daily_return - index_return
288 | 
289 |         attr_daily_returns2bench = exposure2bench.shift()[1:].mul(self.factor_returns)
290 |         # country 收益为 0, 无意义
291 |         del attr_daily_returns2bench['country']
292 |         attr_daily_returns2bench['common_return'] = attr_daily_returns2bench[self.style_exposure.columns.tolist() +
293 |                                                                              self.industry_exposure.columns.tolist()].sum(axis=1)
294 |         attr_daily_returns2bench['cash'] = index_return * exposure2bench.country.shift()
295 |         attr_daily_returns2bench['specific_return'] = daily_return - \
296 |             attr_daily_returns2bench['common_return'] - \
297 |             attr_daily_returns2bench['cash']
298 |         attr_daily_returns2bench['total_return'] = daily_return
299 |         return attr_daily_returns2bench
300 | 
301 |     @lru_cache()
302 |     def get_attr_returns2bench(self, index_symbol):
303 |         """获取相对于指数的累积归因收益
304 |         将超额收益分解成了:
305 |         1.common_return (因子收益, 又可进一步拆分成风格和行业);
306 |         2.cash (现金收益, 假设组合本身现金部分的收益为0, 则相对于指数的超额收益为"-1 * 指数收益");
307 |               累积算法: (组合收益率 + 1).cumpord() - (日现金收益率+组合收益率 + 1).cumpord()
308 |         3.specific_return: 残差, 无法被风格和行业因子解释的部分, 即为主动收益, 现金收益实际也可划分到主动收益中
309 |         """
310 |         index_return = self._get_index_returns(index_symbol,
311 |                                                start_date=self.factor_returns.index[0],
312 |                                                end_date=self.factor_returns.index[-1])
313 | 
314 |         attr_daily_returns2bench = self.get_attr_daily_returns2bench("000905.XSHG")
315 |         # 假设持仓的现金用于购买指数时的净值
316 |         position_with_cash_net = ((-attr_daily_returns2bench.cash + self.daily_return).fillna(0) + 1).cumprod()
317 |         # 持仓本身的净值
318 |         position_net = (self.daily_return.fillna(0) + 1).cumprod()
319 |         # 假设指数满仓时的超额
320 |         t_net = position_net - (index_return + 1).fillna(1).cumprod()
321 |         # 假设指数调整仓位到和组合一致(风格暴露)的超额
322 |         net = position_net - (index_return * self.weights.sum(axis=1).shift() + 1).fillna(1).cumprod()
323 |         # 超额的暴露收益
324 |         attr_returns2bench2 = attr_daily_returns2bench.mul(net.shift() + 1, axis=0).cumsum()
325 |         # 现金的收益 = 持仓本身的净值 - 假设持仓的现金用于购买指数的净值
326 |         attr_returns2bench2['cash'] = position_net - position_with_cash_net
327 |         # 超额收益
328 |         attr_returns2bench2['total_return'] = t_net
329 |         # 风格 + 行业因子收益, 不含现金
330 |         attr_returns2bench2['common_return'] = attr_returns2bench2[self.style_exposure.columns.tolist() +
331 |                                                                    self.industry_exposure.columns.tolist()].sum(axis=1)
332 |         attr_returns2bench2.loc[attr_returns2bench2.cash.isna(), 'common_return'] = np.nan
333 |         # 除风格,现金以外的无法解释的收益
334 |         attr_returns2bench2['specific_return'] = (
335 |             attr_returns2bench2['total_return'] - attr_returns2bench2['common_return'] - attr_returns2bench2['cash']
336 |         )
337 |         return attr_returns2bench2
338 | 
339 |     def calc_attr_returns(self):
340 |         """计算风格归因收益, country 收益率为国家收益 (这里的国家收益是用均衡大小市值后 (根号市值) 回归得到的"""
341 |         self._attr_daily_returns = self.exposure_portfolio.reindex(
342 |             self.factor_returns.index).shift(1).mul(self.factor_returns)
343 |         self._attr_daily_returns['common_return'] = self._attr_daily_returns.sum(axis=1)
344 |         self._attr_daily_returns['specific_return'] = self.daily_return - self._attr_daily_returns['common_return']
345 |         self._attr_daily_returns['total_return'] = self.daily_return
346 | 
347 |         cum_return = (self._attr_daily_returns.total_return.fillna(0) + 1).cumprod()
348 |         self._attr_returns = self._attr_daily_returns.mul(cum_return.shift(1), axis=0).cumsum()
349 | 
350 |         return self._attr_daily_returns, self._attr_returns
351 | 
352 |     def plot_data(self, data, title=None, figsize=(15, 8)):
353 |         ax = data.plot(figsize=figsize, title=title)
354 |         ax.legend(loc='upper left', bbox_to_anchor=(1, 1))
355 |         plt.tight_layout(rect=[0, 0, 0.85, 1])
356 |         plt.show()
357 | 
358 |     def plot_exposure(self, factors='style', index_symbol=None, figsize=(15, 7)):
359 |         """绘制风格暴露
360 |         factors: 绘制的暴露类型 , 可选 'style'(所有风格因子), 'industry'(所有行业因子), 也可以传递一个list, list 为 exposure_portfolio 中 columns 的一个或者多个
361 |         index_symbol: 基准指数代码, 指定时绘制相对于指数的暴露, 默认 None 为组合本身的暴露
362 |         figsize: 画布大小
363 |         """
364 |         exposure = self.exposure_portfolio if index_symbol is None else self.get_exposure2bench(index_symbol)
365 |         if isinstance(factors, str):
366 |             if factors == 'style':
367 |                 exposure = exposure[self.style_exposure.columns]
368 |             elif factors == 'industry':
369 |                 exposure = exposure[self.industry_exposure.columns]
370 |             else:
371 |                 exposure = exposure[[factors]]
372 |         else:
373 |             exposure = exposure[factors]
374 | 
375 |         if self.use_cn:
376 |             exposure = exposure.rename(columns=self.factor_cn_name)
377 |             title = '组合相对{}暴露'.format(index_symbol) if index_symbol else '组合暴露'
378 |         else:
379 |             title = 'exposure of {}'.format(index_symbol) if index_symbol else 'exposure'
380 |         self.plot_data(exposure, title=title, figsize=figsize)
381 | 
382 |     def plot_returns(self, factors='style', index_symbol=None, figsize=(15, 7)):
383 |         """绘制归因分析收益信息
384 |         factors: 绘制的暴露类型, 可选 'style'(所有风格因子), 'industry'(所有行业因子), 也可以传递一个 list, list 为 exposure_portfolio 中 columns 的一个或者多个
385 |                 同时也支持指定 ['common_return'(风格总收益), 'specific_return'(特异收益), 'total_return'(总收益),
386 |                                'country'(国家因子收益,当指定index_symbol时会用现金相对于指数的收益替代)]
387 |         index_symbol: 基准指数代码, 指定时绘制相对于指数的暴露, 默认 None 为组合本身的暴露
388 |         figsize: 画布大小
389 |         """
390 |         returns = self.attr_returns if index_symbol is None else self.get_attr_returns2bench(index_symbol)
391 |         if isinstance(factors, str):
392 |             if factors == 'style':
393 |                 returns = returns[self.style_exposure.columns]
394 |             elif factors == 'industry':
395 |                 returns = returns[self.industry_exposure.columns]
396 |             else:
397 |                 if index_symbol and factors == 'country':
398 |                     factors = 'cash'
399 |                 if factors not in returns.columns:
400 |                     raise ValueError("错误的因子名称: {}".format(factors))
401 |                 returns = returns[[factors]]
402 |         else:
403 |             if index_symbol and 'country' in factors:
404 |                 factors = [x if x != 'country' else 'cash' for x in factors]
405 |             wrong_factors = [x for x in factors if x not in returns.columns]
406 |             if wrong_factors:
407 |                 raise ValueError("错误的因子名称: {}".format(wrong_factors))
408 |             returns = returns[factors]
409 | 
410 |         if self.use_cn:
411 |             returns = returns.rename(columns=self.factor_cn_name)
412 |             title = "累积归因收益 (相对{})".format(
413 |                 index_symbol) if index_symbol else "累积归因收益"
414 |         else:
415 |             title = 'cum return to {}  '.format(
416 |                 index_symbol) if index_symbol else "cum return"
417 |         self.plot_data(returns, title=title, figsize=figsize)
418 | 
419 |     def plot_exposure_and_returns(self, factors='style', index_symbol=None, show_factor_perf=False, figsize=(12, 6)):
420 |         """将因子暴露与收益同时绘制在多个子图上
421 |         factors: 绘制的暴露类型, 可选 'style'(所有风格因子) , 'industry'(所有行业因子), 也可以传递一个 list, list为 exposure_portfolio 中 columns 的一个或者多个
422 |                  当指定 index_symbol 时, country 会用现金相对于指数的收益替代)
423 |         index_symbol: 基准指数代码,指定时绘制相对于指数的暴露及收益 , 默认None为组合本身的暴露和收益
424 |         show_factor_perf: 是否同时绘制因子表现
425 |         figsize: 画布大小, 这里第一个参数是画布的宽度, 第二个参数为单个子图的高度
426 |         """
427 |         if isinstance(factors, str):
428 |             if factors == 'style':
429 |                 factors = self.style_exposure.columns
430 |             elif factors == 'industry':
431 |                 factors = self.industry_exposure.columns
432 |             else:
433 |                 factors = [factors]
434 |         if index_symbol:
435 |             exposure = self.get_exposure2bench(index_symbol).rename(columns={"country": "cash"})
436 |             returns = self.get_attr_returns2bench(index_symbol)
437 |         else:
438 |             exposure = self.exposure_portfolio
439 |             returns = self.attr_returns
440 |         exposure, returns = exposure.align(returns, join='outer')
441 |         if show_factor_perf:
442 |             factor_performance = self.factor_returns.cumsum().reindex(exposure.index)
443 | 
444 |         num_factors = len(factors)
445 |         # 每行最多两个子图
446 |         ncols = 2 if num_factors > 1 else 1
447 |         nrows = (num_factors + 1) // ncols if num_factors > 1 else 1
448 | 
449 |         fixed_width, base_height_per_row = figsize
450 |         height_per_row = base_height_per_row if ncols == 1 else base_height_per_row / 2
451 |         total_height = max(1, nrows) * height_per_row
452 | 
453 |         fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(fixed_width, total_height))
454 |         axes = axes.flatten() if num_factors > 1 else [axes]
455 | 
456 |         # 删除多余的子图
457 |         for j in range(len(factors), len(axes)):
458 |             fig.delaxes(axes[j])
459 | 
460 |         for i, factor_name in enumerate(factors):
461 |             if index_symbol and factor_name == 'country':
462 |                 factor_name = 'cash'
463 |             if factor_name not in exposure.columns:
464 |                 raise ValueError("错误的因子名称: {}".format(factor_name))
465 |             e = exposure[factor_name]
466 |             r = returns[factor_name]
467 | 
468 |             ax1 = axes[i]
469 |             e.plot(kind='area', stacked=False, alpha=0.5, ax=ax1, color='skyblue')
470 | 
471 |             ax2 = ax1.twinx()
472 |             r.plot(ax=ax2, color='red')
473 |             if factor_name != 'cash' and show_factor_perf:
474 |                 factor_performance[factor_name].plot(ax=ax2, color='blue')
475 |             ax1.set_title(factor_name if not self.use_cn else self.factor_cn_name.get(factor_name))
476 |         labels = ['暴露', '因子收益', '因子表现'] if self.use_cn else ['exposure', 'return', 'factor performance']
477 |         fig.legend(labels[:1], loc='upper left')
478 | 
479 |         # 手动创建图例条目
480 |         custom_lines = [Line2D([0], [0], color='red', lw=2),
481 |                         Line2D([0], [0], color='blue', lw=2)]
482 |         # 创建自定义图例
483 |         fig.legend(custom_lines, labels[1:], loc='upper right',
484 |                    bbox_to_anchor=(1, 1.02), bbox_transform=plt.gcf().transFigure)
485 |         fig.suptitle('因子暴露与收益图' if self.use_cn else 'factor exposure and return', y=1.02)
486 |         plt.tight_layout()
487 |         plt.show()
488 | 
489 |     def plot_disable_chinese_label(self):
490 |         """关闭中文图例显示
491 | 
492 |         画图时默认会从系统中查找中文字体显示以中文图例
493 |         如果找不到中文字体则默认使用英文图例
494 |         当找到中文字体但中文显示乱码时, 可调用此 API 关闭中文图例显示而使用英文
495 |         """
496 |         _use_chinese(False)
497 |         self.use_cn = False
498 | 


--------------------------------------------------------------------------------
/jqfactor_analyzer/compat.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """pandas库版本兼容模块"""
 4 | 
 5 | import warnings
 6 | 
 7 | import pandas as pd
 8 | 
 9 | 
10 | # pandas
11 | PD_VERSION = pd.__version__
12 | 
13 | 
14 | def rolling_apply(
15 |     x,
16 |     window,
17 |     func,
18 |     min_periods=None,
19 |     freq=None,
20 |     center=False,
21 |     args=None,
22 |     kwargs=None
23 | ):
24 |     if args is None:
25 |         args = tuple()
26 |     if kwargs is None:
27 |         kwargs = dict()
28 | 
29 |     if PD_VERSION >= '0.23.0':
30 |         return x.rolling(
31 |             window, min_periods=min_periods, center=center
32 |         ).apply(
33 |             func, False, args=args, kwargs=kwargs
34 |         )
35 |     elif PD_VERSION >= '0.18.0':
36 |         return x.rolling(
37 |             window, min_periods=min_periods, center=center
38 |         ).apply(
39 |             func, args=args, kwargs=kwargs
40 |         )
41 |     else:
42 |         return pd.rolling_apply(
43 |             x,
44 |             window,
45 |             func,
46 |             min_periods=min_periods,
47 |             freq=freq,
48 |             center=center,
49 |             args=args,
50 |             kwargs=kwargs
51 |         )
52 | 
53 | 
54 | def rolling_mean(x, window, min_periods=None, center=False):
55 |     if PD_VERSION >= '0.18.0':
56 |         return x.rolling(window, min_periods=min_periods, center=center).mean()
57 |     else:
58 |         return pd.rolling_mean(
59 |             x, window, min_periods=min_periods, center=center
60 |         )
61 | 
62 | 
63 | def rolling_std(x, window, min_periods=None, center=False, ddof=1):
64 |     if PD_VERSION >= '0.18.0':
65 |         return x.rolling(
66 |             window, min_periods=min_periods, center=center
67 |         ).std(ddof=ddof)
68 |     else:
69 |         return pd.rolling_std(
70 |             x, window, min_periods=min_periods, center=center, ddof=ddof
71 |         )
72 | 
73 | 
74 | # statsmodels
75 | with warnings.catch_warnings():
76 |     # 有的版本依赖的 pandas 库会有 deprecated warning
77 |     warnings.simplefilter("ignore")
78 |     import statsmodels
79 |     from statsmodels.api import OLS, qqplot, ProbPlot
80 |     from statsmodels.tools.tools import add_constant
81 | 


--------------------------------------------------------------------------------
/jqfactor_analyzer/config.json:
--------------------------------------------------------------------------------
1 | {"default_dir": "~/jqfactor_datacache/bundle", "user_dir": ""}
2 | 


--------------------------------------------------------------------------------
/jqfactor_analyzer/data.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import os
  3 | import numpy as np
  4 | import pandas as pd
  5 | from tqdm import tqdm
  6 | from fastcache import lru_cache
  7 | from functools import partial
  8 | import pyarrow.feather as feather
  9 | 
 10 | from .when import date2str, convert_date, today, now, Time, Date
 11 | from .factor_cache import save_factor_values_by_group, get_factor_values_by_cache, get_cache_dir
 12 | 
 13 | 
 14 | class DataApi(object):
 15 | 
 16 |     def __init__(self, price='close', fq='post',
 17 |                  industry='jq_l1', weight_method='avg', allow_cache=True, show_progress=True):
 18 |         """数据接口, 用于因子分析获取数据
 19 | 
 20 |         参数
 21 |         ----------
 22 |         price : 使用开盘价/收盘价计算收益 (请注意避免未来函数), 默认为 'close'
 23 |             - 'close': 使用当日收盘价和次日收盘价计算当日因子的远期收益
 24 |             - 'open' : 使用当日开盘价和次日开盘价计算当日因子的远期收益
 25 |         fq : 价格数据的复权方式, 默认为 'post'
 26 |             - 'post': 后复权
 27 |             - 'pre': 前复权
 28 |             - None: 不复权
 29 |         industry : 行业分类, 默认为 'jq_l1'
 30 |             - 'jq_l1': 聚宽一级行业
 31 |             - 'jq_l2': 聚宽二级行业
 32 |             - 'sw_l1': 申万一级行业
 33 |             - 'sw_l2': 申万二级行业
 34 |             - 'sw_l3': 申万三级行业
 35 |             - 'zjw': 证监会行业
 36 |         weight_method : 计算各分位收益时, 每只股票权重, 默认为 'avg'
 37 |             - 'avg': 等权重
 38 |             - 'mktcap': 按总市值加权
 39 |             - 'ln_mktcap': 按总市值的对数加权
 40 |             - 'cmktcap': 按流通市值加权
 41 |             - 'ln_cmktcap': 按流通市值的对数加权
 42 |         allow_cache : 是否允许将分析必须数据以文件的形式缓存至本地, 默认允许, 缓存开启时, 首次加载耗时较长
 43 |         show_progress : 是否展示数据获取进度
 44 | 
 45 |         使用示例
 46 |         ----------
 47 |         from jqfactor_analyzer import DataApi, FactorAnalyzer
 48 | 
 49 |         api = DataApi(fq='pre', industry='sw_l1', weight_method='ln_mktcap')
 50 |         api.auth('username', 'password')
 51 | 
 52 |         factor = FactorAnalyzer(factor_data,
 53 |                                 price=api.get_prices,
 54 |                                 groupby=api.get_groupby,
 55 |                                 weights=api.get_weights)
 56 |         # 或者
 57 |         # factor = FactorAnalyzer(factor_data, **api.apis)
 58 | 
 59 | 
 60 |         方法列表
 61 |         ----------
 62 |         auth : 登陆 jqdatasdk
 63 |           参数 :
 64 |             username : jqdatasdk 用户名
 65 |             username : jqdatasdk 密码
 66 |           返回值 :
 67 |             None
 68 | 
 69 |         get_prices : 价格数据获取接口
 70 |           参数 :
 71 |             securities : 股票代码列表
 72 |             start_date : 开始日期
 73 |             end_date : 结束日期
 74 |             count : 交易日长度
 75 |             (start_date 和 count)
 76 |           返回值 :
 77 |             pd.DataFrame
 78 |             价格数据, columns 为股票代码, index 为日期
 79 | 
 80 |         get_groupby : 行业分类数据获取接口
 81 |           参数 :
 82 |             securities : 股票代码列表
 83 |             start_date : 开始日期
 84 |             end_date : 结束日期
 85 |           返回值 :
 86 |             dict
 87 |             行业分类, {股票代码 -> 行业分类名称}
 88 | 
 89 |         get_weights : 股票权重获取接口
 90 |           参数 :
 91 |             securities : 股票代码列表
 92 |             start_date : 开始日期
 93 |             end_date : 结束日期
 94 |           返回值 :
 95 |             pd.DataFrame
 96 |             权重数据, columns 为股票代码, index 为日期
 97 | 
 98 | 
 99 |         属性列表
100 |         ----------
101 |         apis : dict, {'prices': get_prices, 'groupby': get_groupby,
102 |                       'weights': get_weights}
103 | 
104 |         """
105 |         try:
106 |             import jqdata
107 |             self._api = jqdata.apis
108 |             self._api_name = 'jqdata'
109 |         except ImportError:
110 |             import jqdatasdk
111 |             self._api = jqdatasdk
112 |             self._api_name = 'jqdatasdk'
113 | 
114 |         self.show_progress = show_progress
115 |         valid_price = ('close', 'open')
116 |         if price in valid_price:
117 |             self.price = price
118 |         else:
119 |             ValueError("invalid 'price' parameter, "
120 |                        "should be one of %s" % str(valid_price))
121 | 
122 |         valid_fq = ('post', 'pre', None)
123 |         if fq in valid_fq:
124 |             self.fq = fq
125 |         else:
126 |             raise ValueError("invalid 'fq' parameter, "
127 |                              "should be one of %s" % str(valid_fq))
128 | 
129 |         valid_industry = ('sw_l1', 'sw_l2', 'sw_l3', 'jq_l1', 'jq_l2', 'zjw')
130 |         if industry in valid_industry:
131 |             self.industry = industry
132 |         else:
133 |             raise ValueError("invalid 'industry' parameter, "
134 |                              "should be one of %s" % str(valid_industry))
135 | 
136 |         valid_weight_method = ('avg', 'mktcap', 'ln_mktcap', 'cmktcap', 'ln_cmktcap')
137 |         if weight_method in valid_weight_method:
138 |             self.weight_method = weight_method
139 |         else:
140 |             raise ValueError("invalid 'weight_method' parameter, "
141 |                              "should be one of %s" % str(valid_weight_method))
142 |         self.ini_cache_cfg(allow_cache)
143 | 
144 |     @lru_cache(10)
145 |     def get_ind_record(self, industry):
146 |         mapping = self.api.get_industries(industry).to_dict()['name']
147 |         ind_record = self.api.get_history_industry(industry).set_index('stock')
148 |         ind_record['industry_name'] = ind_record['code'].map(mapping)
149 |         ind_record.end_date = ind_record.end_date.fillna(Date(2040, 1, 1))
150 |         return ind_record
151 | 
152 |     def ini_cache_cfg(self, allow_cache):
153 |         self.allow_cache = allow_cache
154 | 
155 |         if self._api_name != 'jqdatasdk':
156 |             self.allow_cache = False
157 |         self.allow_industry_cache = False
158 | 
159 |     def auth(self, username='', password=''):
160 |         if self._api_name == 'jqdata':
161 |             return
162 |         if username:
163 |             import jqdatasdk
164 |             jqdatasdk.auth(username, password)
165 | 
166 |     @property
167 |     def api(self):
168 |         if not hasattr(self, "_api"):
169 |             raise NotImplementedError('api not specified')
170 |         if self.allow_cache:
171 |             if not self._api.is_auth():
172 |                 raise Exception("Please run jqdatasdk.auth first")
173 |             privilege = self._api.get_privilege()
174 |             if 'GET_HISTORY_INDUSTRY' in privilege:
175 |                 self.allow_industry_cache = True
176 |             else:
177 |                 self.allow_industry_cache = False
178 |             if 'FACTOR_BASICS' in privilege or 'GET_FACTOR_VALUES' in privilege:
179 |                 self.mkt_cache_api = 'factor'
180 |             else:
181 |                 self.mkt_cache_api = 'valuation'
182 |         return self._api
183 | 
184 |     @lru_cache(2)
185 |     def _get_trade_days(self, start_date=None, end_date=None):
186 |         if start_date is not None:
187 |             start_date = date2str(start_date)
188 |         if end_date is not None:
189 |             end_date = date2str(end_date)
190 |         return list(self.api.get_trade_days(start_date=start_date,
191 |                                             end_date=end_date))
192 | 
193 |     def _get_price(self, securities, start_date=None, end_date=None, count=None,
194 |                    fields=None, skip_paused=False, fq='post', round=False):
195 |         start_date = date2str(start_date) if start_date is not None else None
196 |         end_date = date2str(end_date) if end_date is not None else None
197 |         if self._api_name == 'jqdata':
198 |             if 'panel' in self.api.get_price.__code__.co_varnames:
199 |                 get_price = partial(self.api.get_price,
200 |                                     panel=False,
201 |                                     pre_factor_ref_date=end_date)
202 |             else:
203 |                 get_price = partial(self.api.get_price,
204 |                                     pre_factor_ref_date=end_date)
205 |         else:
206 |             get_price = self.api.get_price
207 |         p = get_price(
208 |             securities, start_date=start_date, end_date=end_date, count=count,
209 |             fields=fields, skip_paused=skip_paused, fq=fq, round=round
210 |         )
211 |         if hasattr(p, 'to_frame'):
212 |             p = p.to_frame()
213 |             p.index.names = ['time', 'code']
214 |             p.reset_index(inplace=True)
215 | 
216 |         return p
217 | 
218 |     def _get_cached_price(self, securities, start_date=None, end_date=None, fq=None, overwrite=False):
219 |         """获取缓存价格数据, 缓存文件中存储的数据是为未复权价格和后复权因子"""
220 |         save_factor_values_by_group(start_date, end_date,
221 |                                     factor_names='prices',
222 |                                     overwrite=overwrite,
223 |                                     show_progress=self.show_progress)
224 |         trade_days = pd.to_datetime(self._get_trade_days(start_date, end_date))
225 | 
226 |         ret = []
227 |         if self.show_progress:
228 |             trade_days = tqdm(trade_days, desc="load price info : ")
229 |         for day in trade_days:
230 |             if day < today():
231 |                 p = get_factor_values_by_cache(
232 |                     day, securities, factor_names='prices').reset_index()
233 |             else:
234 |                 p = self.api.get_price(securities, start_date=day, end_date=day,
235 |                                        skip_paused=False, round=False,
236 |                                        fields=['open', 'close', 'factor'],
237 |                                        fq='post', panel=False)
238 |                 p[['open', 'close']] = p[['open', 'close']].div(p['factor'], axis=0)
239 |             p['time'] = day
240 |             ret.append(p)
241 |         ret = pd.concat(ret, ignore_index=True).sort_values(['code', 'time']).reset_index(drop=True)
242 |         if fq == 'pre':
243 |             # 前复权基准日期为最新一天
244 |             latest_factor = self.api.get_price(securities,
245 |                                                end_date=today(),
246 |                                                count=1,
247 |                                                skip_paused=False,
248 |                                                round=False,
249 |                                                fields=['factor'],
250 |                                                fq='post',
251 |                                                panel=False).set_index('code')
252 |             ret = ret.set_index('code')
253 |             ret.factor = ret.factor / latest_factor.factor
254 |             ret = ret.reset_index().reindex(columns=['time', 'code', 'open', 'close', 'factor'])
255 |         elif fq is None:
256 |             ret.loc[ret['factor'].notna(), 'factor'] = 1.0
257 |         ret[['open', 'close']] = ret[['open', 'close']].mul(ret['factor'], axis=0)
258 |         return ret
259 | 
260 |     def get_prices(self, securities, start_date=None, end_date=None,
261 |                    period=None):
262 |         if period is not None:
263 |             trade_days = self._get_trade_days(start_date=end_date)
264 |             if len(trade_days):
265 |                 end_date = trade_days[:period + 1][-1]
266 |         if self.allow_cache:
267 |             p = self._get_cached_price(
268 |                 securities, start_date, end_date, fq=self.fq)
269 |         else:
270 |             p = self._get_price(
271 |                 fields=[self.price], securities=securities,
272 |                 start_date=start_date, end_date=end_date, round=False,
273 |                 fq=self.fq
274 |             )
275 |         p = p.set_index(['time', 'code'])[self.price].unstack('code').sort_index()
276 |         return p
277 | 
278 |     def _get_industry(self, securities, start_date, end_date, industry='jq_l1'):
279 |         trade_days = self._get_trade_days(start_date, end_date)
280 |         industries = map(partial(self.api.get_industry, securities), trade_days)
281 |         day_ind = zip(trade_days, industries)
282 |         if self.show_progress:
283 |             day_ind = tqdm(day_ind, desc='load industry info : ',
284 |                            total=len(trade_days))
285 |         industries = {
286 |             d: {
287 |                 s: ind.get(s).get(industry, dict()).get('industry_name', 'NA')
288 |                 for s in securities
289 |             }
290 |             for d, ind in day_ind
291 |         }
292 |         return pd.DataFrame(industries).T.sort_index()
293 | 
294 |     def _get_cached_industry_one_day(self, date, securities=None, industry=None):
295 |         date = convert_date(date)
296 |         if self.allow_industry_cache:
297 |             ind_record = self.get_ind_record(industry)
298 |             if securities is not None:
299 |                 ind_record = ind_record[ind_record.index.isin(securities)]
300 |             return ind_record[(ind_record.start_date <= date) & (date <= ind_record.end_date)].code
301 |         else:
302 |             ind_record = self.api.get_industry(securities, date, df=True)
303 |             ind_record = ind_record[ind_record['type'] ==
304 |                                     industry].set_index("code").industry_code
305 |             return ind_record
306 | 
307 |     def _get_cached_industry(self, securities, start_date, end_date):
308 |         ind_record = self.get_ind_record(self.industry)
309 |         start_date = convert_date(start_date)
310 |         end_date = convert_date(end_date)
311 |         trade_days = self._get_trade_days(start_date, end_date)
312 |         ind_record = ind_record[(ind_record.index.isin(securities))]
313 |         if self.show_progress:
314 |             trade_days = tqdm(trade_days, desc="load industry info : ")
315 |         df_list = []
316 |         for d in trade_days:
317 |             rec = ind_record[(ind_record.start_date <= d) & (
318 |                 d <= ind_record.end_date)].industry_name
319 |             rec.name = d
320 |             df_list.append(rec)
321 |         df = pd.DataFrame(df_list).reindex(columns=securities)
322 |         return df.fillna('NA')
323 | 
324 |     def get_groupby(self, securities, start_date, end_date):
325 |         if self.allow_industry_cache:
326 |             return self._get_cached_industry(securities, start_date, end_date)
327 |         else:
328 |             return self._get_industry(securities=securities,
329 |                                       start_date=start_date, end_date=end_date,
330 |                                       industry=self.industry)
331 | 
332 |     def _get_cached_mkt_cap_by_valuation(self, securities, date, field, overwrite=False):
333 |         """市值处理函数, 将获取的市值数据缓存到本地"""
334 |         if not securities:
335 |             return pd.Series(dtype='float64', name=date)
336 | 
337 |         query = self.api.query
338 |         valuation = self.api.valuation
339 |         cache_dir = os.path.join(get_cache_dir(), 'mkt_cap', date.strftime("%Y%m"))
340 |         fp = os.path.join(cache_dir, date.strftime("%Y%m%d")) + '.feather'
341 | 
342 |         if os.path.exists(fp) and not overwrite:
343 |             data = feather.read_feather(fp)
344 |         else:
345 |             if not os.path.exists(cache_dir):
346 |                 os.makedirs(cache_dir)
347 |             codes = self.api.get_all_securities('stock').index.tolist()
348 |             q = query(valuation.code,
349 |                       valuation.market_cap,
350 |                       valuation.circulating_market_cap).filter(
351 |                           valuation.code.in_(codes))
352 |             data = self.api.get_fundamentals(q, date=date2str(date))
353 |             data[['market_cap', 'circulating_market_cap']] = data[
354 |                 ['market_cap', 'circulating_market_cap']] * (10 ** 8)
355 |             if date < today() or (date == today() and now().time() >= Time(16, 30)):
356 |                 data.to_feather(fp)
357 | 
358 |         return data[data.code.isin(securities)].set_index('code')[field]
359 | 
360 |     def _get_market_cap(self, securities, start_date, end_date, ln=False, field='market_cap'):
361 |         trade_days = self._get_trade_days(start_date, end_date)
362 | 
363 |         def get_mkt_cap(s, date, field):
364 |             if not s:
365 |                 return pd.Series(dtype='float64', name=date)
366 |             data = self.api.get_fundamentals(
367 |                 q, date=date2str(date)
368 |             ).set_index('code')[field] * (10 ** 8)
369 |             return data
370 | 
371 |         def get_mkt_cap_cache(s, date, field):
372 |             cap = get_factor_values_by_cache(
373 |                 date, securities, factor_path=cache_dir).reindex(columns=[field])
374 |             return cap[field]
375 | 
376 |         if self.allow_cache and len(trade_days) > 5:
377 |             if self.mkt_cache_api == 'factor':
378 |                 desc = 'check/save cap cache :' if self.show_progress else False
379 |                 cache_dir = save_factor_values_by_group(start_date,
380 |                                                         end_date,
381 |                                                         factor_names=['market_cap', 'circulating_market_cap'],
382 |                                                         group_name='mkt_cap',
383 |                                                         show_progress=desc)
384 |                 market_api = get_mkt_cap_cache
385 |             else:
386 |                 market_api = self._get_cached_mkt_cap_by_valuation
387 |         else:
388 |             market_api = get_mkt_cap
389 |             query = self.api.query
390 |             valuation = self.api.valuation
391 | 
392 |             if field == 'market_cap':
393 |                 q = query(valuation.code, valuation.market_cap).filter(
394 |                     valuation.code.in_(securities))
395 |             elif field == 'circulating_market_cap':
396 |                 q = query(valuation.code, valuation.circulating_market_cap).filter(
397 |                     valuation.code.in_(securities))
398 |             else:
399 |                 raise ValueError("不支持的字段 : {}".foramt(field))
400 | 
401 |         if self.show_progress:
402 |             trade_days = tqdm(trade_days, desc="load cap info : ")
403 | 
404 |         market_cap = []
405 |         for date in trade_days:
406 |             cap = market_api(securities, date, field)
407 |             cap.name = date
408 |             market_cap.append(cap)
409 |         market_cap = pd.concat(market_cap, axis=1).astype(float).reindex(index=securities)
410 | 
411 |         if ln:
412 |             market_cap = np.log(market_cap)
413 | 
414 |         return market_cap.T
415 | 
416 |     def _get_circulating_market_cap(self, securities, start_date, end_date,
417 |                                     ln=False):
418 |         return self._get_market_cap(securities, start_date, end_date,
419 |                                     ln=ln, field='circulating_market_cap')
420 | 
421 |     def _get_average_weights(self, securities, start_date, end_date):
422 |         return {sec: 1.0 for sec in securities}
423 | 
424 |     def get_weights(self, securities, start_date, end_date):
425 |         start_date = date2str(start_date)
426 |         end_date = date2str(end_date)
427 | 
428 |         if self.weight_method == 'avg':
429 |             weight_api = self._get_average_weights
430 |         elif self.weight_method == 'mktcap':
431 |             weight_api = partial(self._get_market_cap, ln=False)
432 |         elif self.weight_method == 'ln_mktcap':
433 |             weight_api = partial(self._get_market_cap, ln=True)
434 |         elif self.weight_method == 'cmktcap':
435 |             weight_api = partial(self._get_circulating_market_cap, ln=False)
436 |         elif self.weight_method == 'ln_cmktcap':
437 |             weight_api = partial(self._get_circulating_market_cap, ln=True)
438 |         else:
439 |             raise ValueError('invalid weight_method')
440 | 
441 |         return weight_api(securities=securities, start_date=start_date,
442 |                           end_date=end_date)
443 | 
444 |     @property
445 |     def apis(self):
446 |         return dict(prices=self.get_prices,
447 |                     groupby=self.get_groupby,
448 |                     weights=self.get_weights)
449 | 


--------------------------------------------------------------------------------
/jqfactor_analyzer/exceptions.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | 
 4 | from functools import wraps
 5 | 
 6 | 
 7 | def rethrow(exception, additional_message):
 8 |     """
 9 |     重新抛出当前作用域中的最后一个异常, 保留堆栈信息, 并且在报错信息中添加其他内容
10 |     """
11 |     e = exception
12 |     m = additional_message
13 |     if not e.args:
14 |         e.args = (m,)
15 |     else:
16 |         e.args = (e.args[0] + m,) + e.args[1:]
17 |     raise e
18 | 
19 | 
20 | def non_unique_bin_edges_error(func):
21 |     """
22 |     捕获 pd.qcut 的异常, 添加提示信息并报错
23 |     """
24 |     message = u"""
25 |     根据输入的 quantiles 计算时发生错误.
26 |     这通常发生在输入包含太多相同值, 使得它们跨越多个分位.
27 |     每天的因子值是按照分位数平均分组的, 相同的值不能跨越多个分位数.
28 |     可能的解决方法:
29 |     1. 减少分位数
30 |     2. 调整因子减少重复值
31 |     3. 尝试不同的股票池
32 |     """
33 | 
34 |     @wraps(func)
35 |     def dec(*args, **kwargs):
36 |         try:
37 |             return func(*args, **kwargs)
38 |         except ValueError as e:
39 |             if 'Bin edges must be unique' in str(e):
40 |                 rethrow(e, message)
41 |             raise
42 | 
43 |     return dec
44 | 
45 | 
46 | class MaxLossExceededError(Exception):
47 |     pass
48 | 


--------------------------------------------------------------------------------
/jqfactor_analyzer/factor_cache.py:
--------------------------------------------------------------------------------
  1 | import hashlib
  2 | from itertools import groupby
  3 | import pandas as pd
  4 | import os
  5 | import json
  6 | import functools
  7 | import logging
  8 | from .when import today, now, TimeDelta
  9 | from tqdm import tqdm
 10 | 
 11 | 
 12 | try:
 13 |     import jqdata
 14 |     api = jqdata.apis
 15 |     api_name = 'jqdata'
 16 | except ImportError:
 17 |     import jqdatasdk
 18 |     api = jqdatasdk
 19 |     api_name = 'jqdatasdk'
 20 | 
 21 | 
 22 | def get_cache_config():
 23 |     """获取缓存目录"""
 24 |     config_path = os.path.join(
 25 |         os.path.dirname(os.path.abspath(__file__)), 'config.json'
 26 |     )
 27 |     if not os.path.exists(config_path):
 28 |         return set_cache_dir("")
 29 |     else:
 30 |         with open(config_path, 'r') as conf:
 31 |             return json.load(conf)
 32 | 
 33 | 
 34 | def set_cache_dir(path):
 35 |     """设置缓存目录"""
 36 |     cfg = {'default_dir': '~/jqfactor_datacache/bundle',
 37 |            'user_dir': os.path.expanduser(path)}
 38 |     config_path = os.path.join(
 39 |         os.path.dirname(os.path.abspath(__file__)), 'config.json'
 40 |     )
 41 |     with open(config_path, 'w') as conf:
 42 |         json.dump(cfg, conf)
 43 |     get_cache_dir.cache_clear()
 44 |     return cfg
 45 | 
 46 | 
 47 | def get_factor_values(securities, factors=None, start_date=None, end_date=None, count=None):
 48 |     if api_name == 'jqdatasdk':
 49 |         func = api.get_factor_values
 50 |     else:
 51 |         from jqfactor import get_factor_values
 52 |         func = get_factor_values
 53 |     return func(securities, factors, start_date, end_date, count)
 54 | 
 55 | 
 56 | @functools.lru_cache()
 57 | def get_cache_dir():
 58 |     # 优先获取用户配置的缓存目录, 若无, 则使用默认目录
 59 |     cfg = get_cache_config()
 60 |     user_path = cfg.get('user_dir', "")
 61 |     if user_path != "":
 62 |         return os.path.expanduser(user_path)
 63 |     return os.path.expanduser(cfg['default_dir'])
 64 | 
 65 | 
 66 | def list_to_tuple_converter(func):
 67 |     @functools.wraps(func)
 68 |     def wrapper(*args, **kwargs):
 69 |         # 将所有位置参数中的 list 转换为 tuple
 70 |         args = tuple(tuple(arg) if isinstance(
 71 |             arg, list) else arg for arg in args)
 72 | 
 73 |         # 将关键字参数中的 list 转换为 tuple
 74 |         kwargs = {k: tuple(v) if isinstance(v, list)
 75 |                   else v for k, v in kwargs.items()}
 76 | 
 77 |         return func(*args, **kwargs)
 78 |     return wrapper
 79 | 
 80 | 
 81 | @list_to_tuple_converter
 82 | @functools.lru_cache()
 83 | def get_factor_folder(factor_names, group_name=None):
 84 |     """获取因子组的文件夹
 85 |     factor_names : 因子名列表
 86 |     group_name : 因子组的名称, 如果指定则使用指定的名称作为文件夹名
 87 |                  否则用 jqfactor_cache_ + 因子名的 md5 值 (顺序无关) 作为文件夹名
 88 |     """
 89 |     if group_name:
 90 |         return group_name
 91 |     else:
 92 |         if factor_names == 'prices':
 93 |             return 'jqprice_cache'
 94 |         if isinstance(factor_names, str):
 95 |             factor_names = [factor_names]
 96 |         factor_names = sorted(factor_names)
 97 |         factor_names = ''.join(factor_names)
 98 |         hash_object = hashlib.md5(factor_names.encode())
 99 |         hash_hex = hash_object.hexdigest()
100 |     return f"jqfactor_cache_{hash_hex}"
101 | 
102 | 
103 | def get_date_miss_group(A, B):
104 |     '''将A相比B缺失的部分按连续性进行分组'''
105 |     group_values = []
106 |     masks = [(x not in A) for x in B]
107 |     for key, group in groupby(zip(B, masks), lambda x: x[1]):
108 |         if key:
109 |             group_values.append([item[0] for item in group])
110 |     return group_values
111 | 
112 | 
113 | def save_data_by_month(factor_names, start, end, month_path):
114 |     """按时间段获取储存数据(不要跨月)
115 |     """
116 |     start = pd.to_datetime(start)
117 |     end = pd.to_datetime(end)
118 |     security_info = api.get_all_securities()
119 |     security_info.start_date = pd.to_datetime(security_info.start_date)
120 |     security_info.end_date = pd.to_datetime(security_info.end_date)
121 | 
122 |     month_value = {}
123 |     stocks = security_info[(security_info.start_date <= end) & (
124 |         security_info.end_date >= start)].index.tolist()
125 |     if factor_names == 'prices':
126 |         month_value = api.get_price(stocks, start_date=start, end_date=end,
127 |                                     skip_paused=False, round=False,
128 |                                     fields=['open', 'close', 'factor'],
129 |                                     fq='post', panel=False)
130 |         if month_value.empty:
131 |             return 0
132 |         month_value.set_index(['code', 'time'], inplace=True)
133 |         month_value[['open', 'close']] = month_value[[
134 |             'open', 'close']].div(month_value['factor'], axis=0)
135 |     else:
136 |         for factor in factor_names:
137 |             month_value.update(get_factor_values(stocks,
138 |                                                  start_date=start,
139 |                                                  end_date=end,
140 |                                                  factors=factor))
141 |         if not month_value:
142 |             return 0
143 |         month_value = pd.concat(month_value).unstack(level=1).T
144 |     month_value.index.names = ('code', 'date')
145 | 
146 |     for date, data in month_value.groupby(month_value.index.get_level_values(1)):
147 |         data = data.reset_index(level=1, drop=True)
148 |         data = data.reindex(security_info[(security_info.start_date <= date) & (
149 |             security_info.end_date >= date)].index.tolist())
150 |         # 数据未产生, 或者已经生产了但是全为 nan
151 |         if data.isna().values.all():
152 |             continue
153 |         path = os.path.join(month_path, date.strftime("%Y%m%d") + ".feather")
154 |         data.reset_index().to_feather(path)
155 |     return month_value
156 | 
157 | 
158 | def save_factor_values_by_group(start_date, end_date,
159 |                                 factor_names='prices', group_name=None,
160 |                                 overwrite=False, cache_dir=None, show_progress=True):
161 |     """将因子库数据按因子组储存到本地
162 |     start_date : 开始时间
163 |     end_date : 结束时间
164 |     factor_names : 因子组所含因子的名称,除过因子库中支持的因子外，还支持指定为'prices'缓存价格数据
165 |     overwrite  : 文件已存在时是否覆盖更新
166 |     返回 : 因子组储存的路径 , 文件以天为单位储存,每天一个feather文件,每月一个文件夹,columns第一列是因子名称, 而后是当天在市的所有标的代码
167 |     """
168 |     if cache_dir is None:
169 |         cache_dir = get_cache_dir()
170 | 
171 |     start_date = pd.to_datetime(start_date).date()
172 |     last_day = today() - TimeDelta(days=1) if now().hour > 8 else today() - TimeDelta(days=2)
173 |     end_date = min(pd.to_datetime(end_date).date(), last_day)
174 |     date_range = pd.date_range(start_date, end_date, freq='1M')
175 |     _date = pd.to_datetime(end_date)
176 |     if len(date_range) == 0 or date_range[-1] < _date:
177 |         date_range = date_range.append(pd.Index([_date]))
178 | 
179 |     if show_progress:
180 |         if isinstance(show_progress, str):
181 |             desc = show_progress
182 |         elif factor_names == 'prices':
183 |             desc = 'check/save price cache '
184 |         else:
185 |             desc = 'check/save factor cache '
186 |         date_range = tqdm(date_range, total=len(date_range), desc=desc)
187 |     root_path = os.path.join(
188 |         cache_dir, get_factor_folder(factor_names, group_name))
189 | 
190 |     for end in date_range:
191 |         start = max(end.replace(day=1).date(), start_date)
192 |         month_path = os.path.join(root_path, end.strftime("%Y%m"))
193 |         if not os.path.exists(month_path):
194 |             os.makedirs(month_path)
195 |         elif not overwrite:
196 |             dates = [x.split(".")[0] for x in os.listdir(month_path)]
197 |             dates = pd.to_datetime(dates).date
198 |             trade_days = api.get_trade_days(start, end)
199 |             miss_group = get_date_miss_group(dates, trade_days)
200 |             if miss_group:
201 |                 for group in miss_group:
202 |                     save_data_by_month(
203 |                         factor_names, group[0], group[-1], month_path)
204 |             continue
205 |         save_data_by_month(factor_names, start, end, month_path)
206 | 
207 |     return root_path
208 | 
209 | 
210 | def get_factor_values_by_cache(date, codes=None, factor_names=None, group_name=None, factor_path=None):
211 |     """从缓存的文件读取因子数据, 文件不存在时返回空的 DataFrame"""
212 |     date = pd.to_datetime(date)
213 |     if factor_path:
214 |         path = os.path.join(factor_path,
215 |                             date.strftime("%Y%m"),
216 |                             date.strftime("%Y%m%d") + ".feather")
217 |     elif group_name:
218 |         path = os.path.join(get_cache_dir(),
219 |                             group_name,
220 |                             date.strftime("%Y%m"),
221 |                             date.strftime("%Y%m%d") + ".feather")
222 |     elif factor_names:
223 |         path = os.path.join(get_cache_dir(),
224 |                             get_factor_folder(factor_names),
225 |                             date.strftime("%Y%m"),
226 |                             date.strftime("%Y%m%d") + ".feather")
227 |     else:
228 |         raise ValueError("factor_names, factor_path 和 group_name 至少指定其中一个")
229 |     # 数据未产生, 或者已经生产了但是全为 nan
230 |     if not os.path.exists(path):
231 |         factor_names = factor_names if factor_names != 'prices' else [
232 |             'open', 'close', 'factor']
233 |         data = pd.DataFrame(index=codes, columns=factor_names)
234 |         data.index.name = 'code'
235 |         return data
236 | 
237 |     try:
238 |         data = pd.read_feather(path, use_threads=False).set_index('code')
239 |     except Exception as e:
240 |         if factor_names:
241 |             logging.error("\n{} 缓存文件可能已损坏, 请重新下载".format(date))
242 |             save_data_by_month(factor_names,
243 |                                date, date,
244 |                                os.path.join(factor_path, date.strftime("%Y%m")))
245 |             data = get_factor_values_by_cache(
246 |                 date, codes, factor_names, factor_path)
247 |         else:
248 |             raise ValueError(
249 |                 "\n{} 缓存文件可能已损坏, 请重新下载 (指定 factor_names 时会自动下载) {} ".format(date, e))
250 | 
251 |     if codes is not None:
252 |         data = data.reindex(codes)
253 | 
254 |     return data
255 | 


--------------------------------------------------------------------------------
/jqfactor_analyzer/performance.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | 
  4 | import numpy as np
  5 | from scipy import stats
  6 | import pandas as pd
  7 | from statsmodels.api import OLS, add_constant
  8 | 
  9 | from .compat import rolling_apply
 10 | from .prepare import demean_forward_returns, common_start_returns
 11 | from .utils import get_forward_returns_columns
 12 | 
 13 | 
 14 | def factor_information_coefficient(
 15 |     factor_data, group_adjust=False, by_group=False, method=stats.spearmanr
 16 | ):
 17 |     """
 18 |     通过因子值与因子远期收益计算信息系数(IC).
 19 | 
 20 |     参数
 21 |     ----------
 22 |     factor_data : pd.DataFrame - MultiIndex
 23 |         一个 DataFrame, index 为日期 (level 0) 和资产(level 1) 的 MultiIndex,
 24 |         values 包括因子的值, 各期因子远期收益, 因子分位数,
 25 |         因子分组(可选), 因子权重(可选)
 26 |     group_adjust : bool
 27 |         是否使用分组去均值后的因子远期收益计算 IC.
 28 |     by_group : bool
 29 |         是否分组计算 IC.
 30 |     Returns
 31 |     -------
 32 |     ic : pd.DataFrame
 33 |         因子信息系数(IC).
 34 |     """
 35 | 
 36 |     def src_ic(group):
 37 |         f = group['factor']
 38 |         _ic = group[get_forward_returns_columns(factor_data.columns)] \
 39 |             .apply(lambda x: method(x, f)[0])
 40 |         return _ic
 41 | 
 42 |     factor_data = factor_data.copy()
 43 | 
 44 |     grouper = [factor_data.index.get_level_values('date')]
 45 | 
 46 |     if group_adjust:
 47 |         factor_data = demean_forward_returns(factor_data, grouper + ['group'])
 48 |     if by_group:
 49 |         grouper.append('group')
 50 | 
 51 |     with np.errstate(divide='ignore', invalid='ignore'):
 52 |         ic = factor_data.groupby(grouper).apply(src_ic)
 53 | 
 54 |     return ic
 55 | 
 56 | 
 57 | def mean_information_coefficient(
 58 |     factor_data,
 59 |     group_adjust=False,
 60 |     by_group=False,
 61 |     by_time=None,
 62 |     method=stats.spearmanr
 63 | ):
 64 |     """
 65 |     根据不同分组求因子 IC 均值.
 66 | 
 67 |     参数
 68 |     ----------
 69 |     factor_data : pd.DataFrame - MultiIndex
 70 |         一个 DataFrame, index 为日期 (level 0) 和资产(level 1) 的 MultiIndex,
 71 |         values 包括因子的值, 各期因子远期收益, 因子分位数,
 72 |         因子分组(可选), 因子权重(可选)
 73 |     group_adjust : bool
 74 |         是否使用分组去均值后的因子远期收益计算 IC.
 75 |     by_group : bool
 76 |         是否分组计算 IC.
 77 |     by_time : str (pd time_rule), optional
 78 |         根据相应的时间频率计算 IC 均值
 79 |         时间频率参见 http://pandas.pydata.org/pandas-docs/stable/timeseries.html
 80 | 
 81 |     返回值
 82 |     -------
 83 |     ic : pd.DataFrame
 84 |         根据不同分组求出的因子 IC 均值序列
 85 |     """
 86 | 
 87 |     ic = factor_information_coefficient(
 88 |         factor_data, group_adjust, by_group, method=method
 89 |     )
 90 | 
 91 |     grouper = []
 92 |     if by_time is not None:
 93 |         grouper.append(pd.Grouper(freq=by_time))
 94 |     if by_group:
 95 |         grouper.append('group')
 96 | 
 97 |     if len(grouper) == 0:
 98 |         ic = ic.mean()
 99 | 
100 |     else:
101 |         ic = (ic.reset_index().set_index('date').groupby(grouper).mean())
102 | 
103 |     return ic
104 | 
105 | 
106 | def factor_returns(factor_data, demeaned=True, group_adjust=False):
107 |     """
108 |     计算按因子值加权的投资组合的收益
109 |     权重为去均值的因子除以其绝对值之和 (实现总杠杆率为1).
110 | 
111 |     参数
112 |     ----------
113 |     factor_data : pd.DataFrame - MultiIndex
114 |         一个 DataFrame, index 为日期 (level 0) 和资产(level 1) 的 MultiIndex,
115 |         values 包括因子的值, 各期因子远期收益, 因子分位数,
116 |         因子分组(可选), 因子权重(可选)
117 |     demeaned : bool
118 |         因子分析是否基于一个多空组合? 如果是 True, 则计算权重时因子值需要去均值
119 |     group_adjust : bool
120 |         因子分析是否基于一个分组(行业)中性的组合?
121 |         如果是 True, 则计算权重时因子值需要根据分组和日期去均值
122 | 
123 |     返回值
124 |     -------
125 |     returns : pd.DataFrame
126 |         每期零风险暴露的多空组合收益
127 |     """
128 | 
129 |     def to_weights(group, is_long_short):
130 |         if is_long_short:
131 |             demeaned_vals = group - group.mean()
132 |             return demeaned_vals / demeaned_vals.abs().sum()
133 |         else:
134 |             return group / group.abs().sum()
135 | 
136 |     grouper = [factor_data.index.get_level_values('date')]
137 |     if group_adjust:
138 |         grouper.append('group')
139 | 
140 |     weights = factor_data.groupby(grouper)['factor'] \
141 |         .apply(to_weights, demeaned)
142 | 
143 |     if group_adjust:
144 |         weights = weights.groupby(level='date').apply(to_weights, False)
145 | 
146 |     weighted_returns = \
147 |         factor_data[get_forward_returns_columns(factor_data.columns)] \
148 |         .multiply(weights, axis=0)
149 | 
150 |     returns = weighted_returns.groupby(level='date').sum()
151 | 
152 |     return returns
153 | 
154 | 
155 | def factor_alpha_beta(factor_data, demeaned=True, group_adjust=False):
156 |     """
157 |     计算因子的alpha(超额收益),
158 |     alpha t-统计量 (alpha 显著性）和 beta(市场暴露).
159 |     使用每期平均远期收益作为自变量(视为市场组合收益)
160 |     因子值加权平均的远期收益作为因变量(视为因子收益), 进行回归.
161 | 
162 |     Parameters
163 |     ----------
164 |     factor_data : pd.DataFrame - MultiIndex
165 |         一个 DataFrame, index 为日期 (level 0) 和资产(level 1) 的 MultiIndex,
166 |         values 包括因子的值, 各期因子远期收益, 因子分位数,
167 |         因子分组(可选), 因子权重(可选)
168 |     demeaned : bool
169 |         因子分析是否基于一个多空组合? 如果是 True, 则计算权重时因子值需要去均值
170 |     group_adjust : bool
171 |         因子分析是否基于一个分组(行业)中性的组合?
172 |         如果是 True, 则计算权重时因子值需要根据分组和日期去均值
173 |     Returns
174 |     -------
175 |     alpha_beta : pd.Series
176 |         一个包含 alpha, beta, a t-统计量(alpha) 的序列
177 |     """
178 | 
179 |     returns = factor_returns(factor_data, demeaned, group_adjust)
180 | 
181 |     universe_ret = factor_data.groupby(level='date')[
182 |         get_forward_returns_columns(factor_data.columns)] \
183 |         .mean().loc[returns.index]
184 | 
185 |     if isinstance(returns, pd.Series):
186 |         returns.name = universe_ret.columns.values[0]
187 |         returns = pd.DataFrame(returns)
188 | 
189 |     alpha_beta = pd.DataFrame()
190 |     for period in returns.columns.values:
191 |         x = universe_ret[period].values
192 |         y = returns[period].values
193 |         x = add_constant(x)
194 |         period_int = int(period.replace('period_', ''))
195 | 
196 |         reg_fit = OLS(y, x).fit()
197 |         alpha, beta = reg_fit.params
198 | 
199 |         alpha_beta.loc['Ann. alpha', period] = \
200 |             (1 + alpha) ** (250.0 / period_int) - 1
201 |         alpha_beta.loc['beta', period] = beta
202 | 
203 |     return alpha_beta
204 | 
205 | 
206 | def cumulative_returns(returns, period):
207 |     """
208 |     从'N 期'因子远期收益率构建累积收益
209 |     当 'period' N 大于 1 时, 建立平均 N 个交错的投资组合 (在随后的时段 1,2,3，...，N 开始),
210 |     每个 N 个周期重新调仓, 最后计算 N 个投资组合累积收益的均值。
211 | 
212 |     参数
213 |     ----------
214 |     returns: pd.Series
215 |         N 期因子远期收益序列
216 |     period: integer
217 |         对应的因子远期收益时间跨度
218 | 
219 |     返回值
220 |     -------
221 |     pd.Series
222 |         累积收益序列
223 |     """
224 | 
225 |     returns = returns.fillna(0)
226 | 
227 |     if period == 1:
228 |         return returns.add(1).cumprod()
229 |     #
230 |     # 构建 N 个交错的投资组合
231 |     #
232 | 
233 |     def split_portfolio(ret, period):
234 |         return pd.DataFrame(np.diag(ret))
235 | 
236 |     sub_portfolios = returns.groupby(
237 |         np.arange(len(returns.index)) // period, axis=0
238 |     ).apply(split_portfolio, period)
239 |     sub_portfolios.index = returns.index
240 | 
241 |     #
242 |     # 将 N 期收益转换为 1 期收益, 方便计算累积收益
243 |     #
244 | 
245 |     def rate_of_returns(ret, period):
246 |         return ((np.nansum(ret) + 1)**(1. / period)) - 1
247 | 
248 |     sub_portfolios = rolling_apply(
249 |         sub_portfolios,
250 |         window=period,
251 |         func=rate_of_returns,
252 |         min_periods=1,
253 |         args=(period,)
254 |     )
255 |     sub_portfolios = sub_portfolios.add(1).cumprod()
256 | 
257 |     #
258 |     # 求 N 个投资组合累积收益均值
259 |     #
260 |     return sub_portfolios.mean(axis=1)
261 | 
262 | 
263 | def weighted_mean_return(factor_data, grouper):
264 |     """计算(年化)加权平均/标准差"""
265 |     forward_returns_columns = get_forward_returns_columns(factor_data.columns)
266 | 
267 |     def agg(values, weights):
268 |         count = len(values)
269 |         average = np.average(values, weights=weights, axis=0)
270 |         # Fast and numerically precise
271 |         variance = np.average(
272 |             (values - average)**2, weights=weights, axis=0
273 |         ) * count / max((count - 1), 1)
274 |         return pd.Series(
275 |             [average, np.sqrt(variance), count], index=['mean', 'std', 'count']
276 |         )
277 | 
278 |     group_stats = factor_data.groupby(grouper)[
279 |         forward_returns_columns.append(pd.Index(['weights']))] \
280 |         .apply(lambda x: x[forward_returns_columns].apply(
281 |             agg, weights=x['weights'].fillna(0.0).values
282 |         ))
283 | 
284 |     mean_ret = group_stats.xs('mean', level=-1)
285 | 
286 |     std_error_ret = group_stats.xs('std', level=-1) \
287 |         / np.sqrt(group_stats.xs('count', level=-1))
288 | 
289 |     return mean_ret, std_error_ret
290 | 
291 | 
292 | def mean_return_by_quantile(
293 |     factor_data,
294 |     by_date=False,
295 |     by_group=False,
296 |     demeaned=True,
297 |     group_adjust=False
298 | ):
299 |     """
300 |     计算各分位数的因子远期收益均值和标准差
301 | 
302 |     参数
303 |     ----------
304 |     factor_data : pd.DataFrame - MultiIndex
305 |         一个 DataFrame, index 为日期 (level 0) 和资产(level 1) 的 MultiIndex,
306 |         values 包括因子的值, 各期因子远期收益, 因子分位数,
307 |         因子分组(可选), 因子权重(可选)
308 |     by_date : bool
309 |         如果为 True, 则按日期计算各分位数的因子远期收益均值
310 |     by_group : bool
311 |         如果为 True, 则分组计算各分位数的因子远期收益均值
312 |     demeaned : bool
313 |         是否按日期对因子远期收益去均值
314 |     group_adjust : bool
315 |         是否按日期和分组对因子远期收益去均值
316 |     Returns
317 |     -------
318 |     mean_ret : pd.DataFrame
319 |         各分位数因子远期收益均值
320 |     std_error_ret : pd.DataFrame
321 |         各分位数因子远期收益标准差
322 |     """
323 | 
324 |     if group_adjust:
325 |         grouper = [factor_data.index.get_level_values('date')] + ['group']
326 |         factor_data = demean_forward_returns(factor_data, grouper)
327 |     elif demeaned:
328 |         factor_data = demean_forward_returns(factor_data)
329 |     else:
330 |         factor_data = factor_data.copy()
331 | 
332 |     grouper = ['factor_quantile']
333 |     if by_date:
334 |         grouper.append(factor_data.index.get_level_values('date'))
335 | 
336 |     if by_group:
337 |         grouper.append('group')
338 | 
339 |     mean_ret, std_error_ret = weighted_mean_return(factor_data, grouper=grouper)
340 | 
341 |     return mean_ret, std_error_ret
342 | 
343 | 
344 | def compute_mean_returns_spread(
345 |     mean_returns, upper_quant, lower_quant, std_err=None
346 | ):
347 |     """
348 |     计算两个分位数的平均收益之差, 和(可选)计算此差异的标准差
349 | 
350 |     参数
351 |     ----------
352 |     mean_returns : pd.DataFrame
353 |         各分位数因子远期收益均值
354 |     upper_quant : int
355 |         作为被减数的因子分位数
356 |     lower_quant : int
357 |         作为减数的因子分位数
358 |     std_err : pd.DataFrame
359 |         各分位数因子远期收益标准差
360 | 
361 |     返回值
362 |     -------
363 |     mean_return_difference : pd.Series
364 |         每期两个分位数的平均收益之差
365 |     joint_std_err : pd.Series
366 |         每期两个分位数的平均收益标准差之差
367 |     """
368 |     if isinstance(mean_returns.index, pd.MultiIndex):
369 |         mean_return_difference = mean_returns.xs(upper_quant,
370 |                                                  level='factor_quantile') \
371 |             - mean_returns.xs(lower_quant, level='factor_quantile')
372 |     else:
373 |         mean_return_difference = mean_returns.loc[
374 |             upper_quant] - mean_returns.loc[lower_quant]
375 | 
376 |     if isinstance(std_err.index, pd.MultiIndex):
377 |         std1 = std_err.xs(upper_quant, level='factor_quantile')
378 |         std2 = std_err.xs(lower_quant, level='factor_quantile')
379 |     else:
380 |         std1 = std_err.loc[upper_quant]
381 |         std2 = std_err.loc[lower_quant]
382 |     joint_std_err = np.sqrt(std1**2 + std2**2)
383 | 
384 |     return mean_return_difference, joint_std_err
385 | 
386 | 
387 | def quantile_turnover(quantile_factor, quantile, period=1):
388 |     """
389 |     计算当期在分位数中的因子不在上一期分位数中的比例
390 | 
391 |     Parameters
392 |     ----------
393 |     quantile_factor : pd.Series
394 |         包含日期, 资产, 和因子分位数的 DataFrame.
395 |     quantile : int
396 |         对应的分位数
397 |     period: int, optional
398 |         对应的因子远期收益时间跨度
399 |     Returns
400 |     -------
401 |     quant_turnover : pd.Series
402 |         每期对饮分位数因子的换手率
403 |     """
404 | 
405 |     quant_names = quantile_factor[quantile_factor == quantile]
406 |     quant_name_sets = quant_names.groupby(
407 |         level=['date']
408 |     ).apply(lambda x: set(x.index.get_level_values('asset')))
409 |     new_names = (quant_name_sets - quant_name_sets.shift(period)).dropna()
410 |     quant_turnover = new_names.apply(lambda x: len(x)) / quant_name_sets.apply(
411 |         lambda x: len(x)
412 |     )
413 |     quant_turnover.name = quantile
414 |     return quant_turnover
415 | 
416 | 
417 | def factor_autocorrelation(factor_data, period=1, rank=True):
418 |     """
419 |     计算指定时间跨度内平均因子排名/因子值的自相关性.
420 |     该指标对于衡量因子的换手率非常有用.
421 |     如果每个因子值在一个周期内随机变化，我们预计自相关为 0.
422 | 
423 |     参数
424 |     ----------
425 |     factor_data : pd.DataFrame - MultiIndex
426 |         一个 DataFrame, index 为日期 (level 0) 和资产(level 1) 的 MultiIndex,
427 |         values 包括因子的值, 各期因子远期收益, 因子分位数,
428 |         因子分组(可选), 因子权重(可选)
429 |     period: int, optional
430 |         对应的因子远期收益时间跨度
431 |     Returns
432 |     -------
433 |     autocorr : pd.Series
434 |         滞后一期的因子自相关性
435 |     """
436 | 
437 |     grouper = [factor_data.index.get_level_values('date')]
438 | 
439 |     if rank:
440 |         ranks = factor_data.groupby(grouper)[['factor']].rank()
441 |     else:
442 |         ranks = factor_data[['factor']]
443 |     asset_factor_rank = ranks.reset_index().pivot(
444 |         index='date', columns='asset', values='factor'
445 |     )
446 | 
447 |     autocorr = asset_factor_rank.corrwith(
448 |         asset_factor_rank.shift(period), axis=1
449 |     )
450 |     autocorr.name = period
451 |     return autocorr
452 | 
453 | 
454 | def average_cumulative_return_by_quantile(
455 |     factor_data,
456 |     prices,
457 |     periods_before=10,
458 |     periods_after=15,
459 |     demeaned=True,
460 |     group_adjust=False,
461 |     by_group=False
462 | ):
463 |     """
464 |     计算由 periods_before 到 periods_after 定义的周期范围内的因子分位数的平均累积收益率
465 | 
466 |     参数
467 |     ----------
468 |     factor_data : pd.DataFrame - MultiIndex
469 |         一个 DataFrame, index 为日期 (level 0) 和资产(level 1) 的 MultiIndex,
470 |         values 包括因子的值, 各期因子远期收益, 因子分位数,
471 |         因子分组(可选), 因子权重(可选)
472 |     prices : pd.DataFrame
473 |         用于计算因子远期收益的价格数据
474 |         columns 为资产, index 为 日期.
475 |         价格数据必须覆盖因子分析时间段以及额外远期收益计算中的最大预期期数.
476 |     periods_before : int, optional
477 |         之前多少期
478 |     periods_after  : int, optional
479 |         之后多少期
480 |     demeaned : bool, optional
481 |         是否按日期对因子远期收益去均值
482 |     group_adjust : bool
483 |         是否按日期和分组对因子远期收益去均值
484 |     by_group : bool
485 |         如果为 True, 则分组计算各分位数的因子远期累积收益
486 |     Returns
487 |     -------
488 |     cumulative returns and std deviation : pd.DataFrame
489 |         一个 DataFrame, index 为分位数 (level 0) 和 'mean'/'std' (level 1) 的 MultiIndex
490 |         columns 为取值范围从 -periods_before 到 periods_after 的整数
491 |         如果 by_group=True, 则 index 会多出一个 'group' level
492 |     """
493 | 
494 |     def cumulative_return(q_fact, demean_by):
495 |         return common_start_returns(
496 |             q_fact, prices, periods_before, periods_after, True, True, demean_by
497 |         )
498 | 
499 |     def average_cumulative_return(q_fact, demean_by):
500 |         q_returns = cumulative_return(q_fact, demean_by)
501 |         return pd.DataFrame(
502 |             {
503 |                 'mean': q_returns.mean(axis=1),
504 |                 'std': q_returns.std(axis=1)
505 |             }
506 |         ).T
507 | 
508 |     if by_group:
509 | 
510 |         returns_bygroup = []
511 | 
512 |         for group, g_data in factor_data.groupby('group'):
513 |             g_fq = g_data['factor_quantile']
514 |             if group_adjust:
515 |                 demean_by = g_fq  # demeans at group level
516 |             elif demeaned:
517 |                 demean_by = factor_data['factor_quantile']  # demean by all
518 |             else:
519 |                 demean_by = None
520 |             #
521 |             # Align cumulative return from different dates to the same index
522 |             # then compute mean and std
523 |             #
524 |             avgcumret = g_fq.groupby(g_fq).apply(
525 |                 average_cumulative_return, demean_by
526 |             )
527 |             avgcumret['group'] = group
528 |             avgcumret.set_index('group', append=True, inplace=True)
529 |             returns_bygroup.append(avgcumret)
530 | 
531 |         return pd.concat(returns_bygroup, axis=0)
532 | 
533 |     else:
534 | 
535 |         if group_adjust:
536 |             all_returns = []
537 |             for group, g_data in factor_data.groupby('group'):
538 |                 g_fq = g_data['factor_quantile']
539 |                 avgcumret = g_fq.groupby(g_fq).apply(cumulative_return, g_fq)
540 |                 all_returns.append(avgcumret)
541 |             q_returns = pd.concat(all_returns, axis=1)
542 |             q_returns = pd.DataFrame(
543 |                 {
544 |                     'mean': q_returns.mean(axis=1),
545 |                     'std': q_returns.std(axis=1)
546 |                 }
547 |             )
548 |             return q_returns.unstack(level=1).stack(level=0)
549 |         elif demeaned:
550 |             fq = factor_data['factor_quantile']
551 |             return fq.groupby(fq).apply(average_cumulative_return, fq)
552 |         else:
553 |             fq = factor_data['factor_quantile']
554 |             return fq.groupby(fq).apply(average_cumulative_return, None)
555 | 


--------------------------------------------------------------------------------
/jqfactor_analyzer/plot_utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | 
  4 | import sys
  5 | import subprocess
  6 | from functools import wraps
  7 | 
  8 | import matplotlib as mpl
  9 | import seaborn as sns
 10 | import pandas as pd
 11 | 
 12 | 
 13 | def customize(func):
 14 | 
 15 |     @wraps(func)
 16 |     def call_w_context(*args, **kwargs):
 17 | 
 18 |         if not PlotConfig.FONT_SETTED:
 19 |             _use_chinese(True)
 20 | 
 21 |         set_context = kwargs.pop('set_context', True)
 22 |         if set_context:
 23 |             with plotting_context(), axes_style():
 24 |                 sns.despine(left=True)
 25 |                 return func(*args, **kwargs)
 26 |         else:
 27 |             return func(*args, **kwargs)
 28 | 
 29 |     return call_w_context
 30 | 
 31 | 
 32 | def plotting_context(context='notebook', font_scale=1.5, rc=None):
 33 | 
 34 |     if rc is None:
 35 |         rc = {}
 36 | 
 37 |     rc_default = {'lines.linewidth': 1.5}
 38 | 
 39 |     for name, val in rc_default.items():
 40 |         rc.setdefault(name, val)
 41 | 
 42 |     return sns.plotting_context(context=context, font_scale=font_scale, rc=rc)
 43 | 
 44 | 
 45 | def axes_style(style='darkgrid', rc=None):
 46 | 
 47 |     if rc is None:
 48 |         rc = {}
 49 | 
 50 |     rc_default = {}
 51 | 
 52 |     for name, val in rc_default.items():
 53 |         rc.setdefault(name, val)
 54 | 
 55 |     return sns.axes_style(style=style, rc=rc)
 56 | 
 57 | 
 58 | def print_table(table, name=None, fmt=None):
 59 | 
 60 |     from IPython.display import display
 61 | 
 62 |     if isinstance(table, pd.Series):
 63 |         table = pd.DataFrame(table)
 64 | 
 65 |     if isinstance(table, pd.DataFrame):
 66 |         table.columns.name = name
 67 | 
 68 |     prev_option = pd.get_option('display.float_format')
 69 |     if fmt is not None:
 70 |         pd.set_option('display.float_format', lambda x: fmt.format(x))
 71 | 
 72 |     display(table)
 73 | 
 74 |     if fmt is not None:
 75 |         pd.set_option('display.float_format', prev_option)
 76 | 
 77 | 
 78 | class PlotConfig(object):
 79 |     FONT_SETTED = False
 80 |     USE_CHINESE_LABEL = False
 81 |     MPL_FONT_FAMILY = mpl.rcParams["font.family"]
 82 |     MPL_FONT = mpl.rcParams["font.sans-serif"]
 83 |     MPL_UNICODE_MINUS = mpl.rcParams["axes.unicode_minus"]
 84 | 
 85 | 
 86 | def get_chinese_font():
 87 |     if sys.platform.startswith('linux'):
 88 |         cmd = 'fc-list :lang=zh -f "%{family}\n"'
 89 |         output = subprocess.check_output(cmd, shell=True)
 90 |         if isinstance(output, bytes):
 91 |             output = output.decode("utf-8")
 92 |         zh_fonts = [
 93 |             f.split(',', 1)[0] for f in output.split('\n') if f.split(',', 1)[0]
 94 |         ]
 95 |         return zh_fonts
 96 | 
 97 |     return []
 98 | 
 99 | 
100 | def _use_chinese(use=None):
101 |     if use is None:
102 |         return PlotConfig.USE_CHINESE_LABEL
103 |     elif use:
104 |         PlotConfig.USE_CHINESE_LABEL = use
105 |         PlotConfig.FONT_SETTED = True
106 |         _set_chinese_fonts()
107 |     else:
108 |         PlotConfig.USE_CHINESE_LABEL = use
109 |         PlotConfig.FONT_SETTED = True
110 |         _set_default_fonts()
111 | 
112 | 
113 | def _set_chinese_fonts():
114 |     default_chinese_font = ['SimHei', 'FangSong', 'STXihei', 'Hiragino Sans GB',
115 |                             'Heiti SC', 'WenQuanYi Micro Hei']
116 |     chinese_font = default_chinese_font + get_chinese_font()
117 |     # 设置中文字体
118 |     mpl.rc(
119 |         "font", **{
120 |             # seaborn 需要设置 sans-serif
121 |             "sans-serif": chinese_font,
122 |             "family": ','.join(chinese_font) + ',sans-serif'
123 |         }
124 |     )
125 |     # 防止负号乱码
126 |     mpl.rcParams["axes.unicode_minus"] = False
127 | 
128 | 
129 | def _set_default_fonts():
130 |     mpl.rc(
131 |         "font", **{
132 |             "sans-serif": PlotConfig.MPL_FONT,
133 |             "family": PlotConfig.MPL_FONT_FAMILY
134 |         }
135 |     )
136 |     mpl.rcParams["axes.unicode_minus"] = PlotConfig.MPL_UNICODE_MINUS
137 | 
138 | 
139 | class _PlotLabels(object):
140 | 
141 |     def get(self, v):
142 |         if _use_chinese():
143 |             return getattr(self, v + "_CN")
144 |         else:
145 |             return getattr(self, v + "_EN")
146 | 
147 | 
148 | class ICTS(_PlotLabels):
149 |     TITLE_CN = "{} 天 IC"
150 |     TITLE_EN = "{} Period Forward Return Information Coefficient (IC)"
151 |     LEGEND_CN = ["IC", "1个月移动平均"]
152 |     LEGEND_EN = ["IC", "1 month moving avg"]
153 |     TEXT_CN = "均值 {:.3f} \n方差 {:.3f}"
154 |     TEXT_EN = "Mean {:.3f} \nStd. {:.3f}"
155 | 
156 | 
157 | ICTS = ICTS()
158 | 
159 | 
160 | class ICHIST(_PlotLabels):
161 |     TITLE_CN = "%s 天 IC 分布直方图"
162 |     TITLE_EN = "%s Period IC"
163 |     LEGEND_CN = "均值 {:.3f} \n方差 {:.3f}"
164 |     LEGEND_EN = "Mean {:.3f} \nStd. {:.3f}"
165 | 
166 | 
167 | ICHIST = ICHIST()
168 | 
169 | 
170 | class ICQQ(_PlotLabels):
171 |     NORM_CN = "正态"
172 |     NORM_EN = "Normal"
173 |     T_CN = "T"
174 |     T_EN = "T"
175 |     CUSTOM_CN = "自定义"
176 |     CUSTOM_EN = "Theoretical"
177 |     TITLE_CN = "{} 天 IC {}分布 Q-Q 图"
178 |     TITLE_EN = "{} Period IC {} Dist. Q-Q"
179 |     XLABEL_CN = "{} 分布分位数"
180 |     XLABEL_EN = "{} Distribution Quantile"
181 |     YLABEL_CN = "Observed Quantile"
182 |     YLABEL_EN = "Observed Quantile"
183 | 
184 | 
185 | ICQQ = ICQQ()
186 | 
187 | 
188 | class QRETURNBAR(_PlotLabels):
189 |     COLUMN_CN = "{} 天"
190 |     COLUMN_EN = "{} Day"
191 |     TITLE_CN = "各分位数平均收益"
192 |     TITLE_EN = "Mean Period Wise Return By Factor Quantile"
193 |     YLABEL_CN = "平均收益 (bps)"
194 |     YLABEL_EN = "Mean Return (bps)"
195 | 
196 | 
197 | QRETURNBAR = QRETURNBAR()
198 | 
199 | 
200 | class QRETURNVIOLIN(_PlotLabels):
201 |     LEGENDNAME_CN = "滞后天数"
202 |     LEGENDNAME_EN = "forward periods"
203 |     TITLE_CN = "各分位数收益分布图"
204 |     TITLE_EN = "Period Wise Return By Factor Quantile"
205 |     YLABEL_CN = "收益 (bps)"
206 |     YLABEL_EN = "Return (bps)"
207 | 
208 | 
209 | QRETURNVIOLIN = QRETURNVIOLIN()
210 | 
211 | 
212 | class QRETURNTS(_PlotLabels):
213 |     TITLE_CN = "最大分位收益减最小分位收益 ({} 天)"
214 |     TITLE_EN = "Top Minus Bottom Quantile Mean Return ({} Period Forward Return)"
215 |     LEGEND0_CN = "当日收益 (加减 {:.2f} 倍当日标准差)"
216 |     LEGEND0_EN = "mean returns spread (+/- {:.2f} std)"
217 |     LEGEND1_CN = "1 个月移动平均"
218 |     LEGEND1_EN = "1 month moving avg"
219 |     YLABEL_CN = "分位数平均收益差 (bps)"
220 |     YLABEL_EN = "Difference In Quantile Mean Return (bps)"
221 | 
222 | 
223 | QRETURNTS = QRETURNTS()
224 | 
225 | 
226 | class ICGROUP(_PlotLabels):
227 |     TITLE_CN = "分组 IC"
228 |     TITLE_EN = "Information Coefficient By Group"
229 | 
230 | 
231 | ICGROUP = ICGROUP()
232 | 
233 | 
234 | class AUTOCORR(_PlotLabels):
235 |     TITLE_CN = "因子自相关性 (滞后 {} 天)"
236 |     TITLE_EN = "{} Period Factor Autocorrelation"
237 |     YLABEL_CN = "自相关性"
238 |     YLABEL_EN = "Autocorrelation Coefficient"
239 |     TEXT_CN = "均值 {:.3f}"
240 |     TEXT_EN = "Mean {:.3f}"
241 | 
242 | 
243 | AUTOCORR = AUTOCORR()
244 | 
245 | 
246 | class TBTURNOVER(_PlotLabels):
247 |     TURNOVER_CN = "{:d} 分位换手率"
248 |     TURNOVER_EN = "quantile {:d} turnover"
249 |     TITLE_CN = "{} 天换手率"
250 |     TITLE_EN = "{} Period Top and Bottom Quantile Turnover"
251 |     YLABEL_CN = "分位数换手率"
252 |     YLABEL_EN = "Proportion Of Names New To Quantile"
253 | 
254 | 
255 | TBTURNOVER = TBTURNOVER()
256 | 
257 | 
258 | class ICHEATMAP(_PlotLabels):
259 |     TITLE_CN = "{} 天 IC 月度均值"
260 |     TITLE_EN = "Monthly Mean {} Period IC"
261 | 
262 | 
263 | ICHEATMAP = ICHEATMAP()
264 | 
265 | 
266 | class CUMRET(_PlotLabels):
267 |     YLABEL_CN = "累积收益"
268 |     YLABEL_EN = "Cumulative Returns"
269 |     TITLE_CN = "因子值加权多空组合累积收益 ({} 天平均)"
270 |     TITLE_EN = """Factor Weighted Long/Short Portfolio Cumulative Return
271 |                   ({} Fwd Period)"""
272 | 
273 | 
274 | CUMRET = CUMRET()
275 | 
276 | 
277 | class TDCUMRET(_PlotLabels):
278 |     YLABEL_CN = "累积收益"
279 |     YLABEL_EN = "Cumulative Returns"
280 |     TITLE_CN = "做多最大分位做空最小分位组合累积收益 ({} 天平均)"
281 |     TITLE_EN = """Long Top/Short Bottom Factor Portfolio Cumulative Return
282 |                   ({} Fwd Period)"""
283 | 
284 | 
285 | TDCUMRET = TDCUMRET()
286 | 
287 | 
288 | class CUMRETQ(_PlotLabels):
289 |     YLABEL_CN = "累积收益(对数轴)"
290 |     YLABEL_EN = "Log Cumulative Returns"
291 |     TITLE_CN = "分位数 {} 天 Forward Return 累积收益 (对数轴)"
292 |     TITLE_EN = """Cumulative Return by Quantile
293 |                   ({} Period Forward Return)"""
294 | 
295 | 
296 | CUMRETQ = CUMRETQ()
297 | 
298 | 
299 | class AVGCUMRET(_PlotLabels):
300 |     TITLE_CN = "因子预测能力 (前 {} 天, 后 {} 天)"
301 |     TITLE_EN = "Average Cumulative Returns by Quantile ({} days backword, {} days forward)"
302 |     COLUMN_CN = "{} 分位"
303 |     COLUMN_EN = "Quantile {}"
304 |     XLABEL_CN = "天数"
305 |     XLABEL_EN = "Periods"
306 |     YLABEL_CN = "平均累积收益 (bps)"
307 |     YLABEL_EN = "Mean Return (bps)"
308 | 
309 | 
310 | AVGCUMRET = AVGCUMRET()
311 | 
312 | 
313 | class EVENTSDIST(_PlotLabels):
314 |     TITLE_CN = "因子数量随时间分布"
315 |     TITLE_EN = "Distribution of events in time"
316 |     XLABEL_CN = "日期"
317 |     XLABEL_EN = "Date"
318 |     YLABEL_CN = "因子数量"
319 |     YLABEL_EN = "Number of events"
320 | 
321 | 
322 | EVENTSDIST = EVENTSDIST()
323 | 
324 | 
325 | class MISSIINGEVENTSDIST(_PlotLabels):
326 |     TITLE_CN = "因子数量随时间分布"
327 |     TITLE_EN = "Distribution of missing events in time"
328 |     XLABEL_CN = "日期"
329 |     XLABEL_EN = "Date"
330 |     YLABEL_CN = "因子缺失率"
331 |     YLABEL_EN = "Rate of missing events"
332 | 
333 | 
334 | MISSIINGEVENTSDIST = MISSIINGEVENTSDIST()
335 | 


--------------------------------------------------------------------------------
/jqfactor_analyzer/plotting.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | 
  4 | from __future__ import division, print_function
  5 | 
  6 | import pandas as pd
  7 | import numpy as np
  8 | from scipy import stats
  9 | from statsmodels.api import qqplot
 10 | import matplotlib.pyplot as plt
 11 | import matplotlib.cm as cm
 12 | from matplotlib.ticker import ScalarFormatter
 13 | import seaborn as sns
 14 | 
 15 | from .compat import rolling_mean
 16 | from .plot_utils import (
 17 |     print_table, customize, ICTS, ICHIST, ICQQ, QRETURNBAR, QRETURNVIOLIN,
 18 |     QRETURNTS, ICGROUP, AUTOCORR, TBTURNOVER, ICHEATMAP, CUMRET, TDCUMRET,
 19 |     CUMRETQ, AVGCUMRET, EVENTSDIST, MISSIINGEVENTSDIST
 20 | )
 21 | from .performance import cumulative_returns
 22 | from .utils import (ignore_warning, convert_to_forward_returns_columns)
 23 | 
 24 | 
 25 | DECIMAL_TO_BPS = 10000
 26 | 
 27 | 
 28 | def plot_returns_table(alpha_beta, mean_ret_quantile, mean_ret_spread_quantile):
 29 |     returns_table = pd.DataFrame()
 30 |     returns_table = returns_table.append(alpha_beta)
 31 |     returns_table.loc["Mean Period Wise Return Top Quantile (bps)"] = \
 32 |         mean_ret_quantile.iloc[-1] * DECIMAL_TO_BPS
 33 |     returns_table.loc["Mean Period Wise Return Bottom Quantile (bps)"] = \
 34 |         mean_ret_quantile.iloc[0] * DECIMAL_TO_BPS
 35 |     returns_table.loc["Mean Period Wise Spread (bps)"] = \
 36 |         mean_ret_spread_quantile.mean() * DECIMAL_TO_BPS
 37 | 
 38 |     print("收益分析")
 39 |     print_table(returns_table.apply(lambda x: x.round(3)))
 40 | 
 41 | 
 42 | def plot_turnover_table(autocorrelation_data, quantile_turnover, return_df=False):
 43 |     turnover_table = pd.DataFrame()
 44 |     for period in sorted(quantile_turnover.keys()):
 45 |         for quantile, p_data in quantile_turnover[period].iteritems():
 46 |             turnover_table.loc["Quantile {} Mean Turnover ".format(quantile),
 47 |                                "{}".format(period)] = p_data.mean()
 48 |     auto_corr = pd.DataFrame()
 49 |     for period, p_data in autocorrelation_data.iteritems():
 50 |         auto_corr.loc["Mean Factor Rank Autocorrelation", "{}"
 51 |                       .format(period)] = p_data.mean()
 52 | 
 53 |     if return_df:
 54 |         return turnover_table.apply(lambda x: x.round(3)), auto_corr.apply(lambda x: x.round(3))
 55 |     else:
 56 |         print("换手率分析")
 57 |         print_table(turnover_table.apply(lambda x: x.round(3)))
 58 |         print_table(auto_corr.apply(lambda x: x.round(3)))
 59 | 
 60 | 
 61 | def plot_information_table(ic_data, return_df=False):
 62 |     ic_summary_table = pd.DataFrame()
 63 |     ic_summary_table["IC Mean"] = ic_data.mean()
 64 |     ic_summary_table["IC Std."] = ic_data.std()
 65 |     ic_summary_table["IR"] = ic_data.mean() / ic_data.std()
 66 |     t_stat, p_value = stats.ttest_1samp(ic_data, 0)
 67 |     ic_summary_table["t-stat(IC)"] = t_stat
 68 |     ic_summary_table["p-value(IC)"] = p_value
 69 |     ic_summary_table["IC Skew"] = stats.skew(ic_data)
 70 |     ic_summary_table["IC Kurtosis"] = stats.kurtosis(ic_data)
 71 | 
 72 |     if return_df:
 73 |         return ic_summary_table.apply(lambda x: x.round(3)).T
 74 |     else:
 75 |         print("IC 分析")
 76 |         print_table(ic_summary_table.apply(lambda x: x.round(3)).T)
 77 | 
 78 | 
 79 | def plot_quantile_statistics_table(factor_data, return_df=False):
 80 |     quantile_stats = factor_data.groupby('factor_quantile') \
 81 |         .agg(['min', 'max', 'mean', 'std', 'count'])['factor']
 82 |     quantile_stats['count %'] = quantile_stats['count'] \
 83 |         / quantile_stats['count'].sum() * 100.
 84 | 
 85 |     if return_df:
 86 |         return quantile_stats
 87 |     else:
 88 |         print("分位数统计")
 89 |         print_table(quantile_stats)
 90 | 
 91 | 
 92 | @customize
 93 | def plot_ic_ts(ic, ax=None):
 94 | 
 95 |     ic = ic.copy()
 96 | 
 97 |     num_plots = len(ic.columns)
 98 |     if ax is None:
 99 |         f, ax = plt.subplots(num_plots, 1, figsize=(18, num_plots * 7))
100 |         ax = np.asarray([ax]).flatten()
101 | 
102 |     ymin, ymax = (None, None)
103 |     for a, (period, ic) in zip(ax, ic.iteritems()):
104 |         period_num = period.replace('period_', '')
105 |         ic.plot(alpha=0.7, ax=a, lw=0.7, color='steelblue')
106 |         rolling_mean(
107 |             ic, window=22
108 |         ).plot(
109 |             ax=a, color='forestgreen', lw=2, alpha=0.8
110 |         )
111 | 
112 |         a.axhline(0.0, linestyle='-', color='black', lw=1, alpha=0.8)
113 |         a.set(ylabel='IC', xlabel="")
114 |         a.set_title(ICTS.get("TITLE").format(period_num))
115 |         a.legend(ICTS.get("LEGEND"), loc='upper right')
116 |         a.text(
117 |             .05,
118 |             .95,
119 |             ICTS.get("TEXT").format(ic.mean(), ic.std()),
120 |             fontsize=16,
121 |             bbox={
122 |                 'facecolor': 'white',
123 |                 'alpha': 1,
124 |                 'pad': 5
125 |             },
126 |             transform=a.transAxes,
127 |             verticalalignment='top'
128 |         )
129 | 
130 |         curr_ymin, curr_ymax = a.get_ylim()
131 |         ymin = curr_ymin if ymin is None else min(ymin, curr_ymin)
132 |         ymax = curr_ymax if ymax is None else max(ymax, curr_ymax)
133 | 
134 |     for a in ax:
135 |         a.set_ylim([ymin, ymax])
136 | 
137 |     return ax
138 | 
139 | 
140 | @ignore_warning(message='Using a non-tuple sequence for multidimensional indexing is deprecated',
141 |                 category=FutureWarning)
142 | @customize
143 | def plot_ic_hist(ic, ax=None):
144 | 
145 |     ic = ic.copy()
146 | 
147 |     num_plots = len(ic.columns)
148 | 
149 |     v_spaces = ((num_plots - 1) // 3) + 1
150 | 
151 |     if ax is None:
152 |         f, ax = plt.subplots(v_spaces, 3, figsize=(18, v_spaces * 6))
153 |         ax = ax.flatten()
154 | 
155 |     for a, (period, ic) in zip(ax, ic.iteritems()):
156 |         period_num = period.replace('period_', '')
157 |         sns.distplot(ic.replace(np.nan, 0.), norm_hist=True, ax=a)
158 |         a.set_xlim([-1, 1])
159 |         a.set(title=ICHIST.get("TITLE") % period_num, xlabel='IC')
160 |         a.text(
161 |             .05,
162 |             .95,
163 |             ICHIST.get("LEGEND").format(ic.mean(), ic.std()),
164 |             fontsize=16,
165 |             bbox={
166 |                 'facecolor': 'white',
167 |                 'alpha': 1,
168 |                 'pad': 5
169 |             },
170 |             transform=a.transAxes,
171 |             verticalalignment='top'
172 |         )
173 |         a.axvline(ic.mean(), color='w', linestyle='dashed', linewidth=2)
174 | 
175 |     if num_plots < len(ax):
176 |         for a in ax[num_plots:]:
177 |             a.set_visible(False)
178 | 
179 |     return ax
180 | 
181 | 
182 | @customize
183 | def plot_ic_qq(ic, theoretical_dist=stats.norm, ax=None):
184 | 
185 |     ic = ic.copy()
186 | 
187 |     num_plots = len(ic.columns)
188 | 
189 |     v_spaces = ((num_plots - 1) // 3) + 1
190 | 
191 |     if ax is None:
192 |         f, ax = plt.subplots(v_spaces, 3, figsize=(18, v_spaces * 6))
193 |         ax = ax.flatten()
194 | 
195 |     if isinstance(theoretical_dist, stats.norm.__class__):
196 |         dist_name = ICQQ.get("NORM")
197 |     elif isinstance(theoretical_dist, stats.t.__class__):
198 |         dist_name = ICQQ.get("T")
199 |     else:
200 |         dist_name = ICQQ.get("CUSTOM")
201 | 
202 |     for a, (period, ic) in zip(ax, ic.iteritems()):
203 |         period_num = period.replace('period_', '')
204 |         qqplot(
205 |             ic.replace(np.nan, 0.).values,
206 |             theoretical_dist,
207 |             fit=True,
208 |             line='45',
209 |             ax=a
210 |         )
211 |         a.set(
212 |             title=ICQQ.get("TITLE").format(period_num, dist_name),
213 |             xlabel=ICQQ.get("XLABEL").format(dist_name),
214 |             ylabel=ICQQ.get("YLABEL"),
215 |         )
216 | 
217 |     if num_plots < len(ax):
218 |         for a in ax[num_plots:]:
219 |             a.set_visible(False)
220 | 
221 |     return ax
222 | 
223 | 
224 | @customize
225 | def plot_quantile_returns_bar(
226 |     mean_ret_by_q, by_group=False, ylim_percentiles=None, ax=None
227 | ):
228 |     mean_ret_by_q = mean_ret_by_q.copy()
229 |     mean_ret_by_q.columns = mean_ret_by_q.columns.map(
230 |         lambda x: QRETURNBAR.get("COLUMN").format(x.replace("period_", ""))
231 |     )
232 | 
233 |     if ylim_percentiles is not None:
234 |         ymin = (
235 |             np.nanpercentile(mean_ret_by_q.values, ylim_percentiles[0]) *
236 |             DECIMAL_TO_BPS
237 |         )
238 |         ymax = (
239 |             np.nanpercentile(mean_ret_by_q.values, ylim_percentiles[1]) *
240 |             DECIMAL_TO_BPS
241 |         )
242 |     else:
243 |         ymin = None
244 |         ymax = None
245 | 
246 |     if by_group:
247 |         num_group = len(mean_ret_by_q.index.get_level_values('group').unique())
248 | 
249 |         if ax is None:
250 |             v_spaces = ((num_group - 1) // 2) + 1
251 |             f, ax = plt.subplots(
252 |                 v_spaces,
253 |                 2,
254 |                 sharex=False,
255 |                 sharey=True,
256 |                 figsize=(
257 |                     max(
258 |                         18,
259 |                         mean_ret_by_q.index.get_level_values('factor_quantile')
260 |                         .max()
261 |                     ), 6 * v_spaces
262 |                 )
263 |             )
264 |             ax = ax.flatten()
265 | 
266 |         for a, (sc, cor) in zip(ax, mean_ret_by_q.groupby(level='group')):
267 |             (
268 |                 cor.xs(sc, level='group').multiply(DECIMAL_TO_BPS).plot(
269 |                     kind='bar', title=sc, ax=a
270 |                 )
271 |             )
272 | 
273 |             a.set(xlabel='', ylabel=QRETURNBAR.get("YLABEL"), ylim=(ymin, ymax))
274 | 
275 |         if num_group < len(ax):
276 |             for a in ax[num_group:]:
277 |                 a.set_visible(False)
278 | 
279 |         return ax
280 | 
281 |     else:
282 |         if ax is None:
283 |             f, ax = plt.subplots(
284 |                 1,
285 |                 1,
286 |                 figsize=(
287 |                     max(
288 |                         18,
289 |                         mean_ret_by_q.index.get_level_values(
290 |                             'factor_quantile'
291 |                         ).max() // 2
292 |                     ), 6
293 |                 )
294 |             )
295 | 
296 |         mean_ret_by_q.multiply(DECIMAL_TO_BPS).plot(
297 |             kind='bar', title=QRETURNBAR.get("TITLE"), ax=ax
298 |         )
299 |         ax.set(xlabel="", ylabel=QRETURNBAR.get("YLABEL"), ylim=(ymin, ymax))
300 | 
301 |         return ax
302 | 
303 | 
304 | @customize
305 | def plot_quantile_returns_violin(return_by_q, ylim_percentiles=None, ax=None):
306 | 
307 |     return_by_q = return_by_q.copy()
308 | 
309 |     if ylim_percentiles is not None:
310 |         ymin = (
311 |             np.nanpercentile(return_by_q.values, ylim_percentiles[0]) *
312 |             DECIMAL_TO_BPS
313 |         )
314 |         ymax = (
315 |             np.nanpercentile(return_by_q.values, ylim_percentiles[1]) *
316 |             DECIMAL_TO_BPS
317 |         )
318 |     else:
319 |         ymin = None
320 |         ymax = None
321 | 
322 |     if ax is None:
323 |         f, ax = plt.subplots(1, 1, figsize=(18, 6))
324 | 
325 |     unstacked_dr = (return_by_q.multiply(DECIMAL_TO_BPS))
326 |     unstacked_dr.columns = unstacked_dr.columns.astype(str).str.replace(
327 |         'period_', ''
328 |     ).astype(int).set_names(QRETURNVIOLIN.get("LEGENDNAME"))
329 |     unstacked_dr = unstacked_dr.stack()
330 |     unstacked_dr.name = 'return'
331 |     unstacked_dr = unstacked_dr.reset_index()
332 | 
333 |     sns.violinplot(
334 |         data=unstacked_dr,
335 |         x='factor_quantile',
336 |         hue=QRETURNVIOLIN.get("LEGENDNAME"),
337 |         y='return',
338 |         orient='v',
339 |         cut=0,
340 |         inner='quartile',
341 |         ax=ax
342 |     )
343 |     ax.set(
344 |         xlabel='',
345 |         ylabel=QRETURNVIOLIN.get("YLABEL"),
346 |         title=QRETURNVIOLIN.get("TITLE"),
347 |         ylim=(ymin, ymax)
348 |     )
349 | 
350 |     ax.axhline(0.0, linestyle='-', color='black', lw=0.7, alpha=0.6)
351 | 
352 |     return ax
353 | 
354 | 
355 | @customize
356 | def plot_mean_quantile_returns_spread_time_series(
357 |     mean_returns_spread, std_err=None, bandwidth=1, ax=None
358 | ):
359 |     if isinstance(mean_returns_spread, pd.DataFrame):
360 |         if ax is None:
361 |             ax = [None for a in mean_returns_spread.columns]
362 | 
363 |         ymin, ymax = (None, None)
364 |         for (i, a), (name, fr_column
365 |                      ) in zip(enumerate(ax), mean_returns_spread.iteritems()):
366 |             stdn = None if std_err is None else std_err[name]
367 |             a = plot_mean_quantile_returns_spread_time_series(
368 |                 fr_column, std_err=stdn, bandwidth=bandwidth, ax=a
369 |             )
370 |             ax[i] = a
371 |             curr_ymin, curr_ymax = a.get_ylim()
372 |             ymin = curr_ymin if ymin is None else min(ymin, curr_ymin)
373 |             ymax = curr_ymax if ymax is None else max(ymax, curr_ymax)
374 | 
375 |         for a in ax:
376 |             a.set_ylim([ymin, ymax])
377 | 
378 |         return ax
379 | 
380 |     periods = mean_returns_spread.name
381 |     title = QRETURNTS.get(
382 |         "TITLE"
383 |     ).format(periods.replace('period_', '') if periods is not None else '')
384 | 
385 |     if ax is None:
386 |         f, ax = plt.subplots(figsize=(18, 6))
387 | 
388 |     mean_returns_spread_bps = mean_returns_spread * DECIMAL_TO_BPS
389 | 
390 |     mean_returns_spread_bps.plot(alpha=0.4, ax=ax, lw=0.7, color='forestgreen')
391 |     rolling_mean(
392 |         mean_returns_spread_bps, window=22
393 |     ).plot(
394 |         color='orangered', alpha=0.7, ax=ax
395 |     )
396 |     ax.legend(
397 |         [QRETURNTS.get("LEGEND0").format(bandwidth),
398 |          QRETURNTS.get("LEGEND1")],
399 |         loc='upper right'
400 |     )
401 | 
402 |     if std_err is not None:
403 |         std_err_bps = std_err * DECIMAL_TO_BPS
404 |         upper = mean_returns_spread_bps.values + (std_err_bps * bandwidth)
405 |         lower = mean_returns_spread_bps.values - (std_err_bps * bandwidth)
406 |         ax.fill_between(
407 |             mean_returns_spread.index,
408 |             lower,
409 |             upper,
410 |             alpha=0.3,
411 |             color='steelblue'
412 |         )
413 | 
414 |     ylim = np.nanpercentile(abs(mean_returns_spread_bps.values), 95)
415 |     ax.set(
416 |         ylabel=QRETURNTS.get("YLABEL"),
417 |         xlabel="",
418 |         title=title,
419 |         ylim=(-ylim, ylim)
420 |     )
421 |     ax.axhline(0.0, linestyle='-', color='black', lw=1, alpha=0.8)
422 | 
423 |     return ax
424 | 
425 | 
426 | @customize
427 | def plot_ic_by_group(ic_group, ax=None):
428 | 
429 |     ic_group = ic_group.copy()
430 |     ic_group.columns = ic_group.columns.astype(str).str.replace('period_', '')
431 |     if ax is None:
432 |         f, ax = plt.subplots(1, 1, figsize=(max(18, len(ic_group)), 6))
433 |     ic_group.plot(kind='bar', ax=ax)
434 |     ax.set(title=ICGROUP.get("TITLE"), xlabel="")
435 |     ax.set_xticklabels(ic_group.index, rotation=45)
436 | 
437 |     return ax
438 | 
439 | 
440 | @customize
441 | def plot_factor_rank_auto_correlation(
442 |     factor_autocorrelation, period=1, ax=None
443 | ):
444 | 
445 |     if ax is None:
446 |         f, ax = plt.subplots(1, 1, figsize=(18, 6))
447 | 
448 |     factor_autocorrelation.plot(
449 |         title=AUTOCORR.get("TITLE").format(period), ax=ax
450 |     )
451 |     ax.set(ylabel=AUTOCORR.get("YLABEL").format(period), xlabel="")
452 |     ax.axhline(0.0, linestyle='-', color='black', lw=1)
453 |     ax.text(
454 |         .05,
455 |         .95,
456 |         AUTOCORR.get("TEXT").format(factor_autocorrelation.mean()),
457 |         fontsize=16,
458 |         bbox={
459 |             'facecolor': 'white',
460 |             'alpha': 1,
461 |             'pad': 5
462 |         },
463 |         transform=ax.transAxes,
464 |         verticalalignment='top'
465 |     )
466 | 
467 |     return ax
468 | 
469 | 
470 | @customize
471 | def plot_top_bottom_quantile_turnover(quantile_turnover, period=1, ax=None):
472 | 
473 |     if ax is None:
474 |         f, ax = plt.subplots(1, 1, figsize=(18, 6))
475 | 
476 |     max_quantile = quantile_turnover.columns.max()
477 |     min_quantile = quantile_turnover.columns.min()
478 |     turnover = pd.DataFrame()
479 | 
480 |     turnover[TBTURNOVER.get("TURNOVER").format(max_quantile)
481 |              ] = quantile_turnover[max_quantile]
482 |     turnover[TBTURNOVER.get("TURNOVER").format(min_quantile)
483 |              ] = quantile_turnover[min_quantile]
484 |     turnover.plot(
485 |         title=TBTURNOVER.get("TITLE").format(period), ax=ax, alpha=0.6, lw=0.8
486 |     )
487 | 
488 |     ax.set(ylabel=TBTURNOVER.get("YLABEL"), xlabel="")
489 | 
490 |     return ax
491 | 
492 | 
493 | @customize
494 | def plot_monthly_ic_heatmap(mean_monthly_ic, ax=None):
495 | 
496 |     mean_monthly_ic = mean_monthly_ic.copy()
497 | 
498 |     num_plots = len(mean_monthly_ic.columns)
499 | 
500 |     v_spaces = ((num_plots - 1) // 3) + 1
501 | 
502 |     if ax is None:
503 |         f, ax = plt.subplots(v_spaces, 3, figsize=(18, v_spaces * 6))
504 |         ax = ax.flatten()
505 | 
506 |     new_index_year = []
507 |     new_index_month = []
508 |     for date in mean_monthly_ic.index:
509 |         new_index_year.append(date.year)
510 |         new_index_month.append(date.month)
511 | 
512 |     mean_monthly_ic.index = pd.MultiIndex.from_arrays(
513 |         [new_index_year, new_index_month], names=["year", "month"]
514 |     )
515 | 
516 |     for a, (period, ic) in zip(ax, mean_monthly_ic.iteritems()):
517 |         periods_num = period.replace('period_', '')
518 | 
519 |         sns.heatmap(
520 |             ic.unstack(),
521 |             annot=True,
522 |             alpha=1.0,
523 |             center=0.0,
524 |             annot_kws={"size": 15},
525 |             linewidths=0.01,
526 |             linecolor='white',
527 |             cmap=cm.RdYlGn,
528 |             cbar=False,
529 |             ax=a
530 |         )
531 |         a.set(ylabel='', xlabel='')
532 |         a.set_title(ICHEATMAP.get("TITLE").format(periods_num))
533 | 
534 |     if num_plots < len(ax):
535 |         for a in ax[num_plots:]:
536 |             a.set_visible(False)
537 | 
538 |     return ax
539 | 
540 | 
541 | @customize
542 | def plot_cumulative_returns(factor_returns, period=1, overlap=True, ax=None):
543 | 
544 |     if ax is None:
545 |         f, ax = plt.subplots(1, 1, figsize=(18, 6))
546 | 
547 |     overlapping_period = period if overlap else 1
548 |     factor_returns = cumulative_returns(factor_returns, overlapping_period)
549 | 
550 |     factor_returns.plot(ax=ax, lw=3, color='forestgreen', alpha=0.6)
551 |     ax.set(
552 |         ylabel=CUMRET.get("YLABEL"),
553 |         title=CUMRET.get("TITLE").format(period),
554 |         xlabel=""
555 |     )
556 | 
557 |     ax.axhline(1.0, linestyle='-', color='black', lw=1)
558 | 
559 |     return ax
560 | 
561 | 
562 | @customize
563 | def plot_top_down_cumulative_returns(factor_returns, period=1, ax=None):
564 | 
565 |     if ax is None:
566 |         f, ax = plt.subplots(1, 1, figsize=(18, 6))
567 | 
568 |     factor_returns.plot(ax=ax, lw=3, color='forestgreen', alpha=0.6)
569 | 
570 |     ax.set(
571 |         ylabel=TDCUMRET.get("YLABEL"),
572 |         title=TDCUMRET.get("TITLE").format(period),
573 |         xlabel=""
574 |     )
575 | 
576 |     ax.axhline(1.0, linestyle='-', color='black', lw=1)
577 | 
578 |     return ax
579 | 
580 | 
581 | @customize
582 | def plot_cumulative_returns_by_quantile(
583 |     quantile_returns, period=1, overlap=True, ax=None
584 | ):
585 | 
586 |     if ax is None:
587 |         f, ax = plt.subplots(1, 1, figsize=(18, 6))
588 | 
589 |     ret_wide = quantile_returns.reset_index()\
590 |         .pivot(index='date', columns='factor_quantile',
591 |                values=convert_to_forward_returns_columns(period))
592 | 
593 |     overlapping_period = period if overlap else 1
594 |     cum_ret = ret_wide.apply(cumulative_returns, args=(overlapping_period,))
595 |     cum_ret = cum_ret.loc[:, ::-1]
596 | 
597 |     cum_ret.plot(lw=2, ax=ax, cmap=cm.RdYlGn_r)
598 |     ax.legend()
599 |     ymin, ymax = cum_ret.min().min(), cum_ret.max().max()
600 |     ax.set(
601 |         ylabel=CUMRETQ.get("YLABEL"),
602 |         title=CUMRETQ.get("TITLE").format(period),
603 |         xlabel='',
604 |         ylim=(ymin, ymax)
605 |     )
606 |     ax.set_yscale('symlog', linthresh=1)
607 |     ax.set_yticks(np.linspace(ymin, ymax, 8))
608 |     ax.yaxis.set_major_formatter(ScalarFormatter())
609 |     ax.axhline(1.0, linestyle='-', color='black', lw=1)
610 | 
611 |     return ax
612 | 
613 | 
614 | @customize
615 | def plot_quantile_average_cumulative_return(
616 |     avg_cumulative_returns,
617 |     by_quantile=False,
618 |     std_bar=False,
619 |     ax=None,
620 |     periods_before='',
621 |     periods_after=''
622 | ):
623 | 
624 |     avg_cumulative_returns = avg_cumulative_returns.multiply(DECIMAL_TO_BPS)
625 |     quantiles = len(avg_cumulative_returns.index.levels[0].unique())
626 |     palette = [cm.RdYlGn_r(i) for i in np.linspace(0, 1, quantiles)]
627 | 
628 |     if by_quantile:
629 | 
630 |         if ax is None:
631 |             v_spaces = ((quantiles - 1) // 2) + 1
632 |             f, ax = plt.subplots(
633 |                 v_spaces,
634 |                 2,
635 |                 sharex=False,
636 |                 sharey=False,
637 |                 figsize=(18, 6 * v_spaces)
638 |             )
639 |             ax = ax.flatten()
640 | 
641 |         for i, (quantile, q_ret) in enumerate(
642 |             avg_cumulative_returns.groupby(level='factor_quantile')
643 |         ):
644 | 
645 |             mean = q_ret.loc[(quantile, 'mean')]
646 |             mean.name = AVGCUMRET.get("COLUMN").format(quantile)
647 |             mean.plot(ax=ax[i], color=palette[i])
648 |             ax[i].set_ylabel(AVGCUMRET.get("YLABEL"))
649 | 
650 |             if std_bar:
651 |                 std = q_ret.loc[(quantile, 'std')]
652 |                 ax[i].errorbar(
653 |                     std.index,
654 |                     mean,
655 |                     yerr=std,
656 |                     fmt='none',
657 |                     ecolor=palette[i],
658 |                     label=None
659 |                 )
660 | 
661 |             ax[i].axvline(x=0, color='k', linestyle='--')
662 |             ax[i].legend()
663 |             i += 1
664 | 
665 |     else:
666 | 
667 |         if ax is None:
668 |             f, ax = plt.subplots(1, 1, figsize=(18, 6))
669 | 
670 |         for i, (quantile, q_ret) in enumerate(
671 |             avg_cumulative_returns.groupby(level='factor_quantile')
672 |         ):
673 | 
674 |             mean = q_ret.loc[(quantile, 'mean')]
675 |             mean.name = AVGCUMRET.get("COLUMN").format(quantile)
676 |             mean.plot(ax=ax, color=palette[i])
677 | 
678 |             if std_bar:
679 |                 std = q_ret.loc[(quantile, 'std')]
680 |                 ax.errorbar(
681 |                     std.index,
682 |                     mean,
683 |                     yerr=std,
684 |                     fmt='none',
685 |                     ecolor=palette[i],
686 |                     label=None
687 |                 )
688 |             i += 1
689 | 
690 |         ax.axvline(x=0, color='k', linestyle='--')
691 |         ax.legend()
692 |         ax.set(
693 |             title=AVGCUMRET.get("YLABEL").format(periods_before, periods_after),
694 |             xlabel=AVGCUMRET.get("XLABEL"),
695 |             ylabel=AVGCUMRET.get("YLABEL"),
696 |         )
697 | 
698 |     return ax
699 | 
700 | 
701 | @customize
702 | def plot_events_distribution(events, num_days=5, full_dates=None, ax=None):
703 | 
704 |     if ax is None:
705 |         f, ax = plt.subplots(1, 1, figsize=(18, 6))
706 | 
707 |     if full_dates is None:
708 |         full_dates = events.index.get_level_values('date').unique()
709 | 
710 |     group = pd.Series(range(len(full_dates)), index=full_dates) // num_days
711 |     grouper_label = group.drop_duplicates()
712 |     grouper = group.reindex(events.index.get_level_values('date'))
713 | 
714 |     count = events.groupby(grouper.values).count()
715 |     count = count.reindex(grouper_label.values, fill_value=0)
716 |     count.index = grouper_label.index.map(lambda x: x.strftime('%Y-%m-%d'))
717 |     count.plot(kind="bar", grid=False, ax=ax)
718 | 
719 |     def annotateBars(x, dt, ax=ax):
720 |         color = 'black'
721 |         vertalign = 'top'
722 |         ax.text(
723 |             x,
724 |             count.loc[dt],
725 |             "{:d}".format(count.loc[dt]),
726 |             rotation=45,
727 |             color=color,
728 |             horizontalalignment='center',
729 |             verticalalignment=vertalign,
730 |             fontsize=15,
731 |             weight='heavy'
732 |         )
733 | 
734 |     [annotateBars(x, dt, ax=ax) for x, dt in enumerate(list(count.index))]
735 |     ax.set(
736 |         ylabel=EVENTSDIST.get("YLABEL"),
737 |         title=EVENTSDIST.get("TITLE"),
738 |         xlabel=EVENTSDIST.get("XLABEL"),
739 |     )
740 |     return ax
741 | 
742 | 
743 | @customize
744 | def plot_missing_events_distribution(
745 |     events, num_days=5, full_dates=None, ax=None
746 | ):
747 | 
748 |     if ax is None:
749 |         f, ax = plt.subplots(1, 1, figsize=(18, 6))
750 | 
751 |     if full_dates is None:
752 |         full_dates = events.index.get_level_values('date').unique()
753 | 
754 |     daily_count = events.groupby(level='date').count()
755 |     most_common_count = np.argmax(np.bincount(daily_count))
756 |     daily_missing = daily_count / most_common_count - 1
757 |     daily_missing = daily_missing.reindex(full_dates, fill_value=-1.0)
758 | 
759 |     grouper = pd.Series(range(len(full_dates)), index=full_dates) // num_days
760 |     grouper_label = grouper.drop_duplicates()
761 | 
762 |     missing = daily_missing.groupby(grouper.values).mean()
763 |     missing = missing.reindex(grouper_label.values, fill_value=-1.0)
764 |     missing.index = grouper_label.index.map(lambda x: x.strftime('%Y-%m-%d'))
765 |     missing.plot(kind="bar", grid=False, ax=ax)
766 | 
767 |     def annotateBars(x, dt, ax=ax):
768 |         color = 'black'
769 |         vertalign = 'top'
770 |         ax.text(
771 |             x,
772 |             missing.loc[dt],
773 |             "{:+.1f}%".format(missing.loc[dt] * 100),
774 |             rotation=45,
775 |             color=color,
776 |             horizontalalignment='center',
777 |             verticalalignment=vertalign,
778 |             fontsize=15,
779 |             weight='heavy'
780 |         )
781 | 
782 |     [annotateBars(x, dt, ax=ax) for x, dt in enumerate(list(missing.index))]
783 |     ax.set(
784 |         ylabel=MISSIINGEVENTSDIST.get("YLABEL"),
785 |         title=MISSIINGEVENTSDIST.get("TITLE"),
786 |         xlabel=MISSIINGEVENTSDIST.get("XLABEL")
787 |     )
788 | 
789 |     return ax
790 | 


--------------------------------------------------------------------------------
/jqfactor_analyzer/prepare.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | 
  4 | from __future__ import division
  5 | 
  6 | import pandas as pd
  7 | import numpy as np
  8 | 
  9 | from .exceptions import MaxLossExceededError, non_unique_bin_edges_error
 10 | from .utils import get_forward_returns_columns
 11 | 
 12 | 
 13 | @non_unique_bin_edges_error
 14 | def quantize_factor(
 15 |     factor_data, quantiles=5, bins=None, by_group=False, no_raise=False, zero_aware=False,
 16 | ):
 17 |     """
 18 |     计算每期因子分位数
 19 | 
 20 |     参数
 21 |     ----------
 22 |     factor_data : pd.DataFrame - MultiIndex
 23 |         一个 DataFrame, index 为日期 (level 0) 和资产(level 1) 的 MultiIndex,
 24 |         values 包括因子的值, 各期因子远期收益, 因子分位数,
 25 |         因子分组(可选), 因子权重(可选)
 26 |     quantiles : int or sequence[float]
 27 |         在因子分组中按照因子值大小平均分组的组数。
 28 |          或分位数序列, 允许不均匀分组
 29 |         例如 [0, .10, .5, .90, 1.] 或 [.05, .5, .95]
 30 |         'quantiles' 和 'bins' 有且只能有一个不为 None
 31 |     bins : int or sequence[float]
 32 |         在因子分组中使用的等宽 (按照因子值) 区间的数量
 33 |         或边界值序列, 允许不均匀的区间宽度
 34 |         例如 [-4, -2, -0.5, 0, 10]
 35 |         'quantiles' 和 'bins' 有且只能有一个不为 None
 36 |     by_group : bool
 37 |         如果是 True, 按照 group 分别计算分位数
 38 |     no_raise: bool, optional
 39 |         如果为 True，则不抛出任何异常，并且将抛出异常的值设置为 np.NaN
 40 |     zero_aware : bool, optional
 41 |         如果为True，则分别为正负因子值计算分位数。
 42 |         适用于您的信号聚集并且零是正值和负值的分界线的情况.
 43 | 
 44 |     返回值
 45 |     -------
 46 |     factor_quantile : pd.Series
 47 |         index 为日期 (level 0) 和资产(level 1) 的因子分位数
 48 |     """
 49 |     if not ((quantiles is not None and bins is None) or
 50 |             (quantiles is None and bins is not None)):
 51 |         raise ValueError('quantiles 和 bins 至少要输入一个')
 52 | 
 53 |     if zero_aware and not (isinstance(quantiles, int)
 54 |                            or isinstance(bins, int)):
 55 |         msg = ("只有 quantiles 或 bins 为 int 类型时， 'zero_aware' 才能为 True")
 56 |         raise ValueError(msg)
 57 | 
 58 |     def quantile_calc(x, _quantiles, _bins, _zero_aware, _no_raise):
 59 |         try:
 60 |             if _quantiles is not None and _bins is None and not _zero_aware:
 61 |                 return pd.qcut(x, _quantiles, labels=False) + 1
 62 |             elif _quantiles is not None and _bins is None and _zero_aware:
 63 |                 pos_quantiles = pd.qcut(x[x >= 0], _quantiles // 2,
 64 |                                         labels=False) + _quantiles // 2 + 1
 65 |                 neg_quantiles = pd.qcut(x[x < 0], _quantiles // 2,
 66 |                                         labels=False) + 1
 67 |                 return pd.concat([pos_quantiles, neg_quantiles]).sort_index()
 68 |             elif _bins is not None and _quantiles is None and not _zero_aware:
 69 |                 return pd.cut(x, _bins, labels=False) + 1
 70 |             elif _bins is not None and _quantiles is None and _zero_aware:
 71 |                 pos_bins = pd.cut(x[x >= 0], _bins // 2,
 72 |                                   labels=False) + _bins // 2 + 1
 73 |                 neg_bins = pd.cut(x[x < 0], _bins // 2,
 74 |                                   labels=False) + 1
 75 |                 return pd.concat([pos_bins, neg_bins]).sort_index()
 76 |         except Exception as e:
 77 |             if _no_raise:
 78 |                 return pd.Series(index=x.index)
 79 |             raise e
 80 | 
 81 |     grouper = [factor_data.index.get_level_values('date')]
 82 |     if by_group:
 83 |         if 'group' not in factor_data.columns:
 84 |             raise ValueError('只有输入了 groupby 参数时 binning_by_group 才能为 True')
 85 |         grouper.append('group')
 86 | 
 87 |     factor_quantile = factor_data.groupby(grouper)['factor'] \
 88 |         .apply(quantile_calc, quantiles, bins, zero_aware, no_raise)
 89 |     factor_quantile.name = 'factor_quantile'
 90 | 
 91 |     return factor_quantile.dropna()
 92 | 
 93 | 
 94 | def compute_forward_returns(factor,
 95 |                             prices,
 96 |                             periods=(1, 5, 10)):
 97 |     """
 98 |     计算每个因子值对应的 N 期因子远期收益
 99 | 
100 |     参数
101 |     ----------
102 |     factor : pd.Series - MultiIndex
103 |         一个 Series, index 为日期 (level 0) 和资产(level 1) 的 MultiIndex,
104 |         values 为因子值
105 |     prices : pd.DataFrame
106 |         用于计算因子远期收益的价格数据
107 |         columns 为资产, index 为 日期.
108 |         价格数据必须覆盖因子分析时间段以及额外远期收益计算中的最大预期期数.
109 |     periods : sequence[int]
110 |         远期收益的期数
111 |     Returns
112 |     -------
113 |     forward_returns : pd.DataFrame - MultiIndex
114 |         因子远期收益
115 |         index 为日期 (level 0) 和资产(level 1) 的 MultiIndex
116 |         column 为远期收益的期数
117 |     """
118 | 
119 |     factor_dateindex = factor.index.levels[0]
120 |     factor_dateindex = factor_dateindex.intersection(prices.index)
121 | 
122 |     if len(factor_dateindex) == 0:
123 |         raise ValueError("Factor and prices indices don't match: make sure "
124 |                          "they have the same convention in terms of datetimes "
125 |                          "and symbol-names")
126 | 
127 |     prices = prices.filter(items=factor.index.levels[1])
128 | 
129 |     forward_returns = pd.DataFrame(
130 |         index=pd.MultiIndex
131 |         .from_product([prices.index, prices.columns], names=['date', 'asset'])
132 |     )
133 | 
134 |     for period in periods:
135 |         delta = prices.pct_change(period).shift(-period).reindex(factor_dateindex)
136 |         forward_returns['period_{p}'.format(p=period)] = delta.stack()
137 | 
138 |     forward_returns.index = forward_returns.index.rename(['date', 'asset'])
139 | 
140 |     return forward_returns
141 | 
142 | 
143 | def demean_forward_returns(factor_data, grouper=None):
144 |     """
145 |     根据相关分组为因子远期收益去均值.
146 |     分组去均值包含了投资组合分组中性化约束的假设，因此允许跨组评估因子.
147 | 
148 |     Parameters
149 |     ----------
150 |     factor_data : pd.DataFrame - MultiIndex
151 |         因子远期收益
152 |         index 为日期 (level 0) 和资产(level 1) 的 MultiIndex
153 |         column 为远期收益的期数
154 |     grouper : list
155 |         如果为 None, 则只根据日期去均值
156 |         否则则根据列表中提供的组分组去均值
157 | 
158 |     返回值
159 |     -------
160 |     adjusted_forward_returns : pd.DataFrame - MultiIndex
161 |         和 factor_data 相同形状的 DataFrame, 但每个收益都被分组去均值了
162 |     """
163 | 
164 |     factor_data = factor_data.copy()
165 | 
166 |     if not grouper:
167 |         grouper = factor_data.index.get_level_values('date')
168 | 
169 |     cols = get_forward_returns_columns(factor_data.columns)
170 |     factor_data[cols] = factor_data.groupby(
171 |         grouper, as_index=False
172 |     )[cols.append(pd.Index(['weights']))].apply(
173 |         lambda x: x[cols].subtract(
174 |             np.average(x[cols], axis=0, weights=x['weights'].fillna(0.0).values),
175 |             axis=1
176 |         )
177 |     )
178 | 
179 |     return factor_data
180 | 
181 | 
182 | def get_clean_factor(factor,
183 |                      forward_returns,
184 |                      groupby=None,
185 |                      weights=None,
186 |                      binning_by_group=False,
187 |                      quantiles=5,
188 |                      bins=None,
189 |                      max_loss=0.35,
190 |                      zero_aware=False):
191 |     """
192 |     将因子值, 因子远期收益, 因子分组数据, 因子权重数据
193 |     格式化为以时间和资产的 MultiIndex 作为索引的 DataFrame.
194 | 
195 |     参数
196 |     ----------
197 |     factor : pd.Series - MultiIndex
198 |         一个 Series, index 为日期 (level 0) 和资产(level 1) 的 MultiIndex,
199 |         values 为因子的值
200 |     forward_returns : pd.DataFrame - MultiIndex
201 |         一个 DataFrame, index 为日期 (level 0) 和资产(level 1) 的 MultiIndex,
202 |         values 为因子的远期收益, columns 为因子远期收益的期数.
203 |     groupby : pd.Series - MultiIndex or dict
204 |         index 为日期和资产的 Series，为每个资产每天的分组，或资产-分组映射的字典.
205 |         如果传递了dict，则假定分组映射在整个时间段内保持不变.
206 |     weights : pd.Series - MultiIndex or dict
207 |         index 为日期和资产的 Series，为每个资产每天的权重，或资产-权重映射的字典.
208 |         如果传递了dict，则假定权重映射在整个时间段内保持不变.
209 |     binning_by_group : bool
210 |         如果为 True, 则对每个组分别计算分位数.
211 |         适用于因子值范围在各个组上变化很大的情况.
212 |         如果要分析分组(行业)中性的组合, 您最好设置为 True
213 |     quantiles : int or sequence[float]
214 |         在因子分组中按照因子值大小平均分组的组数。
215 |          或分位数序列, 允许不均匀分组
216 |         例如 [0, .10, .5, .90, 1.] 或 [.05, .5, .95]
217 |         'quantiles' 和 'bins' 有且只能有一个不为 None
218 |     bins : int or sequence[float]
219 |         在因子分组中使用的等宽 (按照因子值) 区间的数量
220 |         或边界值序列, 允许不均匀的区间宽度
221 |         例如 [-4, -2, -0.5, 0, 10]
222 |         'quantiles' 和 'bins' 有且只能有一个不为 None
223 |     max_loss : float, optional
224 |         允许的丢弃因子数据的最大百分比 (0.00 到 1.00),
225 |         计算比较输入因子索引中的项目数和输出 DataFrame 索引中的项目数.
226 |         因子数据本身存在缺陷 (例如 NaN),
227 |         没有提供足够的价格数据来计算所有因子值的远期收益，
228 |         或者因为分组失败, 因此可以部分地丢弃因子数据
229 |         设置 max_loss = 0 以停止异常捕获.
230 |     zero_aware : bool, optional
231 |         如果为True，则分别为正负因子值计算分位数。
232 |         适用于您的信号聚集并且零是正值和负值的分界线的情况.
233 | 
234 |     返回值
235 |     -------
236 |     merged_data : pd.DataFrame - MultiIndex
237 |         一个 DataFrame, index 为日期 (level 0) 和资产(level 1) 的 MultiIndex,
238 |         values 包括因子的值, 各期因子远期收益, 因子分位数,
239 |         因子分组(可选), 因子权重(可选)
240 |         - 各期因子远期收益的列名满足 'period_1', 'period_5' 的格式
241 |     """
242 | 
243 |     initial_amount = float(len(factor.index))
244 | 
245 |     factor_copy = factor.copy()
246 |     factor_copy.index = factor_copy.index.rename(['date', 'asset'])
247 | 
248 |     merged_data = forward_returns.copy()
249 |     merged_data['factor'] = factor_copy
250 | 
251 |     if groupby is not None:
252 |         if isinstance(groupby, dict):
253 |             diff = set(factor_copy.index.get_level_values(
254 |                 'asset')) - set(groupby.keys())
255 |             if len(diff) > 0:
256 |                 raise KeyError(
257 |                     "Assets {} not in group mapping".format(
258 |                         list(diff)))
259 | 
260 |             ss = pd.Series(groupby)
261 |             groupby = pd.Series(index=factor_copy.index,
262 |                                 data=ss[factor_copy.index.get_level_values(
263 |                                     'asset')].values)
264 |         elif isinstance(groupby, pd.DataFrame):
265 |             groupby = groupby.stack()
266 |         merged_data['group'] = groupby
267 | 
268 |     if weights is not None:
269 |         if isinstance(weights, dict):
270 |             diff = set(factor_copy.index.get_level_values(
271 |                 'asset')) - set(weights.keys())
272 |             if len(diff) > 0:
273 |                 raise KeyError(
274 |                     "Assets {} not in weights mapping".format(
275 |                         list(diff)))
276 | 
277 |             ww = pd.Series(weights)
278 |             weights = pd.Series(index=factor_copy.index,
279 |                                 data=ww[factor_copy.index.get_level_values(
280 |                                     'asset')].values)
281 |         elif isinstance(weights, pd.DataFrame):
282 |             weights = weights.stack()
283 |         merged_data['weights'] = weights
284 | 
285 |     merged_data = merged_data.dropna()
286 | 
287 |     quantile_data = quantize_factor(
288 |         merged_data,
289 |         quantiles,
290 |         bins,
291 |         binning_by_group,
292 |         True,
293 |         zero_aware
294 |     )
295 | 
296 |     merged_data['factor_quantile'] = quantile_data
297 |     merged_data = merged_data.dropna()
298 |     merged_data['factor_quantile'] = merged_data['factor_quantile'].astype(int)
299 | 
300 |     if 'weights' in merged_data.columns:
301 |         merged_data['weights'] = merged_data.set_index(
302 |             'factor_quantile', append=True
303 |         ).groupby(level=['date', 'factor_quantile'])['weights'].apply(
304 |             lambda s: s.divide(s.sum())
305 |         ).reset_index('factor_quantile', drop=True)
306 | 
307 |     binning_amount = float(len(merged_data.index))
308 | 
309 |     tot_loss = (initial_amount - binning_amount) / initial_amount
310 | 
311 |     no_raise = True if max_loss == 0 else False
312 |     if tot_loss > max_loss and not no_raise:
313 |         message = ("max_loss (%.1f%%) 超过 %.1f%%"
314 |                    % (tot_loss * 100, max_loss * 100))
315 |         raise MaxLossExceededError(message)
316 | 
317 |     return merged_data
318 | 
319 | 
320 | def get_clean_factor_and_forward_returns(factor,
321 |                                          prices,
322 |                                          groupby=None,
323 |                                          weights=None,
324 |                                          binning_by_group=False,
325 |                                          quantiles=5,
326 |                                          bins=None,
327 |                                          periods=(1, 5, 10),
328 |                                          max_loss=0.35,
329 |                                          zero_aware=False):
330 |     """
331 |     将因子数据, 价格数据, 分组映射和权重映射格式化为
332 |     由包含时间和资产的 MultiIndex 作为索引的 DataFrame
333 | 
334 |     参数
335 |     ----------
336 |     factor : pd.Series - MultiIndex
337 |      一个 Series, index 为日期 (level 0) 和资产(level 1) 的 MultiIndex,
338 |         values 为因子的值
339 |     prices : pd.DataFrame
340 |         用于计算因子远期收益的价格数据
341 |         columns 为资产, index 为 日期.
342 |         价格数据必须覆盖因子分析时间段以及额外远期收益计算中的最大预期期数.
343 |     groupby : pd.Series - MultiIndex or dict
344 |         index 为日期和资产的 Series，为每个资产每天的分组，或资产-分组映射的字典.
345 |         如果传递了dict，则假定分组映射在整个时间段内保持不变.
346 |     weights : pd.Series - MultiIndex or dict
347 |         index 为日期和资产的 Series，为每个资产每天的权重，或资产-权重映射的字典.
348 |         如果传递了dict，则假定权重映射在整个时间段内保持不变.
349 |     binning_by_group : bool
350 |         如果为 True, 则对每个组分别计算分位数.
351 |         适用于因子值范围在各个组上变化很大的情况.
352 |         如果要分析分组(行业)中性的组合, 您最好设置为 True
353 |     quantiles : int or sequence[float]
354 |         在因子分组中按照因子值大小平均分组的组数。
355 |          或分位数序列, 允许不均匀分组
356 |         例如 [0, .10, .5, .90, 1.] 或 [.05, .5, .95]
357 |         'quantiles' 和 'bins' 有且只能有一个不为 None
358 |     bins : int or sequence[float]
359 |         在因子分组中使用的等宽 (按照因子值) 区间的数量
360 |         或边界值序列, 允许不均匀的区间宽度
361 |         例如 [-4, -2, -0.5, 0, 10]
362 |         'quantiles' 和 'bins' 有且只能有一个不为 None
363 |     periods : sequence[int]
364 |         远期收益的期数
365 |     max_loss : float, optional
366 |         允许的丢弃因子数据的最大百分比 (0.00 到 1.00),
367 |         计算比较输入因子索引中的项目数和输出 DataFrame 索引中的项目数.
368 |         因子数据本身存在缺陷 (例如 NaN),
369 |         没有提供足够的价格数据来计算所有因子值的远期收益，
370 |         或者因为分组失败, 因此可以部分地丢弃因子数据
371 |         设置 max_loss = 0 以停止异常捕获.
372 |     zero_aware : bool, optional
373 |         如果为True，则分别为正负因子值计算分位数。
374 |         适用于您的信号聚集并且零是正值和负值的分界线的情况.
375 | 
376 |     返回值
377 |     -------
378 |     merged_data : pd.DataFrame - MultiIndex
379 |         一个 DataFrame, index 为日期 (level 0) 和资产(level 1) 的 MultiIndex,
380 |         values 包括因子的值, 各期因子远期收益, 因子分位数,
381 |         因子分组(可选), 因子权重(可选)
382 |         - 各期因子远期收益的列名满足 'period_1', 'period_5' 的格式
383 |     """
384 | 
385 |     forward_returns = compute_forward_returns(factor, prices, periods)
386 | 
387 |     factor_data = get_clean_factor(factor, forward_returns, groupby=groupby,
388 |                                    weights=weights,
389 |                                    quantiles=quantiles, bins=bins,
390 |                                    binning_by_group=binning_by_group,
391 |                                    max_loss=max_loss, zero_aware=zero_aware)
392 | 
393 |     return factor_data
394 | 
395 | 
396 | def common_start_returns(
397 |     factor,
398 |     prices,
399 |     before,
400 |     after,
401 |     cumulative=False,
402 |     mean_by_date=False,
403 |     demean_by=None
404 | ):
405 | 
406 |     if cumulative:
407 |         returns = prices
408 |     else:
409 |         returns = prices.pct_change(axis=0)
410 | 
411 |     all_returns = []
412 | 
413 |     for timestamp, df in factor.groupby(level='date'):
414 | 
415 |         equities = df.index.get_level_values('asset')
416 | 
417 |         try:
418 |             day_zero_index = returns.index.get_loc(timestamp)
419 |         except KeyError:
420 |             continue
421 | 
422 |         starting_index = max(day_zero_index - before, 0)
423 |         ending_index = min(day_zero_index + after + 1, len(returns.index))
424 | 
425 |         equities_slice = set(equities)
426 |         if demean_by is not None:
427 |             demean_equities = demean_by.loc[timestamp] \
428 |                 .index.get_level_values('asset')
429 |             equities_slice |= set(demean_equities)
430 | 
431 |         series = returns.loc[returns.
432 |                              index[starting_index:ending_index], equities_slice]
433 |         series.index = range(
434 |             starting_index - day_zero_index, ending_index - day_zero_index
435 |         )
436 | 
437 |         if cumulative:
438 |             series = (series / series.loc[0, :]) - 1
439 | 
440 |         if demean_by is not None:
441 |             mean = series.loc[:, demean_equities].mean(axis=1)
442 |             series = series.loc[:, equities]
443 |             series = series.sub(mean, axis=0)
444 | 
445 |         if mean_by_date:
446 |             series = series.mean(axis=1)
447 | 
448 |         all_returns.append(series)
449 | 
450 |     return pd.concat(all_returns, axis=1)
451 | 
452 | 
453 | def rate_of_return(period_ret):
454 |     """
455 |     转换回报率为"每期"回报率：如果收益以稳定的速度增长, 则相当于每期的回报率
456 |     """
457 |     period = int(period_ret.name.replace('period_', ''))
458 |     return period_ret.add(1).pow(1. / period).sub(1)
459 | 
460 | 
461 | def std_conversion(period_std):
462 |     """
463 |     转换回报率标准差为"每期"回报率标准差
464 |     """
465 |     period_len = int(period_std.name.replace('period_', ''))
466 |     return period_std / np.sqrt(period_len)
467 | 


--------------------------------------------------------------------------------
/jqfactor_analyzer/preprocess.py:
--------------------------------------------------------------------------------
  1 | # encoding: utf-8
  2 | 
  3 | import warnings
  4 | 
  5 | import pandas as pd
  6 | import numpy as np
  7 | from scipy.stats.mstats import winsorize as spwinsorize
  8 | from decimal import Decimal
  9 | from .utils import ignore_warning
 10 | 
 11 | from .data import  DataApi,convert_date
 12 | from fastcache import lru_cache
 13 | from functools import partial
 14 | from statsmodels.api import OLS, add_constant as sm_add_constant
 15 | 
 16 | 
 17 | 
 18 | def winsorize(data, scale=None, range=None, qrange=None, inclusive=True, inf2nan=True, axis=1):
 19 | 
 20 |     if isinstance(data, pd.DataFrame):
 21 |         return data.apply(
 22 |             winsorize,
 23 |             axis,
 24 |             scale=scale,
 25 |             range=range,
 26 |             qrange=qrange,
 27 |             inclusive=inclusive,
 28 |             inf2nan=inf2nan
 29 |         )
 30 |     elif (isinstance(data, np.ndarray) and data.ndim > 1):
 31 |         return np.apply_along_axis(
 32 |             winsorize,
 33 |             axis,
 34 |             arr=data,
 35 |             scale=scale,
 36 |             range=range,
 37 |             qrange=qrange,
 38 |             inclusive=inclusive,
 39 |             inf2nan=inf2nan
 40 |         )
 41 | 
 42 |     if isinstance(data, pd.Series):
 43 |         v = data.values
 44 |     else:
 45 |         v = data
 46 | 
 47 |     if not np.isfinite(v).any():
 48 |         return data
 49 | 
 50 |     # 如果v是int arrary，无法给 array 赋值 np.nan，因为 np.nan 是个 float
 51 |     v = v.astype(float)
 52 | 
 53 |     if inf2nan:
 54 |         v[~np.isfinite(v)] = np.nan
 55 | 
 56 |     if qrange:
 57 |         if not ((0 <= qrange[0] <= 1) and (0 <= qrange[1] <= 1)):
 58 |             raise Exception(u'qrange 值应在 0 到 1 之间，如 [0.05, 0.95]')
 59 |         qrange = (Decimal(str(qrange[0])), 1 - Decimal(str(qrange[1])))
 60 | 
 61 |         if inclusive:
 62 |             v[~np.isnan(v)] = spwinsorize(v[~np.isnan(v)], qrange, inclusive=[True, True])
 63 |         else:
 64 |             # 如果v是int arrary，无法给 array 赋值 np.nan，因为 np.nan 是个 float
 65 |             v = v.astype(float)
 66 |             not_nan = v[~np.isnan(v)]
 67 |             not_nan[not_nan != spwinsorize(not_nan, qrange, inclusive=[True, True])] = np.nan
 68 |             v[~np.isnan(v)] = not_nan
 69 | 
 70 |     else:
 71 |         if range:
 72 |             range_ = (Decimal(str(range[0])) if not np.isnan(range[0]) else np.nan,
 73 |                       Decimal(str(range[1])) if not np.isnan(range[1]) else np.nan)
 74 |         else:
 75 |             mu = np.mean(data[np.isfinite(data)])
 76 |             sigma = np.std(data[np.isfinite(data)])
 77 |             range_ = (np.nanmin(v[v > mu - scale * sigma]),
 78 |                       np.nanmax(v[v < mu + scale * sigma]))
 79 | 
 80 |         if inclusive:
 81 |             not_nan = ~np.isnan(v)
 82 |             v[not_nan] = np.where(v[not_nan] < range_[0], range_[0], v[not_nan])
 83 |             not_nan = ~np.isnan(v)
 84 |             v[not_nan] = np.where(v[not_nan] > range_[1], range_[1], v[not_nan])
 85 |         else:
 86 |             not_nan = ~np.isnan(v)
 87 |             v_not_nan = v[not_nan]
 88 |             v[not_nan] = np.where(
 89 |                 np.logical_and(v_not_nan >= range_[0], v_not_nan <= range_[1]), v_not_nan, np.nan
 90 |             )
 91 | 
 92 |     if isinstance(data, pd.Series):
 93 |         return pd.Series(v, index=data.index)
 94 |     else:
 95 |         return v
 96 | 
 97 | 
 98 | def winsorize_med(data, scale=1, inclusive=True, inf2nan=True, axis=1):
 99 | 
100 |     if isinstance(data, pd.DataFrame):
101 |         return data.apply(winsorize_med, axis, scale=scale, inclusive=inclusive, inf2nan=inf2nan)
102 |     elif (isinstance(data, np.ndarray) and data.ndim > 1):
103 |         return np.apply_along_axis(
104 |             winsorize_med, axis, arr=data, scale=scale, inclusive=inclusive, inf2nan=inf2nan
105 |         )
106 | 
107 |     if isinstance(data, pd.Series):
108 |         v = data.values
109 |     else:
110 |         v = data
111 | 
112 |     if not np.isfinite(v).any():
113 |         return data
114 | 
115 |     # 如果v是int arrary，无法给 array 赋值 np.nan，因为 np.nan 是个 float
116 |     v = v.astype(float)
117 | 
118 |     if inf2nan:
119 |         v[~np.isfinite(v)] = np.nan
120 | 
121 |     med = np.median(v[~np.isnan(v)])
122 | 
123 |     data_minus_med = v[~np.isnan(v)] - med
124 |     median_absolute = np.median(np.abs(data_minus_med))
125 | 
126 |     if inclusive:
127 |         not_nan = ~np.isnan(v)
128 |         v[not_nan] = np.where(
129 |             v[not_nan] > med + scale * median_absolute, med + scale * median_absolute, v[not_nan]
130 |         )
131 |         not_nan = ~np.isnan(v)
132 |         v[not_nan] = np.where(
133 |             v[not_nan] < med - scale * median_absolute, med - scale * median_absolute, v[not_nan]
134 |         )
135 |     else:
136 |         # 如果v是int arrary，np.nan 会被转换成一个极小的数，比如 -2147483648
137 |         v = v.astype(float)
138 |         not_nan = ~np.isnan(v)
139 |         v_not_nan = v[not_nan]
140 |         v[not_nan] = np.where(
141 |             np.logical_and(
142 |                 v_not_nan <= med + scale * median_absolute,
143 |                 v_not_nan >= med - scale * median_absolute
144 |             ), v_not_nan, np.nan
145 |         )
146 | 
147 |     if isinstance(data, pd.Series):
148 |         return pd.Series(v, index=data.index)
149 |     else:
150 |         return v
151 | 
152 | 
153 | @ignore_warning(message='Mean of empty slice', category=RuntimeWarning)
154 | @ignore_warning(message='Degrees of freedom <= 0 for slice',
155 |                 category=RuntimeWarning)
156 | @ignore_warning(message='invalid value encountered in true_divide',
157 |                 category=RuntimeWarning)
158 | def standardlize(data, inf2nan=True, axis=1):
159 |     if inf2nan:
160 |         data = data.astype('float64')
161 |         data[np.isinf(data)] = np.nan
162 | 
163 |     axis = min(data.ndim - 1, axis)
164 | 
165 |     if not np.any(np.isfinite(data)):
166 |         return data
167 | 
168 |     mu = np.nanmean(np.where(~np.isinf(data), data, np.nan), axis=axis)
169 |     std = np.nanstd(np.where(~np.isinf(data), data, np.nan), axis=axis)
170 | 
171 |     rep = np.tile if axis == 0 else np.repeat
172 |     mu = np.asarray(rep(mu, data.shape[axis])).reshape(data.shape)
173 |     std = np.asarray(rep(std, data.shape[axis])).reshape(data.shape)
174 | 
175 |     if isinstance(data, (pd.Series, pd.DataFrame)):
176 |         data = data.where(np.isinf(data), (data - mu) / std)
177 |     else:
178 |         data = np.where(np.isinf(data), data, (data - mu) / std)
179 |     return data
180 | 
181 | 
182 | @lru_cache(3)
183 | def cache_dataapi(allow_cache=True, show_progress=False):
184 |     return DataApi(allow_cache=allow_cache, show_progress=show_progress)
185 | 
186 | 
187 | def get_neu_basicdata(how, securities, date=None):
188 |     """获取中性化的依赖数据
189 |     返回: 一个 DataFrame, index 是股票代码
190 |     """
191 |     if isinstance(how, str):
192 |         how = [how]
193 | 
194 |     if isinstance(how, (pd.Series, pd.DataFrame)):
195 |         return how
196 |     elif isinstance(how, (list, tuple)):
197 |         how_datas = []
198 |     else:
199 |         raise ValueError("错误的 how 参数格式 : {}".format(how))
200 | 
201 |     dataapi = cache_dataapi()
202 |     for how_name in how:
203 |         if isinstance(how_name, pd.Series):
204 |             how_datas.append(how_name.to_frame())
205 |         elif isinstance(how_name, pd.DataFrame):
206 |             how_datas.append(how_name)
207 |         elif how_name in ['jq_l1', 'jq_l2', 'sw_l1', 'sw_l2', 'sw_l3', 'zjw']:
208 |             industry_info = pd.get_dummies(dataapi._get_cached_industry_one_day(
209 |                 date, securities, industry=how_name)).reindex(securities, fill_value=0)
210 |             how_datas.append(industry_info)
211 |         elif how_name in ['mktcap', 'ln_mktcap', 'cmktcap', 'ln_cmktcap']:
212 |             if how_name == 'mktcap':
213 |                 mkt_api = partial(dataapi._get_market_cap, ln=False)
214 |             elif how_name == 'ln_mktcap':
215 |                 mkt_api = partial(dataapi._get_market_cap, ln=True)
216 |             elif how_name == 'cmktcap':
217 |                 mkt_api = partial(dataapi._get_circulating_market_cap, ln=False)
218 |             elif how_name == 'ln_cmktcap':
219 |                 mkt_api = partial(dataapi._get_circulating_market_cap, ln=True)
220 | 
221 |             market_info= mkt_api(securities=securities, start_date=date, end_date=date).T
222 |             market_info.columns=[how_name]
223 |             how_datas.append(market_info)
224 |         else:
225 |             raise ValueError("不支持的因子名称 : {} ".format(how_name))
226 | 
227 |     return pd.concat(how_datas,axis=1)
228 | 
229 | 
230 | def neutralize(data, how=None, date=None, axis=1, fillna=None, add_constant=False):
231 |     """中性化
232 |     data: pd.Series/pd.DataFrame, 待中性化的序列, 序列的 index/columns 为股票的 code
233 |     how: str list. 中性化使用的因子名称列表. 默认为 ['jq_l1', 'market_cap'], 支持的中性化方法有:
234 |                 1. 行业: sw_l1, sw_l2, sw_l3, jq_l1, jq_l2
235 |                 2. 市值因子: mktcap(总市值), ln_mktcap(对数总市值), cmktcap(流通市值), ln_cmktcap(对数流通市值)
236 |                 3. 自定义的中性化数据: 支持同时传入额外的 Series 或者 DataFrame 用来进行中性化, index 必须是标的代码
237 |                 以上三类参数可同时传入参数列表
238 |     date: 日期, 将用 date 这天的相关变量数据对 series 进行中性化 (注意依赖数据的实际可用时间, 如市值数据当天盘中是无法获取到的)
239 |     axis: 默认为 1. 仅在 data 为 pd.DataFrame 时生效. 表示沿哪个方向做中性化, 0 为对每列做中性化, 1 为对每行做中性化
240 |     fillna: 缺失值填充方式, 默认为None, 表示不填充. 支持的值:
241 |         'jq_l1': 聚宽一级行业
242 |         'jq_l2': 聚宽二级行业
243 |         'sw_l1': 申万一级行业
244 |         'sw_l2': 申万二级行业
245 |         'sw_l3': 申万三级行业 表示使用某行业分类的均值进行填充.
246 |     add_constant: 中性化时是否添加常数项, 默认为 False
247 |     """
248 |     if data.dropna(how='all').empty:
249 |         return data
250 | 
251 |     if how is None:
252 |         how = ['jq_l1', 'mktcap']
253 |     elif isinstance(how, str):
254 |         how = [how]
255 | 
256 |     if isinstance(data, pd.Series) or axis == 0:
257 |         securities = data.index.astype(str)
258 |     else:
259 |         securities = data.columns.astype(str)
260 |     invalid_securities = securities[~(securities.str.endswith("XSHG") | securities.str.endswith("XSHE"))].tolist()
261 |     if invalid_securities:
262 |         raise ValueError('neutralize: 找不到股票: {sym:s}'.format(sym=str(invalid_securities)))
263 | 
264 |     exposure = get_neu_basicdata(how, securities.tolist(), date=date)
265 | 
266 |     with pd.option_context('mode.use_inf_as_null', True):
267 |         exposure.dropna(axis=1, how='all', inplace=True)
268 |         exposure.dropna(inplace=True)
269 |         exposure = exposure.astype(np.float64)
270 | 
271 |     if exposure.empty:
272 |         return data
273 | 
274 |     if fillna is not None:
275 |         dataapi = cache_dataapi()
276 |         ind = dataapi._get_cached_industry_one_day(date, securities)
277 | 
278 |     def valid_index(s):
279 |         return s[np.isfinite(s)].index.intersection(exposure.index)
280 | 
281 |     def get_resid(s):
282 |         valid_index_ = valid_index(s)
283 |         if len(valid_index_) > 1:
284 |             resid = OLS(
285 |                 s.loc[valid_index_].values,
286 |                 (sm_add_constant(exposure.loc[valid_index_].values) if add_constant
287 |                  else exposure.loc[valid_index_].values),
288 |                 missing='drop'
289 |             ).fit().resid
290 |             resid = pd.Series(resid, index=valid_index_)
291 |             resid = resid.reindex(s.index, fill_value=np.nan)
292 |             if fillna is not None:
293 |                 resid = resid.groupby(ind.loc[s.index]).apply(lambda x: x.fillna(x.mean()))
294 |         else:
295 |             resid = pd.Series(np.nan, index=s.index)
296 |         return resid
297 | 
298 |     if isinstance(data, pd.Series):
299 |         return get_resid(data)
300 |     else:
301 |         return data.apply(get_resid, axis)
302 | 
303 | 
304 | __all__ = [
305 |     'neutralize',
306 |     'winsorize',
307 |     'winsorize_med',
308 |     'standardlize',
309 | ]
310 | 


--------------------------------------------------------------------------------
/jqfactor_analyzer/sample.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import os
 4 | import pandas as pd
 5 | 
 6 | 
 7 | VOL5 = pd.read_csv(
 8 |     os.path.abspath(os.path.join(os.path.dirname(__file__),
 9 |                                  'sample_data',
10 |                                  'VOL5.csv')),
11 |     header=0, index_col=0, encoding='utf-8'
12 | )
13 | 
14 | VOL5.index = pd.to_datetime(VOL5.index)
15 | VOL5.index.set_names(['date'], inplace=True)
16 | VOL5.columns.set_names(['asset'], inplace=True)
17 | 


--------------------------------------------------------------------------------
/jqfactor_analyzer/utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | 
 4 | import re
 5 | import six
 6 | import warnings
 7 | from functools import wraps
 8 | try:
 9 |     from collections import Iterable
10 | except ImportError:
11 |     from collections.abc import Iterable
12 | 
13 | import pandas as pd
14 | 
15 | 
16 | def get_forward_returns_columns(columns):
17 |     syntax = re.compile("^period_\\d+$")
18 |     return columns[columns.astype('str').str.contains(syntax, regex=True)]
19 | 
20 | 
21 | def convert_to_forward_returns_columns(period):
22 |     try:
23 |         return 'period_{:d}'.format(period)
24 |     except ValueError:
25 |         return period
26 | 
27 | 
28 | def ignore_warning(message='', category=Warning, module='', lineno=0, append=False):
29 |     """过滤 warnings"""
30 |     def decorator(func):
31 |         @wraps(func)
32 |         def func_wrapper(*args, **kwargs):
33 |             with warnings.catch_warnings():
34 |                 warnings.filterwarnings('ignore', message=message, category=category,
35 |                                         module=module, lineno=lineno, append=append)
36 |                 return func(*args, **kwargs)
37 |         return func_wrapper
38 | 
39 |     return decorator
40 | 
41 | 
42 | def ensure_tuple(x):
43 |     if isinstance(x, six.string_types) or not isinstance(x, Iterable):
44 |         return (x,)
45 |     else:
46 |         return tuple(x)
47 | 


--------------------------------------------------------------------------------
/jqfactor_analyzer/version.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | 
4 | __version__ = '1.1.0'
5 | 


--------------------------------------------------------------------------------
/jqfactor_analyzer/when.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import six
 4 | import datetime
 5 | 
 6 | import pandas as pd
 7 | 
 8 | 
 9 | DateTime = datetime.datetime
10 | Date = datetime.date
11 | Time = datetime.time
12 | TimeDelta = datetime.timedelta
13 | 
14 | today = datetime.date.today
15 | now = datetime.datetime.now
16 | 
17 | 
18 | def date2str(date, format='%Y-%m-%d'):
19 |     return pd.to_datetime(date).strftime(format)
20 | 
21 | 
22 | def convert_date(date):
23 |     if isinstance(date, six.string_types):
24 |         if ':' in date:
25 |             date = date[:10]
26 |         return datetime.datetime.strptime(date, '%Y-%m-%d').date()
27 |     elif isinstance(date, datetime.datetime):
28 |         return date.date()
29 |     elif isinstance(date, datetime.date):
30 |         return date
31 |     raise Exception("date 必须是datetime.date, datetime.datetime或者如下格式的字符串:'2015-01-05'")
32 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | six
 2 | fastcache>=1.0.2
 3 | SQLAlchemy>=1.2.8
 4 | cached_property>=1.5.1
 5 | statsmodels
 6 | scipy
 7 | numpy>=1.15.0
 8 | pandas>=1.0.0
 9 | matplotlib
10 | seaborn
11 | jqdatasdk
12 | pyarrow
13 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [bdist_wheel]
2 | universal=1
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from __future__ import print_function
 5 | 
 6 | from os.path import join as path_join, dirname as path_dirname
 7 | 
 8 | from setuptools import setup, find_packages
 9 | 
10 | try:
11 |     # for pip >= 10
12 |     from pip._internal.req import parse_requirements
13 | except ImportError:
14 |     # for pip <= 9.0.3
15 |     from pip.req import parse_requirements
16 | 
17 | try:
18 |     requirements = [str(ir.req) for ir in parse_requirements("requirements.txt", session=False)]
19 | except AttributeError:
20 |     requirements = [str(ir.requirement) for ir in parse_requirements("requirements.txt", session=False)]
21 | 
22 | 
23 | def get_version():
24 |     scope = {}
25 |     with open(path_join(path_dirname(__file__), "jqfactor_analyzer", "version.py")) as fp:
26 |         exec(fp.read(), scope)
27 |     return scope.get('__version__', '1.0')
28 | 
29 | 
30 | def get_long_description():
31 |     with open(path_join(path_dirname(__file__), 'README.md'), 'rb') as fp:
32 |         long_desc = fp.read()
33 | 
34 |     long_desc = long_desc.replace(
35 |         u'docs/API文档.md'.encode('utf-8'),
36 |         u'https://github.com/JoinQuant/jqfactor_analyzer/blob/master/docs/API%E6%96%87%E6%A1%A3.md'.encode('utf-8'),
37 |     )
38 | 
39 |     return long_desc.decode('utf-8')
40 | 
41 | 
42 | setup_args = dict(
43 |     name='jqfactor_analyzer',
44 |     version=get_version(),
45 |     packages=find_packages(exclude=("tests", "tests.*")),
46 |     author='JoinQuant',
47 |     author_email='xlx@joinquant.com',
48 |     maintainer="",
49 |     maintainer_email="",
50 |     url='https://www.joinquant.com',
51 |     description='JoinQuant single factor analyzer',
52 |     long_description=get_long_description(),
53 |     long_description_content_type='text/markdown',
54 |     zip_safe=False,
55 |     platforms=["all"],
56 |     license='Apache License v2',
57 |     classifiers=[
58 |         'Programming Language :: Python',
59 |         'Operating System :: Microsoft :: Windows',
60 |         'Operating System :: Unix',
61 |         'Programming Language :: Python :: 2.7',
62 |         'Programming Language :: Python :: 3.4',
63 |         'Programming Language :: Python :: 3.5',
64 |         'Programming Language :: Python :: 3.6',
65 |         'Programming Language :: Python :: 3.7',
66 |     ],
67 |     install_requires=requirements,
68 |     include_package_data=True,
69 |     package_data={'jqfactor_analyzer': ['jqfactor_analyzer/sample_data/*.csv', 'jqfactor_analyzer/config.json']},
70 | )
71 | 
72 | 
73 | def main():
74 |     setup(**setup_args)
75 | 
76 | 
77 | if __name__ == "__main__":
78 |     main()
79 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JoinQuant/jqfactor_analyzer/69e677dc0dd9bed9fece02a70b9c81ce3d0afc53/tests/__init__.py


--------------------------------------------------------------------------------
/tests/test_attribution.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import datetime
 3 | import pandas as pd
 4 | from functools import partial
 5 | 
 6 | from jqfactor_analyzer import AttributionAnalysis, DataApi
 7 | 
 8 | try:
 9 |     import jqdata
10 | except:
11 |     # 使用 sdk 进行测试时可能需要先登陆
12 |     import jqdatasdk
13 | 
14 | weights = pd.read_csv(
15 |     os.path.join(os.getcwd(), "jqfactor_analyzer/sample_data/weight_info.csv"), index_col=0)
16 | returns = weights.pop("return")
17 | index_weights = pd.read_csv(
18 |     os.path.join(os.getcwd(), "jqfactor_analyzer/sample_data/index_weight_info.csv"), index_col=0)
19 | index_returns = index_weights.pop("return")
20 | 
21 | dataapi = DataApi(allow_cache=True, show_progress=True)
22 | w2 = index_weights.div(index_weights.sum(axis=1), axis=0) * 0.1
23 | r2 = dataapi.api.get_price('000905.XSHG',
24 |                            start_date='2020-01-01',
25 |                            end_date='2024-07-01',
26 |                            fields='close',
27 |                            fq=None)['close'].pct_change() * 0.1
28 | An = AttributionAnalysis(w2, r2, style_type='style' )
29 | df = An.get_attr_returns2bench("000905.XSHG")
30 | 
31 | 
32 | def test_get_attr_returns2bench():
33 |     assert df.shape == (1088, 46)
34 |     assert set(df.columns) == set([
35 |         'beta', 'book_to_price_ratio', 'earnings_yield', 'growth', 'leverage',
36 |         'liquidity', 'momentum', 'non_linear_size', 'residual_volatility',
37 |         'size', '801750', '801160', '801200', '801780', '801050', '801040',
38 |         '801960', '801170', '801760', '801790', '801720', '801130', '801080',
39 |         '801110', '801890', '801140', '801120', '801180', '801880', '801030',
40 |         '801770', '801740', '801730', '801950', '801010', '801230', '801710',
41 |         '801970', '801210', '801150', '801020', '801980', 'common_return',
42 |         'cash', 'specific_return', 'total_return']
43 |     )
44 | 
45 | 
46 | def test_net():
47 |     func = partial(dataapi.api.get_price,
48 |                    '000905.XSHG',
49 |                    start_date='2020-01-01',
50 |                    end_date='2024-07-01',
51 |                    fields='close')
52 |     if dataapi._api_name == 'jqdata':
53 |         index_return = func(pre_factor_ref_date=datetime.date.today())['close'].pct_change()[1:]
54 |     else:
55 |         index_return = func()['close'].pct_change()[1:]
56 |     index_net = (index_return.fillna(0) + 1).cumprod()
57 |     assert len(index_net) == 1087
58 | 


--------------------------------------------------------------------------------
/tests/test_data.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | 
 4 | from jqfactor_analyzer.data import DataApi
 5 | from jqfactor_analyzer.preprocess import *
 6 | from jqfactor_analyzer.factor_cache import *
 7 | 
 8 | 
 9 | try:
10 |     import jqdata
11 | except:
12 |     # 使用 sdk 进行测试时可能需要先登陆
13 |     import jqdatasdk
14 | 
15 | 
16 | def test_preprocess():
17 |     api = DataApi(weight_method='mktcap')
18 |     codes = api._api.get_all_securities('stock').index.tolist()
19 |     start_date = '2024-07-05'
20 |     end_date = '2024-07-15'
21 |     df = api.apis['prices'](codes, start_date, end_date).dropna(how='all', axis=1)
22 | 
23 |     w_df = winsorize(df, scale=1)
24 |     assert all(df.max() >= w_df.max())
25 | 
26 |     wm_df = winsorize_med(df, scale=1)
27 |     assert not wm_df.equals(w_df)
28 | 
29 |     s_df = standardlize(df)
30 |     assert set(s_df.std(axis=1).round()) == {1.0}
31 | 
32 |     n_df = neutralize(df, how='sw_l3', date='2024-07-10')
33 |     assert n_df.shape == (7, 5111)
34 | 
35 | 
36 | def test_cache():
37 |     # api1 不开启缓存, api2 开启缓存
38 |     api1 = DataApi(weight_method='mktcap', allow_cache=False)
39 |     api2 = DataApi(weight_method='mktcap')
40 |     codes = api1._api.get_all_securities('stock').index.tolist()
41 |     start_date = '2024-07-01'
42 |     end_date = '2024-07-10'
43 | 
44 |     df1 = api1.apis['weights'](codes, start_date, end_date)
45 |     df2 = api2.apis['weights'](codes, start_date, end_date)
46 |     for code in codes:
47 |         assert (df1[code] - df2[code]).abs().sum() < 1e-3
48 | 
49 |     api1.weight_method = api2.weight_method = 'cmktcap'
50 |     df1 = api1.apis['weights'](codes, start_date, end_date)
51 |     df2 = api2.apis['weights'](codes, start_date, end_date)
52 |     for code in codes:
53 |         assert (df1[code] - df2[code]).abs().sum() < 1e-3
54 | 
55 |     df1 = api1.apis['prices'](codes, start_date, end_date)
56 |     df2 = api2.apis['prices'](codes, start_date, end_date)
57 |     assert df1.equals(df2)
58 | 
59 |     # 非后复权的 price 存在微量差异
60 |     api1.fq = 'pre'
61 |     api2.fq = 'pre'
62 |     df1 = api1.apis['prices'](codes, start_date, end_date)  # 无缓存
63 |     df2 = api2.apis['prices'](codes, start_date, end_date)  # 有缓存
64 |     for code in codes:
65 |         diff = (df1[code] - df2[code]).abs().sum()
66 |         assert diff < 1e-12
67 | 
68 |     api1.fq = None
69 |     api1.price = 'open'
70 |     api2.fq = None
71 |     api2.price = 'open'
72 |     df1 = api1.apis['prices'](codes, start_date, end_date)
73 |     df2 = api2.apis['prices'](codes, start_date, end_date)
74 |     for code in codes:
75 |         diff = (df1[code] - df2[code]).abs().sum()
76 |         assert diff < 1e-12
77 | 
78 |     df1 = api1.apis['groupby'](codes, start_date, end_date)
79 |     df2 = api2.apis['groupby'](codes, start_date, end_date)
80 |     assert df1.equals(df2)
81 | 
82 |     # 删除缓存文件
83 |     cache_path = get_cache_dir()
84 |     if os.path.exists(cache_path):
85 |         shutil.rmtree(cache_path)
86 | 


--------------------------------------------------------------------------------
/tests/test_performance.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | 
  4 | import pytest
  5 | import pandas as pd
  6 | from numpy import nan, float64
  7 | 
  8 | from jqfactor_analyzer.prepare import get_clean_factor_and_forward_returns
  9 | from jqfactor_analyzer.performance import (
 10 |     factor_information_coefficient,
 11 |     factor_autocorrelation,
 12 |     mean_information_coefficient,
 13 |     quantile_turnover,
 14 |     factor_returns, factor_alpha_beta,
 15 |     average_cumulative_return_by_quantile
 16 | )
 17 | from jqfactor_analyzer.utils import get_forward_returns_columns
 18 | 
 19 | 
 20 | dr = pd.date_range(start='2015-1-1', end='2015-1-2')
 21 | dr.name = 'date'
 22 | tickers = ['A', 'B', 'C', 'D']
 23 | factor = pd.DataFrame(index=dr,
 24 |                       columns=tickers,
 25 |                       data=[[1, 2, 3, 4],
 26 |                             [4, 3, 2, 1]]).stack()
 27 | factor.index = factor.index.set_names(['date', 'asset'])
 28 | factor.name = 'factor'
 29 | factor_data = pd.DataFrame()
 30 | factor_data['factor'] = factor
 31 | factor_data['group'] = pd.Series(index=factor.index,
 32 |                                  data=[1, 1, 2, 2, 1, 1, 2, 2],)
 33 | factor_data['weights'] = pd.Series(range(8), index=factor.index,
 34 |                                    dtype=float64) + 1
 35 | 
 36 | 
 37 | @pytest.mark.parametrize(
 38 |     ('factor_data', 'forward_returns', 'group_adjust',
 39 |      'by_group', 'expected_ix', 'expected_ic_val'),
 40 |     [(factor_data, [4, 3, 2, 1, 1, 2, 3, 4], False, False, dr, [-1., -1.]),
 41 |      (factor_data, [1, 2, 3, 4, 4, 3, 2, 1], False, False, dr, [1., 1.]),
 42 |      (factor_data, [1, 2, 3, 4, 4, 3, 2, 1], False, True,
 43 |       pd.MultiIndex.from_product([dr, [1, 2]], names=['date', 'group']),
 44 |       [1., 1., 1., 1.]),
 45 |      (factor_data, [1, 2, 3, 4, 4, 3, 2, 1], True, True,
 46 |       pd.MultiIndex.from_product([dr, [1, 2]], names=['date', 'group']),
 47 |       [1., 1., 1., 1.])]
 48 | )
 49 | def test_information_coefficient(factor_data,
 50 |                                  forward_returns,
 51 |                                  group_adjust,
 52 |                                  by_group,
 53 |                                  expected_ix,
 54 |                                  expected_ic_val):
 55 | 
 56 |     factor_data = factor_data.copy()
 57 |     factor_data['period_1'] = pd.Series(index=factor_data.index,
 58 |                                         data=forward_returns)
 59 | 
 60 |     ic = factor_information_coefficient(factor_data=factor_data,
 61 |                                         group_adjust=group_adjust,
 62 |                                         by_group=by_group)
 63 | 
 64 |     expected_ic_df = pd.DataFrame(index=expected_ix,
 65 |                                   columns=pd.Index(['period_1'], dtype='object'),
 66 |                                   data=expected_ic_val)
 67 | 
 68 |     pd.testing.assert_frame_equal(ic, expected_ic_df)
 69 | 
 70 | @pytest.mark.parametrize(
 71 |     (
 72 |         'factor_data', 'forward_returns', 'group_adjust',
 73 |         'by_group', 'by_time', 'expected_ix', 'expected_ic_val'
 74 |     ), [
 75 |         (factor_data, [4, 3, 2, 1, 1, 2, 3, 4], False, False, 'D',
 76 |          dr, [-1., -1.]),
 77 |         (factor_data, [1, 2, 3, 4, 4, 3, 2, 1], False, False, 'W',
 78 |          pd.DatetimeIndex(['2015-01-04'], name='date', freq='W-SUN'), [1.]),
 79 |         (factor_data, [1, 2, 3, 4, 4, 3, 2, 1], False, True, None,
 80 |          pd.Int64Index([1, 2], name='group'), [1., 1.]),
 81 |         (factor_data, [1, 2, 3, 4, 4, 3, 2, 1], False, True, 'W',
 82 |          pd.MultiIndex.from_product(
 83 |              [pd.DatetimeIndex(['2015-01-04'], name='date', freq='W-SUN'),
 84 |               [1, 2]],
 85 |              names=['date', 'group']
 86 |          ),
 87 |          [1., 1.])
 88 |     ]
 89 | )
 90 | def test_mean_information_coefficient(factor_data,
 91 |                                       forward_returns,
 92 |                                       group_adjust,
 93 |                                       by_group,
 94 |                                       by_time,
 95 |                                       expected_ix,
 96 |                                       expected_ic_val):
 97 | 
 98 |     factor_data = factor_data.copy()
 99 |     factor_data['period_1'] = pd.Series(index=factor_data.index,
100 |                                         data=forward_returns)
101 | 
102 |     ic = mean_information_coefficient(factor_data,
103 |                                       group_adjust=group_adjust,
104 |                                       by_group=by_group,
105 |                                       by_time=by_time)
106 | 
107 |     expected_ic_df = pd.DataFrame(index=expected_ix,
108 |                                   columns=pd.Index(['period_1']),
109 |                                   data=expected_ic_val)
110 | 
111 |     pd.testing.assert_frame_equal(ic, expected_ic_df,
112 |                                   check_index_type=False,
113 |                                   check_column_type=False)
114 | 
115 | 
116 | @pytest.mark.parametrize(
117 |     ('quantile_values', 'test_quantile', 'expected_vals'),
118 |     [([[1.0, 2.0, 3.0, 4.0],
119 |        [4.0, 3.0, 2.0, 1.0],
120 |        [1.0, 2.0, 3.0, 4.0],
121 |        [1.0, 2.0, 3.0, 4.0]],
122 |       4.0,
123 |       [nan, 1.0, 1.0, 0.0]),
124 |      ([[1.0, 2.0, 3.0, 4.0],
125 |        [1.0, 2.0, 3.0, 4.0],
126 |        [1.0, 2.0, 3.0, 4.0],
127 |        [1.0, 2.0, 3.0, 4.0]],
128 |       3.0,
129 |       [nan, 0.0, 0.0, 0.0]),
130 |      ([[1.0, 2.0, 3.0, 4.0],
131 |        [4.0, 3.0, 2.0, 1.0],
132 |        [1.0, 2.0, 3.0, 4.0],
133 |        [4.0, 3.0, 2.0, 1.0]],
134 |       2.0,
135 |       [nan, 1.0, 1.0, 1.0])]
136 | )
137 | def test_quantile_turnover(quantile_values, test_quantile,
138 |                            expected_vals):
139 | 
140 |     dr = pd.date_range(start='2015-1-1', end='2015-1-4')
141 |     dr.name = 'date'
142 |     tickers = ['A', 'B', 'C', 'D']
143 | 
144 |     quantized_test_factor = pd.Series(
145 |         pd.DataFrame(index=dr, columns=tickers, data=quantile_values).stack()
146 |     )
147 |     quantized_test_factor.index = quantized_test_factor.index.set_names(
148 |         ['date', 'asset']
149 |     )
150 | 
151 |     to = quantile_turnover(quantized_test_factor, test_quantile)
152 | 
153 |     expected = pd.Series(
154 |         index=quantized_test_factor.index.levels[0], data=expected_vals)
155 |     expected.name = test_quantile
156 | 
157 |     pd.testing.assert_series_equal(to, expected)
158 | 
159 | 
160 | @pytest.mark.parametrize(
161 |     ('factor_data', 'factor_vals', 'fwd_return_vals',
162 |      'group_adjust', 'expected_vals'),
163 |     [(factor_data, [1, 2, 3, 4, 4, 3, 2, 1], [4, 3, 2, 1, 1, 2, 3, 4],
164 |       False, [-1.25000, -1.25000]),
165 |      (factor_data, [1, 1, 1, 1, 1, 1, 1, 1], [4, 3, 2, 1, 1, 2, 3, 4],
166 |       False, [0.0, 0.0]),
167 |      (factor_data, [1, 2, 3, 4, 4, 3, 2, 1], [4, 3, 2, 1, 1, 2, 3, 4],
168 |       True, [-0.5, -0.5]),
169 |      (factor_data, [1, 2, 3, 4, 1, 2, 3, 4], [1, 4, 1, 2, 1, 2, 2, 1],
170 |       True, [1.0, 0.0]),
171 |      (factor_data, [1, 1, 1, 1, 1, 1, 1, 1], [4, 3, 2, 1, 1, 2, 3, 4],
172 |       True, [0.0, 0.0])]
173 | )
174 | def test_factor_returns(factor_data,
175 |                         factor_vals,
176 |                         fwd_return_vals,
177 |                         group_adjust,
178 |                         expected_vals):
179 | 
180 |     factor_data = factor_data.copy()
181 |     factor_data['period_1'] = fwd_return_vals
182 |     factor_data['factor'] = factor_vals
183 | 
184 |     factor_returns_s = factor_returns(factor_data=factor_data,
185 |                                       demeaned=True,
186 |                                       group_adjust=group_adjust)
187 | 
188 |     expected = pd.DataFrame(
189 |         index=dr,
190 |         data=expected_vals,
191 |         columns=get_forward_returns_columns(factor_data.columns)
192 |     )
193 | 
194 |     pd.testing.assert_frame_equal(factor_returns_s, expected)
195 | 
196 | 
197 | @pytest.mark.parametrize(
198 |     ('factor_data', 'fwd_return_vals', 'alpha', 'beta'),
199 |     [(factor_data, [1, 2, 3, 4, 1, 1, 1, 1], -1, 5. / 6.)]
200 | )
201 | def test_factor_alpha_beta(factor_data, fwd_return_vals, alpha, beta):
202 | 
203 |     factor_data = factor_data.copy()
204 |     factor_data['period_1'] = fwd_return_vals
205 | 
206 |     ab = factor_alpha_beta(factor_data=factor_data)
207 | 
208 |     expected = pd.DataFrame(columns=['period_1'],
209 |                             index=['Ann. alpha', 'beta'],
210 |                             data=[alpha, beta])
211 | 
212 |     pd.testing.assert_frame_equal(ab, expected)
213 | 
214 | @pytest.mark.parametrize(
215 |     ('factor_values', 'end_date', 'period', 'expected_vals'),
216 |     [([[1.0, 2.0, 3.0, 4.0],
217 |        [1.0, 2.0, 3.0, 4.0],
218 |        [1.0, 2.0, 3.0, 4.0],
219 |        [1.0, 2.0, 3.0, 4.0]],
220 |       '2015-1-4', 1,
221 |       [nan, 1.0, 1.0, 1.0]),
222 |      ([[4.0, 3.0, 2.0, 1.0],
223 |        [1.0, 2.0, 3.0, 4.0],
224 |        [4.0, 3.0, 2.0, 1.0],
225 |        [1.0, 2.0, 3.0, 4.0]],
226 |       '2015-1-4', 1,
227 |       [nan, -1.0, -1.0, -1.0]),
228 |      ([[1.0, 2.0, 3.0, 4.0],
229 |        [2.0, 1.0, 4.0, 3.0],
230 |        [4.0, 3.0, 2.0, 1.0],
231 |        [1.0, 2.0, 3.0, 4.0],
232 |        [2.0, 1.0, 4.0, 3.0],
233 |        [4.0, 3.0, 2.0, 1.0],
234 |        [2.0, 1.0, 4.0, 3.0],
235 |        [4.0, 3.0, 2.0, 1.0],
236 |        [1.0, 2.0, 3.0, 4.0],
237 |        [2.0, 1.0, 4.0, 3.0],
238 |        [2.0, 1.0, 4.0, 3.0],
239 |        [4.0, 3.0, 2.0, 1.0]],
240 |       '2015-1-12', 3,
241 |       [nan, nan, nan, 1.0, 1.0, 1.0, 0.6, -0.6, -1.0, 1.0, -0.6, -1.0])]
242 | )
243 | def test_factor_autocorrelation(factor_values,
244 |                                 end_date,
245 |                                 period,
246 |                                 expected_vals):
247 |     dr = pd.date_range(start='2015-1-1', end=end_date)
248 |     dr.name = 'date'
249 |     tickers = ['A', 'B', 'C', 'D']
250 |     factor = pd.DataFrame(index=dr,
251 |                           columns=tickers,
252 |                           data=factor_values).stack()
253 |     factor.index = factor.index.set_names(['date', 'asset'])
254 | 
255 |     factor_df = pd.DataFrame()
256 |     factor_df['factor'] = factor
257 | 
258 |     fa = factor_autocorrelation(factor_df, period)
259 |     expected = pd.Series(index=dr, data=expected_vals)
260 |     expected.name = period
261 | 
262 |     pd.testing.assert_series_equal(fa, expected)
263 | 
264 | @pytest.mark.parametrize(
265 |     ('before', 'after', 'demeaned', 'quantiles', 'expected_vals'),
266 |     [(1, 2, False, 4,
267 |       [[1.00, 0.0, -0.50, -0.75],
268 |        [0.0, 0.0, 0.0, 0.0],
269 |        [0.00, 0.00, 0.00, 0.00],
270 |        [0.0, 0.0, 0.0, 0.0],
271 |        [-0.20, 0.0, 0.25, 0.5625],
272 |        [0.0, 0.0, 0.0, 0.0],
273 |        [-0.3333333, 0.0, 0.50, 1.25],
274 |        [0.0, 0.0, 0.0, 0.0]]),
275 |      (1, 2, True, 4,
276 |       [[0.8833333, 0.0, -0.5625, -1.015625],
277 |        [0.0, 0.0, 0.0, 0.0],
278 |        [-0.1166667, 0.0, -0.0625, -0.265625],
279 |        [0.0, 0.0, 0.0, 0.0],
280 |        [-0.3166667, 0.0, 0.1875, 0.296875],
281 |        [0.0, 0.0, 0.0, 0.0],
282 |        [-0.4500000, 0.0, 0.4375, 0.984375],
283 |        [0.0, 0.0, 0.0, 0.0]]),
284 |      (3, 0, False, 4,
285 |       [[7.0, 3.0, 1.0, 0.0],
286 |        [0.0, 0.0, 0.0, 0.0],
287 |        [0.0, 0.0, 0.0, 0.0],
288 |        [0.0, 0.0, 0.0, 0.0],
289 |        [-0.488, -0.36, -0.2, 0.0],
290 |        [0.0, 0.0, 0.0, 0.0],
291 |        [-0.703704, -0.55555555, -0.333333333, 0.0],
292 |        [0.0, 0.0, 0.0, 0.0]]),
293 |      (0, 3, True, 4,
294 |       [[0.0, -0.5625, -1.015625, -1.488281],
295 |        [0.0, 0.0, 0.0, 0.0],
296 |        [0.0, -0.0625, -0.265625, -0.613281],
297 |        [0.0, 0.0, 0.0, 0.0],
298 |        [0.0, 0.1875, 0.296875, 0.339844],
299 |        [0.0, 0.0, 0.0, 0.0],
300 |        [0.0, 0.4375, 0.984375, 1.761719],
301 |        [0.0, 0.0, 0.0, 0.0]]),
302 |      (3, 3, False, 2,
303 |       [[3.5, 1.5, 0.5, 0.0, -0.25, -0.375, -0.4375],
304 |        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
305 |        [-0.595852, -0.457778, -0.266667, 0.0, 0.375, 0.90625, 1.664062],
306 |        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]),
307 |      (3, 3, True, 2,
308 |       [[2.047926, 0.978888, 0.383333, 0.0, -0.3125, -0.640625, -1.050781],
309 |        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
310 |        [-2.047926, -0.978888, -0.383333, 0.0, 0.3125, 0.640625, 1.050781],
311 |        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]])]
312 | )
313 | def test_average_cumulative_return_by_quantile(before, after,
314 |                                                demeaned, quantiles,
315 |                                                expected_vals):
316 |     dr = pd.date_range(start='2015-1-15', end='2015-2-1')
317 |     dr.name = 'date'
318 |     tickers = ['A', 'B', 'C', 'D']
319 |     r1, r2, r3, r4 = (1.25, 1.50, 1.00, 0.50)
320 |     data = [[r1**i, r2**i, r3**i, r4**i] for i in range(1, 19)]
321 |     prices = pd.DataFrame(index=dr, columns=tickers, data=data)
322 |     dr2 = pd.date_range(start='2015-1-21', end='2015-1-26')
323 |     dr2.name = 'date'
324 |     factor = pd.DataFrame(
325 |         index=dr2, columns=tickers, data=[
326 |             [3, 4, 2, 1],
327 |             [3, 4, 2, 1],
328 |             [3, 4, 2, 1],
329 |             [3, 4, 2, 1],
330 |             [3, 4, 2, 1],
331 |             [3, 4, 2, 1]]).stack()
332 | 
333 |     factor_data = get_clean_factor_and_forward_returns(
334 |         factor, prices, quantiles=quantiles, periods=range(0, after + 1)
335 |     )
336 | 
337 |     avgrt = average_cumulative_return_by_quantile(
338 |         factor_data, prices, before, after, demeaned)
339 |     arrays = []
340 |     for q in range(1, quantiles + 1):
341 |         arrays.append((q, 'mean'))
342 |         arrays.append((q, 'std'))
343 |     index = pd.MultiIndex.from_tuples(arrays, names=['factor_quantile', None])
344 |     expected = pd.DataFrame(
345 |         index=index, columns=range(-before, after + 1), data=expected_vals)
346 |     pd.testing.assert_frame_equal(avgrt, expected)
347 | 
348 | @pytest.mark.parametrize(
349 |     ('before', 'after', 'demeaned', 'quantiles', 'expected_vals'),
350 |     [(0, 2, False, 4,
351 |       [[0.0, -0.50, -0.75],
352 |        [0.0, 0.0, 0.0],
353 |        [0.0, 0.0, 0.0],
354 |        [0.0, 0.0, 0.0],
355 |        [0.0, 0.25, 0.5625],
356 |        [0.0, 0.0, 0.0],
357 |        [0.0, 0.50, 1.25],
358 |        [0.0, 0.0, 0.0]]),
359 |      (0, 3, True, 4,
360 |       [[0.0, -0.5625, -1.015625, -1.488281],
361 |        [0.0, 0.0, 0.0, 0.0],
362 |        [0.0, -0.0625, -0.265625, -0.613281],
363 |        [0.0, 0.0, 0.0, 0.0],
364 |        [0.0, 0.1875, 0.296875, 0.339844],
365 |        [0.0, 0.0, 0.0, 0.0],
366 |        [0.0, 0.4375, 0.984375, 1.761719],
367 |        [0.0, 0.0, 0.0, 0.0]]),
368 |      (0, 3, False, 2,
369 |       [[0.0, -0.25, -0.375, -0.4375],
370 |        [0.0, 0.0, 0.0, 0.0],
371 |        [0.0, 0.375, 0.90625, 1.664062],
372 |        [0.0, 0.0, 0.0, 0.0]]),
373 |      (0, 3, True, 2,
374 |       [[0.0, -0.3125, -0.640625, -1.050781],
375 |        [0.0, 0.0, 0.0, 0.0],
376 |        [0.0, 0.3125, 0.640625, 1.050781],
377 |        [0.0, 0.0, 0.0, 0.0]])]
378 | )
379 | def test_average_cumulative_return_by_quantile_2(before, after,
380 |                                                  demeaned, quantiles,
381 |                                                  expected_vals):
382 |     """Test varying factor asset universe
383 | 
384 |     at different dates there might be different assets
385 |     """
386 |     dr = pd.date_range(start='2015-1-15', end='2015-1-25')
387 |     dr.name = 'date'
388 |     tickers = ['A', 'B', 'C', 'D', 'E', 'F']
389 |     r1, r2, r3, r4 = (1.25, 1.50, 1.00, 0.50)
390 |     data = [[r1**i, r2**i, r3**i, r4**i, r2**i, r3**i]
391 |             for i in range(1, 12)]
392 |     prices = pd.DataFrame(index=dr, columns=tickers, data=data)
393 |     dr2 = pd.date_range(start='2015-1-18', end='2015-1-21')
394 |     dr2.name = 'date'
395 |     factor = pd.DataFrame(index=dr2, columns=tickers,
396 |                           data=[[3, 4, 2, 1, nan, nan],
397 |                                 [3, 4, 2, 1, nan, nan],
398 |                                 [3, nan, nan, 1, 4, 2],
399 |                                 [3, nan, nan, 1, 4, 2]]).stack()
400 | 
401 |     factor_data = get_clean_factor_and_forward_returns(
402 |         factor, prices, quantiles=quantiles, periods=range(0, after + 1),
403 |     )
404 | 
405 |     avgrt = average_cumulative_return_by_quantile(
406 |         factor_data, prices, before, after, demeaned
407 |     )
408 |     arrays = []
409 |     for q in range(1, quantiles + 1):
410 |         arrays.append((q, 'mean'))
411 |         arrays.append((q, 'std'))
412 |     index = pd.MultiIndex.from_tuples(arrays, names=['factor_quantile', None])
413 |     expected = pd.DataFrame(
414 |         index=index, columns=range(-before, after + 1), data=expected_vals
415 |     )
416 |     pd.testing.assert_frame_equal(avgrt, expected)
417 | 


--------------------------------------------------------------------------------
/tests/test_prepare.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | 
  4 | import pytest
  5 | import pandas as pd
  6 | from numpy import nan
  7 | 
  8 | from jqfactor_analyzer.prepare import (
  9 |     quantize_factor, compute_forward_returns, common_start_returns
 10 | )
 11 | 
 12 | 
 13 | dr = pd.date_range(start='2015-1-1', end='2015-1-2')
 14 | dr.name = 'date'
 15 | tickers = ['A', 'B', 'C', 'D']
 16 | factor = pd.DataFrame(
 17 |     index=dr, columns=tickers, data=[[1, 2, 3, 4], [4, 3, 2, 1]]
 18 | ).stack()
 19 | factor.index = factor.index.set_names(['date', 'asset'])
 20 | factor.name = 'factor'
 21 | factor_data = pd.DataFrame()
 22 | factor_data['factor'] = factor
 23 | factor_data['group'] = pd.Series(
 24 |     index=factor.index,
 25 |     data=[1, 1, 2, 2, 1, 1, 2, 2],
 26 | )
 27 | 
 28 | 
 29 | def test_compute_forward_returns():
 30 |     dr = pd.date_range(start='2015-1-1', end='2015-1-3')
 31 |     prices = pd.DataFrame(
 32 |         index=dr, columns=['A', 'B'], data=[[1, 1], [1, 2], [2, 1]]
 33 |     )
 34 | 
 35 |     fp = compute_forward_returns(factor, prices, periods=[1, 2])
 36 | 
 37 |     ix = pd.MultiIndex.from_product([dr, ['A', 'B']], names=['date', 'asset'])
 38 |     expected = pd.DataFrame(index=ix, columns=['period_1', 'period_2'])
 39 |     expected['period_1'] = [0., 1., 1., -0.5, nan, nan]
 40 |     expected['period_2'] = [1., 0., nan, nan, nan, nan]
 41 | 
 42 |     pd.testing.assert_frame_equal(fp, expected)
 43 | 
 44 | 
 45 | @pytest.mark.parametrize(
 46 |     ('factor', 'quantiles', 'bins', 'by_group', 'expected_vals'), [
 47 |         (factor_data, 4, None, False, [1, 2, 3, 4, 4, 3, 2, 1]),
 48 |         (factor_data, 2, None, False, [1, 1, 2, 2, 2, 2, 1, 1]),
 49 |         (factor_data, 2, None, True, [1, 2, 1, 2, 2, 1, 2, 1]),
 50 |         (
 51 |             factor_data, [0, .25, .5, .75, 1.], None, False,
 52 |             [1, 2, 3, 4, 4, 3, 2, 1]
 53 |         ),
 54 |         (factor_data, [0, .5, .75, 1.], None, False, [1, 1, 2, 3, 3, 2, 1, 1]),
 55 |         (factor_data, [0, .25, .5, 1.], None, False, [1, 2, 3, 3, 3, 3, 2, 1]),
 56 |         (factor_data, [0, .5, 1.], None, False, [1, 1, 2, 2, 2, 2, 1, 1]),
 57 |         (
 58 |             factor_data, [.25, .5, .75], None, False,
 59 |             [nan, 1, 2, nan, nan, 2, 1, nan]
 60 |         ), (factor_data, [0, .5, 1.], None, True, [1, 2, 1, 2, 2, 1, 2, 1]),
 61 |         (factor_data, [.5, 1.], None, True, [nan, 1, nan, 1, 1, nan, 1, nan]),
 62 |         (factor_data, [0, 1.], None, True, [1, 1, 1, 1, 1, 1, 1, 1]),
 63 |         (factor_data, None, 4, False, [1, 2, 3, 4, 4, 3, 2, 1]),
 64 |         (factor_data, None, 2, False, [1, 1, 2, 2, 2, 2, 1, 1]),
 65 |         (factor_data, None, 3, False, [1, 1, 2, 3, 3, 2, 1, 1]),
 66 |         (factor_data, None, 8, False, [1, 3, 6, 8, 8, 6, 3, 1]),
 67 |         (factor_data, None, [0, 1, 2, 3, 5], False, [1, 2, 3, 4, 4, 3, 2, 1]),
 68 |         (factor_data, None, [1, 2, 3], False, [nan, 1, 2, nan, nan, 2, 1, nan]),
 69 |         (factor_data, None, [0, 2, 5], False, [1, 1, 2, 2, 2, 2, 1, 1]),
 70 |         (factor_data, None, [0.5, 2.5, 4.5], False, [1, 1, 2, 2, 2, 2, 1, 1]),
 71 |         (factor_data, None, [0.5, 2.5], True, [1, 1, nan, nan, nan, nan, 1, 1]),
 72 |         (factor_data, None, 2, True, [1, 2, 1, 2, 2, 1, 2, 1])
 73 |     ]
 74 | )
 75 | def test_quantize_factor(factor, quantiles, bins, by_group, expected_vals):
 76 |     quantized_factor = quantize_factor(
 77 |         factor, quantiles=quantiles, bins=bins, by_group=by_group
 78 |     )
 79 |     expected = pd.Series(
 80 |         index=factor.index, data=expected_vals, name='factor_quantile'
 81 |     ).dropna()
 82 |     pd.testing.assert_series_equal(quantized_factor, expected)
 83 | 
 84 | 
 85 | @pytest.mark.parametrize(
 86 |     ('before', 'after', 'mean_by_date', 'demeaned', 'expected_vals'), [
 87 |         (
 88 |             2, 3, False, False, [
 89 |                 [0.075, 0.241868], [0.075, 0.241868], [0.075, 0.241868],
 90 |                 [0.075, 0.241868], [0.075, 0.241868], [0.075, 0.241868]
 91 |             ]
 92 |         ),
 93 |         (
 94 |             3, 2, False, True, [
 95 |                 [0.0, 0.241868], [0.0, 0.241868], [0.0, 0.241868],
 96 |                 [0.0, 0.241868], [0.0, 0.241868], [0.0, 0.241868]
 97 |             ]
 98 |         ),
 99 |         (
100 |             3, 5, True, False, [
101 |                 [0.075, 0.0], [0.075, 0.0], [0.075, 0.0], [0.075, 0.0],
102 |                 [0.075, 0.0], [0.075, 0.0], [0.075, 0.0], [0.075, 0.0],
103 |                 [0.075, 0.0]
104 |             ]
105 |         ),
106 |         (
107 |             1, 4, True, True,
108 |             [[0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.]]
109 |         ),
110 |         (
111 |             6, 6, False, False, [
112 |                 [0.075, 0.243614], [0.075, 0.242861], [0.075, 0.242301],
113 |                 [0.075, 0.241868], [0.075, 0.241868], [0.075, 0.241868],
114 |                 [0.075, 0.241868], [0.075, 0.241868], [0.075, 0.241868],
115 |                 [0.075, 0.241868], [0.075, 0.241868], [0.075, 0.242301],
116 |                 [0.075, 0.242861]
117 |             ]
118 |         ),
119 |         (
120 |             6, 6, False, True, [
121 |                 [0.0, 0.243614], [0.0, 0.242861], [0.0, 0.242301],
122 |                 [0.0, 0.241868], [0.0, 0.241868], [0.0, 0.241868],
123 |                 [0.0, 0.241868], [0.0, 0.241868], [0.0, 0.241868],
124 |                 [0.0, 0.241868], [0.0, 0.241868], [0.0, 0.242301],
125 |                 [0.0, 0.242861]
126 |             ]
127 |         ),
128 |         (
129 |             6, 6, True, False, [
130 |                 [0.075, 0.0], [0.075, 0.0], [0.075, 0.0], [0.075, 0.0],
131 |                 [0.075, 0.0], [0.075, 0.0], [0.075, 0.0], [0.075, 0.0],
132 |                 [0.075, 0.0], [0.075, 0.0], [0.075, 0.0], [0.075, 0.0],
133 |                 [0.075, 0.0]
134 |             ]
135 |         ),
136 |         (
137 |             6, 6, True, True, [
138 |                 [0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.],
139 |                 [0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.],
140 |                 [0., 0.]
141 |             ]
142 |         )
143 |     ]
144 | )
145 | def test_common_start_returns(
146 |     before, after, mean_by_date, demeaned, expected_vals
147 | ):
148 |     dr = pd.date_range(start='2015-1-17', end='2015-2-2')
149 |     dr.name = 'date'
150 |     tickers = ['A', 'B', 'C', 'D']
151 |     r1, r2, r3, r4 = (1.20, 1.40, 0.90, 0.80)
152 |     prices = pd.DataFrame(
153 |         index=dr,
154 |         columns=tickers,
155 |         data=[
156 |             [r1**1, r2**1, r3**1, r4**1], [r1**2, r2**2, r3**2, r4**2],
157 |             [r1**3, r2**3, r3**3, r4**3], [r1**4, r2**4, r3**4, r4**4],
158 |             [r1**5, r2**5, r3**5, r4**5], [r1**6, r2**6, r3**6, r4**6],
159 |             [r1**7, r2**7, r3**7, r4**7], [r1**8, r2**8, r3**8, r4**8],
160 |             [r1**9, r2**9, r3**9, r4**9], [r1**10, r2**10, r3**10, r4**10],
161 |             [r1**11, r2**11, r3**11, r4**11], [r1**12, r2**12, r3**12, r4**12],
162 |             [r1**13, r2**13, r3**13, r4**13], [r1**14, r2**14, r3**14, r4**14],
163 |             [r1**15, r2**15, r3**15, r4**15], [r1**16, r2**16, r3**16, r4**16],
164 |             [r1**17, r2**17, r3**17, r4**17]
165 |         ]
166 |     )
167 |     dr2 = pd.date_range(start='2015-1-21', end='2015-1-29')
168 |     factor = pd.DataFrame(
169 |         index=dr2,
170 |         columns=tickers,
171 |         data=[
172 |             [3, 4, 2, 1], [3, 4, 2, 1], [3, 4, 2, 1], [3, 4, 2,
173 |                                                        1], [3, 4, 2, 1],
174 |             [3, 4, 2, 1], [3, 4, 2, 1], [3, 4, 2, 1], [3, 4, 2, 1]
175 |         ]
176 |     ).stack()
177 |     factor.index = factor.index.set_names(['date', 'asset'])
178 |     factor.name = 'factor'
179 | 
180 |     cmrt = common_start_returns(
181 |         factor, prices, before, after, False, mean_by_date,
182 |         factor if demeaned else None
183 |     )
184 |     cmrt = pd.DataFrame({'mean': cmrt.mean(axis=1), 'std': cmrt.std(axis=1)})
185 |     expected = pd.DataFrame(
186 |         index=range(-before, after + 1),
187 |         columns=['mean', 'std'],
188 |         data=expected_vals
189 |     )
190 |     pd.testing.assert_frame_equal(cmrt, expected)
191 | 


--------------------------------------------------------------------------------