├── .gitignore ├── LICENSE ├── MANiFEST.in ├── README.md ├── docs └── API文档.md ├── jqfactor_analyzer ├── __init__.py ├── analyze.py ├── attribution.py ├── compat.py ├── config.json ├── data.py ├── exceptions.py ├── factor_cache.py ├── performance.py ├── plot_utils.py ├── plotting.py ├── prepare.py ├── preprocess.py ├── sample.py ├── sample_data │ ├── VOL5.csv │ ├── index_weight_info.csv │ └── weight_info.csv ├── utils.py ├── version.py └── when.py ├── requirements.txt ├── setup.cfg ├── setup.py └── tests ├── __init__.py ├── test_attribution.py ├── test_data.py ├── test_performance.py └── test_prepare.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 JoinQuant 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANiFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include *.txt 3 | include jqfactor_analyzer/sample_data/*.csv 4 | include jqfactor_analyzer/config.json 5 | -------------------------------------------------------------------------------- /docs/API文档.md: -------------------------------------------------------------------------------- 1 | # **API文档** 2 | 3 | ## 一、因子缓存factor_cache模块 4 | 5 | 为了在本地进行分析时,为了提高数据获取的速度并避免反复从服务端获取数据,所以增加了本地数据缓存的方法。 6 | 7 | 注意缓存格式为pyarrow.feather格式,pyarrow库不同版本之间可能存在兼容问题,建议不要随意修改pyarrow库的版本,如果修改后产生大量缓存文件无法读取(提示已损坏)的情况,建议删除整个缓存目录后重新缓存。 8 | 9 | ### 1. 设置缓存目录 10 | 11 | 对于单因子分析和归因分析中使用到的市值/价格和风格因子等数据,默认会缓存到用户的主目录( `os.path.expanduser( '~/jqfactor_datacache/bundle')` )。 一般地,在 Unix 系统上可能是 `/home/username/jqfactor_datacache/bundle`,而在 Windows 系统上可能是 `C:\Users\username\jqfactor_datacache\bundle`。 12 | 13 | 您可以通过以下代码修改配置信息来设置为其他路径,设置过一次后后续都将沿用设置的这个路径,不用重复设置。 14 | 15 | ```python 16 | from jqfactor_analyzer.factor_cache import set_cache_dir,get_cache_dir 17 | set_cache_dir(my_path) #设置缓存目录为my_path 18 | print(get_cache_dir()) #输出缓存目录 19 | ``` 20 | 21 | ### 2. 缓存/检查缓存和读取已缓存数据 22 | 23 | 除过对单因子分析及归因分析依赖的数据进行缓存外,factor_cache还可以缓存自定义的因子组(仅限聚宽因子库中支持的因子) 24 | 25 | ```python 26 | def save_factor_values_by_group(start_date,end_date,factor_names='prices', 27 | group_name=None,overwrite=False,cache_dir=None,show_progress=True): 28 | """将因子库数据按因子组储存到本地,根据factor_names因子列表(顺序无关)自动生成因子组的名称 29 | start_date : 开始时间 30 | end_date : 结束时间 31 | factor_names : 因子组所含因子的名称,除过因子库中支持的因子外,还支持指定为'prices'缓存价格数据 32 | group_name : 因子组名称,不指定时使用get_factor_folder自动生成因子组名(即缓存文件夹名),如果指定则按照指定的名称生成文件夹名(使用get_factor_values_by_cache时,需要自行指定factor_path) 33 | overwrite : 文件已存在时是否覆盖更新,默认为False即增量更新,文件已存在时跳过 34 | cache_dir : 缓存的路径,如果没有指定则使用配置信息中的路径,一般不用指定 35 | show_progress : 是否展示缓存进度,默认为True 36 | 返回 : 因子组储存的路径 , 文件以天为单位储存为feather文件,每天一个feather文件,每月一个文件夹,columns为因子名称, index为当天在市的所有标的代码 37 | """ 38 | def get_factor_values_by_cache(date,codes=None,factor_names=None,group_name=None, 39 | factor_path=None): 40 | """从缓存的文件读取因子数据,文件不存在时返回空的dataframe 41 | date : 日期 42 | codes : 标的代码,默认为None获取当天在市的所有标的 43 | factor_names : 因子列表(顺序无关),当指定factor_path/group_name时失效 44 | group_name : 因子组名,如果缓存时指定了group_name,则获取时必须也指定group_name或factor_path 45 | factor_path : 可选参数,因子组的路径,一般不用指定 46 | 返回: 47 | 如果缓存文件存在,则返回当天的因子数据,index是标的代码,columns是因子名 48 | 如果缓存文件不存在,则返回空的dataframe, 建议在使用get_factor_values_by_cache前,先运行save_factor_values_by_group检查时间区间内的缓存文件是否完整 49 | """ 50 | def get_factor_folder(factor_names,group_name=None): 51 | """获取因子组的文件夹名(文件夹位于get_cache_dir()获取的缓存目录下) 52 | factor_names : 因子储存时,如果未指定group_name,则根据因子列表(顺序无关)获取md5值生成因子组名(即储存的文件夹名),使用此方法可以获取生成的文件夹名称 53 | group_name : 如果储存时指定了因子组名,则直接返回此因子组名 54 | """ 55 | 56 | ``` 57 | 58 | **示例** 59 | 60 | ```python 61 | from jqfactor_analyzer.factor_cache import save_factor_values_by_group,get_factor_values_by_cache,get_factor_folder,get_cache_dir 62 | # import jqdatasdk as jq 63 | # jq.auth("账号",'密码') #登陆jqdatasdk来从服务端缓存数据 64 | 65 | all_factors = jq.get_all_factors() 66 | factor_names = all_factors[all_factors.category=='growth'].factor.tolist() #将聚宽因子库中的成长类因子作为一组因子 67 | group_name = 'growth_factors' #因子组名定义为'growth_factors' 68 | start_date = '2021-01-01' 69 | end_date = '2021-06-01' 70 | # 检查/缓存因子数据 71 | factor_path = save_factor_values_by_group(start_date,end_date,factor_names=factor_names,group_name=group_name,overwrite=False,show_progress=True) 72 | # factor_path = os.path.join(get_cache_dir(), get_factor_folder(factor_names,group_name=group_name) #等同于save_factor_values_by_group返回的路径 73 | 74 | # 循环获取缓存的因子数据,并拼接 75 | trade_days = jq.get_trade_days(start_date,end_date) 76 | factor_values = {} 77 | for date in trade_days: 78 | factor_values[date] = get_factor_values_by_cache(date,codes=None,factor_names=factor_names,group_name=group_name, factor_path=factor_path)#这里实际只需要指定group_name,factor_names参数的其中一个,缓存时指定了group_name时,factor_names不生效 79 | factor_values = pd.concat(factor_values) 80 | ``` 81 | 82 | ## 二、归因分析模块 83 | 84 | ```python 85 | from jqfactor_analyzer import AttributionAnalysis 86 | AttributionAnalysis(weights,daily_return,style_type='style_pro',industry ='sw_l1',use_cn=True,show_data_progress=True) 87 | ``` 88 | 89 | **参数 :** 90 | 91 | - `weights`:持仓权重信息,index是日期,columns是标的代码, value对应的是组合当天的仓位占比(单日仓位占比总和不为1时,剩余部分认为是当天的现金) 92 | - `daily_return`:Series,index是日期,values为当天组合的收益率 93 | - `style_type`:归因分析所使用的风格因子类型,可选'style'和'style_pro'中的一个 94 | - `industry`:归因分析所使用的行业分类,可选'sw_l1'和'jq_l1'中的一个 95 | - `use_cn`:绘图时是否使用中文 96 | - `show_data_progress`:是否展示数据获取进度(使用本地缓存,第一次运行时速度较慢,后续对于本地不存在的数据将增量缓存) 97 | 98 | **示例** 99 | 100 | ```python 101 | import pandas as pd 102 | # position_weights.csv 是一个储存了组合权重信息的csv文件,index是日期,columns是股票代码 103 | # position_daily_return.csv 是一个储存了组合日收益率的csv文件,index是日期,daily_return列是日收益 104 | weights = pd.read_csv("position_weights.csv",index_col=0) 105 | returns = pd.read_csv("position_daily_return.csv",index_col=0)['daily_return'] 106 | 107 | An = AttributionAnalysis(weights , returns ,style_type='style_pro',industry ='sw_l1', show_data_progress=True ) 108 | ``` 109 | 110 | 111 | 112 | ### 1. 属性 113 | 114 | - `style_exposure` : 组合的风格暴露 115 | - `industry_exposure` : 组合的行业暴露 116 | - `exposure_portfolio` : 组合的风格+行业及country暴露 117 | - `attr_daily_returns` : 组合的\风格+行业及country日度归因收益率 118 | - `attr_returns` : 组合的日度风格+行业及country累积归因收益率 119 | 120 | ### 2. 方法 121 | 122 | #### (1) 获取组合相对于指数的暴露 123 | 124 | ```python 125 | get_exposure2bench(index_symbol) 126 | ``` 127 | 128 | **参数 :** 129 | 130 | - `index_symbol` : 基准指数, 可选`['000300.XSHG','000905.XSHG','000906.XSHG','000852.XSHG','932000.CSI','000985.XSHG']`中的一个 131 | 132 | **返回 :** 133 | 134 | - 一个dataframe,index为日期,columns为风格因子+行业因子+county , 其中country为股票总持仓占比 135 | 136 | #### (2) 获取组合相对于指数的日度归因收益率 137 | 138 | ```python 139 | get_attr_daily_returns2bench(index_symbol) 140 | ``` 141 | 142 | 假设组合相对于指数的收益由以下部分构成 : 风格+行业暴露收益(common_return ) , 现金闲置收益(cash) ,策略本身的超额收益(specific_return) 143 | **参数 :** 144 | 145 | - `index_symbol` : 基准指数, 可选`['000300.XSHG','000905.XSHG','000906.XSHG','000852.XSHG','932000.CSI','000985.XSHG']`中的一个 146 | 147 | **返回 :** 148 | 149 | - 一个dataframe,index为日期,columns为`风格因子+行业因子+cash+common_return,specific_return,total_return` 150 | 151 | 其中: 152 | cash是假设现金收益(0)相对指数带来的收益率 153 | common_return 为风格+行业总收益率 154 | specific_return 为特意收益率 155 | total_return 为组合相对于指数的总收益 156 | 157 | #### (3) 获取相对于指数的累积归因收益率 158 | 159 | ```python 160 | get_attr_returns2bench(index_symbol) 161 | ``` 162 | 163 | 假设组合相对于指数的收益由以下部分构成 : 风格+行业暴露收益(common_return ) , 现金闲置收益(cash) ,策略本身的超额收益(specific_return) 164 | 165 | **参数 :** 166 | 167 | `index_symbol` : 基准指数, 可选`['000300.XSHG','000905.XSHG','000906.XSHG','000852.XSHG','932000.CSI','000985.XSHG']`中的一个 168 | 169 | **返回 :** 170 | 171 | - 一个dataframe,index为日期,columns为`风格因子+行业因子+cash+common_return,specific_return,total_return` 172 | 173 | 其中: 174 | cash是假设现金收益(0)相对指数带来的收益率 175 | common_return 为风格+行业总收益率 176 | specific_return 为特异收益率 177 | total_return 为组合相对于指数的总收益(减法超额) 178 | 179 | ### 3. 绘图方法 180 | 181 | #### (1) 绘制风格暴露时序图 182 | 183 | ```python 184 | plot_exposure(factors='style',index_symbol=None,figsize=(15,7)) 185 | ``` 186 | 187 | 绘制风格暴露时序 188 | 189 | **参数** 190 | 191 | - factors : 绘制的暴露类型 , 可选 'style'(所有风格因子) , 'industry'(所有行业因子),也可以传递一个list,list为exposure_portfolio中columns的一个或者多个 192 | - index_symbol : 基准指数代码,指定时绘制相对于指数的暴露 , 默认None为组合本身的暴露 193 | - figsize : 画布大小 194 | 195 | #### (2) 绘制归因分析收益时序图 196 | 197 | ```python 198 | plot_returns(factors='style',index_symbol=None,figsize=(15,7)) 199 | ``` 200 | 201 | 绘制归因分析收益时序 202 | 203 | **参数** 204 | 205 | - factors : 绘制的暴露类型 , 可选 'style'(所有风格因子) , 'industry'(所有行业因子),也可以传递一个list,list为exposure_portfolio中columns的一个或者多个 206 | 同时也支持指定['common_return'(风格总收益),'specific_return'(特异收益),'total_return'(总收益)', 'country'(国家因子收益,当指定index_symbol时会用现金相对于指数的收益替代)] 207 | - index_symbol : 基准指数代码,指定时绘制相对于指数的暴露 , 默认None为组合本身的暴露 208 | - figsize : 画布大小 209 | 210 | #### (3) 绘制暴露与收益对照图 211 | 212 | ```python 213 | plot_exposure_and_returns(factors='style',index_symbol=None,show_factor_perf=False,figsize=(12,6)) 214 | ``` 215 | 216 | 将因子暴露与收益同时绘制在多个子图上 217 | 218 | **参数** 219 | 220 | - factors : 绘制的暴露类型 , 可选 'style'(所有风格因子) , 'industry'(所有行业因子,也可以传递一个list,list为exposure_portfolio中columns的一个或者多个 221 | 当指定index_symbol时,country会用现金相对于指数的收益替代) 222 | - index_symbol : 基准指数代码,指定时绘制相对于指数的暴露及收益 , 默认None为组合本身的暴露和收益 223 | - show_factor_perf : 是否同时绘制因子表现 224 | - figsize : 画布大小,这里第一个参数是画布的宽度, 第二个参数为单个子图的高度 225 | 226 | #### (4) 关闭中文图例显示 227 | 228 | ```python 229 | plot_disable_chinese_label() 230 | ``` 231 | 232 | 画图时默认会从系统中查找中文字体显示以中文图例 233 | 如果找不到中文字体则默认使用英文图例 234 | 当找到中文字体但中文显示乱码时, 可调用此 API 关闭中文图例显示而使用英文 235 | 236 | 237 | 238 | ## 三、单因子分析模块 239 | 240 | ```python 241 | from jqfactor_analyzer import analyze_factor 242 | analyze_factor(factor, industry='jq_l1', quantiles=5, periods=(1, 5, 10), weight_method='avg', max_loss=0.25, allow_cache=True, show_data_progress=True ) 243 | ``` 244 | 245 | 单因子分析函数 246 | 247 | 248 | 249 | **参数** 250 | 251 | * `factor`: 因子值, 252 | 253 | pandas.DataFrame格式的数据 254 | 255 | - index为日期,格式为pandas日期通用的DatetimeIndex,转换方法见[将自有因子值转换成 DataFrame 格式的数据](#将自有因子值转换成-dataframe-格式的数据) 256 | - columns为股票代码,格式要求符合聚宽的代码定义规则(如:平安银行的股票代码为000001.XSHE) 257 | - 如果是深交所上市的股票,在股票代码后面需要加入.XSHE 258 | - 如果是上交所上市的股票,在股票代码后面需要加入.XSHG 259 | 260 | 或 pd.Series格式的数据 261 | - index为日期和股票代码组成的MultiIndex 262 | 263 | * `industry`: 行业分类, 默认为 `'jq_l1'` 264 | 265 | * `'sw_l1'`: 申万一级行业 266 | * `'sw_l2'`: 申万二级行业 267 | * `'sw_l3'`: 申万三级行业 268 | * `'jq_l1'`: 聚宽一级行业 269 | * `'jq_l2'`: 聚宽二级行业 270 | * `'zjw'`: 证监会行业 271 | 272 | * `quantiles`: 分位数数量, 默认为 `5` 273 | 274 | `int` 275 | 276 | 在因子分组中按照因子值大小平均分组的组数. 277 | 278 | * `periods`: 调仓周期, 默认为 [1, 5, 10] 279 | 280 | `int` or `list[int]` 281 | 282 | * `weight_method`: 基于分位数收益时的加权方法, 默认为 `'avg'` 283 | 284 | * `'avg'`: 等权重 285 | * `'mktcap'`:按总市值加权 286 | * `'ln_mktcap'`: 按总市值的对数加权 287 | * `'cmktcap'`: 按流通市值加权 288 | * `'ln_cmktcap'`: 按流通市值的对数加权 289 | 290 | * `max_loss`: 因重复值或nan值太多而无效的因子值的最大占比, 默认为 0.25 291 | 292 | `float` 293 | 294 | 允许的丢弃因子数据的最大百分比 (0.00 到 1.00), 295 | 296 | 计算比较输入因子索引中的项目数和输出 DataFrame 索引中的项目数. 297 | 298 | 因子数据本身存在缺陷 (例如 NaN), 299 | 300 | 没有提供足够的价格数据来计算所有因子值的远期收益, 301 | 302 | 或者因为分组失败, 因此可以部分地丢弃因子数据 303 | 304 | * `allow_cache` : 是否允许对价格,市值等信息进行本地缓存(按天缓存,初次运行可能比较慢,但后续重新获取对应区间的数据将非常快,且分析时仅消耗较小的jqdatasdk流量) 305 | 306 | * show_data_progress: 是否展示数据获取的进度信息 307 | 308 | 309 | 310 | **示例** 311 | 312 | ```python 313 | #载入函数库 314 | import pandas as pd 315 | import jqfactor_analyzer as ja 316 | 317 | # 获取 jqdatasdk 授权 318 | # 输入用户名、密码,申请地址:http://t.cn/EINDOxE 319 | # 聚宽官网及金融终端,使用方法参见:http://t.cn/EINcS4j 320 | import jqdatasdk 321 | jqdatasdk.auth('username', 'password') 322 | 323 | # 对因子进行分析 324 | far = ja.analyze_factor( 325 | factor_data, # factor_data 为因子值的 pandas.DataFrame 326 | quantiles=10, 327 | periods=(1, 10), 328 | industry='jq_l1', 329 | weight_method='avg', 330 | max_loss=0.1 331 | ) 332 | 333 | # 生成统计图表 334 | far.create_full_tear_sheet( 335 | demeaned=False, group_adjust=False, by_group=False, 336 | turnover_periods=None, avgretplot=(5, 15), std_bar=False 337 | ) 338 | ``` 339 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | ### 1. 绘制结果 347 | 348 | #### 展示全部分析 349 | 350 | ``` 351 | far.create_full_tear_sheet(demeaned=False, group_adjust=False, by_group=False, 352 | turnover_periods=None, avgretplot=(5, 15), std_bar=False) 353 | ``` 354 | 355 | **参数:** 356 | 357 | - demeaned: 358 | - True: 使用超额收益计算 (基准收益被认为是每日所有股票收益按照weight列中权重加权的均值) 359 | - False: 不使用超额收益 360 | - group_adjust: 361 | - True: 使用行业中性化后的收益计算 (行业收益被认为是每日各个行业股票收益按照weight列中权重加权的均值) 362 | - False: 不使用行业中性化后的收益 363 | - by_group: 364 | - True: 按行业展示 365 | - False: 不按行业展示 366 | - turnover_periods: 调仓周期 367 | - avgretplot: tuple 因子预测的天数-(计算过去的天数, 计算未来的天数) 368 | - std_bar: 369 | - True: 显示标准差 370 | - False: 不显示标准差 371 | 372 | #### 因子值特征分析 373 | 374 | ``` 375 | far.create_summary_tear_sheet(demeaned=False, group_adjust=False) 376 | ``` 377 | 378 | **参数:** 379 | 380 | - demeaned: 381 | - True: 对每日因子收益去均值求得因子收益表 382 | - False: 因子收益表 383 | - group_adjust: 384 | - True: 按行业对因子收益去均值后求得因子收益表 385 | - False: 因子收益表 386 | 387 | #### 因子收益分析 388 | 389 | ``` 390 | far.create_returns_tear_sheet(demeaned=False, group_adjust=False, by_group=False) 391 | 392 | ``` 393 | 394 | **参数:** 395 | 396 | - demeaned: 397 | - True: 使用超额收益计算 (基准收益被认为是每日所有股票收益按照weight列中权重加权的均值) 398 | - False: 不使用超额收益 399 | - group_adjust: 400 | - True: 使用行业中性化后的收益计算 (行业收益被认为是每日各个行业股票收益按照weight列中权重加权的均值) 401 | - False: 不使用行业中性化后的收益 402 | - by_group: 403 | - True: 画各行业的各分位数平均收益图 404 | - False: 不画各行业的各分位数平均收益图 405 | 406 | #### 因子 IC 分析 407 | 408 | ``` 409 | far.create_information_tear_sheet(group_adjust=False, by_group=False) 410 | 411 | ``` 412 | 413 | **参数:** 414 | 415 | - group_adjust: 416 | - True: 使用行业中性收益 (行业收益被认为是每日各个行业股票收益按照weight列中权重的加权的均值) 417 | - False: 不使用行业中性收益 418 | - by_group: 419 | - True: 画按行业分组信息比率(IC)图 420 | - False: 画月度信息比率(IC)图 421 | 422 | #### 因子换手率分析 423 | 424 | ``` 425 | far.create_turnover_tear_sheet(turnover_periods=None) 426 | 427 | ``` 428 | 429 | **参数:** 430 | 431 | - turnover_periods: 调仓周期 432 | 433 | #### 因子预测能力分析 434 | 435 | ``` 436 | far.create_event_returns_tear_sheet(avgretplot=(5, 15),demeaned=False, group_adjust=False,std_bar=False) 437 | 438 | ``` 439 | 440 | **参数:** 441 | 442 | - avgretplot: tuple 因子预测的天数-(计算过去的天数, 计算未来的天数) 443 | - demeaned: 444 | - True: 使用超额收益计算累积收益 (基准收益被认为是每日所有股票收益按照weight列中权重加权的均值) 445 | - False: 不使用超额收益 446 | - group_adjust: 447 | - True: 使用行业中性化后的收益计算累积收益 (行业收益被认为是每日各个行业股票收益按照weight列中权重加权的均值) 448 | - False: 不使用行业中性化后的收益 449 | - std_bar: 450 | - True: 显示标准差 451 | - False: 不显示标准差 452 | 453 | #### 打印因子收益表 454 | 455 | ``` 456 | far.plot_returns_table(demeaned=False, group_adjust=False) 457 | 458 | ``` 459 | 460 | **参数:** 461 | 462 | - demeaned: 463 | - True:使用超额收益计算 (基准收益被认为是每日所有股票收益按照weight列中权重的加权的均值) 464 | - False:不使用超额收益 465 | - group_adjust: 466 | - True:使用行业中性收益 (行业收益被认为是每日各个行业股票收益按照weight列中权重的加权的均值) 467 | - False:不使用行业中性收益 468 | 469 | #### 打印换手率表 470 | 471 | ``` 472 | far.plot_turnover_table() 473 | 474 | ``` 475 | 476 | #### 打印信息比率(IC)相关表 477 | 478 | ``` 479 | far.plot_information_table(group_adjust=False, method='rank') 480 | 481 | ``` 482 | 483 | **参数:** 484 | 485 | - group_adjust: 486 | - True:使用行业中性收益 (行业收益被认为是每日各个行业股票收益按照weight列中权重的加权的均值) 487 | - False:不使用行业中性收益 488 | - method: 489 | - 'rank':用秩相关系数计算IC值 490 | - 'normal': 用相关系数计算IC值 491 | 492 | #### 打印个分位数统计表 493 | 494 | ``` 495 | far.plot_quantile_statistics_table() 496 | 497 | ``` 498 | 499 | #### 画信息比率(IC)时间序列图 500 | 501 | ``` 502 | far.plot_ic_ts(group_adjust=False, method='rank') 503 | 504 | ``` 505 | 506 | **参数:** 507 | 508 | - group_adjust: 509 | - True:使用行业中性收益 (行业收益被认为是每日各个行业股票收益按照weight列中权重的加权的均值) 510 | - False:不使用行业中性收益 511 | - method: 512 | - 'rank':用秩相关系数计算IC值 513 | - 'normal': 用相关系数计算IC值 514 | 515 | #### 画信息比率分布直方图 516 | 517 | ``` 518 | far.plot_ic_hist(group_adjust=False, method='rank') 519 | 520 | ``` 521 | 522 | **参数:** 523 | 524 | - group_adjust: 525 | - True:使用行业中性收益 (行业收益被认为是每日各个行业股票收益按照weight列中权重的加权的均值) 526 | - False:不使用行业中性收益 527 | - method: 528 | - 'rank':用秩相关系数计算IC值 529 | - 'normal': 用相关系数计算IC值 530 | 531 | #### 画信息比率 qq 图 532 | 533 | ``` 534 | far.plot_ic_qq(group_adjust=False, method='rank', theoretical_dist='norm') 535 | 536 | ``` 537 | 538 | **参数:** 539 | 540 | - group_adjust: 541 | - True:使用行业中性收益 (行业收益被认为是每日各个行业股票收益按照weight列中权重的加权的均值) 542 | - False:不使用行业中性收益 543 | - method: 544 | - 'rank':用秩相关系数计算IC值 545 | - 'normal': 用相关系数计算IC值 546 | - theoretical_dist: 547 | - 'norm':正态分布 548 | - 't':t分布 549 | 550 | #### 画各分位数平均收益图 551 | 552 | ``` 553 | far.plot_quantile_returns_bar(by_group=False, demeaned=False, group_adjust=False) 554 | 555 | ``` 556 | 557 | **参数:** 558 | 559 | - by_group: 560 | - True:各行业的各分位数平均收益图 561 | - False:各分位数平均收益图 562 | - demeaned: 563 | - True:使用超额收益计算累积收益 (基准收益被认为是每日所有股票收益按照weight列中权重加权的均值) 564 | - False:不使用超额收益 565 | - group_adjust: 566 | - True:使用行业中性化后的收益计算累积收益 (行业收益被认为是每日各个行业股票收益按照weight列中权重加权的均值) 567 | - False:不使用行业中性化后的收益 568 | 569 | #### 画最高分位减最低分位收益图 570 | 571 | ``` 572 | far.plot_mean_quantile_returns_spread_time_series(demeaned=False, group_adjust=False, bandwidth=1) 573 | 574 | ``` 575 | 576 | **参数:** 577 | 578 | - demeaned: 579 | - True:使用超额收益计算累积收益 (基准收益被认为是每日所有股票收益按照weight列中权重加权的均值) 580 | - False:不使用超额收益 581 | - group_adjust: 582 | - True:使用行业中性化后的收益计算累积收益 (行业收益被认为是每日各个行业股票收益按照weight列中权重加权的均值) 583 | - False:不使用行业中性化后的收益 584 | - bandwidth:n,加减n倍当日标准差 585 | 586 | #### 画按行业分组信息比率(IC)图 587 | 588 | ``` 589 | far.plot_ic_by_group(group_adjust=False, method='rank') 590 | 591 | ``` 592 | 593 | **参数:** 594 | 595 | - group_adjust: 596 | - True:使用行业中性收益 (行业收益被认为是每日各个行业股票收益按照weight列中权重的加权的均值) 597 | - False:不使用行业中性收益 598 | - method: 599 | - 'rank':用秩相关系数计算IC值 600 | - 'normal': 用相关系数计算IC值 601 | 602 | #### 画因子自相关图 603 | 604 | ``` 605 | far.plot_factor_auto_correlation(rank=True) 606 | 607 | ``` 608 | 609 | **参数:** 610 | 611 | - rank: 612 | - True:用秩相关系数 613 | - False:用相关系数 614 | 615 | #### 画最高最低分位换手率图 616 | 617 | ``` 618 | far.plot_top_bottom_quantile_turnover(periods=(1, 3, 9)) 619 | 620 | ``` 621 | 622 | **参数:** 623 | 624 | - periods:调仓周期 625 | 626 | #### 画月度信息比率(IC)图 627 | 628 | ``` 629 | far.plot_monthly_ic_heatmap(group_adjust=False) 630 | 631 | ``` 632 | 633 | **参数:** 634 | 635 | - group_adjust: 636 | - True:使用行业中性收益 (行业收益被认为是每日各个行业股票收益按照weight列中权重的加权的均值) 637 | - False:不使用行业中性收益 638 | 639 | #### 画按因子值加权多空组合每日累积收益图 640 | 641 | ``` 642 | far.plot_cumulative_returns(period=1, demeaned=False, group_adjust=False) 643 | 644 | ``` 645 | 646 | **参数:** 647 | 648 | - periods:调仓周期 649 | - demeaned: 650 | - True:对因子值加权组合每日收益的权重去均值 (每日权重 = 每日权重 - 每日权重的均值),使组合转换为cash-neutral多空组合 651 | - False:不对权重去均值 652 | - group_adjust: 653 | - True:对权重分行业去均值 (每日权重 = 每日权重 - 每日各行业权重的均值),使组合转换为 industry-neutral 多空组合 654 | - False:不对权重分行业去均值 655 | 656 | #### 画做多最大分位数做空最小分位数组合每日累积收益图 657 | 658 | ``` 659 | far.plot_top_down_cumulative_returns(period=1, demeaned=False, group_adjust=False) 660 | 661 | ``` 662 | 663 | **参数:** 664 | 665 | - periods:指定调仓周期 666 | - demeaned: 667 | - True:使用超额收益计算累积收益 (基准收益被认为是每日所有股票收益按照weight列中权重加权的均值) 668 | - False:不使用超额收益 669 | - group_adjust: 670 | - True:使用行业中性化后的收益计算累积收益 (行业收益被认为是每日各个行业股票收益按照weight列中权重加权的均值) 671 | - False:不使用行业中性化后的收益 672 | 673 | #### 画各分位数每日累积收益图 674 | 675 | ``` 676 | far.plot_cumulative_returns_by_quantile(period=(1, 3, 9), demeaned=False, group_adjust=False) 677 | 678 | ``` 679 | 680 | **参数:** 681 | 682 | - periods:调仓周期 683 | - demeaned: 684 | - True:使用超额收益计算累积收益 (基准收益被认为是每日所有股票收益按照weight列中权重加权的均值) 685 | - False:不使用超额收益 686 | - group_adjust: 687 | - True:使用行业中性化后的收益计算累积收益 (行业收益被认为是每日各个行业股票收益按照weight列中权重加权的均值) 688 | - False:不使用行业中性化后的收益 689 | 690 | #### 因子预测能力平均累计收益图 691 | 692 | ``` 693 | far.plot_quantile_average_cumulative_return(periods_before=5, periods_after=10, by_quantile=False, std_bar=False, demeaned=False, group_adjust=False) 694 | 695 | ``` 696 | 697 | **参数:** 698 | 699 | - periods_before: 计算过去的天数 700 | - periods_after: 计算未来的天数 701 | - by_quantile: 702 | - True:各分位数分别显示因子预测能力平均累计收益图 703 | - False:不用各分位数分别显示因子预测能力平均累计收益图 704 | - std_bar: 705 | - True:显示标准差 706 | - False:不显示标准差 707 | - demeaned: 708 | - True: 使用超额收益计算累积收益 (基准收益被认为是每日所有股票收益按照weight列中权重加权的均值) 709 | - False: 不使用超额收益 710 | - group_adjust: 711 | - True: 使用行业中性化后的收益计算累积收益 (行业收益被认为是每日各个行业股票收益按照weight列中权重加权的均值) 712 | - False: 不使用行业中性化后的收益 713 | 714 | #### 画有效因子数量统计图 715 | 716 | ``` 717 | far.plot_events_distribution(num_days=1) 718 | 719 | ``` 720 | 721 | **参数:** 722 | 723 | - num_days:统计间隔天数 724 | 725 | #### 关闭中文图例显示 726 | 727 | ``` 728 | far.plot_disable_chinese_label() 729 | 730 | ``` 731 | 732 | 733 | 734 | ### 2. 属性列表 735 | 736 | 用于访问因子分析的结果,大部分为惰性属性,在访问才会计算结果并返回 737 | 738 | 739 | 740 | #### 查看因子值 741 | 742 | ``` 743 | far.factor_data 744 | ``` 745 | 746 | - 类型:pandas.Series 747 | - index:为日期和股票代码的MultiIndex 748 | 749 | #### 去除 nan/inf,整理后的因子值、forward_return 和分位数 750 | 751 | ``` 752 | far.clean_factor_data 753 | ``` 754 | 755 | - 类型:pandas.DataFrame index:为日期和股票代码的MultiIndex 756 | - columns:根据period选择后的forward_return(如果调仓周期为1天,那么forward_return为[第二天的收盘价-今天的收盘价]/今天的收盘价)、因子值、行业分组、分位数数组、权重 757 | 758 | #### 按分位数分组加权平均因子收益 759 | 760 | ``` 761 | far.mean_return_by_quantile 762 | ``` 763 | 764 | - 类型:pandas.DataFrame 765 | - index:分位数分组 766 | - columns:调仓周期 767 | 768 | #### 按分位数分组加权因子收益标准差 769 | 770 | ``` 771 | far.mean_return_std_by_quantile 772 | ``` 773 | 774 | - 类型:pandas.DataFrame 775 | - index:分位数分组 776 | - columns:调仓周期 777 | 778 | #### 按分位数及日期分组加权平均因子收益 779 | 780 | ``` 781 | far.mean_return_by_date 782 | ``` 783 | 784 | - 类型:pandas.DataFrame 785 | - index:为日期和分位数的MultiIndex 786 | - columns:调仓周期 787 | 788 | #### 按分位数及日期分组加权因子收益标准差 789 | 790 | ``` 791 | far.mean_return_std_by_date 792 | ``` 793 | 794 | - 类型:pandas.DataFrame 795 | - index:为日期和分位数的MultiIndex 796 | - columns:调仓周期 797 | 798 | #### 按分位数及行业分组加权平均因子收益 799 | 800 | ``` 801 | far.mean_return_by_group 802 | ``` 803 | 804 | - 类型:pandas.DataFrame 805 | - index:为行业和分位数的MultiIndex 806 | - columns:调仓周期 807 | 808 | #### 按分位数及行业分组加权因子收益标准差 809 | 810 | ``` 811 | far.mean_return_std_by_group 812 | ``` 813 | 814 | - 类型:pandas.DataFrame 815 | - index:为行业和分位数的MultiIndex 816 | - columns:调仓周期 817 | 818 | #### 最高分位数因子收益减最低分位数因子收益每日均值 819 | 820 | ``` 821 | far.mean_return_spread_by_quantile 822 | ``` 823 | 824 | - 类型:pandas.DataFrame 825 | - index:日期 826 | - columns:调仓周期 827 | 828 | #### 最高分位数因子收益减最低分位数因子收益每日标准差 829 | 830 | ``` 831 | far.mean_return_spread_std_by_quantile 832 | ``` 833 | 834 | - 类型:pandas.DataFrame 835 | - index:日期 836 | - columns:调仓周期 837 | 838 | #### 信息比率 839 | 840 | ``` 841 | far.ic 842 | ``` 843 | 844 | - 类型:pandas.DataFrame 845 | - index:日期 846 | - columns:调仓周期 847 | 848 | #### 分行业信息比率 849 | 850 | ``` 851 | far.ic_by_group 852 | ``` 853 | 854 | - 类型:pandas.DataFrame 855 | - index:行业 856 | - columns:调仓周期 857 | 858 | #### 月度信息比率 859 | 860 | ``` 861 | far.ic_monthly 862 | ``` 863 | 864 | - 类型:pandas.DataFrame 865 | - index:月度 866 | - columns:调仓周期表 867 | 868 | #### 换手率 869 | 870 | ``` 871 | far.quantile_turnover 872 | ``` 873 | 874 | - 键:调仓周期 875 | - 值: pandas.DataFrame 换手率 876 | - index:日期 877 | - columns:分位数分组 878 | 879 | #### 计算按分位数分组加权因子收益和标准差 880 | 881 | ``` 882 | mean, std = far.calc_mean_return_by_quantile(by_date=True, by_group=False, demeaned=False, group_adjust=False) 883 | ``` 884 | 885 | **参数:** 886 | 887 | - by_date: 888 | - True:按天计算收益 889 | - False:不按天计算收益 890 | - by_group: 891 | - True: 按行业计算收益 892 | - False:不按行业计算收益 893 | - demeaned: 894 | - True:使用超额收益计算各分位数收益,超额收益=收益-基准收益 (基准收益被认为是每日所有股票收益按照weight列中权重的加权的均值) 895 | - False:不使用超额收益 896 | - group_adjust: 897 | - True:使用行业中性收益计算各分位数收益,行业中性收益=收益-行业收益 (行业收益被认为是每日各个行业股票收益按照weight列中权重的加权的均值) 898 | - False:不使用行业中性收益 899 | 900 | #### 计算按因子值加权多空组合每日收益 901 | 902 | ``` 903 | far.calc_factor_returns(demeaned=True, group_adjust=False) 904 | ``` 905 | 906 | 权重 = 每日因子值 / 每日因子值的绝对值的和 907 | 正的权重代表买入, 负的权重代表卖出 908 | 909 | **参数:** 910 | 911 | - demeaned: 912 | - True: 对权重去均值 (每日权重 = 每日权重 - 每日权重的均值), 使组合转换为 cash-neutral 多空组合 913 | - False:不对权重去均值 914 | - group_adjust: 915 | - True:对权重分行业去均值 (每日权重 = 每日权重 - 每日各行业权重的均值),使组合转换为 industry-neutral 多空组合 916 | - False:不对权重分行业去均值 917 | 918 | #### 计算两个分位数相减的因子收益和标准差 919 | 920 | ``` 921 | mean, std = far.compute_mean_returns_spread(upper_quant=None, lower_quant=None, by_date=False, by_group=False, demeaned=False, group_adjust=False) 922 | ``` 923 | 924 | **参数:** 925 | 926 | - upper_quant:用upper_quant选择的分位数减去lower_quant选择的分位数,只能在已有的范围内选择 927 | - lower_quant:用upper_quant选择的分位数减去lower_quant选择的分位数,只能在已有的范围内选择 928 | - by_date: 929 | - True:按天计算两个分位数相减的因子收益和标准差 930 | - False:不按天计算两个分位数相减的因子收益和标准差 931 | - by_group: 932 | - True: 分行业计算两个分位数相减的因子收益和标准差 933 | - False:不分行业计算两个分位数相减的因子收益和标准差 934 | - demeaned: 935 | - True:使用超额收益 (基准收益被认为是每日所有股票收益按照weight列中权重加权的均值) 936 | - False:不使用超额收益 937 | - group_adjust: 938 | - True:使用行业中性收益 (行业收益被认为是每日各个行业股票收益按照weight列中权重加权的均值) 939 | - False:不使用行业中性收益 940 | 941 | 942 | #### 计算因子的 alpha 和 beta 943 | 944 | ``` 945 | far.calc_factor_alpha_beta(demeaned=True, group_adjust=False) 946 | ``` 947 | 948 | 因子值加权组合每日收益 = beta * 市场组合每日收益 + alpha 949 | 950 | 因子值加权组合每日收益计算方法见 calc_factor_returns 函数 951 | 952 | 市场组合每日收益是每日所有股票收益按照weight列中权重加权的均值 953 | 954 | 结果中的 alpha 是年化 alpha 955 | 956 | **参数:** 957 | 958 | - demeaned: 959 | - True: 对因子值加权组合每日收益的权重去均值 (每日权重 = 每日权重 - 每日权重的均值),使组合转换为cash-neutral多空组合 960 | - False:不对权重去均值 961 | - group_adjust: 962 | - True:对权重分行业去均值 (每日权重 = 每日权重 - 每日各行业权重的均值),使组合转换为 industry-neutral 多空组合 963 | - False:不对权重分行业去均值 964 | 965 | #### 计算每日因子信息比率(IC值) 966 | 967 | ``` 968 | far.calc_factor_information_coefficient(group_adjust=False, by_group=False, method='rank') 969 | ``` 970 | 971 | **参数:** 972 | 973 | - group_adjust: 974 | - True:使用行业中性收益计算 IC (行业收益被认为是每日各个行业股票收益按照weight列中权重加权的均值) 975 | - False:不使用行业中性收益 976 | - by_group: 977 | - True:分行业计算 IC 978 | - False:不分行业计算 IC 979 | - method: 980 | - 'rank':用秩相关系数计算IC值 981 | - 'normal':用普通相关系数计算IC值 982 | 983 | #### 计算因子信息比率均值(IC值均值) 984 | 985 | ``` 986 | far.calc_mean_information_coefficient(group_adjust=False, by_group=False, by_time=None, method='rank') 987 | ``` 988 | 989 | **参数:** 990 | 991 | - group_adjust: 992 | - True:使用行业中性收益计算 IC (行业收益被认为是每日各个行业股票收益按照weight列中权重加权的均值) 993 | - False:不使用行业中性收益 994 | - by_group: 995 | - True:分行业计算 IC 996 | - False:不分行业计算 IC 997 | - by_time: 998 | - 'Y':按年求均值 999 | - 'M':按月求均值 1000 | - None:对所有日期求均值 1001 | - method: 1002 | - 'rank':用秩相关系数计算IC值 1003 | - 'normal':用普通相关系数计算IC值 1004 | 1005 | #### 按照当天的分位数算分位数未来和过去的收益均值和标准差 1006 | 1007 | ``` 1008 | far.calc_average_cumulative_return_by_quantile(periods_before=5, periods_after=15, demeaned=False, group_adjust=False) 1009 | ``` 1010 | 1011 | **参数:** 1012 | 1013 | - periods_before:计算过去的天数 1014 | - periods_after:计算未来的天数 1015 | - demeaned: 1016 | - True:使用超额收益计算累积收益 (基准收益被认为是每日所有股票收益按照weight列中权重加权的均值) 1017 | - False:不使用超额收益 1018 | - group_adjust: 1019 | - True:使用行业中性化后的收益计算累积收益 1020 | - False:不使用行业中性化后的收益 1021 | 1022 | #### 计算指定调仓周期的各分位数每日累积收益 1023 | 1024 | ``` 1025 | far.calc_cumulative_return_by_quantile(period=None, demeaned=False, group_adjust=False) 1026 | ``` 1027 | 1028 | **参数:** 1029 | 1030 | - period:指定调仓周期 1031 | - demeaned: 1032 | - True:使用超额收益计算累积收益 (基准收益被认为是每日所有股票收益按照weight列中权重加权的均值) 1033 | - False:不使用超额收益 1034 | - group_adjust: 1035 | - True:使用行业中性化后的收益计算累积收益 1036 | - False:不使用行业中性化后的收益 1037 | 1038 | #### 计算指定调仓周期的按因子值加权多空组合每日累积收益 1039 | 1040 | ``` 1041 | far.calc_cumulative_returns(period=5, demeaned=False, group_adjust=False) 1042 | ``` 1043 | 1044 | 当 period > 1 时,组合的累积收益计算方法为: 1045 | 1046 | 组合每日收益 = (从第0天开始每period天一调仓的组合每日收益 + 从第1天开始每period天一调仓的组合每日收益 + ... + 从第period-1天开始每period天一调仓的组合每日收益) / period 1047 | 1048 | 组合累积收益 = 组合每日收益的累积 1049 | 1050 | **参数:** 1051 | 1052 | - period:指定调仓周期 1053 | - demeaned: 1054 | - True:对权重去均值 (每日权重 = 每日权重 - 每日权重的均值),使组合转换为 cash-neutral 多空组合 1055 | - False:不对权重去均值 1056 | - group_adjust: 1057 | - True:对权重分行业去均值 (每日权重 = 每日权重 - 每日各行业权重的均值),使组合转换为 industry-neutral 多空组合 1058 | - False:不对权重分行业去均值 1059 | 1060 | #### 计算指定调仓周期和前面定义好的加权方式计算多空组合每日累计收益 1061 | 1062 | ``` 1063 | far.calc_top_down_cumulative_returns(period=5, demeaned=False, group_adjust=False) 1064 | ``` 1065 | 1066 | **参数:** 1067 | 1068 | - period:指定调仓周期 1069 | - demeaned: 1070 | - True:使用超额收益计算累积收益 (基准收益被认为是每日所有股票收益按照weight列中权重加权的均值) 1071 | - False:不使用超额收益 1072 | - group_adjust: 1073 | - True:使用行业中性化后的收益计算累积收益 (行业收益被认为是每日各个行业股票收益按照weight列中权重加权的均值) 1074 | - False:不使用行业中性化后的收益 1075 | 1076 | #### 根据调仓周期确定滞后期的每天计算因子自相关性 1077 | 1078 | ``` 1079 | far.calc_autocorrelation(rank=True) 1080 | ``` 1081 | 1082 | **参数:** 1083 | 1084 | - rank: 1085 | - True:秩相关系数 1086 | - False:普通相关系数 1087 | 1088 | #### 滞后n天因子值自相关性 1089 | 1090 | ``` 1091 | far.calc_autocorrelation_n_days_lag(n=9,rank=True) 1092 | ``` 1093 | 1094 | **参数:** 1095 | 1096 | - n:滞后n天到1天的因子值自相关性 1097 | - rank: 1098 | - True:秩相关系数 1099 | - False:普通相关系数 1100 | 1101 | #### 各分位数滞后1天到n天的换手率均值 1102 | 1103 | ``` 1104 | far.calc_quantile_turnover_mean_n_days_lag(n=10) 1105 | ``` 1106 | 1107 | **参数:** 1108 | 1109 | - n:滞后 1 天到 n 天的换手率 1110 | 1111 | #### 滞后 0 - n 天因子收益信息比率(IC)的移动平均 1112 | 1113 | ``` 1114 | far.calc_ic_mean_n_days_lag(n=10,group_adjust=False,by_group=False,method=None) 1115 | ``` 1116 | 1117 | **参数:** 1118 | 1119 | - n:滞后0-n天因子收益的信息比率(IC)的移动平均 1120 | - group_adjust: 1121 | - True:使用行业中性收益计算 IC (行业收益被认为是每日各个行业股票收益按照weight列中权重加权的均值) 1122 | - False:不使用行业中性收益 1123 | - by_group: 1124 | - True:分行业计算 IC 1125 | - False:不分行业计算 IC 1126 | - method: 1127 | - 'rank':用秩相关系数计算IC值 1128 | - 'normal':用普通相关系数计算IC值 1129 | 1130 | 1131 | 1132 | ### 3. 获取聚宽因子库数据的方法 1133 | 1134 | 1. [聚宽因子库](https://www.joinquant.com/help/api/help?name=factor_values)包含数百个质量、情绪、风险等其他类目的因子 1135 | 1136 | 2. 连接jqdatasdk获取数据包,数据接口需调用聚宽 [`jqdatasdk`](https://github.com/JoinQuant/jqdatasdk/blob/master/README.md) 接口获取金融数据([试用注册地址](http://t.cn/EINDOxE)) 1137 | 1138 | ```python 1139 | # 获取因子数据:以5日平均换手率为例,该数据可以直接用于因子分析 1140 | # 具体使用方法可以参照jqdatasdk的API文档 1141 | import jqdatasdk 1142 | jqdatasdk.auth('username', 'password') 1143 | # 获取聚宽因子库中的VOL5数据 1144 | factor_data=jqdatasdk.get_factor_values( 1145 | securities=jqdatasdk.get_index_stocks('000300.XSHG'), 1146 | factors=['VOL5'], 1147 | start_date='2018-01-01', 1148 | end_date='2018-12-31')['VOL5'] 1149 | ``` 1150 | 1151 | 1152 | 1153 | ### 4. 将自有因子值转换成 DataFrame 格式的数据 1154 | 1155 | - index 为日期,格式为 pandas 日期通用的 DatetimeIndex 1156 | 1157 | - columns 为股票代码,格式要求符合聚宽的代码定义规则(如:平安银行的股票代码为 000001.XSHE) 1158 | 1159 | - 如果是深交所上市的股票,在股票代码后面需要加入.XSHE 1160 | - 如果是上交所上市的股票,在股票代码后面需要加入.XSHG 1161 | 1162 | - 将 pandas.DataFrame 转换成满足格式要求数据格式 1163 | 1164 | 首先要保证 index 为 `DatetimeIndex` 格式 1165 | 1166 | 一般是通过 pandas 提供的 [`pandas.to_datetime`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html) 函数进行转换, 在转换前应确保 index 中的值都为合理的日期格式, 如 `'2018-01-01'` / `'20180101'`, 之后再调用 `pandas.to_datetime` 进行转换 1167 | 1168 | 另外应确保 index 的日期是按照从小到大的顺序排列的, 可以通过 [`sort_index`](https://pandas.pydata.org/pandas-docs/version/0.23.3/generated/pandas.DataFrame.sort_index.html) 进行排序 1169 | 1170 | 最后请检查 columns 中的股票代码是否都满足聚宽的代码定义 1171 | 1172 | ```python 1173 | import pandas as pd 1174 | 1175 | sample_data = pd.DataFrame( 1176 | [[0.84, 0.43, 2.33, 0.86, 0.96], 1177 | [1.06, 0.51, 2.60, 0.90, 1.09], 1178 | [1.12, 0.54, 2.68, 0.94, 1.12], 1179 | [1.07, 0.64, 2.65, 1.33, 1.15], 1180 | [1.21, 0.73, 2.97, 1.65, 1.19]], 1181 | index=['2018-01-02', '2018-01-03', '2018-01-04', '2018-01-05', '2018-01-08'], 1182 | columns=['000001.XSHE', '000002.XSHE', '000063.XSHE', '000069.XSHE', '000100.XSHE'] 1183 | ) 1184 | 1185 | print(sample_data) 1186 | 1187 | factor_data = sample_data.copy() 1188 | # 将 index 转换为 DatetimeIndex 1189 | factor_data.index = pd.to_datetime(factor_data.index) 1190 | # 将 DataFrame 按照日期顺序排列 1191 | factor_data = factor_data.sort_index() 1192 | # 检查 columns 是否满足聚宽股票代码格式 1193 | if not sample_data.columns.astype(str).str.match('\d{6}\.XSH[EG]').all(): 1194 | print("有不满足聚宽股票代码格式的股票") 1195 | print(sample_data.columns[~sample_data.columns.astype(str).str.match('\d{6}\.XSH[EG]')]) 1196 | 1197 | print(factor_data) 1198 | ``` 1199 | 1200 | - 将键为日期, 值为各股票因子值的 `Series` 的 `dict` 转换成 `pandas.DataFrame` 1201 | 1202 | 可以直接利用 `pandas.DataFrame` 生成 1203 | 1204 | ```python 1205 | sample_data = \ 1206 | {'2018-01-02': pd.Seris([0.84, 0.43, 2.33, 0.86, 0.96], 1207 | index=['000001.XSHE', '000002.XSHE', '000063.XSHE', '000069.XSHE', '000100.XSHE']), 1208 | '2018-01-03': pd.Seris([1.06, 0.51, 2.60, 0.90, 1.09], 1209 | index=['000001.XSHE', '000002.XSHE', '000063.XSHE', '000069.XSHE', '000100.XSHE']), 1210 | '2018-01-04': pd.Seris([1.12, 0.54, 2.68, 0.94, 1.12], 1211 | index=['000001.XSHE', '000002.XSHE', '000063.XSHE', '000069.XSHE', '000100.XSHE']), 1212 | '2018-01-05': pd.Seris([1.07, 0.64, 2.65, 1.33, 1.15], 1213 | index=['000001.XSHE', '000002.XSHE', '000063.XSHE', '000069.XSHE', '000100.XSHE']), 1214 | '2018-01-08': pd.Seris([1.21, 0.73, 2.97, 1.65, 1.19], 1215 | index=['000001.XSHE', '000002.XSHE', '000063.XSHE', '000069.XSHE', '000100.XSHE'])} 1216 | 1217 | import pandas as pd 1218 | # 直接调用 pd.DataFrame 将 dict 转换为 DataFrame 1219 | factor_data = pd.DataFrame(data).T 1220 | 1221 | print(factor_data) 1222 | 1223 | # 之后请按照 DataFrame 的方法转换成满足格式要求数据格式 1224 | ``` 1225 | 1226 | ## 四、数据处理函数 1227 | 1228 | #### 1. 中性化 1229 | 1230 | ```python 1231 | from jqfactor_analyzer import neutralize 1232 | neutralize(data, how=None, date=None, axis=1, fillna=None, add_constant=False) 1233 | ``` 1234 | 1235 | **参数 :** 1236 | 1237 | - data: pd.Series/pd.DataFrame, 待中性化的序列, 序列的 index/columns 为股票的 code 1238 | - how: str list. 中性化使用的因子名称列表. 1239 | 默认为 ['jq_l1', 'market_cap'], 支持的中性化方法有: 1240 | (1) 行业: sw_l1, sw_l2, sw_l3, jq_l1, jq_l2 1241 | (2) mktcap(总市值), ln_mktcap(对数总市值), cmktcap(流通市值), ln_cmktcap(对数流通市值) 1242 | (3)自定义的中性化数据: 支持同时传入额外的 Series 或者 DataFrame 用来进行中性化, index 必须是标的代码 1243 | 数列表。 1244 | - date: 日期, 将用 date 这天的相关变量数据对 series 进行中性化 (注意依赖数据的实际可用时间, 如市值数据当天盘中是无法获取到的) 1245 | - axis: 默认为 1. 仅在 data 为 pd.DataFrame 时生效. 表示沿哪个方向做中性化, 0 为对每列做中性化, 1 为对每行做中性化 1246 | - fillna: 缺失值填充方式, 默认为None, 表示不填充. 支持的值: 1247 | 'jq_l1': 聚宽一级行业 1248 | 'jq_l2': 聚宽二级行业 1249 | 'sw_l1': 申万一级行业 1250 | 'sw_l2': 申万二级行业 1251 | 'sw_l3': 申万三级行业 表示使用某行业分类的均值进行填充. 1252 | - add_constant: 中性化时是否添加常数项, 默认为 False 1253 | 1254 | **返回 :** 1255 | 1256 | - 中性化后的因子数据 1257 | 1258 | 1259 | 1260 | #### 2. 去极值 1261 | 1262 | ```python 1263 | from jqfactor_analyzer import winsorize 1264 | winsorize(data, scale=None, range=None, qrange=None, inclusive=True, inf2nan=True, axis=1) 1265 | ``` 1266 | 1267 | **参数 :** 1268 | 1269 | - data: pd.Series/pd.DataFrame/np.array, 待缩尾的序列 1270 | - scale: 标准差倍数,与 range,qrange 三选一,不可同时使用。会将位于 [mu - scale * sigma, mu + scale * sigma] 边界之外的值替换为边界值 1271 | - range: 列表, 缩尾的上下边界。与 scale,qrange 三选一,不可同时使用。 1272 | - qrange: 列表,缩尾的上下分位数边界,值应在 0 到 1 之间,如 [0.05, 0.95]。与 scale,range 三选一,不可同时使用。 1273 | - inclusive: 是否将位于边界之外的值替换为边界值,默认为 True。如果为 True,则将边界之外的值替换为边界值,否则则替换为 np.nan 1274 | - inf2nan: 是否将 np.inf 和 -np.inf 替换成 np.nan,默认为 True如果为 True,在缩尾之前会先将 np.inf 和 -np.inf 替换成 np.nan,缩尾的时候不会考虑 np.nan,否则 inf 被认为是在上界之上,-inf 被认为在下界之下 1275 | - axis: 在 data 为 pd.DataFrame 时使用,沿哪个方向做标准化,默认为 1。 0 为对每列做缩尾,1 为对每行做缩尾。 1276 | 1277 | **返回 :** 1278 | 1279 | - 去极值处理之后的因子数据 1280 | 1281 | 1282 | 1283 | #### 3. 中位数去极值 1284 | 1285 | ```python 1286 | from jqfactor_analyzer import winsorize_med 1287 | winsorize_med(data, scale=1, inclusive=True, inf2nan=True, axis=1) 1288 | ``` 1289 | 1290 | **参数 :** 1291 | 1292 | - data: pd.Series/pd.DataFrame/np.array, 待缩尾的序列 1293 | - scale: 倍数,默认为 1.0。会将位于 [med - scale * distance, med + scale * distance] 边界之外的值替换为边界值/np.nan 1294 | - inclusive bool 是否将位于边界之外的值替换为边界值,默认为 True。 如果为 True,则将边界之外的值替换为边界值,否则则替换为 np.nan 1295 | - inf2nan: 是否将 np.inf 和 -np.inf 替换成 np.nan,默认为 True。如果为 True,在缩尾之前会先将 np.inf 和 -np.inf 替换成 np.nan,缩尾的时候不会考虑 np.nan,否则 inf 被认为是在上界之上,-inf 被认为在下界之下 1296 | - axis: 在 data 为 pd.DataFrame 时使用,沿哪个方向做标准化,默认为 1。0 为对每列做缩尾,1 为对每行做缩尾 1297 | 1298 | **返回 :** 1299 | 1300 | - 中位数去极值之后的因子数据 1301 | 1302 | 1303 | 1304 | #### 4. 标准化(z-score) 1305 | 1306 | ```python 1307 | from jqfactor_analyzer import standardlize 1308 | standardlize(data, inf2nan=True, axis=1) 1309 | ``` 1310 | 1311 | **参数 :** 1312 | 1313 | - data: pd.Series/pd.DataFrame/np.array, 待标准化的序列 1314 | - inf2nan: 是否将 np.inf 和 -np.inf 替换成 np.nan。默认为 True 1315 | - axis=1: 在 data 为 pd.DataFrame 时使用,如果 series 为 pd.DataFrame,沿哪个方向做标准化。0 为对每列做标准化,1 为对每行做标准化 1316 | 1317 | **返回 :** 1318 | 1319 | - 标准化后的因子数据 1320 | -------------------------------------------------------------------------------- /jqfactor_analyzer/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from .version import __version__ 4 | from .analyze import FactorAnalyzer 5 | from .attribution import AttributionAnalysis 6 | from .data import DataApi 7 | from .preprocess import winsorize, winsorize_med, standardlize, neutralize 8 | from .factor_cache import save_factor_values_by_group, get_factor_values_by_cache, get_cache_dir 9 | 10 | 11 | def analyze_factor( 12 | factor, industry='jq_l1', quantiles=5, periods=(1, 5, 10), 13 | weight_method='avg', max_loss=0.25, allow_cache=True, show_data_progress=True 14 | ): 15 | """单因子分析 16 | 17 | 输入: 18 | factor: pandas.DataFrame: 因子值, columns 为股票代码 (如 '000001.XSHE'), 19 | index 为 日期的 DatetimeIndex 20 | 或 pandas.Series: 因子值, index 为日期和股票代码的 MultiIndex 21 | industry: 行业分类, 默认为 'jq_l1' 22 | - 'jq_l1': 聚宽一级行业 23 | - 'jq_l2': 聚宽二级行业 24 | - 'sw_l1': 申万一级行业 25 | - 'sw_l2': 申万二级行业 26 | - 'sw_l3': 申万三级行业 27 | - 'zjw': 证监会行业 28 | quantiles: 分位数数量, 默认为 5 29 | periods: 调仓周期, int 或 int 的 列表, 默认为 [1, 5, 10] 30 | weight_method: 计算分位数收益时的加权方法, 默认为 'avg' 31 | - 'avg': 等权重 32 | - 'mktcap': 按总市值加权 33 | - 'ln_mktcap': 按总市值的对数加权 34 | - 'cmktcap': 按流通市值加权 35 | - 'ln_cmktcap': 按流通市值的对数加权 36 | max_loss: 因重复值或nan值太多而无效的因子值的最大占比, 默认为 0.25 37 | allow_cache: 是否允许对价格,市值等信息进行本地缓存(按天缓存,初次运行可能比较慢,但后续重新获取对应区间的数据将非常快,且分析时仅消耗较小的jqdatasdk流量) 38 | show_data_progress: 是否展示数据获取的进度信息 39 | 40 | """ 41 | 42 | dataapi = DataApi(industry=industry, weight_method=weight_method, 43 | allow_cache=allow_cache, show_progress=show_data_progress) 44 | return FactorAnalyzer(factor, 45 | quantiles=quantiles, 46 | periods=periods, 47 | max_loss=max_loss, 48 | **dataapi.apis) 49 | 50 | 51 | def attribution_analysis( 52 | weights, daily_return, style_type='style_pro', industry='sw_l1', 53 | use_cn=True, show_data_progress=True 54 | ): 55 | """归因分析 56 | 57 | 用户需要提供的数据: 58 | 1. 日度股票持仓权重 (加总不为 1 的剩余部分视为现金) 59 | 2. 组合的的日度收益率 (使用 T 日持仓盘后的因子暴露与 T+1 日的收益进行归因分析) 60 | 61 | 组合风格因子暴露 (含行业, country) = sum(组合权重 * 个股因子值), country 暴露为总的股票持仓权重 62 | 组合风格收益率 (含行业, country) = sum(组合风格因子暴露 * factor_return) 63 | 组合特异收益率 = 组合总收益率 - 组合风格收益率(含行业, country 或 cash) 64 | """ 65 | return AttributionAnalysis(weights, 66 | daily_return=daily_return, 67 | style_type=style_type, 68 | industry=industry, 69 | use_cn=use_cn, 70 | show_data_progress=show_data_progress) 71 | -------------------------------------------------------------------------------- /jqfactor_analyzer/attribution.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import datetime 4 | from tqdm import tqdm 5 | from functools import partial 6 | import matplotlib.pyplot as plt 7 | from matplotlib.lines import Line2D 8 | 9 | from jqfactor_analyzer.data import DataApi 10 | from jqfactor_analyzer.factor_cache import save_factor_values_by_group, get_factor_values_by_cache 11 | from jqfactor_analyzer.plot_utils import _use_chinese 12 | from functools import lru_cache 13 | 14 | 15 | dataapi = DataApi(allow_cache=True, show_progress=True) 16 | 17 | 18 | def get_factor_style_returns(factors=None, start_date=None, end_date=None, 19 | count=None, universe=None, industry='sw_l1'): 20 | if dataapi._api_name == 'jqdatasdk': 21 | func = dataapi.api.get_factor_style_returns 22 | else: 23 | import jqfactor 24 | func = jqfactor.get_factor_style_returns 25 | return func(factors=factors, start_date=start_date, end_date=end_date, 26 | count=count, universe=universe, industry=industry) 27 | 28 | 29 | def get_price(security, start_date, end_date, fields): 30 | func = partial(dataapi.api.get_price, security=security, 31 | start_date=start_date, end_date=end_date, fields=fields) 32 | if dataapi._api_name == 'jqdatasdk': 33 | return func() 34 | else: 35 | return func(pre_factor_ref_date=datetime.date.today()) 36 | 37 | 38 | def get_index_style_exposure(index, factors=None, 39 | start_date=None, end_date=None, count=None): 40 | if dataapi._api_name == 'jqdatasdk': 41 | func = dataapi.api.get_index_style_exposure 42 | else: 43 | import jqfactor 44 | func = jqfactor.get_index_style_exposure 45 | return func(index=index, factors=factors, 46 | start_date=start_date, end_date=end_date, count=count) 47 | 48 | 49 | class AttributionAnalysis(): 50 | """归因分析 51 | 52 | 用户需要提供的数据: 53 | 1. 日度股票持仓权重 (加总不为 1 的剩余部分视为现金) 54 | 2. 组合的的日度收益率 (使用 T 日持仓盘后的因子暴露与 T+1 日的收益进行归因分析) 55 | 56 | 组合风格因子暴露 (含行业, country) = sum(组合权重 * 个股因子值), country 暴露为总的股票持仓权重 57 | 组合风格收益率 (含行业, country) = sum(组合风格因子暴露 * factor_return) 58 | 组合特异收益率 = 组合总收益率 - 组合风格收益率(含行业, country 或 cash) 59 | """ 60 | 61 | def __init__(self, weights, daily_return, 62 | style_type='style_pro', industry='sw_l1', 63 | use_cn=True, show_data_progress=True): 64 | """ 65 | 参数 66 | ---------- 67 | weights: 68 | 持仓权重信息, index 是日期, columns 是标的代码, value 对应的是当天的仓位占比 (单日仓位占比总和不为 1 时, 剩余部分认为是当天的现金) 69 | daily_return: 70 | Series, index 是日期, values 为当天账户的收益率 71 | style_type: 72 | 所选的风格因子类型, 'style' 和 'style_pro' 中的一个 73 | industry: 74 | 行业分类, 可选: 'sw_l1' 或 'jq_l1' 75 | use_cn: 76 | 绘图时是否使用中文 77 | show_data_progress: 78 | 是否展示数据获取进度 (使用本地缓存, 第一次运行时速度较慢, 后续对于本地不存在的数据将增量缓存) 79 | 80 | 所有属性列表 81 | ---------- 82 | style_exposure: 83 | 组合风格因子暴露 84 | industry_exposure: 85 | 组合行业因子暴露 86 | exposure_portfolio: 87 | 组合风格 / 行业及 country 的暴露 88 | attr_daily_return: 89 | 组合归因日收益率 90 | attr_returns: 91 | 组合归因累积收益汇总 92 | 93 | 所有方法列表 94 | ---------- 95 | get_exposure2bench(index_symbol): 96 | 获取相对于指数的暴露 97 | get_attr_daily_returns2bench(index_symbol): 98 | 获取相对于指数的日归因收益 99 | get_attr_returns2bench(index_symbol): 100 | 获取相对于指数的累积归因收益 101 | 102 | plot_exposure(factors='style', index_symbol=None, use_cn=True, figsize=(15, 8)) 103 | 绘制风格或行业暴露, 当指定 index_symbol 时, 返回的是相对指数的暴露, 否则为组合自身的暴露 104 | plot_returns(factors='style', index_symbol=None, use_cn=True, figsize=(15, 8)) 105 | 绘制风格或者行业的暴露收益, 当指定 index_symbol 时, 返回的是相对指数的暴露收益, 否则为组合自身的暴露收益 106 | plot_exposure_and_returns(self, factors, index_symbol=None, use_cn=True, figsize=(12, 6)) 107 | 同时绘制暴露和收益信息 108 | """ 109 | 110 | self.STYLE_TYPE_DICT = { 111 | 'style': ['size', 'beta', 'momentum', 'residual_volatility', 'non_linear_size', 112 | 'book_to_price_ratio', 'liquidity', 'earnings_yield', 'growth', 'leverage'], 113 | 'style_pro': ['btop', 'divyild', 'earnqlty', 'earnvar', 'earnyild', 'financial_leverage', 114 | 'invsqlty', 'liquidty', 'long_growth', 'ltrevrsl', 'market_beta', 'market_size', 115 | 'midcap', 'profit', 'relative_momentum', 'resvol'] 116 | } 117 | weights.index = pd.to_datetime(weights.index) 118 | daily_return.index = pd.to_datetime(daily_return.index) 119 | weights.loc[weights.sum(axis=1) > 1] = weights.div(weights.sum(axis=1), axis=0) 120 | self.weights = weights.replace(0, np.nan) 121 | self.daily_return = daily_return 122 | self.style_factor_names = self.STYLE_TYPE_DICT[style_type] 123 | self.industry = industry 124 | self.industry_code = list( 125 | set(dataapi.api.get_industries(industry, date=weights.index[0]).index) | 126 | set(dataapi.api.get_industries(industry, date=weights.index[-1]).index) 127 | ) 128 | self.style_type = style_type 129 | self.show_progress = show_data_progress 130 | self.factor_cache_directory = self.check_factor_values() 131 | 132 | # 当日收盘后的暴露 133 | self.style_exposure = self.calc_style_exposure() 134 | # 当日收盘后的暴露 135 | self.industry_exposure = self.calc_industry_exposure() 136 | # 当日收盘后的暴露 137 | self.exposure_portfolio = pd.concat([self.style_exposure, self.industry_exposure], axis=1) 138 | self.exposure_portfolio['country'] = self.weights.sum(axis=1) 139 | self.use_cn = use_cn 140 | if use_cn: 141 | _use_chinese(True) 142 | 143 | self._attr_daily_returns = None 144 | self._attr_returns = None 145 | self._factor_returns = None 146 | self._factor_cn_name = None 147 | 148 | def _get_factor_cn_name(self): 149 | """获取行业及风格因子的中文名称""" 150 | industry_info = dataapi.api.get_industries(self.industry).name 151 | factor_info = dataapi.api.get_all_factors() 152 | factor_info = factor_info[factor_info.category == 153 | self.style_type].set_index("factor").factor_intro 154 | factor_info = pd.concat([industry_info, factor_info]) 155 | factor_info['common_return'] = '因子收益' 156 | factor_info['specific_return'] = '特异收益' 157 | factor_info['total_return'] = '总收益' 158 | factor_info['cash'] = '现金' 159 | factor_info['country'] = 'country' 160 | self._factor_cn_name = factor_info 161 | return factor_info 162 | 163 | @property 164 | def factor_cn_name(self): 165 | if self._factor_cn_name is None: 166 | return self._get_factor_cn_name() 167 | else: 168 | return self._factor_cn_name 169 | 170 | def check_factor_values(self): 171 | """检查并缓存因子数据到本地""" 172 | start_date = self.weights.index[0] 173 | end_date = self.weights.index[-1] 174 | return save_factor_values_by_group(start_date, end_date, 175 | self.style_factor_names, 176 | show_progress=self.show_progress) 177 | 178 | def _get_style_exposure_daily(self, date, weight): 179 | weight = weight.dropna() 180 | resdaily = get_factor_values_by_cache( 181 | date, 182 | codes=weight.index, 183 | factor_names=self.style_factor_names, 184 | factor_path=self.factor_cache_directory).T 185 | resdaily = resdaily.mul(weight).sum(axis=1, min_count=1) 186 | resdaily.name = date 187 | return resdaily 188 | 189 | def calc_style_exposure(self): 190 | """计算组合的风格因子暴露 191 | 返回: 一个 dataframe, index 为日期, columns 为风格因子名, values 为暴露值""" 192 | 193 | iters = self.weights.iterrows() 194 | 195 | if self.show_progress: 196 | iters = tqdm(iters, total=self.weights.shape[0], desc='calc_style_exposure ') 197 | results = [] 198 | for date, weight in iters: 199 | results.append(self._get_style_exposure_daily(date, weight)) 200 | return pd.DataFrame(results) 201 | 202 | def _get_industry_exposure_daily(self, date, weight): 203 | weight = weight.dropna() 204 | resdaily = pd.get_dummies(dataapi._get_cached_industry_one_day( 205 | str(date.date()), securities=weight.index, industry=self.industry)) 206 | resdaily = resdaily.mul(weight, axis=0).sum(axis=0, min_count=1) 207 | resdaily.name = date 208 | return resdaily 209 | 210 | def calc_industry_exposure(self): 211 | """计算组合的行业因子暴露 212 | 返回: 一个 dataframe, index 为日期, columns为风格因子名, values为暴露值""" 213 | iters = self.weights.iterrows() 214 | if self.show_progress: 215 | iters = tqdm(iters, total=self.weights.shape[0], desc='calc_industry_exposure ') 216 | results = [] 217 | for date, weight in iters: 218 | results.append(self._get_industry_exposure_daily(date, weight)) 219 | return pd.DataFrame(results).reindex(columns=self.industry_code, fill_value=0) 220 | 221 | @property 222 | def attr_daily_returns(self): 223 | if self._attr_daily_returns is None: 224 | return self.calc_attr_returns()[0] 225 | else: 226 | return self._attr_daily_returns 227 | 228 | @property 229 | def attr_returns(self): 230 | if self._attr_returns is None: 231 | return self.calc_attr_returns()[1] 232 | else: 233 | return self._attr_returns 234 | 235 | @property 236 | def factor_returns(self): 237 | if self._factor_returns is None: 238 | exposure_portfolio = self.exposure_portfolio.copy() 239 | self._factor_returns = get_factor_style_returns( 240 | exposure_portfolio.columns.tolist(), 241 | self.exposure_portfolio.index[0], 242 | dataapi.api.get_trade_days(self.exposure_portfolio.index[-1], count=2)[-1], 243 | industry=self.industry, 244 | universe='zzqz') 245 | return self._factor_returns 246 | else: 247 | return self._factor_returns 248 | 249 | @lru_cache() 250 | def _get_index_returns(self, index_symbol, start_date, end_date): 251 | index_return = get_price(index_symbol, 252 | start_date=start_date, 253 | end_date=end_date, 254 | fields='close')['close'].pct_change() 255 | return index_return 256 | 257 | @lru_cache() 258 | def _get_index_exposure(self, index_symbol): 259 | index_exposure = get_index_style_exposure( 260 | index_symbol, 261 | factors=self.style_exposure.columns.tolist() + self.industry_exposure.columns.tolist(), 262 | start_date=str(self.weights.index[0]), 263 | end_date=str(self.weights.index[-1])) 264 | index_exposure = index_exposure.mul(self.weights.sum(axis=1), axis=0) 265 | index_exposure['country'] = 1 266 | return index_exposure 267 | 268 | @lru_cache() 269 | def get_exposure2bench(self, index_symbol): 270 | """获取相对于指数的暴露""" 271 | index_exposure = self._get_index_exposure(index_symbol) 272 | return self.exposure_portfolio - index_exposure 273 | 274 | @lru_cache() 275 | def get_attr_daily_returns2bench(self, index_symbol): 276 | """获取相对于指数的日归因收益率 277 | 返回: 一个 datafame, index 是日期, value 为对应日期的收益率值 278 | columns 为风格因子/行业因子/现金cash/因子总收益common_return(含风格,行业)/特异收益率 specific_return 及组合总收益率 total_return 279 | 注意: 日收益率直接加总, 可能和实际的最终收益率不一致, 因为没考虑到资产的变动情况 280 | """ 281 | exposure2bench = self.get_exposure2bench(index_symbol) 282 | exposure2bench = exposure2bench.reindex(self.factor_returns.index) 283 | 284 | index_return = self._get_index_returns(index_symbol, 285 | start_date=exposure2bench.index[0], 286 | end_date=exposure2bench.index[-1]) 287 | daily_return = self.daily_return - index_return 288 | 289 | attr_daily_returns2bench = exposure2bench.shift()[1:].mul(self.factor_returns) 290 | # country 收益为 0, 无意义 291 | del attr_daily_returns2bench['country'] 292 | attr_daily_returns2bench['common_return'] = attr_daily_returns2bench[self.style_exposure.columns.tolist() + 293 | self.industry_exposure.columns.tolist()].sum(axis=1) 294 | attr_daily_returns2bench['cash'] = index_return * exposure2bench.country.shift() 295 | attr_daily_returns2bench['specific_return'] = daily_return - \ 296 | attr_daily_returns2bench['common_return'] - \ 297 | attr_daily_returns2bench['cash'] 298 | attr_daily_returns2bench['total_return'] = daily_return 299 | return attr_daily_returns2bench 300 | 301 | @lru_cache() 302 | def get_attr_returns2bench(self, index_symbol): 303 | """获取相对于指数的累积归因收益 304 | 将超额收益分解成了: 305 | 1.common_return (因子收益, 又可进一步拆分成风格和行业); 306 | 2.cash (现金收益, 假设组合本身现金部分的收益为0, 则相对于指数的超额收益为"-1 * 指数收益"); 307 | 累积算法: (组合收益率 + 1).cumpord() - (日现金收益率+组合收益率 + 1).cumpord() 308 | 3.specific_return: 残差, 无法被风格和行业因子解释的部分, 即为主动收益, 现金收益实际也可划分到主动收益中 309 | """ 310 | index_return = self._get_index_returns(index_symbol, 311 | start_date=self.factor_returns.index[0], 312 | end_date=self.factor_returns.index[-1]) 313 | 314 | attr_daily_returns2bench = self.get_attr_daily_returns2bench("000905.XSHG") 315 | # 假设持仓的现金用于购买指数时的净值 316 | position_with_cash_net = ((-attr_daily_returns2bench.cash + self.daily_return).fillna(0) + 1).cumprod() 317 | # 持仓本身的净值 318 | position_net = (self.daily_return.fillna(0) + 1).cumprod() 319 | # 假设指数满仓时的超额 320 | t_net = position_net - (index_return + 1).fillna(1).cumprod() 321 | # 假设指数调整仓位到和组合一致(风格暴露)的超额 322 | net = position_net - (index_return * self.weights.sum(axis=1).shift() + 1).fillna(1).cumprod() 323 | # 超额的暴露收益 324 | attr_returns2bench2 = attr_daily_returns2bench.mul(net.shift() + 1, axis=0).cumsum() 325 | # 现金的收益 = 持仓本身的净值 - 假设持仓的现金用于购买指数的净值 326 | attr_returns2bench2['cash'] = position_net - position_with_cash_net 327 | # 超额收益 328 | attr_returns2bench2['total_return'] = t_net 329 | # 风格 + 行业因子收益, 不含现金 330 | attr_returns2bench2['common_return'] = attr_returns2bench2[self.style_exposure.columns.tolist() + 331 | self.industry_exposure.columns.tolist()].sum(axis=1) 332 | attr_returns2bench2.loc[attr_returns2bench2.cash.isna(), 'common_return'] = np.nan 333 | # 除风格,现金以外的无法解释的收益 334 | attr_returns2bench2['specific_return'] = ( 335 | attr_returns2bench2['total_return'] - attr_returns2bench2['common_return'] - attr_returns2bench2['cash'] 336 | ) 337 | return attr_returns2bench2 338 | 339 | def calc_attr_returns(self): 340 | """计算风格归因收益, country 收益率为国家收益 (这里的国家收益是用均衡大小市值后 (根号市值) 回归得到的""" 341 | self._attr_daily_returns = self.exposure_portfolio.reindex( 342 | self.factor_returns.index).shift(1).mul(self.factor_returns) 343 | self._attr_daily_returns['common_return'] = self._attr_daily_returns.sum(axis=1) 344 | self._attr_daily_returns['specific_return'] = self.daily_return - self._attr_daily_returns['common_return'] 345 | self._attr_daily_returns['total_return'] = self.daily_return 346 | 347 | cum_return = (self._attr_daily_returns.total_return.fillna(0) + 1).cumprod() 348 | self._attr_returns = self._attr_daily_returns.mul(cum_return.shift(1), axis=0).cumsum() 349 | 350 | return self._attr_daily_returns, self._attr_returns 351 | 352 | def plot_data(self, data, title=None, figsize=(15, 8)): 353 | ax = data.plot(figsize=figsize, title=title) 354 | ax.legend(loc='upper left', bbox_to_anchor=(1, 1)) 355 | plt.tight_layout(rect=[0, 0, 0.85, 1]) 356 | plt.show() 357 | 358 | def plot_exposure(self, factors='style', index_symbol=None, figsize=(15, 7)): 359 | """绘制风格暴露 360 | factors: 绘制的暴露类型 , 可选 'style'(所有风格因子), 'industry'(所有行业因子), 也可以传递一个list, list 为 exposure_portfolio 中 columns 的一个或者多个 361 | index_symbol: 基准指数代码, 指定时绘制相对于指数的暴露, 默认 None 为组合本身的暴露 362 | figsize: 画布大小 363 | """ 364 | exposure = self.exposure_portfolio if index_symbol is None else self.get_exposure2bench(index_symbol) 365 | if isinstance(factors, str): 366 | if factors == 'style': 367 | exposure = exposure[self.style_exposure.columns] 368 | elif factors == 'industry': 369 | exposure = exposure[self.industry_exposure.columns] 370 | else: 371 | exposure = exposure[[factors]] 372 | else: 373 | exposure = exposure[factors] 374 | 375 | if self.use_cn: 376 | exposure = exposure.rename(columns=self.factor_cn_name) 377 | title = '组合相对{}暴露'.format(index_symbol) if index_symbol else '组合暴露' 378 | else: 379 | title = 'exposure of {}'.format(index_symbol) if index_symbol else 'exposure' 380 | self.plot_data(exposure, title=title, figsize=figsize) 381 | 382 | def plot_returns(self, factors='style', index_symbol=None, figsize=(15, 7)): 383 | """绘制归因分析收益信息 384 | factors: 绘制的暴露类型, 可选 'style'(所有风格因子), 'industry'(所有行业因子), 也可以传递一个 list, list 为 exposure_portfolio 中 columns 的一个或者多个 385 | 同时也支持指定 ['common_return'(风格总收益), 'specific_return'(特异收益), 'total_return'(总收益), 386 | 'country'(国家因子收益,当指定index_symbol时会用现金相对于指数的收益替代)] 387 | index_symbol: 基准指数代码, 指定时绘制相对于指数的暴露, 默认 None 为组合本身的暴露 388 | figsize: 画布大小 389 | """ 390 | returns = self.attr_returns if index_symbol is None else self.get_attr_returns2bench(index_symbol) 391 | if isinstance(factors, str): 392 | if factors == 'style': 393 | returns = returns[self.style_exposure.columns] 394 | elif factors == 'industry': 395 | returns = returns[self.industry_exposure.columns] 396 | else: 397 | if index_symbol and factors == 'country': 398 | factors = 'cash' 399 | if factors not in returns.columns: 400 | raise ValueError("错误的因子名称: {}".format(factors)) 401 | returns = returns[[factors]] 402 | else: 403 | if index_symbol and 'country' in factors: 404 | factors = [x if x != 'country' else 'cash' for x in factors] 405 | wrong_factors = [x for x in factors if x not in returns.columns] 406 | if wrong_factors: 407 | raise ValueError("错误的因子名称: {}".format(wrong_factors)) 408 | returns = returns[factors] 409 | 410 | if self.use_cn: 411 | returns = returns.rename(columns=self.factor_cn_name) 412 | title = "累积归因收益 (相对{})".format( 413 | index_symbol) if index_symbol else "累积归因收益" 414 | else: 415 | title = 'cum return to {} '.format( 416 | index_symbol) if index_symbol else "cum return" 417 | self.plot_data(returns, title=title, figsize=figsize) 418 | 419 | def plot_exposure_and_returns(self, factors='style', index_symbol=None, show_factor_perf=False, figsize=(12, 6)): 420 | """将因子暴露与收益同时绘制在多个子图上 421 | factors: 绘制的暴露类型, 可选 'style'(所有风格因子) , 'industry'(所有行业因子), 也可以传递一个 list, list为 exposure_portfolio 中 columns 的一个或者多个 422 | 当指定 index_symbol 时, country 会用现金相对于指数的收益替代) 423 | index_symbol: 基准指数代码,指定时绘制相对于指数的暴露及收益 , 默认None为组合本身的暴露和收益 424 | show_factor_perf: 是否同时绘制因子表现 425 | figsize: 画布大小, 这里第一个参数是画布的宽度, 第二个参数为单个子图的高度 426 | """ 427 | if isinstance(factors, str): 428 | if factors == 'style': 429 | factors = self.style_exposure.columns 430 | elif factors == 'industry': 431 | factors = self.industry_exposure.columns 432 | else: 433 | factors = [factors] 434 | if index_symbol: 435 | exposure = self.get_exposure2bench(index_symbol).rename(columns={"country": "cash"}) 436 | returns = self.get_attr_returns2bench(index_symbol) 437 | else: 438 | exposure = self.exposure_portfolio 439 | returns = self.attr_returns 440 | exposure, returns = exposure.align(returns, join='outer') 441 | if show_factor_perf: 442 | factor_performance = self.factor_returns.cumsum().reindex(exposure.index) 443 | 444 | num_factors = len(factors) 445 | # 每行最多两个子图 446 | ncols = 2 if num_factors > 1 else 1 447 | nrows = (num_factors + 1) // ncols if num_factors > 1 else 1 448 | 449 | fixed_width, base_height_per_row = figsize 450 | height_per_row = base_height_per_row if ncols == 1 else base_height_per_row / 2 451 | total_height = max(1, nrows) * height_per_row 452 | 453 | fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(fixed_width, total_height)) 454 | axes = axes.flatten() if num_factors > 1 else [axes] 455 | 456 | # 删除多余的子图 457 | for j in range(len(factors), len(axes)): 458 | fig.delaxes(axes[j]) 459 | 460 | for i, factor_name in enumerate(factors): 461 | if index_symbol and factor_name == 'country': 462 | factor_name = 'cash' 463 | if factor_name not in exposure.columns: 464 | raise ValueError("错误的因子名称: {}".format(factor_name)) 465 | e = exposure[factor_name] 466 | r = returns[factor_name] 467 | 468 | ax1 = axes[i] 469 | e.plot(kind='area', stacked=False, alpha=0.5, ax=ax1, color='skyblue') 470 | 471 | ax2 = ax1.twinx() 472 | r.plot(ax=ax2, color='red') 473 | if factor_name != 'cash' and show_factor_perf: 474 | factor_performance[factor_name].plot(ax=ax2, color='blue') 475 | ax1.set_title(factor_name if not self.use_cn else self.factor_cn_name.get(factor_name)) 476 | labels = ['暴露', '因子收益', '因子表现'] if self.use_cn else ['exposure', 'return', 'factor performance'] 477 | fig.legend(labels[:1], loc='upper left') 478 | 479 | # 手动创建图例条目 480 | custom_lines = [Line2D([0], [0], color='red', lw=2), 481 | Line2D([0], [0], color='blue', lw=2)] 482 | # 创建自定义图例 483 | fig.legend(custom_lines, labels[1:], loc='upper right', 484 | bbox_to_anchor=(1, 1.02), bbox_transform=plt.gcf().transFigure) 485 | fig.suptitle('因子暴露与收益图' if self.use_cn else 'factor exposure and return', y=1.02) 486 | plt.tight_layout() 487 | plt.show() 488 | 489 | def plot_disable_chinese_label(self): 490 | """关闭中文图例显示 491 | 492 | 画图时默认会从系统中查找中文字体显示以中文图例 493 | 如果找不到中文字体则默认使用英文图例 494 | 当找到中文字体但中文显示乱码时, 可调用此 API 关闭中文图例显示而使用英文 495 | """ 496 | _use_chinese(False) 497 | self.use_cn = False 498 | -------------------------------------------------------------------------------- /jqfactor_analyzer/compat.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """pandas库版本兼容模块""" 4 | 5 | import warnings 6 | 7 | import pandas as pd 8 | 9 | 10 | # pandas 11 | PD_VERSION = pd.__version__ 12 | 13 | 14 | def rolling_apply( 15 | x, 16 | window, 17 | func, 18 | min_periods=None, 19 | freq=None, 20 | center=False, 21 | args=None, 22 | kwargs=None 23 | ): 24 | if args is None: 25 | args = tuple() 26 | if kwargs is None: 27 | kwargs = dict() 28 | 29 | if PD_VERSION >= '0.23.0': 30 | return x.rolling( 31 | window, min_periods=min_periods, center=center 32 | ).apply( 33 | func, False, args=args, kwargs=kwargs 34 | ) 35 | elif PD_VERSION >= '0.18.0': 36 | return x.rolling( 37 | window, min_periods=min_periods, center=center 38 | ).apply( 39 | func, args=args, kwargs=kwargs 40 | ) 41 | else: 42 | return pd.rolling_apply( 43 | x, 44 | window, 45 | func, 46 | min_periods=min_periods, 47 | freq=freq, 48 | center=center, 49 | args=args, 50 | kwargs=kwargs 51 | ) 52 | 53 | 54 | def rolling_mean(x, window, min_periods=None, center=False): 55 | if PD_VERSION >= '0.18.0': 56 | return x.rolling(window, min_periods=min_periods, center=center).mean() 57 | else: 58 | return pd.rolling_mean( 59 | x, window, min_periods=min_periods, center=center 60 | ) 61 | 62 | 63 | def rolling_std(x, window, min_periods=None, center=False, ddof=1): 64 | if PD_VERSION >= '0.18.0': 65 | return x.rolling( 66 | window, min_periods=min_periods, center=center 67 | ).std(ddof=ddof) 68 | else: 69 | return pd.rolling_std( 70 | x, window, min_periods=min_periods, center=center, ddof=ddof 71 | ) 72 | 73 | 74 | # statsmodels 75 | with warnings.catch_warnings(): 76 | # 有的版本依赖的 pandas 库会有 deprecated warning 77 | warnings.simplefilter("ignore") 78 | import statsmodels 79 | from statsmodels.api import OLS, qqplot, ProbPlot 80 | from statsmodels.tools.tools import add_constant 81 | -------------------------------------------------------------------------------- /jqfactor_analyzer/config.json: -------------------------------------------------------------------------------- 1 | {"default_dir": "~/jqfactor_datacache/bundle", "user_dir": ""} 2 | -------------------------------------------------------------------------------- /jqfactor_analyzer/data.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import numpy as np 4 | import pandas as pd 5 | from tqdm import tqdm 6 | from fastcache import lru_cache 7 | from functools import partial 8 | import pyarrow.feather as feather 9 | 10 | from .when import date2str, convert_date, today, now, Time, Date 11 | from .factor_cache import save_factor_values_by_group, get_factor_values_by_cache, get_cache_dir 12 | 13 | 14 | class DataApi(object): 15 | 16 | def __init__(self, price='close', fq='post', 17 | industry='jq_l1', weight_method='avg', allow_cache=True, show_progress=True): 18 | """数据接口, 用于因子分析获取数据 19 | 20 | 参数 21 | ---------- 22 | price : 使用开盘价/收盘价计算收益 (请注意避免未来函数), 默认为 'close' 23 | - 'close': 使用当日收盘价和次日收盘价计算当日因子的远期收益 24 | - 'open' : 使用当日开盘价和次日开盘价计算当日因子的远期收益 25 | fq : 价格数据的复权方式, 默认为 'post' 26 | - 'post': 后复权 27 | - 'pre': 前复权 28 | - None: 不复权 29 | industry : 行业分类, 默认为 'jq_l1' 30 | - 'jq_l1': 聚宽一级行业 31 | - 'jq_l2': 聚宽二级行业 32 | - 'sw_l1': 申万一级行业 33 | - 'sw_l2': 申万二级行业 34 | - 'sw_l3': 申万三级行业 35 | - 'zjw': 证监会行业 36 | weight_method : 计算各分位收益时, 每只股票权重, 默认为 'avg' 37 | - 'avg': 等权重 38 | - 'mktcap': 按总市值加权 39 | - 'ln_mktcap': 按总市值的对数加权 40 | - 'cmktcap': 按流通市值加权 41 | - 'ln_cmktcap': 按流通市值的对数加权 42 | allow_cache : 是否允许将分析必须数据以文件的形式缓存至本地, 默认允许, 缓存开启时, 首次加载耗时较长 43 | show_progress : 是否展示数据获取进度 44 | 45 | 使用示例 46 | ---------- 47 | from jqfactor_analyzer import DataApi, FactorAnalyzer 48 | 49 | api = DataApi(fq='pre', industry='sw_l1', weight_method='ln_mktcap') 50 | api.auth('username', 'password') 51 | 52 | factor = FactorAnalyzer(factor_data, 53 | price=api.get_prices, 54 | groupby=api.get_groupby, 55 | weights=api.get_weights) 56 | # 或者 57 | # factor = FactorAnalyzer(factor_data, **api.apis) 58 | 59 | 60 | 方法列表 61 | ---------- 62 | auth : 登陆 jqdatasdk 63 | 参数 : 64 | username : jqdatasdk 用户名 65 | username : jqdatasdk 密码 66 | 返回值 : 67 | None 68 | 69 | get_prices : 价格数据获取接口 70 | 参数 : 71 | securities : 股票代码列表 72 | start_date : 开始日期 73 | end_date : 结束日期 74 | count : 交易日长度 75 | (start_date 和 count) 76 | 返回值 : 77 | pd.DataFrame 78 | 价格数据, columns 为股票代码, index 为日期 79 | 80 | get_groupby : 行业分类数据获取接口 81 | 参数 : 82 | securities : 股票代码列表 83 | start_date : 开始日期 84 | end_date : 结束日期 85 | 返回值 : 86 | dict 87 | 行业分类, {股票代码 -> 行业分类名称} 88 | 89 | get_weights : 股票权重获取接口 90 | 参数 : 91 | securities : 股票代码列表 92 | start_date : 开始日期 93 | end_date : 结束日期 94 | 返回值 : 95 | pd.DataFrame 96 | 权重数据, columns 为股票代码, index 为日期 97 | 98 | 99 | 属性列表 100 | ---------- 101 | apis : dict, {'prices': get_prices, 'groupby': get_groupby, 102 | 'weights': get_weights} 103 | 104 | """ 105 | try: 106 | import jqdata 107 | self._api = jqdata.apis 108 | self._api_name = 'jqdata' 109 | except ImportError: 110 | import jqdatasdk 111 | self._api = jqdatasdk 112 | self._api_name = 'jqdatasdk' 113 | 114 | self.show_progress = show_progress 115 | valid_price = ('close', 'open') 116 | if price in valid_price: 117 | self.price = price 118 | else: 119 | ValueError("invalid 'price' parameter, " 120 | "should be one of %s" % str(valid_price)) 121 | 122 | valid_fq = ('post', 'pre', None) 123 | if fq in valid_fq: 124 | self.fq = fq 125 | else: 126 | raise ValueError("invalid 'fq' parameter, " 127 | "should be one of %s" % str(valid_fq)) 128 | 129 | valid_industry = ('sw_l1', 'sw_l2', 'sw_l3', 'jq_l1', 'jq_l2', 'zjw') 130 | if industry in valid_industry: 131 | self.industry = industry 132 | else: 133 | raise ValueError("invalid 'industry' parameter, " 134 | "should be one of %s" % str(valid_industry)) 135 | 136 | valid_weight_method = ('avg', 'mktcap', 'ln_mktcap', 'cmktcap', 'ln_cmktcap') 137 | if weight_method in valid_weight_method: 138 | self.weight_method = weight_method 139 | else: 140 | raise ValueError("invalid 'weight_method' parameter, " 141 | "should be one of %s" % str(valid_weight_method)) 142 | self.ini_cache_cfg(allow_cache) 143 | 144 | @lru_cache(10) 145 | def get_ind_record(self, industry): 146 | mapping = self.api.get_industries(industry).to_dict()['name'] 147 | ind_record = self.api.get_history_industry(industry).set_index('stock') 148 | ind_record['industry_name'] = ind_record['code'].map(mapping) 149 | ind_record.end_date = ind_record.end_date.fillna(Date(2040, 1, 1)) 150 | return ind_record 151 | 152 | def ini_cache_cfg(self, allow_cache): 153 | self.allow_cache = allow_cache 154 | 155 | if self._api_name != 'jqdatasdk': 156 | self.allow_cache = False 157 | self.allow_industry_cache = False 158 | 159 | def auth(self, username='', password=''): 160 | if self._api_name == 'jqdata': 161 | return 162 | if username: 163 | import jqdatasdk 164 | jqdatasdk.auth(username, password) 165 | 166 | @property 167 | def api(self): 168 | if not hasattr(self, "_api"): 169 | raise NotImplementedError('api not specified') 170 | if self.allow_cache: 171 | if not self._api.is_auth(): 172 | raise Exception("Please run jqdatasdk.auth first") 173 | privilege = self._api.get_privilege() 174 | if 'GET_HISTORY_INDUSTRY' in privilege: 175 | self.allow_industry_cache = True 176 | else: 177 | self.allow_industry_cache = False 178 | if 'FACTOR_BASICS' in privilege or 'GET_FACTOR_VALUES' in privilege: 179 | self.mkt_cache_api = 'factor' 180 | else: 181 | self.mkt_cache_api = 'valuation' 182 | return self._api 183 | 184 | @lru_cache(2) 185 | def _get_trade_days(self, start_date=None, end_date=None): 186 | if start_date is not None: 187 | start_date = date2str(start_date) 188 | if end_date is not None: 189 | end_date = date2str(end_date) 190 | return list(self.api.get_trade_days(start_date=start_date, 191 | end_date=end_date)) 192 | 193 | def _get_price(self, securities, start_date=None, end_date=None, count=None, 194 | fields=None, skip_paused=False, fq='post', round=False): 195 | start_date = date2str(start_date) if start_date is not None else None 196 | end_date = date2str(end_date) if end_date is not None else None 197 | if self._api_name == 'jqdata': 198 | if 'panel' in self.api.get_price.__code__.co_varnames: 199 | get_price = partial(self.api.get_price, 200 | panel=False, 201 | pre_factor_ref_date=end_date) 202 | else: 203 | get_price = partial(self.api.get_price, 204 | pre_factor_ref_date=end_date) 205 | else: 206 | get_price = self.api.get_price 207 | p = get_price( 208 | securities, start_date=start_date, end_date=end_date, count=count, 209 | fields=fields, skip_paused=skip_paused, fq=fq, round=round 210 | ) 211 | if hasattr(p, 'to_frame'): 212 | p = p.to_frame() 213 | p.index.names = ['time', 'code'] 214 | p.reset_index(inplace=True) 215 | 216 | return p 217 | 218 | def _get_cached_price(self, securities, start_date=None, end_date=None, fq=None, overwrite=False): 219 | """获取缓存价格数据, 缓存文件中存储的数据是为未复权价格和后复权因子""" 220 | save_factor_values_by_group(start_date, end_date, 221 | factor_names='prices', 222 | overwrite=overwrite, 223 | show_progress=self.show_progress) 224 | trade_days = pd.to_datetime(self._get_trade_days(start_date, end_date)) 225 | 226 | ret = [] 227 | if self.show_progress: 228 | trade_days = tqdm(trade_days, desc="load price info : ") 229 | for day in trade_days: 230 | if day < today(): 231 | p = get_factor_values_by_cache( 232 | day, securities, factor_names='prices').reset_index() 233 | else: 234 | p = self.api.get_price(securities, start_date=day, end_date=day, 235 | skip_paused=False, round=False, 236 | fields=['open', 'close', 'factor'], 237 | fq='post', panel=False) 238 | p[['open', 'close']] = p[['open', 'close']].div(p['factor'], axis=0) 239 | p['time'] = day 240 | ret.append(p) 241 | ret = pd.concat(ret, ignore_index=True).sort_values(['code', 'time']).reset_index(drop=True) 242 | if fq == 'pre': 243 | # 前复权基准日期为最新一天 244 | latest_factor = self.api.get_price(securities, 245 | end_date=today(), 246 | count=1, 247 | skip_paused=False, 248 | round=False, 249 | fields=['factor'], 250 | fq='post', 251 | panel=False).set_index('code') 252 | ret = ret.set_index('code') 253 | ret.factor = ret.factor / latest_factor.factor 254 | ret = ret.reset_index().reindex(columns=['time', 'code', 'open', 'close', 'factor']) 255 | elif fq is None: 256 | ret.loc[ret['factor'].notna(), 'factor'] = 1.0 257 | ret[['open', 'close']] = ret[['open', 'close']].mul(ret['factor'], axis=0) 258 | return ret 259 | 260 | def get_prices(self, securities, start_date=None, end_date=None, 261 | period=None): 262 | if period is not None: 263 | trade_days = self._get_trade_days(start_date=end_date) 264 | if len(trade_days): 265 | end_date = trade_days[:period + 1][-1] 266 | if self.allow_cache: 267 | p = self._get_cached_price( 268 | securities, start_date, end_date, fq=self.fq) 269 | else: 270 | p = self._get_price( 271 | fields=[self.price], securities=securities, 272 | start_date=start_date, end_date=end_date, round=False, 273 | fq=self.fq 274 | ) 275 | p = p.set_index(['time', 'code'])[self.price].unstack('code').sort_index() 276 | return p 277 | 278 | def _get_industry(self, securities, start_date, end_date, industry='jq_l1'): 279 | trade_days = self._get_trade_days(start_date, end_date) 280 | industries = map(partial(self.api.get_industry, securities), trade_days) 281 | day_ind = zip(trade_days, industries) 282 | if self.show_progress: 283 | day_ind = tqdm(day_ind, desc='load industry info : ', 284 | total=len(trade_days)) 285 | industries = { 286 | d: { 287 | s: ind.get(s).get(industry, dict()).get('industry_name', 'NA') 288 | for s in securities 289 | } 290 | for d, ind in day_ind 291 | } 292 | return pd.DataFrame(industries).T.sort_index() 293 | 294 | def _get_cached_industry_one_day(self, date, securities=None, industry=None): 295 | date = convert_date(date) 296 | if self.allow_industry_cache: 297 | ind_record = self.get_ind_record(industry) 298 | if securities is not None: 299 | ind_record = ind_record[ind_record.index.isin(securities)] 300 | return ind_record[(ind_record.start_date <= date) & (date <= ind_record.end_date)].code 301 | else: 302 | ind_record = self.api.get_industry(securities, date, df=True) 303 | ind_record = ind_record[ind_record['type'] == 304 | industry].set_index("code").industry_code 305 | return ind_record 306 | 307 | def _get_cached_industry(self, securities, start_date, end_date): 308 | ind_record = self.get_ind_record(self.industry) 309 | start_date = convert_date(start_date) 310 | end_date = convert_date(end_date) 311 | trade_days = self._get_trade_days(start_date, end_date) 312 | ind_record = ind_record[(ind_record.index.isin(securities))] 313 | if self.show_progress: 314 | trade_days = tqdm(trade_days, desc="load industry info : ") 315 | df_list = [] 316 | for d in trade_days: 317 | rec = ind_record[(ind_record.start_date <= d) & ( 318 | d <= ind_record.end_date)].industry_name 319 | rec.name = d 320 | df_list.append(rec) 321 | df = pd.DataFrame(df_list).reindex(columns=securities) 322 | return df.fillna('NA') 323 | 324 | def get_groupby(self, securities, start_date, end_date): 325 | if self.allow_industry_cache: 326 | return self._get_cached_industry(securities, start_date, end_date) 327 | else: 328 | return self._get_industry(securities=securities, 329 | start_date=start_date, end_date=end_date, 330 | industry=self.industry) 331 | 332 | def _get_cached_mkt_cap_by_valuation(self, securities, date, field, overwrite=False): 333 | """市值处理函数, 将获取的市值数据缓存到本地""" 334 | if not securities: 335 | return pd.Series(dtype='float64', name=date) 336 | 337 | query = self.api.query 338 | valuation = self.api.valuation 339 | cache_dir = os.path.join(get_cache_dir(), 'mkt_cap', date.strftime("%Y%m")) 340 | fp = os.path.join(cache_dir, date.strftime("%Y%m%d")) + '.feather' 341 | 342 | if os.path.exists(fp) and not overwrite: 343 | data = feather.read_feather(fp) 344 | else: 345 | if not os.path.exists(cache_dir): 346 | os.makedirs(cache_dir) 347 | codes = self.api.get_all_securities('stock').index.tolist() 348 | q = query(valuation.code, 349 | valuation.market_cap, 350 | valuation.circulating_market_cap).filter( 351 | valuation.code.in_(codes)) 352 | data = self.api.get_fundamentals(q, date=date2str(date)) 353 | data[['market_cap', 'circulating_market_cap']] = data[ 354 | ['market_cap', 'circulating_market_cap']] * (10 ** 8) 355 | if date < today() or (date == today() and now().time() >= Time(16, 30)): 356 | data.to_feather(fp) 357 | 358 | return data[data.code.isin(securities)].set_index('code')[field] 359 | 360 | def _get_market_cap(self, securities, start_date, end_date, ln=False, field='market_cap'): 361 | trade_days = self._get_trade_days(start_date, end_date) 362 | 363 | def get_mkt_cap(s, date, field): 364 | if not s: 365 | return pd.Series(dtype='float64', name=date) 366 | data = self.api.get_fundamentals( 367 | q, date=date2str(date) 368 | ).set_index('code')[field] * (10 ** 8) 369 | return data 370 | 371 | def get_mkt_cap_cache(s, date, field): 372 | cap = get_factor_values_by_cache( 373 | date, securities, factor_path=cache_dir).reindex(columns=[field]) 374 | return cap[field] 375 | 376 | if self.allow_cache and len(trade_days) > 5: 377 | if self.mkt_cache_api == 'factor': 378 | desc = 'check/save cap cache :' if self.show_progress else False 379 | cache_dir = save_factor_values_by_group(start_date, 380 | end_date, 381 | factor_names=['market_cap', 'circulating_market_cap'], 382 | group_name='mkt_cap', 383 | show_progress=desc) 384 | market_api = get_mkt_cap_cache 385 | else: 386 | market_api = self._get_cached_mkt_cap_by_valuation 387 | else: 388 | market_api = get_mkt_cap 389 | query = self.api.query 390 | valuation = self.api.valuation 391 | 392 | if field == 'market_cap': 393 | q = query(valuation.code, valuation.market_cap).filter( 394 | valuation.code.in_(securities)) 395 | elif field == 'circulating_market_cap': 396 | q = query(valuation.code, valuation.circulating_market_cap).filter( 397 | valuation.code.in_(securities)) 398 | else: 399 | raise ValueError("不支持的字段 : {}".foramt(field)) 400 | 401 | if self.show_progress: 402 | trade_days = tqdm(trade_days, desc="load cap info : ") 403 | 404 | market_cap = [] 405 | for date in trade_days: 406 | cap = market_api(securities, date, field) 407 | cap.name = date 408 | market_cap.append(cap) 409 | market_cap = pd.concat(market_cap, axis=1).astype(float).reindex(index=securities) 410 | 411 | if ln: 412 | market_cap = np.log(market_cap) 413 | 414 | return market_cap.T 415 | 416 | def _get_circulating_market_cap(self, securities, start_date, end_date, 417 | ln=False): 418 | return self._get_market_cap(securities, start_date, end_date, 419 | ln=ln, field='circulating_market_cap') 420 | 421 | def _get_average_weights(self, securities, start_date, end_date): 422 | return {sec: 1.0 for sec in securities} 423 | 424 | def get_weights(self, securities, start_date, end_date): 425 | start_date = date2str(start_date) 426 | end_date = date2str(end_date) 427 | 428 | if self.weight_method == 'avg': 429 | weight_api = self._get_average_weights 430 | elif self.weight_method == 'mktcap': 431 | weight_api = partial(self._get_market_cap, ln=False) 432 | elif self.weight_method == 'ln_mktcap': 433 | weight_api = partial(self._get_market_cap, ln=True) 434 | elif self.weight_method == 'cmktcap': 435 | weight_api = partial(self._get_circulating_market_cap, ln=False) 436 | elif self.weight_method == 'ln_cmktcap': 437 | weight_api = partial(self._get_circulating_market_cap, ln=True) 438 | else: 439 | raise ValueError('invalid weight_method') 440 | 441 | return weight_api(securities=securities, start_date=start_date, 442 | end_date=end_date) 443 | 444 | @property 445 | def apis(self): 446 | return dict(prices=self.get_prices, 447 | groupby=self.get_groupby, 448 | weights=self.get_weights) 449 | -------------------------------------------------------------------------------- /jqfactor_analyzer/exceptions.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | from functools import wraps 5 | 6 | 7 | def rethrow(exception, additional_message): 8 | """ 9 | 重新抛出当前作用域中的最后一个异常, 保留堆栈信息, 并且在报错信息中添加其他内容 10 | """ 11 | e = exception 12 | m = additional_message 13 | if not e.args: 14 | e.args = (m,) 15 | else: 16 | e.args = (e.args[0] + m,) + e.args[1:] 17 | raise e 18 | 19 | 20 | def non_unique_bin_edges_error(func): 21 | """ 22 | 捕获 pd.qcut 的异常, 添加提示信息并报错 23 | """ 24 | message = u""" 25 | 根据输入的 quantiles 计算时发生错误. 26 |     这通常发生在输入包含太多相同值, 使得它们跨越多个分位. 27 | 每天的因子值是按照分位数平均分组的, 相同的值不能跨越多个分位数. 28 |     可能的解决方法: 29 | 1. 减少分位数 30 | 2. 调整因子减少重复值 31 | 3. 尝试不同的股票池 32 | """ 33 | 34 | @wraps(func) 35 | def dec(*args, **kwargs): 36 | try: 37 | return func(*args, **kwargs) 38 | except ValueError as e: 39 | if 'Bin edges must be unique' in str(e): 40 | rethrow(e, message) 41 | raise 42 | 43 | return dec 44 | 45 | 46 | class MaxLossExceededError(Exception): 47 | pass 48 | -------------------------------------------------------------------------------- /jqfactor_analyzer/factor_cache.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | from itertools import groupby 3 | import pandas as pd 4 | import os 5 | import json 6 | import functools 7 | import logging 8 | from .when import today, now, TimeDelta 9 | from tqdm import tqdm 10 | 11 | 12 | try: 13 | import jqdata 14 | api = jqdata.apis 15 | api_name = 'jqdata' 16 | except ImportError: 17 | import jqdatasdk 18 | api = jqdatasdk 19 | api_name = 'jqdatasdk' 20 | 21 | 22 | def get_cache_config(): 23 | """获取缓存目录""" 24 | config_path = os.path.join( 25 | os.path.dirname(os.path.abspath(__file__)), 'config.json' 26 | ) 27 | if not os.path.exists(config_path): 28 | return set_cache_dir("") 29 | else: 30 | with open(config_path, 'r') as conf: 31 | return json.load(conf) 32 | 33 | 34 | def set_cache_dir(path): 35 | """设置缓存目录""" 36 | cfg = {'default_dir': '~/jqfactor_datacache/bundle', 37 | 'user_dir': os.path.expanduser(path)} 38 | config_path = os.path.join( 39 | os.path.dirname(os.path.abspath(__file__)), 'config.json' 40 | ) 41 | with open(config_path, 'w') as conf: 42 | json.dump(cfg, conf) 43 | get_cache_dir.cache_clear() 44 | return cfg 45 | 46 | 47 | def get_factor_values(securities, factors=None, start_date=None, end_date=None, count=None): 48 | if api_name == 'jqdatasdk': 49 | func = api.get_factor_values 50 | else: 51 | from jqfactor import get_factor_values 52 | func = get_factor_values 53 | return func(securities, factors, start_date, end_date, count) 54 | 55 | 56 | @functools.lru_cache() 57 | def get_cache_dir(): 58 | # 优先获取用户配置的缓存目录, 若无, 则使用默认目录 59 | cfg = get_cache_config() 60 | user_path = cfg.get('user_dir', "") 61 | if user_path != "": 62 | return os.path.expanduser(user_path) 63 | return os.path.expanduser(cfg['default_dir']) 64 | 65 | 66 | def list_to_tuple_converter(func): 67 | @functools.wraps(func) 68 | def wrapper(*args, **kwargs): 69 | # 将所有位置参数中的 list 转换为 tuple 70 | args = tuple(tuple(arg) if isinstance( 71 | arg, list) else arg for arg in args) 72 | 73 | # 将关键字参数中的 list 转换为 tuple 74 | kwargs = {k: tuple(v) if isinstance(v, list) 75 | else v for k, v in kwargs.items()} 76 | 77 | return func(*args, **kwargs) 78 | return wrapper 79 | 80 | 81 | @list_to_tuple_converter 82 | @functools.lru_cache() 83 | def get_factor_folder(factor_names, group_name=None): 84 | """获取因子组的文件夹 85 | factor_names : 因子名列表 86 | group_name : 因子组的名称, 如果指定则使用指定的名称作为文件夹名 87 | 否则用 jqfactor_cache_ + 因子名的 md5 值 (顺序无关) 作为文件夹名 88 | """ 89 | if group_name: 90 | return group_name 91 | else: 92 | if factor_names == 'prices': 93 | return 'jqprice_cache' 94 | if isinstance(factor_names, str): 95 | factor_names = [factor_names] 96 | factor_names = sorted(factor_names) 97 | factor_names = ''.join(factor_names) 98 | hash_object = hashlib.md5(factor_names.encode()) 99 | hash_hex = hash_object.hexdigest() 100 | return f"jqfactor_cache_{hash_hex}" 101 | 102 | 103 | def get_date_miss_group(A, B): 104 | '''将A相比B缺失的部分按连续性进行分组''' 105 | group_values = [] 106 | masks = [(x not in A) for x in B] 107 | for key, group in groupby(zip(B, masks), lambda x: x[1]): 108 | if key: 109 | group_values.append([item[0] for item in group]) 110 | return group_values 111 | 112 | 113 | def save_data_by_month(factor_names, start, end, month_path): 114 | """按时间段获取储存数据(不要跨月) 115 | """ 116 | start = pd.to_datetime(start) 117 | end = pd.to_datetime(end) 118 | security_info = api.get_all_securities() 119 | security_info.start_date = pd.to_datetime(security_info.start_date) 120 | security_info.end_date = pd.to_datetime(security_info.end_date) 121 | 122 | month_value = {} 123 | stocks = security_info[(security_info.start_date <= end) & ( 124 | security_info.end_date >= start)].index.tolist() 125 | if factor_names == 'prices': 126 | month_value = api.get_price(stocks, start_date=start, end_date=end, 127 | skip_paused=False, round=False, 128 | fields=['open', 'close', 'factor'], 129 | fq='post', panel=False) 130 | if month_value.empty: 131 | return 0 132 | month_value.set_index(['code', 'time'], inplace=True) 133 | month_value[['open', 'close']] = month_value[[ 134 | 'open', 'close']].div(month_value['factor'], axis=0) 135 | else: 136 | for factor in factor_names: 137 | month_value.update(get_factor_values(stocks, 138 | start_date=start, 139 | end_date=end, 140 | factors=factor)) 141 | if not month_value: 142 | return 0 143 | month_value = pd.concat(month_value).unstack(level=1).T 144 | month_value.index.names = ('code', 'date') 145 | 146 | for date, data in month_value.groupby(month_value.index.get_level_values(1)): 147 | data = data.reset_index(level=1, drop=True) 148 | data = data.reindex(security_info[(security_info.start_date <= date) & ( 149 | security_info.end_date >= date)].index.tolist()) 150 | # 数据未产生, 或者已经生产了但是全为 nan 151 | if data.isna().values.all(): 152 | continue 153 | path = os.path.join(month_path, date.strftime("%Y%m%d") + ".feather") 154 | data.reset_index().to_feather(path) 155 | return month_value 156 | 157 | 158 | def save_factor_values_by_group(start_date, end_date, 159 | factor_names='prices', group_name=None, 160 | overwrite=False, cache_dir=None, show_progress=True): 161 | """将因子库数据按因子组储存到本地 162 | start_date : 开始时间 163 | end_date : 结束时间 164 | factor_names : 因子组所含因子的名称,除过因子库中支持的因子外,还支持指定为'prices'缓存价格数据 165 | overwrite : 文件已存在时是否覆盖更新 166 | 返回 : 因子组储存的路径 , 文件以天为单位储存,每天一个feather文件,每月一个文件夹,columns第一列是因子名称, 而后是当天在市的所有标的代码 167 | """ 168 | if cache_dir is None: 169 | cache_dir = get_cache_dir() 170 | 171 | start_date = pd.to_datetime(start_date).date() 172 | last_day = today() - TimeDelta(days=1) if now().hour > 8 else today() - TimeDelta(days=2) 173 | end_date = min(pd.to_datetime(end_date).date(), last_day) 174 | date_range = pd.date_range(start_date, end_date, freq='1M') 175 | _date = pd.to_datetime(end_date) 176 | if len(date_range) == 0 or date_range[-1] < _date: 177 | date_range = date_range.append(pd.Index([_date])) 178 | 179 | if show_progress: 180 | if isinstance(show_progress, str): 181 | desc = show_progress 182 | elif factor_names == 'prices': 183 | desc = 'check/save price cache ' 184 | else: 185 | desc = 'check/save factor cache ' 186 | date_range = tqdm(date_range, total=len(date_range), desc=desc) 187 | root_path = os.path.join( 188 | cache_dir, get_factor_folder(factor_names, group_name)) 189 | 190 | for end in date_range: 191 | start = max(end.replace(day=1).date(), start_date) 192 | month_path = os.path.join(root_path, end.strftime("%Y%m")) 193 | if not os.path.exists(month_path): 194 | os.makedirs(month_path) 195 | elif not overwrite: 196 | dates = [x.split(".")[0] for x in os.listdir(month_path)] 197 | dates = pd.to_datetime(dates).date 198 | trade_days = api.get_trade_days(start, end) 199 | miss_group = get_date_miss_group(dates, trade_days) 200 | if miss_group: 201 | for group in miss_group: 202 | save_data_by_month( 203 | factor_names, group[0], group[-1], month_path) 204 | continue 205 | save_data_by_month(factor_names, start, end, month_path) 206 | 207 | return root_path 208 | 209 | 210 | def get_factor_values_by_cache(date, codes=None, factor_names=None, group_name=None, factor_path=None): 211 | """从缓存的文件读取因子数据, 文件不存在时返回空的 DataFrame""" 212 | date = pd.to_datetime(date) 213 | if factor_path: 214 | path = os.path.join(factor_path, 215 | date.strftime("%Y%m"), 216 | date.strftime("%Y%m%d") + ".feather") 217 | elif group_name: 218 | path = os.path.join(get_cache_dir(), 219 | group_name, 220 | date.strftime("%Y%m"), 221 | date.strftime("%Y%m%d") + ".feather") 222 | elif factor_names: 223 | path = os.path.join(get_cache_dir(), 224 | get_factor_folder(factor_names), 225 | date.strftime("%Y%m"), 226 | date.strftime("%Y%m%d") + ".feather") 227 | else: 228 | raise ValueError("factor_names, factor_path 和 group_name 至少指定其中一个") 229 | # 数据未产生, 或者已经生产了但是全为 nan 230 | if not os.path.exists(path): 231 | factor_names = factor_names if factor_names != 'prices' else [ 232 | 'open', 'close', 'factor'] 233 | data = pd.DataFrame(index=codes, columns=factor_names) 234 | data.index.name = 'code' 235 | return data 236 | 237 | try: 238 | data = pd.read_feather(path, use_threads=False).set_index('code') 239 | except Exception as e: 240 | if factor_names: 241 | logging.error("\n{} 缓存文件可能已损坏, 请重新下载".format(date)) 242 | save_data_by_month(factor_names, 243 | date, date, 244 | os.path.join(factor_path, date.strftime("%Y%m"))) 245 | data = get_factor_values_by_cache( 246 | date, codes, factor_names, factor_path) 247 | else: 248 | raise ValueError( 249 | "\n{} 缓存文件可能已损坏, 请重新下载 (指定 factor_names 时会自动下载) {} ".format(date, e)) 250 | 251 | if codes is not None: 252 | data = data.reindex(codes) 253 | 254 | return data 255 | -------------------------------------------------------------------------------- /jqfactor_analyzer/performance.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | import numpy as np 5 | from scipy import stats 6 | import pandas as pd 7 | from statsmodels.api import OLS, add_constant 8 | 9 | from .compat import rolling_apply 10 | from .prepare import demean_forward_returns, common_start_returns 11 | from .utils import get_forward_returns_columns 12 | 13 | 14 | def factor_information_coefficient( 15 | factor_data, group_adjust=False, by_group=False, method=stats.spearmanr 16 | ): 17 | """ 18 | 通过因子值与因子远期收益计算信息系数(IC). 19 | 20 | 参数 21 | ---------- 22 | factor_data : pd.DataFrame - MultiIndex 23 | 一个 DataFrame, index 为日期 (level 0) 和资产(level 1) 的 MultiIndex, 24 | values 包括因子的值, 各期因子远期收益, 因子分位数, 25 | 因子分组(可选), 因子权重(可选) 26 | group_adjust : bool 27 | 是否使用分组去均值后的因子远期收益计算 IC. 28 | by_group : bool 29 | 是否分组计算 IC. 30 | Returns 31 | ------- 32 | ic : pd.DataFrame 33 | 因子信息系数(IC). 34 | """ 35 | 36 | def src_ic(group): 37 | f = group['factor'] 38 | _ic = group[get_forward_returns_columns(factor_data.columns)] \ 39 | .apply(lambda x: method(x, f)[0]) 40 | return _ic 41 | 42 | factor_data = factor_data.copy() 43 | 44 | grouper = [factor_data.index.get_level_values('date')] 45 | 46 | if group_adjust: 47 | factor_data = demean_forward_returns(factor_data, grouper + ['group']) 48 | if by_group: 49 | grouper.append('group') 50 | 51 | with np.errstate(divide='ignore', invalid='ignore'): 52 | ic = factor_data.groupby(grouper).apply(src_ic) 53 | 54 | return ic 55 | 56 | 57 | def mean_information_coefficient( 58 | factor_data, 59 | group_adjust=False, 60 | by_group=False, 61 | by_time=None, 62 | method=stats.spearmanr 63 | ): 64 | """ 65 | 根据不同分组求因子 IC 均值. 66 | 67 | 参数 68 | ---------- 69 | factor_data : pd.DataFrame - MultiIndex 70 | 一个 DataFrame, index 为日期 (level 0) 和资产(level 1) 的 MultiIndex, 71 | values 包括因子的值, 各期因子远期收益, 因子分位数, 72 | 因子分组(可选), 因子权重(可选) 73 | group_adjust : bool 74 | 是否使用分组去均值后的因子远期收益计算 IC. 75 | by_group : bool 76 | 是否分组计算 IC. 77 | by_time : str (pd time_rule), optional 78 | 根据相应的时间频率计算 IC 均值 79 | 时间频率参见 http://pandas.pydata.org/pandas-docs/stable/timeseries.html 80 | 81 | 返回值 82 | ------- 83 | ic : pd.DataFrame 84 | 根据不同分组求出的因子 IC 均值序列 85 | """ 86 | 87 | ic = factor_information_coefficient( 88 | factor_data, group_adjust, by_group, method=method 89 | ) 90 | 91 | grouper = [] 92 | if by_time is not None: 93 | grouper.append(pd.Grouper(freq=by_time)) 94 | if by_group: 95 | grouper.append('group') 96 | 97 | if len(grouper) == 0: 98 | ic = ic.mean() 99 | 100 | else: 101 | ic = (ic.reset_index().set_index('date').groupby(grouper).mean()) 102 | 103 | return ic 104 | 105 | 106 | def factor_returns(factor_data, demeaned=True, group_adjust=False): 107 | """ 108 | 计算按因子值加权的投资组合的收益 109 | 权重为去均值的因子除以其绝对值之和 (实现总杠杆率为1). 110 | 111 | 参数 112 | ---------- 113 | factor_data : pd.DataFrame - MultiIndex 114 | 一个 DataFrame, index 为日期 (level 0) 和资产(level 1) 的 MultiIndex, 115 | values 包括因子的值, 各期因子远期收益, 因子分位数, 116 | 因子分组(可选), 因子权重(可选) 117 | demeaned : bool 118 | 因子分析是否基于一个多空组合? 如果是 True, 则计算权重时因子值需要去均值 119 | group_adjust : bool 120 | 因子分析是否基于一个分组(行业)中性的组合? 121 | 如果是 True, 则计算权重时因子值需要根据分组和日期去均值 122 | 123 | 返回值 124 | ------- 125 | returns : pd.DataFrame 126 | 每期零风险暴露的多空组合收益 127 | """ 128 | 129 | def to_weights(group, is_long_short): 130 | if is_long_short: 131 | demeaned_vals = group - group.mean() 132 | return demeaned_vals / demeaned_vals.abs().sum() 133 | else: 134 | return group / group.abs().sum() 135 | 136 | grouper = [factor_data.index.get_level_values('date')] 137 | if group_adjust: 138 | grouper.append('group') 139 | 140 | weights = factor_data.groupby(grouper)['factor'] \ 141 | .apply(to_weights, demeaned) 142 | 143 | if group_adjust: 144 | weights = weights.groupby(level='date').apply(to_weights, False) 145 | 146 | weighted_returns = \ 147 | factor_data[get_forward_returns_columns(factor_data.columns)] \ 148 | .multiply(weights, axis=0) 149 | 150 | returns = weighted_returns.groupby(level='date').sum() 151 | 152 | return returns 153 | 154 | 155 | def factor_alpha_beta(factor_data, demeaned=True, group_adjust=False): 156 | """ 157 | 计算因子的alpha(超额收益), 158 | alpha t-统计量 (alpha 显著性)和 beta(市场暴露). 159 | 使用每期平均远期收益作为自变量(视为市场组合收益) 160 | 因子值加权平均的远期收益作为因变量(视为因子收益), 进行回归. 161 | 162 | Parameters 163 | ---------- 164 | factor_data : pd.DataFrame - MultiIndex 165 | 一个 DataFrame, index 为日期 (level 0) 和资产(level 1) 的 MultiIndex, 166 | values 包括因子的值, 各期因子远期收益, 因子分位数, 167 | 因子分组(可选), 因子权重(可选) 168 | demeaned : bool 169 | 因子分析是否基于一个多空组合? 如果是 True, 则计算权重时因子值需要去均值 170 | group_adjust : bool 171 | 因子分析是否基于一个分组(行业)中性的组合? 172 | 如果是 True, 则计算权重时因子值需要根据分组和日期去均值 173 | Returns 174 | ------- 175 | alpha_beta : pd.Series 176 | 一个包含 alpha, beta, a t-统计量(alpha) 的序列 177 | """ 178 | 179 | returns = factor_returns(factor_data, demeaned, group_adjust) 180 | 181 | universe_ret = factor_data.groupby(level='date')[ 182 | get_forward_returns_columns(factor_data.columns)] \ 183 | .mean().loc[returns.index] 184 | 185 | if isinstance(returns, pd.Series): 186 | returns.name = universe_ret.columns.values[0] 187 | returns = pd.DataFrame(returns) 188 | 189 | alpha_beta = pd.DataFrame() 190 | for period in returns.columns.values: 191 | x = universe_ret[period].values 192 | y = returns[period].values 193 | x = add_constant(x) 194 | period_int = int(period.replace('period_', '')) 195 | 196 | reg_fit = OLS(y, x).fit() 197 | alpha, beta = reg_fit.params 198 | 199 | alpha_beta.loc['Ann. alpha', period] = \ 200 | (1 + alpha) ** (250.0 / period_int) - 1 201 | alpha_beta.loc['beta', period] = beta 202 | 203 | return alpha_beta 204 | 205 | 206 | def cumulative_returns(returns, period): 207 | """ 208 | 从'N 期'因子远期收益率构建累积收益 209 | 当 'period' N 大于 1 时, 建立平均 N 个交错的投资组合 (在随后的时段 1,2,3,...,N 开始), 210 | 每个 N 个周期重新调仓, 最后计算 N 个投资组合累积收益的均值。 211 | 212 | 参数 213 | ---------- 214 | returns: pd.Series 215 | N 期因子远期收益序列 216 | period: integer 217 | 对应的因子远期收益时间跨度 218 | 219 | 返回值 220 | ------- 221 | pd.Series 222 | 累积收益序列 223 | """ 224 | 225 | returns = returns.fillna(0) 226 | 227 | if period == 1: 228 | return returns.add(1).cumprod() 229 | # 230 | # 构建 N 个交错的投资组合 231 | # 232 | 233 | def split_portfolio(ret, period): 234 | return pd.DataFrame(np.diag(ret)) 235 | 236 | sub_portfolios = returns.groupby( 237 | np.arange(len(returns.index)) // period, axis=0 238 | ).apply(split_portfolio, period) 239 | sub_portfolios.index = returns.index 240 | 241 | # 242 | # 将 N 期收益转换为 1 期收益, 方便计算累积收益 243 | # 244 | 245 | def rate_of_returns(ret, period): 246 | return ((np.nansum(ret) + 1)**(1. / period)) - 1 247 | 248 | sub_portfolios = rolling_apply( 249 | sub_portfolios, 250 | window=period, 251 | func=rate_of_returns, 252 | min_periods=1, 253 | args=(period,) 254 | ) 255 | sub_portfolios = sub_portfolios.add(1).cumprod() 256 | 257 | # 258 | # 求 N 个投资组合累积收益均值 259 | # 260 | return sub_portfolios.mean(axis=1) 261 | 262 | 263 | def weighted_mean_return(factor_data, grouper): 264 | """计算(年化)加权平均/标准差""" 265 | forward_returns_columns = get_forward_returns_columns(factor_data.columns) 266 | 267 | def agg(values, weights): 268 | count = len(values) 269 | average = np.average(values, weights=weights, axis=0) 270 | # Fast and numerically precise 271 | variance = np.average( 272 | (values - average)**2, weights=weights, axis=0 273 | ) * count / max((count - 1), 1) 274 | return pd.Series( 275 | [average, np.sqrt(variance), count], index=['mean', 'std', 'count'] 276 | ) 277 | 278 | group_stats = factor_data.groupby(grouper)[ 279 | forward_returns_columns.append(pd.Index(['weights']))] \ 280 | .apply(lambda x: x[forward_returns_columns].apply( 281 | agg, weights=x['weights'].fillna(0.0).values 282 | )) 283 | 284 | mean_ret = group_stats.xs('mean', level=-1) 285 | 286 | std_error_ret = group_stats.xs('std', level=-1) \ 287 | / np.sqrt(group_stats.xs('count', level=-1)) 288 | 289 | return mean_ret, std_error_ret 290 | 291 | 292 | def mean_return_by_quantile( 293 | factor_data, 294 | by_date=False, 295 | by_group=False, 296 | demeaned=True, 297 | group_adjust=False 298 | ): 299 | """ 300 | 计算各分位数的因子远期收益均值和标准差 301 | 302 | 参数 303 | ---------- 304 | factor_data : pd.DataFrame - MultiIndex 305 | 一个 DataFrame, index 为日期 (level 0) 和资产(level 1) 的 MultiIndex, 306 | values 包括因子的值, 各期因子远期收益, 因子分位数, 307 | 因子分组(可选), 因子权重(可选) 308 | by_date : bool 309 | 如果为 True, 则按日期计算各分位数的因子远期收益均值 310 | by_group : bool 311 | 如果为 True, 则分组计算各分位数的因子远期收益均值 312 | demeaned : bool 313 | 是否按日期对因子远期收益去均值 314 | group_adjust : bool 315 | 是否按日期和分组对因子远期收益去均值 316 | Returns 317 | ------- 318 | mean_ret : pd.DataFrame 319 | 各分位数因子远期收益均值 320 | std_error_ret : pd.DataFrame 321 | 各分位数因子远期收益标准差 322 | """ 323 | 324 | if group_adjust: 325 | grouper = [factor_data.index.get_level_values('date')] + ['group'] 326 | factor_data = demean_forward_returns(factor_data, grouper) 327 | elif demeaned: 328 | factor_data = demean_forward_returns(factor_data) 329 | else: 330 | factor_data = factor_data.copy() 331 | 332 | grouper = ['factor_quantile'] 333 | if by_date: 334 | grouper.append(factor_data.index.get_level_values('date')) 335 | 336 | if by_group: 337 | grouper.append('group') 338 | 339 | mean_ret, std_error_ret = weighted_mean_return(factor_data, grouper=grouper) 340 | 341 | return mean_ret, std_error_ret 342 | 343 | 344 | def compute_mean_returns_spread( 345 | mean_returns, upper_quant, lower_quant, std_err=None 346 | ): 347 | """ 348 | 计算两个分位数的平均收益之差, 和(可选)计算此差异的标准差 349 | 350 | 参数 351 | ---------- 352 | mean_returns : pd.DataFrame 353 | 各分位数因子远期收益均值 354 | upper_quant : int 355 | 作为被减数的因子分位数 356 | lower_quant : int 357 | 作为减数的因子分位数 358 | std_err : pd.DataFrame 359 | 各分位数因子远期收益标准差 360 | 361 | 返回值 362 | ------- 363 | mean_return_difference : pd.Series 364 | 每期两个分位数的平均收益之差 365 | joint_std_err : pd.Series 366 | 每期两个分位数的平均收益标准差之差 367 | """ 368 | if isinstance(mean_returns.index, pd.MultiIndex): 369 | mean_return_difference = mean_returns.xs(upper_quant, 370 | level='factor_quantile') \ 371 | - mean_returns.xs(lower_quant, level='factor_quantile') 372 | else: 373 | mean_return_difference = mean_returns.loc[ 374 | upper_quant] - mean_returns.loc[lower_quant] 375 | 376 | if isinstance(std_err.index, pd.MultiIndex): 377 | std1 = std_err.xs(upper_quant, level='factor_quantile') 378 | std2 = std_err.xs(lower_quant, level='factor_quantile') 379 | else: 380 | std1 = std_err.loc[upper_quant] 381 | std2 = std_err.loc[lower_quant] 382 | joint_std_err = np.sqrt(std1**2 + std2**2) 383 | 384 | return mean_return_difference, joint_std_err 385 | 386 | 387 | def quantile_turnover(quantile_factor, quantile, period=1): 388 | """ 389 | 计算当期在分位数中的因子不在上一期分位数中的比例 390 | 391 | Parameters 392 | ---------- 393 | quantile_factor : pd.Series 394 | 包含日期, 资产, 和因子分位数的 DataFrame. 395 | quantile : int 396 | 对应的分位数 397 | period: int, optional 398 | 对应的因子远期收益时间跨度 399 | Returns 400 | ------- 401 | quant_turnover : pd.Series 402 | 每期对饮分位数因子的换手率 403 | """ 404 | 405 | quant_names = quantile_factor[quantile_factor == quantile] 406 | quant_name_sets = quant_names.groupby( 407 | level=['date'] 408 | ).apply(lambda x: set(x.index.get_level_values('asset'))) 409 | new_names = (quant_name_sets - quant_name_sets.shift(period)).dropna() 410 | quant_turnover = new_names.apply(lambda x: len(x)) / quant_name_sets.apply( 411 | lambda x: len(x) 412 | ) 413 | quant_turnover.name = quantile 414 | return quant_turnover 415 | 416 | 417 | def factor_autocorrelation(factor_data, period=1, rank=True): 418 | """ 419 | 计算指定时间跨度内平均因子排名/因子值的自相关性. 420 | 该指标对于衡量因子的换手率非常有用. 421 | 如果每个因子值在一个周期内随机变化,我们预计自相关为 0. 422 | 423 | 参数 424 | ---------- 425 | factor_data : pd.DataFrame - MultiIndex 426 | 一个 DataFrame, index 为日期 (level 0) 和资产(level 1) 的 MultiIndex, 427 | values 包括因子的值, 各期因子远期收益, 因子分位数, 428 | 因子分组(可选), 因子权重(可选) 429 | period: int, optional 430 | 对应的因子远期收益时间跨度 431 | Returns 432 | ------- 433 | autocorr : pd.Series 434 | 滞后一期的因子自相关性 435 | """ 436 | 437 | grouper = [factor_data.index.get_level_values('date')] 438 | 439 | if rank: 440 | ranks = factor_data.groupby(grouper)[['factor']].rank() 441 | else: 442 | ranks = factor_data[['factor']] 443 | asset_factor_rank = ranks.reset_index().pivot( 444 | index='date', columns='asset', values='factor' 445 | ) 446 | 447 | autocorr = asset_factor_rank.corrwith( 448 | asset_factor_rank.shift(period), axis=1 449 | ) 450 | autocorr.name = period 451 | return autocorr 452 | 453 | 454 | def average_cumulative_return_by_quantile( 455 | factor_data, 456 | prices, 457 | periods_before=10, 458 | periods_after=15, 459 | demeaned=True, 460 | group_adjust=False, 461 | by_group=False 462 | ): 463 | """ 464 | 计算由 periods_before 到 periods_after 定义的周期范围内的因子分位数的平均累积收益率 465 | 466 | 参数 467 | ---------- 468 | factor_data : pd.DataFrame - MultiIndex 469 | 一个 DataFrame, index 为日期 (level 0) 和资产(level 1) 的 MultiIndex, 470 | values 包括因子的值, 各期因子远期收益, 因子分位数, 471 | 因子分组(可选), 因子权重(可选) 472 | prices : pd.DataFrame 473 | 用于计算因子远期收益的价格数据 474 | columns 为资产, index 为 日期. 475 | 价格数据必须覆盖因子分析时间段以及额外远期收益计算中的最大预期期数. 476 | periods_before : int, optional 477 | 之前多少期 478 | periods_after : int, optional 479 | 之后多少期 480 | demeaned : bool, optional 481 | 是否按日期对因子远期收益去均值 482 | group_adjust : bool 483 | 是否按日期和分组对因子远期收益去均值 484 | by_group : bool 485 | 如果为 True, 则分组计算各分位数的因子远期累积收益 486 | Returns 487 | ------- 488 | cumulative returns and std deviation : pd.DataFrame 489 | 一个 DataFrame, index 为分位数 (level 0) 和 'mean'/'std' (level 1) 的 MultiIndex 490 | columns 为取值范围从 -periods_before 到 periods_after 的整数 491 | 如果 by_group=True, 则 index 会多出一个 'group' level 492 | """ 493 | 494 | def cumulative_return(q_fact, demean_by): 495 | return common_start_returns( 496 | q_fact, prices, periods_before, periods_after, True, True, demean_by 497 | ) 498 | 499 | def average_cumulative_return(q_fact, demean_by): 500 | q_returns = cumulative_return(q_fact, demean_by) 501 | return pd.DataFrame( 502 | { 503 | 'mean': q_returns.mean(axis=1), 504 | 'std': q_returns.std(axis=1) 505 | } 506 | ).T 507 | 508 | if by_group: 509 | 510 | returns_bygroup = [] 511 | 512 | for group, g_data in factor_data.groupby('group'): 513 | g_fq = g_data['factor_quantile'] 514 | if group_adjust: 515 | demean_by = g_fq # demeans at group level 516 | elif demeaned: 517 | demean_by = factor_data['factor_quantile'] # demean by all 518 | else: 519 | demean_by = None 520 | # 521 | # Align cumulative return from different dates to the same index 522 | # then compute mean and std 523 | # 524 | avgcumret = g_fq.groupby(g_fq).apply( 525 | average_cumulative_return, demean_by 526 | ) 527 | avgcumret['group'] = group 528 | avgcumret.set_index('group', append=True, inplace=True) 529 | returns_bygroup.append(avgcumret) 530 | 531 | return pd.concat(returns_bygroup, axis=0) 532 | 533 | else: 534 | 535 | if group_adjust: 536 | all_returns = [] 537 | for group, g_data in factor_data.groupby('group'): 538 | g_fq = g_data['factor_quantile'] 539 | avgcumret = g_fq.groupby(g_fq).apply(cumulative_return, g_fq) 540 | all_returns.append(avgcumret) 541 | q_returns = pd.concat(all_returns, axis=1) 542 | q_returns = pd.DataFrame( 543 | { 544 | 'mean': q_returns.mean(axis=1), 545 | 'std': q_returns.std(axis=1) 546 | } 547 | ) 548 | return q_returns.unstack(level=1).stack(level=0) 549 | elif demeaned: 550 | fq = factor_data['factor_quantile'] 551 | return fq.groupby(fq).apply(average_cumulative_return, fq) 552 | else: 553 | fq = factor_data['factor_quantile'] 554 | return fq.groupby(fq).apply(average_cumulative_return, None) 555 | -------------------------------------------------------------------------------- /jqfactor_analyzer/plot_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | import sys 5 | import subprocess 6 | from functools import wraps 7 | 8 | import matplotlib as mpl 9 | import seaborn as sns 10 | import pandas as pd 11 | 12 | 13 | def customize(func): 14 | 15 | @wraps(func) 16 | def call_w_context(*args, **kwargs): 17 | 18 | if not PlotConfig.FONT_SETTED: 19 | _use_chinese(True) 20 | 21 | set_context = kwargs.pop('set_context', True) 22 | if set_context: 23 | with plotting_context(), axes_style(): 24 | sns.despine(left=True) 25 | return func(*args, **kwargs) 26 | else: 27 | return func(*args, **kwargs) 28 | 29 | return call_w_context 30 | 31 | 32 | def plotting_context(context='notebook', font_scale=1.5, rc=None): 33 | 34 | if rc is None: 35 | rc = {} 36 | 37 | rc_default = {'lines.linewidth': 1.5} 38 | 39 | for name, val in rc_default.items(): 40 | rc.setdefault(name, val) 41 | 42 | return sns.plotting_context(context=context, font_scale=font_scale, rc=rc) 43 | 44 | 45 | def axes_style(style='darkgrid', rc=None): 46 | 47 | if rc is None: 48 | rc = {} 49 | 50 | rc_default = {} 51 | 52 | for name, val in rc_default.items(): 53 | rc.setdefault(name, val) 54 | 55 | return sns.axes_style(style=style, rc=rc) 56 | 57 | 58 | def print_table(table, name=None, fmt=None): 59 | 60 | from IPython.display import display 61 | 62 | if isinstance(table, pd.Series): 63 | table = pd.DataFrame(table) 64 | 65 | if isinstance(table, pd.DataFrame): 66 | table.columns.name = name 67 | 68 | prev_option = pd.get_option('display.float_format') 69 | if fmt is not None: 70 | pd.set_option('display.float_format', lambda x: fmt.format(x)) 71 | 72 | display(table) 73 | 74 | if fmt is not None: 75 | pd.set_option('display.float_format', prev_option) 76 | 77 | 78 | class PlotConfig(object): 79 | FONT_SETTED = False 80 | USE_CHINESE_LABEL = False 81 | MPL_FONT_FAMILY = mpl.rcParams["font.family"] 82 | MPL_FONT = mpl.rcParams["font.sans-serif"] 83 | MPL_UNICODE_MINUS = mpl.rcParams["axes.unicode_minus"] 84 | 85 | 86 | def get_chinese_font(): 87 | if sys.platform.startswith('linux'): 88 | cmd = 'fc-list :lang=zh -f "%{family}\n"' 89 | output = subprocess.check_output(cmd, shell=True) 90 | if isinstance(output, bytes): 91 | output = output.decode("utf-8") 92 | zh_fonts = [ 93 | f.split(',', 1)[0] for f in output.split('\n') if f.split(',', 1)[0] 94 | ] 95 | return zh_fonts 96 | 97 | return [] 98 | 99 | 100 | def _use_chinese(use=None): 101 | if use is None: 102 | return PlotConfig.USE_CHINESE_LABEL 103 | elif use: 104 | PlotConfig.USE_CHINESE_LABEL = use 105 | PlotConfig.FONT_SETTED = True 106 | _set_chinese_fonts() 107 | else: 108 | PlotConfig.USE_CHINESE_LABEL = use 109 | PlotConfig.FONT_SETTED = True 110 | _set_default_fonts() 111 | 112 | 113 | def _set_chinese_fonts(): 114 | default_chinese_font = ['SimHei', 'FangSong', 'STXihei', 'Hiragino Sans GB', 115 | 'Heiti SC', 'WenQuanYi Micro Hei'] 116 | chinese_font = default_chinese_font + get_chinese_font() 117 | # 设置中文字体 118 | mpl.rc( 119 | "font", **{ 120 | # seaborn 需要设置 sans-serif 121 | "sans-serif": chinese_font, 122 | "family": ','.join(chinese_font) + ',sans-serif' 123 | } 124 | ) 125 | # 防止负号乱码 126 | mpl.rcParams["axes.unicode_minus"] = False 127 | 128 | 129 | def _set_default_fonts(): 130 | mpl.rc( 131 | "font", **{ 132 | "sans-serif": PlotConfig.MPL_FONT, 133 | "family": PlotConfig.MPL_FONT_FAMILY 134 | } 135 | ) 136 | mpl.rcParams["axes.unicode_minus"] = PlotConfig.MPL_UNICODE_MINUS 137 | 138 | 139 | class _PlotLabels(object): 140 | 141 | def get(self, v): 142 | if _use_chinese(): 143 | return getattr(self, v + "_CN") 144 | else: 145 | return getattr(self, v + "_EN") 146 | 147 | 148 | class ICTS(_PlotLabels): 149 | TITLE_CN = "{} 天 IC" 150 | TITLE_EN = "{} Period Forward Return Information Coefficient (IC)" 151 | LEGEND_CN = ["IC", "1个月移动平均"] 152 | LEGEND_EN = ["IC", "1 month moving avg"] 153 | TEXT_CN = "均值 {:.3f} \n方差 {:.3f}" 154 | TEXT_EN = "Mean {:.3f} \nStd. {:.3f}" 155 | 156 | 157 | ICTS = ICTS() 158 | 159 | 160 | class ICHIST(_PlotLabels): 161 | TITLE_CN = "%s 天 IC 分布直方图" 162 | TITLE_EN = "%s Period IC" 163 | LEGEND_CN = "均值 {:.3f} \n方差 {:.3f}" 164 | LEGEND_EN = "Mean {:.3f} \nStd. {:.3f}" 165 | 166 | 167 | ICHIST = ICHIST() 168 | 169 | 170 | class ICQQ(_PlotLabels): 171 | NORM_CN = "正态" 172 | NORM_EN = "Normal" 173 | T_CN = "T" 174 | T_EN = "T" 175 | CUSTOM_CN = "自定义" 176 | CUSTOM_EN = "Theoretical" 177 | TITLE_CN = "{} 天 IC {}分布 Q-Q 图" 178 | TITLE_EN = "{} Period IC {} Dist. Q-Q" 179 | XLABEL_CN = "{} 分布分位数" 180 | XLABEL_EN = "{} Distribution Quantile" 181 | YLABEL_CN = "Observed Quantile" 182 | YLABEL_EN = "Observed Quantile" 183 | 184 | 185 | ICQQ = ICQQ() 186 | 187 | 188 | class QRETURNBAR(_PlotLabels): 189 | COLUMN_CN = "{} 天" 190 | COLUMN_EN = "{} Day" 191 | TITLE_CN = "各分位数平均收益" 192 | TITLE_EN = "Mean Period Wise Return By Factor Quantile" 193 | YLABEL_CN = "平均收益 (bps)" 194 | YLABEL_EN = "Mean Return (bps)" 195 | 196 | 197 | QRETURNBAR = QRETURNBAR() 198 | 199 | 200 | class QRETURNVIOLIN(_PlotLabels): 201 | LEGENDNAME_CN = "滞后天数" 202 | LEGENDNAME_EN = "forward periods" 203 | TITLE_CN = "各分位数收益分布图" 204 | TITLE_EN = "Period Wise Return By Factor Quantile" 205 | YLABEL_CN = "收益 (bps)" 206 | YLABEL_EN = "Return (bps)" 207 | 208 | 209 | QRETURNVIOLIN = QRETURNVIOLIN() 210 | 211 | 212 | class QRETURNTS(_PlotLabels): 213 | TITLE_CN = "最大分位收益减最小分位收益 ({} 天)" 214 | TITLE_EN = "Top Minus Bottom Quantile Mean Return ({} Period Forward Return)" 215 | LEGEND0_CN = "当日收益 (加减 {:.2f} 倍当日标准差)" 216 | LEGEND0_EN = "mean returns spread (+/- {:.2f} std)" 217 | LEGEND1_CN = "1 个月移动平均" 218 | LEGEND1_EN = "1 month moving avg" 219 | YLABEL_CN = "分位数平均收益差 (bps)" 220 | YLABEL_EN = "Difference In Quantile Mean Return (bps)" 221 | 222 | 223 | QRETURNTS = QRETURNTS() 224 | 225 | 226 | class ICGROUP(_PlotLabels): 227 | TITLE_CN = "分组 IC" 228 | TITLE_EN = "Information Coefficient By Group" 229 | 230 | 231 | ICGROUP = ICGROUP() 232 | 233 | 234 | class AUTOCORR(_PlotLabels): 235 | TITLE_CN = "因子自相关性 (滞后 {} 天)" 236 | TITLE_EN = "{} Period Factor Autocorrelation" 237 | YLABEL_CN = "自相关性" 238 | YLABEL_EN = "Autocorrelation Coefficient" 239 | TEXT_CN = "均值 {:.3f}" 240 | TEXT_EN = "Mean {:.3f}" 241 | 242 | 243 | AUTOCORR = AUTOCORR() 244 | 245 | 246 | class TBTURNOVER(_PlotLabels): 247 | TURNOVER_CN = "{:d} 分位换手率" 248 | TURNOVER_EN = "quantile {:d} turnover" 249 | TITLE_CN = "{} 天换手率" 250 | TITLE_EN = "{} Period Top and Bottom Quantile Turnover" 251 | YLABEL_CN = "分位数换手率" 252 | YLABEL_EN = "Proportion Of Names New To Quantile" 253 | 254 | 255 | TBTURNOVER = TBTURNOVER() 256 | 257 | 258 | class ICHEATMAP(_PlotLabels): 259 | TITLE_CN = "{} 天 IC 月度均值" 260 | TITLE_EN = "Monthly Mean {} Period IC" 261 | 262 | 263 | ICHEATMAP = ICHEATMAP() 264 | 265 | 266 | class CUMRET(_PlotLabels): 267 | YLABEL_CN = "累积收益" 268 | YLABEL_EN = "Cumulative Returns" 269 | TITLE_CN = "因子值加权多空组合累积收益 ({} 天平均)" 270 | TITLE_EN = """Factor Weighted Long/Short Portfolio Cumulative Return 271 | ({} Fwd Period)""" 272 | 273 | 274 | CUMRET = CUMRET() 275 | 276 | 277 | class TDCUMRET(_PlotLabels): 278 | YLABEL_CN = "累积收益" 279 | YLABEL_EN = "Cumulative Returns" 280 | TITLE_CN = "做多最大分位做空最小分位组合累积收益 ({} 天平均)" 281 | TITLE_EN = """Long Top/Short Bottom Factor Portfolio Cumulative Return 282 | ({} Fwd Period)""" 283 | 284 | 285 | TDCUMRET = TDCUMRET() 286 | 287 | 288 | class CUMRETQ(_PlotLabels): 289 | YLABEL_CN = "累积收益(对数轴)" 290 | YLABEL_EN = "Log Cumulative Returns" 291 | TITLE_CN = "分位数 {} 天 Forward Return 累积收益 (对数轴)" 292 | TITLE_EN = """Cumulative Return by Quantile 293 | ({} Period Forward Return)""" 294 | 295 | 296 | CUMRETQ = CUMRETQ() 297 | 298 | 299 | class AVGCUMRET(_PlotLabels): 300 | TITLE_CN = "因子预测能力 (前 {} 天, 后 {} 天)" 301 | TITLE_EN = "Average Cumulative Returns by Quantile ({} days backword, {} days forward)" 302 | COLUMN_CN = "{} 分位" 303 | COLUMN_EN = "Quantile {}" 304 | XLABEL_CN = "天数" 305 | XLABEL_EN = "Periods" 306 | YLABEL_CN = "平均累积收益 (bps)" 307 | YLABEL_EN = "Mean Return (bps)" 308 | 309 | 310 | AVGCUMRET = AVGCUMRET() 311 | 312 | 313 | class EVENTSDIST(_PlotLabels): 314 | TITLE_CN = "因子数量随时间分布" 315 | TITLE_EN = "Distribution of events in time" 316 | XLABEL_CN = "日期" 317 | XLABEL_EN = "Date" 318 | YLABEL_CN = "因子数量" 319 | YLABEL_EN = "Number of events" 320 | 321 | 322 | EVENTSDIST = EVENTSDIST() 323 | 324 | 325 | class MISSIINGEVENTSDIST(_PlotLabels): 326 | TITLE_CN = "因子数量随时间分布" 327 | TITLE_EN = "Distribution of missing events in time" 328 | XLABEL_CN = "日期" 329 | XLABEL_EN = "Date" 330 | YLABEL_CN = "因子缺失率" 331 | YLABEL_EN = "Rate of missing events" 332 | 333 | 334 | MISSIINGEVENTSDIST = MISSIINGEVENTSDIST() 335 | -------------------------------------------------------------------------------- /jqfactor_analyzer/plotting.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | from __future__ import division, print_function 5 | 6 | import pandas as pd 7 | import numpy as np 8 | from scipy import stats 9 | from statsmodels.api import qqplot 10 | import matplotlib.pyplot as plt 11 | import matplotlib.cm as cm 12 | from matplotlib.ticker import ScalarFormatter 13 | import seaborn as sns 14 | 15 | from .compat import rolling_mean 16 | from .plot_utils import ( 17 | print_table, customize, ICTS, ICHIST, ICQQ, QRETURNBAR, QRETURNVIOLIN, 18 | QRETURNTS, ICGROUP, AUTOCORR, TBTURNOVER, ICHEATMAP, CUMRET, TDCUMRET, 19 | CUMRETQ, AVGCUMRET, EVENTSDIST, MISSIINGEVENTSDIST 20 | ) 21 | from .performance import cumulative_returns 22 | from .utils import (ignore_warning, convert_to_forward_returns_columns) 23 | 24 | 25 | DECIMAL_TO_BPS = 10000 26 | 27 | 28 | def plot_returns_table(alpha_beta, mean_ret_quantile, mean_ret_spread_quantile): 29 | returns_table = pd.DataFrame() 30 | returns_table = returns_table.append(alpha_beta) 31 | returns_table.loc["Mean Period Wise Return Top Quantile (bps)"] = \ 32 | mean_ret_quantile.iloc[-1] * DECIMAL_TO_BPS 33 | returns_table.loc["Mean Period Wise Return Bottom Quantile (bps)"] = \ 34 | mean_ret_quantile.iloc[0] * DECIMAL_TO_BPS 35 | returns_table.loc["Mean Period Wise Spread (bps)"] = \ 36 | mean_ret_spread_quantile.mean() * DECIMAL_TO_BPS 37 | 38 | print("收益分析") 39 | print_table(returns_table.apply(lambda x: x.round(3))) 40 | 41 | 42 | def plot_turnover_table(autocorrelation_data, quantile_turnover, return_df=False): 43 | turnover_table = pd.DataFrame() 44 | for period in sorted(quantile_turnover.keys()): 45 | for quantile, p_data in quantile_turnover[period].iteritems(): 46 | turnover_table.loc["Quantile {} Mean Turnover ".format(quantile), 47 | "{}".format(period)] = p_data.mean() 48 | auto_corr = pd.DataFrame() 49 | for period, p_data in autocorrelation_data.iteritems(): 50 | auto_corr.loc["Mean Factor Rank Autocorrelation", "{}" 51 | .format(period)] = p_data.mean() 52 | 53 | if return_df: 54 | return turnover_table.apply(lambda x: x.round(3)), auto_corr.apply(lambda x: x.round(3)) 55 | else: 56 | print("换手率分析") 57 | print_table(turnover_table.apply(lambda x: x.round(3))) 58 | print_table(auto_corr.apply(lambda x: x.round(3))) 59 | 60 | 61 | def plot_information_table(ic_data, return_df=False): 62 | ic_summary_table = pd.DataFrame() 63 | ic_summary_table["IC Mean"] = ic_data.mean() 64 | ic_summary_table["IC Std."] = ic_data.std() 65 | ic_summary_table["IR"] = ic_data.mean() / ic_data.std() 66 | t_stat, p_value = stats.ttest_1samp(ic_data, 0) 67 | ic_summary_table["t-stat(IC)"] = t_stat 68 | ic_summary_table["p-value(IC)"] = p_value 69 | ic_summary_table["IC Skew"] = stats.skew(ic_data) 70 | ic_summary_table["IC Kurtosis"] = stats.kurtosis(ic_data) 71 | 72 | if return_df: 73 | return ic_summary_table.apply(lambda x: x.round(3)).T 74 | else: 75 | print("IC 分析") 76 | print_table(ic_summary_table.apply(lambda x: x.round(3)).T) 77 | 78 | 79 | def plot_quantile_statistics_table(factor_data, return_df=False): 80 | quantile_stats = factor_data.groupby('factor_quantile') \ 81 | .agg(['min', 'max', 'mean', 'std', 'count'])['factor'] 82 | quantile_stats['count %'] = quantile_stats['count'] \ 83 | / quantile_stats['count'].sum() * 100. 84 | 85 | if return_df: 86 | return quantile_stats 87 | else: 88 | print("分位数统计") 89 | print_table(quantile_stats) 90 | 91 | 92 | @customize 93 | def plot_ic_ts(ic, ax=None): 94 | 95 | ic = ic.copy() 96 | 97 | num_plots = len(ic.columns) 98 | if ax is None: 99 | f, ax = plt.subplots(num_plots, 1, figsize=(18, num_plots * 7)) 100 | ax = np.asarray([ax]).flatten() 101 | 102 | ymin, ymax = (None, None) 103 | for a, (period, ic) in zip(ax, ic.iteritems()): 104 | period_num = period.replace('period_', '') 105 | ic.plot(alpha=0.7, ax=a, lw=0.7, color='steelblue') 106 | rolling_mean( 107 | ic, window=22 108 | ).plot( 109 | ax=a, color='forestgreen', lw=2, alpha=0.8 110 | ) 111 | 112 | a.axhline(0.0, linestyle='-', color='black', lw=1, alpha=0.8) 113 | a.set(ylabel='IC', xlabel="") 114 | a.set_title(ICTS.get("TITLE").format(period_num)) 115 | a.legend(ICTS.get("LEGEND"), loc='upper right') 116 | a.text( 117 | .05, 118 | .95, 119 | ICTS.get("TEXT").format(ic.mean(), ic.std()), 120 | fontsize=16, 121 | bbox={ 122 | 'facecolor': 'white', 123 | 'alpha': 1, 124 | 'pad': 5 125 | }, 126 | transform=a.transAxes, 127 | verticalalignment='top' 128 | ) 129 | 130 | curr_ymin, curr_ymax = a.get_ylim() 131 | ymin = curr_ymin if ymin is None else min(ymin, curr_ymin) 132 | ymax = curr_ymax if ymax is None else max(ymax, curr_ymax) 133 | 134 | for a in ax: 135 | a.set_ylim([ymin, ymax]) 136 | 137 | return ax 138 | 139 | 140 | @ignore_warning(message='Using a non-tuple sequence for multidimensional indexing is deprecated', 141 | category=FutureWarning) 142 | @customize 143 | def plot_ic_hist(ic, ax=None): 144 | 145 | ic = ic.copy() 146 | 147 | num_plots = len(ic.columns) 148 | 149 | v_spaces = ((num_plots - 1) // 3) + 1 150 | 151 | if ax is None: 152 | f, ax = plt.subplots(v_spaces, 3, figsize=(18, v_spaces * 6)) 153 | ax = ax.flatten() 154 | 155 | for a, (period, ic) in zip(ax, ic.iteritems()): 156 | period_num = period.replace('period_', '') 157 | sns.distplot(ic.replace(np.nan, 0.), norm_hist=True, ax=a) 158 | a.set_xlim([-1, 1]) 159 | a.set(title=ICHIST.get("TITLE") % period_num, xlabel='IC') 160 | a.text( 161 | .05, 162 | .95, 163 | ICHIST.get("LEGEND").format(ic.mean(), ic.std()), 164 | fontsize=16, 165 | bbox={ 166 | 'facecolor': 'white', 167 | 'alpha': 1, 168 | 'pad': 5 169 | }, 170 | transform=a.transAxes, 171 | verticalalignment='top' 172 | ) 173 | a.axvline(ic.mean(), color='w', linestyle='dashed', linewidth=2) 174 | 175 | if num_plots < len(ax): 176 | for a in ax[num_plots:]: 177 | a.set_visible(False) 178 | 179 | return ax 180 | 181 | 182 | @customize 183 | def plot_ic_qq(ic, theoretical_dist=stats.norm, ax=None): 184 | 185 | ic = ic.copy() 186 | 187 | num_plots = len(ic.columns) 188 | 189 | v_spaces = ((num_plots - 1) // 3) + 1 190 | 191 | if ax is None: 192 | f, ax = plt.subplots(v_spaces, 3, figsize=(18, v_spaces * 6)) 193 | ax = ax.flatten() 194 | 195 | if isinstance(theoretical_dist, stats.norm.__class__): 196 | dist_name = ICQQ.get("NORM") 197 | elif isinstance(theoretical_dist, stats.t.__class__): 198 | dist_name = ICQQ.get("T") 199 | else: 200 | dist_name = ICQQ.get("CUSTOM") 201 | 202 | for a, (period, ic) in zip(ax, ic.iteritems()): 203 | period_num = period.replace('period_', '') 204 | qqplot( 205 | ic.replace(np.nan, 0.).values, 206 | theoretical_dist, 207 | fit=True, 208 | line='45', 209 | ax=a 210 | ) 211 | a.set( 212 | title=ICQQ.get("TITLE").format(period_num, dist_name), 213 | xlabel=ICQQ.get("XLABEL").format(dist_name), 214 | ylabel=ICQQ.get("YLABEL"), 215 | ) 216 | 217 | if num_plots < len(ax): 218 | for a in ax[num_plots:]: 219 | a.set_visible(False) 220 | 221 | return ax 222 | 223 | 224 | @customize 225 | def plot_quantile_returns_bar( 226 | mean_ret_by_q, by_group=False, ylim_percentiles=None, ax=None 227 | ): 228 | mean_ret_by_q = mean_ret_by_q.copy() 229 | mean_ret_by_q.columns = mean_ret_by_q.columns.map( 230 | lambda x: QRETURNBAR.get("COLUMN").format(x.replace("period_", "")) 231 | ) 232 | 233 | if ylim_percentiles is not None: 234 | ymin = ( 235 | np.nanpercentile(mean_ret_by_q.values, ylim_percentiles[0]) * 236 | DECIMAL_TO_BPS 237 | ) 238 | ymax = ( 239 | np.nanpercentile(mean_ret_by_q.values, ylim_percentiles[1]) * 240 | DECIMAL_TO_BPS 241 | ) 242 | else: 243 | ymin = None 244 | ymax = None 245 | 246 | if by_group: 247 | num_group = len(mean_ret_by_q.index.get_level_values('group').unique()) 248 | 249 | if ax is None: 250 | v_spaces = ((num_group - 1) // 2) + 1 251 | f, ax = plt.subplots( 252 | v_spaces, 253 | 2, 254 | sharex=False, 255 | sharey=True, 256 | figsize=( 257 | max( 258 | 18, 259 | mean_ret_by_q.index.get_level_values('factor_quantile') 260 | .max() 261 | ), 6 * v_spaces 262 | ) 263 | ) 264 | ax = ax.flatten() 265 | 266 | for a, (sc, cor) in zip(ax, mean_ret_by_q.groupby(level='group')): 267 | ( 268 | cor.xs(sc, level='group').multiply(DECIMAL_TO_BPS).plot( 269 | kind='bar', title=sc, ax=a 270 | ) 271 | ) 272 | 273 | a.set(xlabel='', ylabel=QRETURNBAR.get("YLABEL"), ylim=(ymin, ymax)) 274 | 275 | if num_group < len(ax): 276 | for a in ax[num_group:]: 277 | a.set_visible(False) 278 | 279 | return ax 280 | 281 | else: 282 | if ax is None: 283 | f, ax = plt.subplots( 284 | 1, 285 | 1, 286 | figsize=( 287 | max( 288 | 18, 289 | mean_ret_by_q.index.get_level_values( 290 | 'factor_quantile' 291 | ).max() // 2 292 | ), 6 293 | ) 294 | ) 295 | 296 | mean_ret_by_q.multiply(DECIMAL_TO_BPS).plot( 297 | kind='bar', title=QRETURNBAR.get("TITLE"), ax=ax 298 | ) 299 | ax.set(xlabel="", ylabel=QRETURNBAR.get("YLABEL"), ylim=(ymin, ymax)) 300 | 301 | return ax 302 | 303 | 304 | @customize 305 | def plot_quantile_returns_violin(return_by_q, ylim_percentiles=None, ax=None): 306 | 307 | return_by_q = return_by_q.copy() 308 | 309 | if ylim_percentiles is not None: 310 | ymin = ( 311 | np.nanpercentile(return_by_q.values, ylim_percentiles[0]) * 312 | DECIMAL_TO_BPS 313 | ) 314 | ymax = ( 315 | np.nanpercentile(return_by_q.values, ylim_percentiles[1]) * 316 | DECIMAL_TO_BPS 317 | ) 318 | else: 319 | ymin = None 320 | ymax = None 321 | 322 | if ax is None: 323 | f, ax = plt.subplots(1, 1, figsize=(18, 6)) 324 | 325 | unstacked_dr = (return_by_q.multiply(DECIMAL_TO_BPS)) 326 | unstacked_dr.columns = unstacked_dr.columns.astype(str).str.replace( 327 | 'period_', '' 328 | ).astype(int).set_names(QRETURNVIOLIN.get("LEGENDNAME")) 329 | unstacked_dr = unstacked_dr.stack() 330 | unstacked_dr.name = 'return' 331 | unstacked_dr = unstacked_dr.reset_index() 332 | 333 | sns.violinplot( 334 | data=unstacked_dr, 335 | x='factor_quantile', 336 | hue=QRETURNVIOLIN.get("LEGENDNAME"), 337 | y='return', 338 | orient='v', 339 | cut=0, 340 | inner='quartile', 341 | ax=ax 342 | ) 343 | ax.set( 344 | xlabel='', 345 | ylabel=QRETURNVIOLIN.get("YLABEL"), 346 | title=QRETURNVIOLIN.get("TITLE"), 347 | ylim=(ymin, ymax) 348 | ) 349 | 350 | ax.axhline(0.0, linestyle='-', color='black', lw=0.7, alpha=0.6) 351 | 352 | return ax 353 | 354 | 355 | @customize 356 | def plot_mean_quantile_returns_spread_time_series( 357 | mean_returns_spread, std_err=None, bandwidth=1, ax=None 358 | ): 359 | if isinstance(mean_returns_spread, pd.DataFrame): 360 | if ax is None: 361 | ax = [None for a in mean_returns_spread.columns] 362 | 363 | ymin, ymax = (None, None) 364 | for (i, a), (name, fr_column 365 | ) in zip(enumerate(ax), mean_returns_spread.iteritems()): 366 | stdn = None if std_err is None else std_err[name] 367 | a = plot_mean_quantile_returns_spread_time_series( 368 | fr_column, std_err=stdn, bandwidth=bandwidth, ax=a 369 | ) 370 | ax[i] = a 371 | curr_ymin, curr_ymax = a.get_ylim() 372 | ymin = curr_ymin if ymin is None else min(ymin, curr_ymin) 373 | ymax = curr_ymax if ymax is None else max(ymax, curr_ymax) 374 | 375 | for a in ax: 376 | a.set_ylim([ymin, ymax]) 377 | 378 | return ax 379 | 380 | periods = mean_returns_spread.name 381 | title = QRETURNTS.get( 382 | "TITLE" 383 | ).format(periods.replace('period_', '') if periods is not None else '') 384 | 385 | if ax is None: 386 | f, ax = plt.subplots(figsize=(18, 6)) 387 | 388 | mean_returns_spread_bps = mean_returns_spread * DECIMAL_TO_BPS 389 | 390 | mean_returns_spread_bps.plot(alpha=0.4, ax=ax, lw=0.7, color='forestgreen') 391 | rolling_mean( 392 | mean_returns_spread_bps, window=22 393 | ).plot( 394 | color='orangered', alpha=0.7, ax=ax 395 | ) 396 | ax.legend( 397 | [QRETURNTS.get("LEGEND0").format(bandwidth), 398 | QRETURNTS.get("LEGEND1")], 399 | loc='upper right' 400 | ) 401 | 402 | if std_err is not None: 403 | std_err_bps = std_err * DECIMAL_TO_BPS 404 | upper = mean_returns_spread_bps.values + (std_err_bps * bandwidth) 405 | lower = mean_returns_spread_bps.values - (std_err_bps * bandwidth) 406 | ax.fill_between( 407 | mean_returns_spread.index, 408 | lower, 409 | upper, 410 | alpha=0.3, 411 | color='steelblue' 412 | ) 413 | 414 | ylim = np.nanpercentile(abs(mean_returns_spread_bps.values), 95) 415 | ax.set( 416 | ylabel=QRETURNTS.get("YLABEL"), 417 | xlabel="", 418 | title=title, 419 | ylim=(-ylim, ylim) 420 | ) 421 | ax.axhline(0.0, linestyle='-', color='black', lw=1, alpha=0.8) 422 | 423 | return ax 424 | 425 | 426 | @customize 427 | def plot_ic_by_group(ic_group, ax=None): 428 | 429 | ic_group = ic_group.copy() 430 | ic_group.columns = ic_group.columns.astype(str).str.replace('period_', '') 431 | if ax is None: 432 | f, ax = plt.subplots(1, 1, figsize=(max(18, len(ic_group)), 6)) 433 | ic_group.plot(kind='bar', ax=ax) 434 | ax.set(title=ICGROUP.get("TITLE"), xlabel="") 435 | ax.set_xticklabels(ic_group.index, rotation=45) 436 | 437 | return ax 438 | 439 | 440 | @customize 441 | def plot_factor_rank_auto_correlation( 442 | factor_autocorrelation, period=1, ax=None 443 | ): 444 | 445 | if ax is None: 446 | f, ax = plt.subplots(1, 1, figsize=(18, 6)) 447 | 448 | factor_autocorrelation.plot( 449 | title=AUTOCORR.get("TITLE").format(period), ax=ax 450 | ) 451 | ax.set(ylabel=AUTOCORR.get("YLABEL").format(period), xlabel="") 452 | ax.axhline(0.0, linestyle='-', color='black', lw=1) 453 | ax.text( 454 | .05, 455 | .95, 456 | AUTOCORR.get("TEXT").format(factor_autocorrelation.mean()), 457 | fontsize=16, 458 | bbox={ 459 | 'facecolor': 'white', 460 | 'alpha': 1, 461 | 'pad': 5 462 | }, 463 | transform=ax.transAxes, 464 | verticalalignment='top' 465 | ) 466 | 467 | return ax 468 | 469 | 470 | @customize 471 | def plot_top_bottom_quantile_turnover(quantile_turnover, period=1, ax=None): 472 | 473 | if ax is None: 474 | f, ax = plt.subplots(1, 1, figsize=(18, 6)) 475 | 476 | max_quantile = quantile_turnover.columns.max() 477 | min_quantile = quantile_turnover.columns.min() 478 | turnover = pd.DataFrame() 479 | 480 | turnover[TBTURNOVER.get("TURNOVER").format(max_quantile) 481 | ] = quantile_turnover[max_quantile] 482 | turnover[TBTURNOVER.get("TURNOVER").format(min_quantile) 483 | ] = quantile_turnover[min_quantile] 484 | turnover.plot( 485 | title=TBTURNOVER.get("TITLE").format(period), ax=ax, alpha=0.6, lw=0.8 486 | ) 487 | 488 | ax.set(ylabel=TBTURNOVER.get("YLABEL"), xlabel="") 489 | 490 | return ax 491 | 492 | 493 | @customize 494 | def plot_monthly_ic_heatmap(mean_monthly_ic, ax=None): 495 | 496 | mean_monthly_ic = mean_monthly_ic.copy() 497 | 498 | num_plots = len(mean_monthly_ic.columns) 499 | 500 | v_spaces = ((num_plots - 1) // 3) + 1 501 | 502 | if ax is None: 503 | f, ax = plt.subplots(v_spaces, 3, figsize=(18, v_spaces * 6)) 504 | ax = ax.flatten() 505 | 506 | new_index_year = [] 507 | new_index_month = [] 508 | for date in mean_monthly_ic.index: 509 | new_index_year.append(date.year) 510 | new_index_month.append(date.month) 511 | 512 | mean_monthly_ic.index = pd.MultiIndex.from_arrays( 513 | [new_index_year, new_index_month], names=["year", "month"] 514 | ) 515 | 516 | for a, (period, ic) in zip(ax, mean_monthly_ic.iteritems()): 517 | periods_num = period.replace('period_', '') 518 | 519 | sns.heatmap( 520 | ic.unstack(), 521 | annot=True, 522 | alpha=1.0, 523 | center=0.0, 524 | annot_kws={"size": 15}, 525 | linewidths=0.01, 526 | linecolor='white', 527 | cmap=cm.RdYlGn, 528 | cbar=False, 529 | ax=a 530 | ) 531 | a.set(ylabel='', xlabel='') 532 | a.set_title(ICHEATMAP.get("TITLE").format(periods_num)) 533 | 534 | if num_plots < len(ax): 535 | for a in ax[num_plots:]: 536 | a.set_visible(False) 537 | 538 | return ax 539 | 540 | 541 | @customize 542 | def plot_cumulative_returns(factor_returns, period=1, overlap=True, ax=None): 543 | 544 | if ax is None: 545 | f, ax = plt.subplots(1, 1, figsize=(18, 6)) 546 | 547 | overlapping_period = period if overlap else 1 548 | factor_returns = cumulative_returns(factor_returns, overlapping_period) 549 | 550 | factor_returns.plot(ax=ax, lw=3, color='forestgreen', alpha=0.6) 551 | ax.set( 552 | ylabel=CUMRET.get("YLABEL"), 553 | title=CUMRET.get("TITLE").format(period), 554 | xlabel="" 555 | ) 556 | 557 | ax.axhline(1.0, linestyle='-', color='black', lw=1) 558 | 559 | return ax 560 | 561 | 562 | @customize 563 | def plot_top_down_cumulative_returns(factor_returns, period=1, ax=None): 564 | 565 | if ax is None: 566 | f, ax = plt.subplots(1, 1, figsize=(18, 6)) 567 | 568 | factor_returns.plot(ax=ax, lw=3, color='forestgreen', alpha=0.6) 569 | 570 | ax.set( 571 | ylabel=TDCUMRET.get("YLABEL"), 572 | title=TDCUMRET.get("TITLE").format(period), 573 | xlabel="" 574 | ) 575 | 576 | ax.axhline(1.0, linestyle='-', color='black', lw=1) 577 | 578 | return ax 579 | 580 | 581 | @customize 582 | def plot_cumulative_returns_by_quantile( 583 | quantile_returns, period=1, overlap=True, ax=None 584 | ): 585 | 586 | if ax is None: 587 | f, ax = plt.subplots(1, 1, figsize=(18, 6)) 588 | 589 | ret_wide = quantile_returns.reset_index()\ 590 | .pivot(index='date', columns='factor_quantile', 591 | values=convert_to_forward_returns_columns(period)) 592 | 593 | overlapping_period = period if overlap else 1 594 | cum_ret = ret_wide.apply(cumulative_returns, args=(overlapping_period,)) 595 | cum_ret = cum_ret.loc[:, ::-1] 596 | 597 | cum_ret.plot(lw=2, ax=ax, cmap=cm.RdYlGn_r) 598 | ax.legend() 599 | ymin, ymax = cum_ret.min().min(), cum_ret.max().max() 600 | ax.set( 601 | ylabel=CUMRETQ.get("YLABEL"), 602 | title=CUMRETQ.get("TITLE").format(period), 603 | xlabel='', 604 | ylim=(ymin, ymax) 605 | ) 606 | ax.set_yscale('symlog', linthresh=1) 607 | ax.set_yticks(np.linspace(ymin, ymax, 8)) 608 | ax.yaxis.set_major_formatter(ScalarFormatter()) 609 | ax.axhline(1.0, linestyle='-', color='black', lw=1) 610 | 611 | return ax 612 | 613 | 614 | @customize 615 | def plot_quantile_average_cumulative_return( 616 | avg_cumulative_returns, 617 | by_quantile=False, 618 | std_bar=False, 619 | ax=None, 620 | periods_before='', 621 | periods_after='' 622 | ): 623 | 624 | avg_cumulative_returns = avg_cumulative_returns.multiply(DECIMAL_TO_BPS) 625 | quantiles = len(avg_cumulative_returns.index.levels[0].unique()) 626 | palette = [cm.RdYlGn_r(i) for i in np.linspace(0, 1, quantiles)] 627 | 628 | if by_quantile: 629 | 630 | if ax is None: 631 | v_spaces = ((quantiles - 1) // 2) + 1 632 | f, ax = plt.subplots( 633 | v_spaces, 634 | 2, 635 | sharex=False, 636 | sharey=False, 637 | figsize=(18, 6 * v_spaces) 638 | ) 639 | ax = ax.flatten() 640 | 641 | for i, (quantile, q_ret) in enumerate( 642 | avg_cumulative_returns.groupby(level='factor_quantile') 643 | ): 644 | 645 | mean = q_ret.loc[(quantile, 'mean')] 646 | mean.name = AVGCUMRET.get("COLUMN").format(quantile) 647 | mean.plot(ax=ax[i], color=palette[i]) 648 | ax[i].set_ylabel(AVGCUMRET.get("YLABEL")) 649 | 650 | if std_bar: 651 | std = q_ret.loc[(quantile, 'std')] 652 | ax[i].errorbar( 653 | std.index, 654 | mean, 655 | yerr=std, 656 | fmt='none', 657 | ecolor=palette[i], 658 | label=None 659 | ) 660 | 661 | ax[i].axvline(x=0, color='k', linestyle='--') 662 | ax[i].legend() 663 | i += 1 664 | 665 | else: 666 | 667 | if ax is None: 668 | f, ax = plt.subplots(1, 1, figsize=(18, 6)) 669 | 670 | for i, (quantile, q_ret) in enumerate( 671 | avg_cumulative_returns.groupby(level='factor_quantile') 672 | ): 673 | 674 | mean = q_ret.loc[(quantile, 'mean')] 675 | mean.name = AVGCUMRET.get("COLUMN").format(quantile) 676 | mean.plot(ax=ax, color=palette[i]) 677 | 678 | if std_bar: 679 | std = q_ret.loc[(quantile, 'std')] 680 | ax.errorbar( 681 | std.index, 682 | mean, 683 | yerr=std, 684 | fmt='none', 685 | ecolor=palette[i], 686 | label=None 687 | ) 688 | i += 1 689 | 690 | ax.axvline(x=0, color='k', linestyle='--') 691 | ax.legend() 692 | ax.set( 693 | title=AVGCUMRET.get("YLABEL").format(periods_before, periods_after), 694 | xlabel=AVGCUMRET.get("XLABEL"), 695 | ylabel=AVGCUMRET.get("YLABEL"), 696 | ) 697 | 698 | return ax 699 | 700 | 701 | @customize 702 | def plot_events_distribution(events, num_days=5, full_dates=None, ax=None): 703 | 704 | if ax is None: 705 | f, ax = plt.subplots(1, 1, figsize=(18, 6)) 706 | 707 | if full_dates is None: 708 | full_dates = events.index.get_level_values('date').unique() 709 | 710 | group = pd.Series(range(len(full_dates)), index=full_dates) // num_days 711 | grouper_label = group.drop_duplicates() 712 | grouper = group.reindex(events.index.get_level_values('date')) 713 | 714 | count = events.groupby(grouper.values).count() 715 | count = count.reindex(grouper_label.values, fill_value=0) 716 | count.index = grouper_label.index.map(lambda x: x.strftime('%Y-%m-%d')) 717 | count.plot(kind="bar", grid=False, ax=ax) 718 | 719 | def annotateBars(x, dt, ax=ax): 720 | color = 'black' 721 | vertalign = 'top' 722 | ax.text( 723 | x, 724 | count.loc[dt], 725 | "{:d}".format(count.loc[dt]), 726 | rotation=45, 727 | color=color, 728 | horizontalalignment='center', 729 | verticalalignment=vertalign, 730 | fontsize=15, 731 | weight='heavy' 732 | ) 733 | 734 | [annotateBars(x, dt, ax=ax) for x, dt in enumerate(list(count.index))] 735 | ax.set( 736 | ylabel=EVENTSDIST.get("YLABEL"), 737 | title=EVENTSDIST.get("TITLE"), 738 | xlabel=EVENTSDIST.get("XLABEL"), 739 | ) 740 | return ax 741 | 742 | 743 | @customize 744 | def plot_missing_events_distribution( 745 | events, num_days=5, full_dates=None, ax=None 746 | ): 747 | 748 | if ax is None: 749 | f, ax = plt.subplots(1, 1, figsize=(18, 6)) 750 | 751 | if full_dates is None: 752 | full_dates = events.index.get_level_values('date').unique() 753 | 754 | daily_count = events.groupby(level='date').count() 755 | most_common_count = np.argmax(np.bincount(daily_count)) 756 | daily_missing = daily_count / most_common_count - 1 757 | daily_missing = daily_missing.reindex(full_dates, fill_value=-1.0) 758 | 759 | grouper = pd.Series(range(len(full_dates)), index=full_dates) // num_days 760 | grouper_label = grouper.drop_duplicates() 761 | 762 | missing = daily_missing.groupby(grouper.values).mean() 763 | missing = missing.reindex(grouper_label.values, fill_value=-1.0) 764 | missing.index = grouper_label.index.map(lambda x: x.strftime('%Y-%m-%d')) 765 | missing.plot(kind="bar", grid=False, ax=ax) 766 | 767 | def annotateBars(x, dt, ax=ax): 768 | color = 'black' 769 | vertalign = 'top' 770 | ax.text( 771 | x, 772 | missing.loc[dt], 773 | "{:+.1f}%".format(missing.loc[dt] * 100), 774 | rotation=45, 775 | color=color, 776 | horizontalalignment='center', 777 | verticalalignment=vertalign, 778 | fontsize=15, 779 | weight='heavy' 780 | ) 781 | 782 | [annotateBars(x, dt, ax=ax) for x, dt in enumerate(list(missing.index))] 783 | ax.set( 784 | ylabel=MISSIINGEVENTSDIST.get("YLABEL"), 785 | title=MISSIINGEVENTSDIST.get("TITLE"), 786 | xlabel=MISSIINGEVENTSDIST.get("XLABEL") 787 | ) 788 | 789 | return ax 790 | -------------------------------------------------------------------------------- /jqfactor_analyzer/prepare.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | from __future__ import division 5 | 6 | import pandas as pd 7 | import numpy as np 8 | 9 | from .exceptions import MaxLossExceededError, non_unique_bin_edges_error 10 | from .utils import get_forward_returns_columns 11 | 12 | 13 | @non_unique_bin_edges_error 14 | def quantize_factor( 15 | factor_data, quantiles=5, bins=None, by_group=False, no_raise=False, zero_aware=False, 16 | ): 17 | """ 18 | 计算每期因子分位数 19 | 20 | 参数 21 | ---------- 22 | factor_data : pd.DataFrame - MultiIndex 23 | 一个 DataFrame, index 为日期 (level 0) 和资产(level 1) 的 MultiIndex, 24 | values 包括因子的值, 各期因子远期收益, 因子分位数, 25 | 因子分组(可选), 因子权重(可选) 26 | quantiles : int or sequence[float] 27 | 在因子分组中按照因子值大小平均分组的组数。 28 |          或分位数序列, 允许不均匀分组 29 | 例如 [0, .10, .5, .90, 1.] 或 [.05, .5, .95] 30 | 'quantiles' 和 'bins' 有且只能有一个不为 None 31 | bins : int or sequence[float] 32 | 在因子分组中使用的等宽 (按照因子值) 区间的数量 33 | 或边界值序列, 允许不均匀的区间宽度 34 | 例如 [-4, -2, -0.5, 0, 10] 35 | 'quantiles' 和 'bins' 有且只能有一个不为 None 36 | by_group : bool 37 | 如果是 True, 按照 group 分别计算分位数 38 | no_raise: bool, optional 39 | 如果为 True,则不抛出任何异常,并且将抛出异常的值设置为 np.NaN 40 | zero_aware : bool, optional 41 | 如果为True,则分别为正负因子值计算分位数。 42 | 适用于您的信号聚集并且零是正值和负值的分界线的情况. 43 | 44 | 返回值 45 | ------- 46 | factor_quantile : pd.Series 47 | index 为日期 (level 0) 和资产(level 1) 的因子分位数 48 | """ 49 | if not ((quantiles is not None and bins is None) or 50 | (quantiles is None and bins is not None)): 51 | raise ValueError('quantiles 和 bins 至少要输入一个') 52 | 53 | if zero_aware and not (isinstance(quantiles, int) 54 | or isinstance(bins, int)): 55 | msg = ("只有 quantiles 或 bins 为 int 类型时, 'zero_aware' 才能为 True") 56 | raise ValueError(msg) 57 | 58 | def quantile_calc(x, _quantiles, _bins, _zero_aware, _no_raise): 59 | try: 60 | if _quantiles is not None and _bins is None and not _zero_aware: 61 | return pd.qcut(x, _quantiles, labels=False) + 1 62 | elif _quantiles is not None and _bins is None and _zero_aware: 63 | pos_quantiles = pd.qcut(x[x >= 0], _quantiles // 2, 64 | labels=False) + _quantiles // 2 + 1 65 | neg_quantiles = pd.qcut(x[x < 0], _quantiles // 2, 66 | labels=False) + 1 67 | return pd.concat([pos_quantiles, neg_quantiles]).sort_index() 68 | elif _bins is not None and _quantiles is None and not _zero_aware: 69 | return pd.cut(x, _bins, labels=False) + 1 70 | elif _bins is not None and _quantiles is None and _zero_aware: 71 | pos_bins = pd.cut(x[x >= 0], _bins // 2, 72 | labels=False) + _bins // 2 + 1 73 | neg_bins = pd.cut(x[x < 0], _bins // 2, 74 | labels=False) + 1 75 | return pd.concat([pos_bins, neg_bins]).sort_index() 76 | except Exception as e: 77 | if _no_raise: 78 | return pd.Series(index=x.index) 79 | raise e 80 | 81 | grouper = [factor_data.index.get_level_values('date')] 82 | if by_group: 83 | if 'group' not in factor_data.columns: 84 | raise ValueError('只有输入了 groupby 参数时 binning_by_group 才能为 True') 85 | grouper.append('group') 86 | 87 | factor_quantile = factor_data.groupby(grouper)['factor'] \ 88 | .apply(quantile_calc, quantiles, bins, zero_aware, no_raise) 89 | factor_quantile.name = 'factor_quantile' 90 | 91 | return factor_quantile.dropna() 92 | 93 | 94 | def compute_forward_returns(factor, 95 | prices, 96 | periods=(1, 5, 10)): 97 | """ 98 | 计算每个因子值对应的 N 期因子远期收益 99 | 100 | 参数 101 | ---------- 102 | factor : pd.Series - MultiIndex 103 | 一个 Series, index 为日期 (level 0) 和资产(level 1) 的 MultiIndex, 104 | values 为因子值 105 | prices : pd.DataFrame 106 | 用于计算因子远期收益的价格数据 107 | columns 为资产, index 为 日期. 108 | 价格数据必须覆盖因子分析时间段以及额外远期收益计算中的最大预期期数. 109 | periods : sequence[int] 110 | 远期收益的期数 111 | Returns 112 | ------- 113 | forward_returns : pd.DataFrame - MultiIndex 114 | 因子远期收益 115 | index 为日期 (level 0) 和资产(level 1) 的 MultiIndex 116 | column 为远期收益的期数 117 | """ 118 | 119 | factor_dateindex = factor.index.levels[0] 120 | factor_dateindex = factor_dateindex.intersection(prices.index) 121 | 122 | if len(factor_dateindex) == 0: 123 | raise ValueError("Factor and prices indices don't match: make sure " 124 | "they have the same convention in terms of datetimes " 125 | "and symbol-names") 126 | 127 | prices = prices.filter(items=factor.index.levels[1]) 128 | 129 | forward_returns = pd.DataFrame( 130 | index=pd.MultiIndex 131 | .from_product([prices.index, prices.columns], names=['date', 'asset']) 132 | ) 133 | 134 | for period in periods: 135 | delta = prices.pct_change(period).shift(-period).reindex(factor_dateindex) 136 | forward_returns['period_{p}'.format(p=period)] = delta.stack() 137 | 138 | forward_returns.index = forward_returns.index.rename(['date', 'asset']) 139 | 140 | return forward_returns 141 | 142 | 143 | def demean_forward_returns(factor_data, grouper=None): 144 | """ 145 | 根据相关分组为因子远期收益去均值. 146 | 分组去均值包含了投资组合分组中性化约束的假设,因此允许跨组评估因子. 147 | 148 | Parameters 149 | ---------- 150 | factor_data : pd.DataFrame - MultiIndex 151 | 因子远期收益 152 | index 为日期 (level 0) 和资产(level 1) 的 MultiIndex 153 | column 为远期收益的期数 154 | grouper : list 155 | 如果为 None, 则只根据日期去均值 156 | 否则则根据列表中提供的组分组去均值 157 | 158 | 返回值 159 | ------- 160 | adjusted_forward_returns : pd.DataFrame - MultiIndex 161 | 和 factor_data 相同形状的 DataFrame, 但每个收益都被分组去均值了 162 | """ 163 | 164 | factor_data = factor_data.copy() 165 | 166 | if not grouper: 167 | grouper = factor_data.index.get_level_values('date') 168 | 169 | cols = get_forward_returns_columns(factor_data.columns) 170 | factor_data[cols] = factor_data.groupby( 171 | grouper, as_index=False 172 | )[cols.append(pd.Index(['weights']))].apply( 173 | lambda x: x[cols].subtract( 174 | np.average(x[cols], axis=0, weights=x['weights'].fillna(0.0).values), 175 | axis=1 176 | ) 177 | ) 178 | 179 | return factor_data 180 | 181 | 182 | def get_clean_factor(factor, 183 | forward_returns, 184 | groupby=None, 185 | weights=None, 186 | binning_by_group=False, 187 | quantiles=5, 188 | bins=None, 189 | max_loss=0.35, 190 | zero_aware=False): 191 | """ 192 | 将因子值, 因子远期收益, 因子分组数据, 因子权重数据 193 | 格式化为以时间和资产的 MultiIndex 作为索引的 DataFrame. 194 | 195 | 参数 196 | ---------- 197 | factor : pd.Series - MultiIndex 198 | 一个 Series, index 为日期 (level 0) 和资产(level 1) 的 MultiIndex, 199 | values 为因子的值 200 | forward_returns : pd.DataFrame - MultiIndex 201 | 一个 DataFrame, index 为日期 (level 0) 和资产(level 1) 的 MultiIndex, 202 | values 为因子的远期收益, columns 为因子远期收益的期数. 203 | groupby : pd.Series - MultiIndex or dict 204 | index 为日期和资产的 Series,为每个资产每天的分组,或资产-分组映射的字典. 205 | 如果传递了dict,则假定分组映射在整个时间段内保持不变. 206 | weights : pd.Series - MultiIndex or dict 207 | index 为日期和资产的 Series,为每个资产每天的权重,或资产-权重映射的字典. 208 | 如果传递了dict,则假定权重映射在整个时间段内保持不变. 209 | binning_by_group : bool 210 | 如果为 True, 则对每个组分别计算分位数. 211 | 适用于因子值范围在各个组上变化很大的情况. 212 | 如果要分析分组(行业)中性的组合, 您最好设置为 True 213 | quantiles : int or sequence[float] 214 | 在因子分组中按照因子值大小平均分组的组数。 215 |          或分位数序列, 允许不均匀分组 216 | 例如 [0, .10, .5, .90, 1.] 或 [.05, .5, .95] 217 | 'quantiles' 和 'bins' 有且只能有一个不为 None 218 | bins : int or sequence[float] 219 | 在因子分组中使用的等宽 (按照因子值) 区间的数量 220 | 或边界值序列, 允许不均匀的区间宽度 221 | 例如 [-4, -2, -0.5, 0, 10] 222 | 'quantiles' 和 'bins' 有且只能有一个不为 None 223 | max_loss : float, optional 224 | 允许的丢弃因子数据的最大百分比 (0.00 到 1.00), 225 | 计算比较输入因子索引中的项目数和输出 DataFrame 索引中的项目数. 226 | 因子数据本身存在缺陷 (例如 NaN), 227 | 没有提供足够的价格数据来计算所有因子值的远期收益, 228 | 或者因为分组失败, 因此可以部分地丢弃因子数据 229 | 设置 max_loss = 0 以停止异常捕获. 230 | zero_aware : bool, optional 231 | 如果为True,则分别为正负因子值计算分位数。 232 | 适用于您的信号聚集并且零是正值和负值的分界线的情况. 233 | 234 | 返回值 235 | ------- 236 | merged_data : pd.DataFrame - MultiIndex 237 | 一个 DataFrame, index 为日期 (level 0) 和资产(level 1) 的 MultiIndex, 238 | values 包括因子的值, 各期因子远期收益, 因子分位数, 239 | 因子分组(可选), 因子权重(可选) 240 | - 各期因子远期收益的列名满足 'period_1', 'period_5' 的格式 241 | """ 242 | 243 | initial_amount = float(len(factor.index)) 244 | 245 | factor_copy = factor.copy() 246 | factor_copy.index = factor_copy.index.rename(['date', 'asset']) 247 | 248 | merged_data = forward_returns.copy() 249 | merged_data['factor'] = factor_copy 250 | 251 | if groupby is not None: 252 | if isinstance(groupby, dict): 253 | diff = set(factor_copy.index.get_level_values( 254 | 'asset')) - set(groupby.keys()) 255 | if len(diff) > 0: 256 | raise KeyError( 257 | "Assets {} not in group mapping".format( 258 | list(diff))) 259 | 260 | ss = pd.Series(groupby) 261 | groupby = pd.Series(index=factor_copy.index, 262 | data=ss[factor_copy.index.get_level_values( 263 | 'asset')].values) 264 | elif isinstance(groupby, pd.DataFrame): 265 | groupby = groupby.stack() 266 | merged_data['group'] = groupby 267 | 268 | if weights is not None: 269 | if isinstance(weights, dict): 270 | diff = set(factor_copy.index.get_level_values( 271 | 'asset')) - set(weights.keys()) 272 | if len(diff) > 0: 273 | raise KeyError( 274 | "Assets {} not in weights mapping".format( 275 | list(diff))) 276 | 277 | ww = pd.Series(weights) 278 | weights = pd.Series(index=factor_copy.index, 279 | data=ww[factor_copy.index.get_level_values( 280 | 'asset')].values) 281 | elif isinstance(weights, pd.DataFrame): 282 | weights = weights.stack() 283 | merged_data['weights'] = weights 284 | 285 | merged_data = merged_data.dropna() 286 | 287 | quantile_data = quantize_factor( 288 | merged_data, 289 | quantiles, 290 | bins, 291 | binning_by_group, 292 | True, 293 | zero_aware 294 | ) 295 | 296 | merged_data['factor_quantile'] = quantile_data 297 | merged_data = merged_data.dropna() 298 | merged_data['factor_quantile'] = merged_data['factor_quantile'].astype(int) 299 | 300 | if 'weights' in merged_data.columns: 301 | merged_data['weights'] = merged_data.set_index( 302 | 'factor_quantile', append=True 303 | ).groupby(level=['date', 'factor_quantile'])['weights'].apply( 304 | lambda s: s.divide(s.sum()) 305 | ).reset_index('factor_quantile', drop=True) 306 | 307 | binning_amount = float(len(merged_data.index)) 308 | 309 | tot_loss = (initial_amount - binning_amount) / initial_amount 310 | 311 | no_raise = True if max_loss == 0 else False 312 | if tot_loss > max_loss and not no_raise: 313 | message = ("max_loss (%.1f%%) 超过 %.1f%%" 314 | % (tot_loss * 100, max_loss * 100)) 315 | raise MaxLossExceededError(message) 316 | 317 | return merged_data 318 | 319 | 320 | def get_clean_factor_and_forward_returns(factor, 321 | prices, 322 | groupby=None, 323 | weights=None, 324 | binning_by_group=False, 325 | quantiles=5, 326 | bins=None, 327 | periods=(1, 5, 10), 328 | max_loss=0.35, 329 | zero_aware=False): 330 | """ 331 | 将因子数据, 价格数据, 分组映射和权重映射格式化为 332 | 由包含时间和资产的 MultiIndex 作为索引的 DataFrame 333 | 334 | 参数 335 | ---------- 336 | factor : pd.Series - MultiIndex 337 | 一个 Series, index 为日期 (level 0) 和资产(level 1) 的 MultiIndex, 338 | values 为因子的值 339 | prices : pd.DataFrame 340 | 用于计算因子远期收益的价格数据 341 | columns 为资产, index 为 日期. 342 | 价格数据必须覆盖因子分析时间段以及额外远期收益计算中的最大预期期数. 343 | groupby : pd.Series - MultiIndex or dict 344 | index 为日期和资产的 Series,为每个资产每天的分组,或资产-分组映射的字典. 345 | 如果传递了dict,则假定分组映射在整个时间段内保持不变. 346 | weights : pd.Series - MultiIndex or dict 347 | index 为日期和资产的 Series,为每个资产每天的权重,或资产-权重映射的字典. 348 | 如果传递了dict,则假定权重映射在整个时间段内保持不变. 349 | binning_by_group : bool 350 | 如果为 True, 则对每个组分别计算分位数. 351 | 适用于因子值范围在各个组上变化很大的情况. 352 | 如果要分析分组(行业)中性的组合, 您最好设置为 True 353 | quantiles : int or sequence[float] 354 | 在因子分组中按照因子值大小平均分组的组数。 355 |          或分位数序列, 允许不均匀分组 356 | 例如 [0, .10, .5, .90, 1.] 或 [.05, .5, .95] 357 | 'quantiles' 和 'bins' 有且只能有一个不为 None 358 | bins : int or sequence[float] 359 | 在因子分组中使用的等宽 (按照因子值) 区间的数量 360 | 或边界值序列, 允许不均匀的区间宽度 361 | 例如 [-4, -2, -0.5, 0, 10] 362 | 'quantiles' 和 'bins' 有且只能有一个不为 None 363 | periods : sequence[int] 364 | 远期收益的期数 365 | max_loss : float, optional 366 | 允许的丢弃因子数据的最大百分比 (0.00 到 1.00), 367 | 计算比较输入因子索引中的项目数和输出 DataFrame 索引中的项目数. 368 | 因子数据本身存在缺陷 (例如 NaN), 369 | 没有提供足够的价格数据来计算所有因子值的远期收益, 370 | 或者因为分组失败, 因此可以部分地丢弃因子数据 371 | 设置 max_loss = 0 以停止异常捕获. 372 | zero_aware : bool, optional 373 | 如果为True,则分别为正负因子值计算分位数。 374 | 适用于您的信号聚集并且零是正值和负值的分界线的情况. 375 | 376 | 返回值 377 | ------- 378 | merged_data : pd.DataFrame - MultiIndex 379 | 一个 DataFrame, index 为日期 (level 0) 和资产(level 1) 的 MultiIndex, 380 | values 包括因子的值, 各期因子远期收益, 因子分位数, 381 | 因子分组(可选), 因子权重(可选) 382 | - 各期因子远期收益的列名满足 'period_1', 'period_5' 的格式 383 | """ 384 | 385 | forward_returns = compute_forward_returns(factor, prices, periods) 386 | 387 | factor_data = get_clean_factor(factor, forward_returns, groupby=groupby, 388 | weights=weights, 389 | quantiles=quantiles, bins=bins, 390 | binning_by_group=binning_by_group, 391 | max_loss=max_loss, zero_aware=zero_aware) 392 | 393 | return factor_data 394 | 395 | 396 | def common_start_returns( 397 | factor, 398 | prices, 399 | before, 400 | after, 401 | cumulative=False, 402 | mean_by_date=False, 403 | demean_by=None 404 | ): 405 | 406 | if cumulative: 407 | returns = prices 408 | else: 409 | returns = prices.pct_change(axis=0) 410 | 411 | all_returns = [] 412 | 413 | for timestamp, df in factor.groupby(level='date'): 414 | 415 | equities = df.index.get_level_values('asset') 416 | 417 | try: 418 | day_zero_index = returns.index.get_loc(timestamp) 419 | except KeyError: 420 | continue 421 | 422 | starting_index = max(day_zero_index - before, 0) 423 | ending_index = min(day_zero_index + after + 1, len(returns.index)) 424 | 425 | equities_slice = set(equities) 426 | if demean_by is not None: 427 | demean_equities = demean_by.loc[timestamp] \ 428 | .index.get_level_values('asset') 429 | equities_slice |= set(demean_equities) 430 | 431 | series = returns.loc[returns. 432 | index[starting_index:ending_index], equities_slice] 433 | series.index = range( 434 | starting_index - day_zero_index, ending_index - day_zero_index 435 | ) 436 | 437 | if cumulative: 438 | series = (series / series.loc[0, :]) - 1 439 | 440 | if demean_by is not None: 441 | mean = series.loc[:, demean_equities].mean(axis=1) 442 | series = series.loc[:, equities] 443 | series = series.sub(mean, axis=0) 444 | 445 | if mean_by_date: 446 | series = series.mean(axis=1) 447 | 448 | all_returns.append(series) 449 | 450 | return pd.concat(all_returns, axis=1) 451 | 452 | 453 | def rate_of_return(period_ret): 454 | """ 455 | 转换回报率为"每期"回报率:如果收益以稳定的速度增长, 则相当于每期的回报率 456 | """ 457 | period = int(period_ret.name.replace('period_', '')) 458 | return period_ret.add(1).pow(1. / period).sub(1) 459 | 460 | 461 | def std_conversion(period_std): 462 | """ 463 | 转换回报率标准差为"每期"回报率标准差 464 | """ 465 | period_len = int(period_std.name.replace('period_', '')) 466 | return period_std / np.sqrt(period_len) 467 | -------------------------------------------------------------------------------- /jqfactor_analyzer/preprocess.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | import warnings 4 | 5 | import pandas as pd 6 | import numpy as np 7 | from scipy.stats.mstats import winsorize as spwinsorize 8 | from decimal import Decimal 9 | from .utils import ignore_warning 10 | 11 | from .data import DataApi,convert_date 12 | from fastcache import lru_cache 13 | from functools import partial 14 | from statsmodels.api import OLS, add_constant as sm_add_constant 15 | 16 | 17 | 18 | def winsorize(data, scale=None, range=None, qrange=None, inclusive=True, inf2nan=True, axis=1): 19 | 20 | if isinstance(data, pd.DataFrame): 21 | return data.apply( 22 | winsorize, 23 | axis, 24 | scale=scale, 25 | range=range, 26 | qrange=qrange, 27 | inclusive=inclusive, 28 | inf2nan=inf2nan 29 | ) 30 | elif (isinstance(data, np.ndarray) and data.ndim > 1): 31 | return np.apply_along_axis( 32 | winsorize, 33 | axis, 34 | arr=data, 35 | scale=scale, 36 | range=range, 37 | qrange=qrange, 38 | inclusive=inclusive, 39 | inf2nan=inf2nan 40 | ) 41 | 42 | if isinstance(data, pd.Series): 43 | v = data.values 44 | else: 45 | v = data 46 | 47 | if not np.isfinite(v).any(): 48 | return data 49 | 50 | # 如果v是int arrary,无法给 array 赋值 np.nan,因为 np.nan 是个 float 51 | v = v.astype(float) 52 | 53 | if inf2nan: 54 | v[~np.isfinite(v)] = np.nan 55 | 56 | if qrange: 57 | if not ((0 <= qrange[0] <= 1) and (0 <= qrange[1] <= 1)): 58 | raise Exception(u'qrange 值应在 0 到 1 之间,如 [0.05, 0.95]') 59 | qrange = (Decimal(str(qrange[0])), 1 - Decimal(str(qrange[1]))) 60 | 61 | if inclusive: 62 | v[~np.isnan(v)] = spwinsorize(v[~np.isnan(v)], qrange, inclusive=[True, True]) 63 | else: 64 | # 如果v是int arrary,无法给 array 赋值 np.nan,因为 np.nan 是个 float 65 | v = v.astype(float) 66 | not_nan = v[~np.isnan(v)] 67 | not_nan[not_nan != spwinsorize(not_nan, qrange, inclusive=[True, True])] = np.nan 68 | v[~np.isnan(v)] = not_nan 69 | 70 | else: 71 | if range: 72 | range_ = (Decimal(str(range[0])) if not np.isnan(range[0]) else np.nan, 73 | Decimal(str(range[1])) if not np.isnan(range[1]) else np.nan) 74 | else: 75 | mu = np.mean(data[np.isfinite(data)]) 76 | sigma = np.std(data[np.isfinite(data)]) 77 | range_ = (np.nanmin(v[v > mu - scale * sigma]), 78 | np.nanmax(v[v < mu + scale * sigma])) 79 | 80 | if inclusive: 81 | not_nan = ~np.isnan(v) 82 | v[not_nan] = np.where(v[not_nan] < range_[0], range_[0], v[not_nan]) 83 | not_nan = ~np.isnan(v) 84 | v[not_nan] = np.where(v[not_nan] > range_[1], range_[1], v[not_nan]) 85 | else: 86 | not_nan = ~np.isnan(v) 87 | v_not_nan = v[not_nan] 88 | v[not_nan] = np.where( 89 | np.logical_and(v_not_nan >= range_[0], v_not_nan <= range_[1]), v_not_nan, np.nan 90 | ) 91 | 92 | if isinstance(data, pd.Series): 93 | return pd.Series(v, index=data.index) 94 | else: 95 | return v 96 | 97 | 98 | def winsorize_med(data, scale=1, inclusive=True, inf2nan=True, axis=1): 99 | 100 | if isinstance(data, pd.DataFrame): 101 | return data.apply(winsorize_med, axis, scale=scale, inclusive=inclusive, inf2nan=inf2nan) 102 | elif (isinstance(data, np.ndarray) and data.ndim > 1): 103 | return np.apply_along_axis( 104 | winsorize_med, axis, arr=data, scale=scale, inclusive=inclusive, inf2nan=inf2nan 105 | ) 106 | 107 | if isinstance(data, pd.Series): 108 | v = data.values 109 | else: 110 | v = data 111 | 112 | if not np.isfinite(v).any(): 113 | return data 114 | 115 | # 如果v是int arrary,无法给 array 赋值 np.nan,因为 np.nan 是个 float 116 | v = v.astype(float) 117 | 118 | if inf2nan: 119 | v[~np.isfinite(v)] = np.nan 120 | 121 | med = np.median(v[~np.isnan(v)]) 122 | 123 | data_minus_med = v[~np.isnan(v)] - med 124 | median_absolute = np.median(np.abs(data_minus_med)) 125 | 126 | if inclusive: 127 | not_nan = ~np.isnan(v) 128 | v[not_nan] = np.where( 129 | v[not_nan] > med + scale * median_absolute, med + scale * median_absolute, v[not_nan] 130 | ) 131 | not_nan = ~np.isnan(v) 132 | v[not_nan] = np.where( 133 | v[not_nan] < med - scale * median_absolute, med - scale * median_absolute, v[not_nan] 134 | ) 135 | else: 136 | # 如果v是int arrary,np.nan 会被转换成一个极小的数,比如 -2147483648 137 | v = v.astype(float) 138 | not_nan = ~np.isnan(v) 139 | v_not_nan = v[not_nan] 140 | v[not_nan] = np.where( 141 | np.logical_and( 142 | v_not_nan <= med + scale * median_absolute, 143 | v_not_nan >= med - scale * median_absolute 144 | ), v_not_nan, np.nan 145 | ) 146 | 147 | if isinstance(data, pd.Series): 148 | return pd.Series(v, index=data.index) 149 | else: 150 | return v 151 | 152 | 153 | @ignore_warning(message='Mean of empty slice', category=RuntimeWarning) 154 | @ignore_warning(message='Degrees of freedom <= 0 for slice', 155 | category=RuntimeWarning) 156 | @ignore_warning(message='invalid value encountered in true_divide', 157 | category=RuntimeWarning) 158 | def standardlize(data, inf2nan=True, axis=1): 159 | if inf2nan: 160 | data = data.astype('float64') 161 | data[np.isinf(data)] = np.nan 162 | 163 | axis = min(data.ndim - 1, axis) 164 | 165 | if not np.any(np.isfinite(data)): 166 | return data 167 | 168 | mu = np.nanmean(np.where(~np.isinf(data), data, np.nan), axis=axis) 169 | std = np.nanstd(np.where(~np.isinf(data), data, np.nan), axis=axis) 170 | 171 | rep = np.tile if axis == 0 else np.repeat 172 | mu = np.asarray(rep(mu, data.shape[axis])).reshape(data.shape) 173 | std = np.asarray(rep(std, data.shape[axis])).reshape(data.shape) 174 | 175 | if isinstance(data, (pd.Series, pd.DataFrame)): 176 | data = data.where(np.isinf(data), (data - mu) / std) 177 | else: 178 | data = np.where(np.isinf(data), data, (data - mu) / std) 179 | return data 180 | 181 | 182 | @lru_cache(3) 183 | def cache_dataapi(allow_cache=True, show_progress=False): 184 | return DataApi(allow_cache=allow_cache, show_progress=show_progress) 185 | 186 | 187 | def get_neu_basicdata(how, securities, date=None): 188 | """获取中性化的依赖数据 189 | 返回: 一个 DataFrame, index 是股票代码 190 | """ 191 | if isinstance(how, str): 192 | how = [how] 193 | 194 | if isinstance(how, (pd.Series, pd.DataFrame)): 195 | return how 196 | elif isinstance(how, (list, tuple)): 197 | how_datas = [] 198 | else: 199 | raise ValueError("错误的 how 参数格式 : {}".format(how)) 200 | 201 | dataapi = cache_dataapi() 202 | for how_name in how: 203 | if isinstance(how_name, pd.Series): 204 | how_datas.append(how_name.to_frame()) 205 | elif isinstance(how_name, pd.DataFrame): 206 | how_datas.append(how_name) 207 | elif how_name in ['jq_l1', 'jq_l2', 'sw_l1', 'sw_l2', 'sw_l3', 'zjw']: 208 | industry_info = pd.get_dummies(dataapi._get_cached_industry_one_day( 209 | date, securities, industry=how_name)).reindex(securities, fill_value=0) 210 | how_datas.append(industry_info) 211 | elif how_name in ['mktcap', 'ln_mktcap', 'cmktcap', 'ln_cmktcap']: 212 | if how_name == 'mktcap': 213 | mkt_api = partial(dataapi._get_market_cap, ln=False) 214 | elif how_name == 'ln_mktcap': 215 | mkt_api = partial(dataapi._get_market_cap, ln=True) 216 | elif how_name == 'cmktcap': 217 | mkt_api = partial(dataapi._get_circulating_market_cap, ln=False) 218 | elif how_name == 'ln_cmktcap': 219 | mkt_api = partial(dataapi._get_circulating_market_cap, ln=True) 220 | 221 | market_info= mkt_api(securities=securities, start_date=date, end_date=date).T 222 | market_info.columns=[how_name] 223 | how_datas.append(market_info) 224 | else: 225 | raise ValueError("不支持的因子名称 : {} ".format(how_name)) 226 | 227 | return pd.concat(how_datas,axis=1) 228 | 229 | 230 | def neutralize(data, how=None, date=None, axis=1, fillna=None, add_constant=False): 231 | """中性化 232 | data: pd.Series/pd.DataFrame, 待中性化的序列, 序列的 index/columns 为股票的 code 233 | how: str list. 中性化使用的因子名称列表. 默认为 ['jq_l1', 'market_cap'], 支持的中性化方法有: 234 | 1. 行业: sw_l1, sw_l2, sw_l3, jq_l1, jq_l2 235 | 2. 市值因子: mktcap(总市值), ln_mktcap(对数总市值), cmktcap(流通市值), ln_cmktcap(对数流通市值) 236 | 3. 自定义的中性化数据: 支持同时传入额外的 Series 或者 DataFrame 用来进行中性化, index 必须是标的代码 237 | 以上三类参数可同时传入参数列表 238 | date: 日期, 将用 date 这天的相关变量数据对 series 进行中性化 (注意依赖数据的实际可用时间, 如市值数据当天盘中是无法获取到的) 239 | axis: 默认为 1. 仅在 data 为 pd.DataFrame 时生效. 表示沿哪个方向做中性化, 0 为对每列做中性化, 1 为对每行做中性化 240 | fillna: 缺失值填充方式, 默认为None, 表示不填充. 支持的值: 241 | 'jq_l1': 聚宽一级行业 242 | 'jq_l2': 聚宽二级行业 243 | 'sw_l1': 申万一级行业 244 | 'sw_l2': 申万二级行业 245 | 'sw_l3': 申万三级行业 表示使用某行业分类的均值进行填充. 246 | add_constant: 中性化时是否添加常数项, 默认为 False 247 | """ 248 | if data.dropna(how='all').empty: 249 | return data 250 | 251 | if how is None: 252 | how = ['jq_l1', 'mktcap'] 253 | elif isinstance(how, str): 254 | how = [how] 255 | 256 | if isinstance(data, pd.Series) or axis == 0: 257 | securities = data.index.astype(str) 258 | else: 259 | securities = data.columns.astype(str) 260 | invalid_securities = securities[~(securities.str.endswith("XSHG") | securities.str.endswith("XSHE"))].tolist() 261 | if invalid_securities: 262 | raise ValueError('neutralize: 找不到股票: {sym:s}'.format(sym=str(invalid_securities))) 263 | 264 | exposure = get_neu_basicdata(how, securities.tolist(), date=date) 265 | 266 | with pd.option_context('mode.use_inf_as_null', True): 267 | exposure.dropna(axis=1, how='all', inplace=True) 268 | exposure.dropna(inplace=True) 269 | exposure = exposure.astype(np.float64) 270 | 271 | if exposure.empty: 272 | return data 273 | 274 | if fillna is not None: 275 | dataapi = cache_dataapi() 276 | ind = dataapi._get_cached_industry_one_day(date, securities) 277 | 278 | def valid_index(s): 279 | return s[np.isfinite(s)].index.intersection(exposure.index) 280 | 281 | def get_resid(s): 282 | valid_index_ = valid_index(s) 283 | if len(valid_index_) > 1: 284 | resid = OLS( 285 | s.loc[valid_index_].values, 286 | (sm_add_constant(exposure.loc[valid_index_].values) if add_constant 287 | else exposure.loc[valid_index_].values), 288 | missing='drop' 289 | ).fit().resid 290 | resid = pd.Series(resid, index=valid_index_) 291 | resid = resid.reindex(s.index, fill_value=np.nan) 292 | if fillna is not None: 293 | resid = resid.groupby(ind.loc[s.index]).apply(lambda x: x.fillna(x.mean())) 294 | else: 295 | resid = pd.Series(np.nan, index=s.index) 296 | return resid 297 | 298 | if isinstance(data, pd.Series): 299 | return get_resid(data) 300 | else: 301 | return data.apply(get_resid, axis) 302 | 303 | 304 | __all__ = [ 305 | 'neutralize', 306 | 'winsorize', 307 | 'winsorize_med', 308 | 'standardlize', 309 | ] 310 | -------------------------------------------------------------------------------- /jqfactor_analyzer/sample.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | import pandas as pd 5 | 6 | 7 | VOL5 = pd.read_csv( 8 | os.path.abspath(os.path.join(os.path.dirname(__file__), 9 | 'sample_data', 10 | 'VOL5.csv')), 11 | header=0, index_col=0, encoding='utf-8' 12 | ) 13 | 14 | VOL5.index = pd.to_datetime(VOL5.index) 15 | VOL5.index.set_names(['date'], inplace=True) 16 | VOL5.columns.set_names(['asset'], inplace=True) 17 | -------------------------------------------------------------------------------- /jqfactor_analyzer/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | import re 5 | import six 6 | import warnings 7 | from functools import wraps 8 | try: 9 | from collections import Iterable 10 | except ImportError: 11 | from collections.abc import Iterable 12 | 13 | import pandas as pd 14 | 15 | 16 | def get_forward_returns_columns(columns): 17 | syntax = re.compile("^period_\\d+$") 18 | return columns[columns.astype('str').str.contains(syntax, regex=True)] 19 | 20 | 21 | def convert_to_forward_returns_columns(period): 22 | try: 23 | return 'period_{:d}'.format(period) 24 | except ValueError: 25 | return period 26 | 27 | 28 | def ignore_warning(message='', category=Warning, module='', lineno=0, append=False): 29 | """过滤 warnings""" 30 | def decorator(func): 31 | @wraps(func) 32 | def func_wrapper(*args, **kwargs): 33 | with warnings.catch_warnings(): 34 | warnings.filterwarnings('ignore', message=message, category=category, 35 | module=module, lineno=lineno, append=append) 36 | return func(*args, **kwargs) 37 | return func_wrapper 38 | 39 | return decorator 40 | 41 | 42 | def ensure_tuple(x): 43 | if isinstance(x, six.string_types) or not isinstance(x, Iterable): 44 | return (x,) 45 | else: 46 | return tuple(x) 47 | -------------------------------------------------------------------------------- /jqfactor_analyzer/version.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | __version__ = '1.1.0' 5 | -------------------------------------------------------------------------------- /jqfactor_analyzer/when.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import six 4 | import datetime 5 | 6 | import pandas as pd 7 | 8 | 9 | DateTime = datetime.datetime 10 | Date = datetime.date 11 | Time = datetime.time 12 | TimeDelta = datetime.timedelta 13 | 14 | today = datetime.date.today 15 | now = datetime.datetime.now 16 | 17 | 18 | def date2str(date, format='%Y-%m-%d'): 19 | return pd.to_datetime(date).strftime(format) 20 | 21 | 22 | def convert_date(date): 23 | if isinstance(date, six.string_types): 24 | if ':' in date: 25 | date = date[:10] 26 | return datetime.datetime.strptime(date, '%Y-%m-%d').date() 27 | elif isinstance(date, datetime.datetime): 28 | return date.date() 29 | elif isinstance(date, datetime.date): 30 | return date 31 | raise Exception("date 必须是datetime.date, datetime.datetime或者如下格式的字符串:'2015-01-05'") 32 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | six 2 | fastcache>=1.0.2 3 | SQLAlchemy>=1.2.8 4 | cached_property>=1.5.1 5 | statsmodels 6 | scipy 7 | numpy>=1.15.0 8 | pandas>=1.0.0 9 | matplotlib 10 | seaborn 11 | jqdatasdk 12 | pyarrow 13 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | universal=1 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from __future__ import print_function 5 | 6 | from os.path import join as path_join, dirname as path_dirname 7 | 8 | from setuptools import setup, find_packages 9 | 10 | try: 11 | # for pip >= 10 12 | from pip._internal.req import parse_requirements 13 | except ImportError: 14 | # for pip <= 9.0.3 15 | from pip.req import parse_requirements 16 | 17 | try: 18 | requirements = [str(ir.req) for ir in parse_requirements("requirements.txt", session=False)] 19 | except AttributeError: 20 | requirements = [str(ir.requirement) for ir in parse_requirements("requirements.txt", session=False)] 21 | 22 | 23 | def get_version(): 24 | scope = {} 25 | with open(path_join(path_dirname(__file__), "jqfactor_analyzer", "version.py")) as fp: 26 | exec(fp.read(), scope) 27 | return scope.get('__version__', '1.0') 28 | 29 | 30 | def get_long_description(): 31 | with open(path_join(path_dirname(__file__), 'README.md'), 'rb') as fp: 32 | long_desc = fp.read() 33 | 34 | long_desc = long_desc.replace( 35 | u'docs/API文档.md'.encode('utf-8'), 36 | u'https://github.com/JoinQuant/jqfactor_analyzer/blob/master/docs/API%E6%96%87%E6%A1%A3.md'.encode('utf-8'), 37 | ) 38 | 39 | return long_desc.decode('utf-8') 40 | 41 | 42 | setup_args = dict( 43 | name='jqfactor_analyzer', 44 | version=get_version(), 45 | packages=find_packages(exclude=("tests", "tests.*")), 46 | author='JoinQuant', 47 | author_email='xlx@joinquant.com', 48 | maintainer="", 49 | maintainer_email="", 50 | url='https://www.joinquant.com', 51 | description='JoinQuant single factor analyzer', 52 | long_description=get_long_description(), 53 | long_description_content_type='text/markdown', 54 | zip_safe=False, 55 | platforms=["all"], 56 | license='Apache License v2', 57 | classifiers=[ 58 | 'Programming Language :: Python', 59 | 'Operating System :: Microsoft :: Windows', 60 | 'Operating System :: Unix', 61 | 'Programming Language :: Python :: 2.7', 62 | 'Programming Language :: Python :: 3.4', 63 | 'Programming Language :: Python :: 3.5', 64 | 'Programming Language :: Python :: 3.6', 65 | 'Programming Language :: Python :: 3.7', 66 | ], 67 | install_requires=requirements, 68 | include_package_data=True, 69 | package_data={'jqfactor_analyzer': ['jqfactor_analyzer/sample_data/*.csv', 'jqfactor_analyzer/config.json']}, 70 | ) 71 | 72 | 73 | def main(): 74 | setup(**setup_args) 75 | 76 | 77 | if __name__ == "__main__": 78 | main() 79 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JoinQuant/jqfactor_analyzer/69e677dc0dd9bed9fece02a70b9c81ce3d0afc53/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_attribution.py: -------------------------------------------------------------------------------- 1 | import os 2 | import datetime 3 | import pandas as pd 4 | from functools import partial 5 | 6 | from jqfactor_analyzer import AttributionAnalysis, DataApi 7 | 8 | try: 9 | import jqdata 10 | except: 11 | # 使用 sdk 进行测试时可能需要先登陆 12 | import jqdatasdk 13 | 14 | weights = pd.read_csv( 15 | os.path.join(os.getcwd(), "jqfactor_analyzer/sample_data/weight_info.csv"), index_col=0) 16 | returns = weights.pop("return") 17 | index_weights = pd.read_csv( 18 | os.path.join(os.getcwd(), "jqfactor_analyzer/sample_data/index_weight_info.csv"), index_col=0) 19 | index_returns = index_weights.pop("return") 20 | 21 | dataapi = DataApi(allow_cache=True, show_progress=True) 22 | w2 = index_weights.div(index_weights.sum(axis=1), axis=0) * 0.1 23 | r2 = dataapi.api.get_price('000905.XSHG', 24 | start_date='2020-01-01', 25 | end_date='2024-07-01', 26 | fields='close', 27 | fq=None)['close'].pct_change() * 0.1 28 | An = AttributionAnalysis(w2, r2, style_type='style' ) 29 | df = An.get_attr_returns2bench("000905.XSHG") 30 | 31 | 32 | def test_get_attr_returns2bench(): 33 | assert df.shape == (1088, 46) 34 | assert set(df.columns) == set([ 35 | 'beta', 'book_to_price_ratio', 'earnings_yield', 'growth', 'leverage', 36 | 'liquidity', 'momentum', 'non_linear_size', 'residual_volatility', 37 | 'size', '801750', '801160', '801200', '801780', '801050', '801040', 38 | '801960', '801170', '801760', '801790', '801720', '801130', '801080', 39 | '801110', '801890', '801140', '801120', '801180', '801880', '801030', 40 | '801770', '801740', '801730', '801950', '801010', '801230', '801710', 41 | '801970', '801210', '801150', '801020', '801980', 'common_return', 42 | 'cash', 'specific_return', 'total_return'] 43 | ) 44 | 45 | 46 | def test_net(): 47 | func = partial(dataapi.api.get_price, 48 | '000905.XSHG', 49 | start_date='2020-01-01', 50 | end_date='2024-07-01', 51 | fields='close') 52 | if dataapi._api_name == 'jqdata': 53 | index_return = func(pre_factor_ref_date=datetime.date.today())['close'].pct_change()[1:] 54 | else: 55 | index_return = func()['close'].pct_change()[1:] 56 | index_net = (index_return.fillna(0) + 1).cumprod() 57 | assert len(index_net) == 1087 58 | -------------------------------------------------------------------------------- /tests/test_data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | 4 | from jqfactor_analyzer.data import DataApi 5 | from jqfactor_analyzer.preprocess import * 6 | from jqfactor_analyzer.factor_cache import * 7 | 8 | 9 | try: 10 | import jqdata 11 | except: 12 | # 使用 sdk 进行测试时可能需要先登陆 13 | import jqdatasdk 14 | 15 | 16 | def test_preprocess(): 17 | api = DataApi(weight_method='mktcap') 18 | codes = api._api.get_all_securities('stock').index.tolist() 19 | start_date = '2024-07-05' 20 | end_date = '2024-07-15' 21 | df = api.apis['prices'](codes, start_date, end_date).dropna(how='all', axis=1) 22 | 23 | w_df = winsorize(df, scale=1) 24 | assert all(df.max() >= w_df.max()) 25 | 26 | wm_df = winsorize_med(df, scale=1) 27 | assert not wm_df.equals(w_df) 28 | 29 | s_df = standardlize(df) 30 | assert set(s_df.std(axis=1).round()) == {1.0} 31 | 32 | n_df = neutralize(df, how='sw_l3', date='2024-07-10') 33 | assert n_df.shape == (7, 5111) 34 | 35 | 36 | def test_cache(): 37 | # api1 不开启缓存, api2 开启缓存 38 | api1 = DataApi(weight_method='mktcap', allow_cache=False) 39 | api2 = DataApi(weight_method='mktcap') 40 | codes = api1._api.get_all_securities('stock').index.tolist() 41 | start_date = '2024-07-01' 42 | end_date = '2024-07-10' 43 | 44 | df1 = api1.apis['weights'](codes, start_date, end_date) 45 | df2 = api2.apis['weights'](codes, start_date, end_date) 46 | for code in codes: 47 | assert (df1[code] - df2[code]).abs().sum() < 1e-3 48 | 49 | api1.weight_method = api2.weight_method = 'cmktcap' 50 | df1 = api1.apis['weights'](codes, start_date, end_date) 51 | df2 = api2.apis['weights'](codes, start_date, end_date) 52 | for code in codes: 53 | assert (df1[code] - df2[code]).abs().sum() < 1e-3 54 | 55 | df1 = api1.apis['prices'](codes, start_date, end_date) 56 | df2 = api2.apis['prices'](codes, start_date, end_date) 57 | assert df1.equals(df2) 58 | 59 | # 非后复权的 price 存在微量差异 60 | api1.fq = 'pre' 61 | api2.fq = 'pre' 62 | df1 = api1.apis['prices'](codes, start_date, end_date) # 无缓存 63 | df2 = api2.apis['prices'](codes, start_date, end_date) # 有缓存 64 | for code in codes: 65 | diff = (df1[code] - df2[code]).abs().sum() 66 | assert diff < 1e-12 67 | 68 | api1.fq = None 69 | api1.price = 'open' 70 | api2.fq = None 71 | api2.price = 'open' 72 | df1 = api1.apis['prices'](codes, start_date, end_date) 73 | df2 = api2.apis['prices'](codes, start_date, end_date) 74 | for code in codes: 75 | diff = (df1[code] - df2[code]).abs().sum() 76 | assert diff < 1e-12 77 | 78 | df1 = api1.apis['groupby'](codes, start_date, end_date) 79 | df2 = api2.apis['groupby'](codes, start_date, end_date) 80 | assert df1.equals(df2) 81 | 82 | # 删除缓存文件 83 | cache_path = get_cache_dir() 84 | if os.path.exists(cache_path): 85 | shutil.rmtree(cache_path) 86 | -------------------------------------------------------------------------------- /tests/test_performance.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | import pytest 5 | import pandas as pd 6 | from numpy import nan, float64 7 | 8 | from jqfactor_analyzer.prepare import get_clean_factor_and_forward_returns 9 | from jqfactor_analyzer.performance import ( 10 | factor_information_coefficient, 11 | factor_autocorrelation, 12 | mean_information_coefficient, 13 | quantile_turnover, 14 | factor_returns, factor_alpha_beta, 15 | average_cumulative_return_by_quantile 16 | ) 17 | from jqfactor_analyzer.utils import get_forward_returns_columns 18 | 19 | 20 | dr = pd.date_range(start='2015-1-1', end='2015-1-2') 21 | dr.name = 'date' 22 | tickers = ['A', 'B', 'C', 'D'] 23 | factor = pd.DataFrame(index=dr, 24 | columns=tickers, 25 | data=[[1, 2, 3, 4], 26 | [4, 3, 2, 1]]).stack() 27 | factor.index = factor.index.set_names(['date', 'asset']) 28 | factor.name = 'factor' 29 | factor_data = pd.DataFrame() 30 | factor_data['factor'] = factor 31 | factor_data['group'] = pd.Series(index=factor.index, 32 | data=[1, 1, 2, 2, 1, 1, 2, 2],) 33 | factor_data['weights'] = pd.Series(range(8), index=factor.index, 34 | dtype=float64) + 1 35 | 36 | 37 | @pytest.mark.parametrize( 38 | ('factor_data', 'forward_returns', 'group_adjust', 39 | 'by_group', 'expected_ix', 'expected_ic_val'), 40 | [(factor_data, [4, 3, 2, 1, 1, 2, 3, 4], False, False, dr, [-1., -1.]), 41 | (factor_data, [1, 2, 3, 4, 4, 3, 2, 1], False, False, dr, [1., 1.]), 42 | (factor_data, [1, 2, 3, 4, 4, 3, 2, 1], False, True, 43 | pd.MultiIndex.from_product([dr, [1, 2]], names=['date', 'group']), 44 | [1., 1., 1., 1.]), 45 | (factor_data, [1, 2, 3, 4, 4, 3, 2, 1], True, True, 46 | pd.MultiIndex.from_product([dr, [1, 2]], names=['date', 'group']), 47 | [1., 1., 1., 1.])] 48 | ) 49 | def test_information_coefficient(factor_data, 50 | forward_returns, 51 | group_adjust, 52 | by_group, 53 | expected_ix, 54 | expected_ic_val): 55 | 56 | factor_data = factor_data.copy() 57 | factor_data['period_1'] = pd.Series(index=factor_data.index, 58 | data=forward_returns) 59 | 60 | ic = factor_information_coefficient(factor_data=factor_data, 61 | group_adjust=group_adjust, 62 | by_group=by_group) 63 | 64 | expected_ic_df = pd.DataFrame(index=expected_ix, 65 | columns=pd.Index(['period_1'], dtype='object'), 66 | data=expected_ic_val) 67 | 68 | pd.testing.assert_frame_equal(ic, expected_ic_df) 69 | 70 | @pytest.mark.parametrize( 71 | ( 72 | 'factor_data', 'forward_returns', 'group_adjust', 73 | 'by_group', 'by_time', 'expected_ix', 'expected_ic_val' 74 | ), [ 75 | (factor_data, [4, 3, 2, 1, 1, 2, 3, 4], False, False, 'D', 76 | dr, [-1., -1.]), 77 | (factor_data, [1, 2, 3, 4, 4, 3, 2, 1], False, False, 'W', 78 | pd.DatetimeIndex(['2015-01-04'], name='date', freq='W-SUN'), [1.]), 79 | (factor_data, [1, 2, 3, 4, 4, 3, 2, 1], False, True, None, 80 | pd.Int64Index([1, 2], name='group'), [1., 1.]), 81 | (factor_data, [1, 2, 3, 4, 4, 3, 2, 1], False, True, 'W', 82 | pd.MultiIndex.from_product( 83 | [pd.DatetimeIndex(['2015-01-04'], name='date', freq='W-SUN'), 84 | [1, 2]], 85 | names=['date', 'group'] 86 | ), 87 | [1., 1.]) 88 | ] 89 | ) 90 | def test_mean_information_coefficient(factor_data, 91 | forward_returns, 92 | group_adjust, 93 | by_group, 94 | by_time, 95 | expected_ix, 96 | expected_ic_val): 97 | 98 | factor_data = factor_data.copy() 99 | factor_data['period_1'] = pd.Series(index=factor_data.index, 100 | data=forward_returns) 101 | 102 | ic = mean_information_coefficient(factor_data, 103 | group_adjust=group_adjust, 104 | by_group=by_group, 105 | by_time=by_time) 106 | 107 | expected_ic_df = pd.DataFrame(index=expected_ix, 108 | columns=pd.Index(['period_1']), 109 | data=expected_ic_val) 110 | 111 | pd.testing.assert_frame_equal(ic, expected_ic_df, 112 | check_index_type=False, 113 | check_column_type=False) 114 | 115 | 116 | @pytest.mark.parametrize( 117 | ('quantile_values', 'test_quantile', 'expected_vals'), 118 | [([[1.0, 2.0, 3.0, 4.0], 119 | [4.0, 3.0, 2.0, 1.0], 120 | [1.0, 2.0, 3.0, 4.0], 121 | [1.0, 2.0, 3.0, 4.0]], 122 | 4.0, 123 | [nan, 1.0, 1.0, 0.0]), 124 | ([[1.0, 2.0, 3.0, 4.0], 125 | [1.0, 2.0, 3.0, 4.0], 126 | [1.0, 2.0, 3.0, 4.0], 127 | [1.0, 2.0, 3.0, 4.0]], 128 | 3.0, 129 | [nan, 0.0, 0.0, 0.0]), 130 | ([[1.0, 2.0, 3.0, 4.0], 131 | [4.0, 3.0, 2.0, 1.0], 132 | [1.0, 2.0, 3.0, 4.0], 133 | [4.0, 3.0, 2.0, 1.0]], 134 | 2.0, 135 | [nan, 1.0, 1.0, 1.0])] 136 | ) 137 | def test_quantile_turnover(quantile_values, test_quantile, 138 | expected_vals): 139 | 140 | dr = pd.date_range(start='2015-1-1', end='2015-1-4') 141 | dr.name = 'date' 142 | tickers = ['A', 'B', 'C', 'D'] 143 | 144 | quantized_test_factor = pd.Series( 145 | pd.DataFrame(index=dr, columns=tickers, data=quantile_values).stack() 146 | ) 147 | quantized_test_factor.index = quantized_test_factor.index.set_names( 148 | ['date', 'asset'] 149 | ) 150 | 151 | to = quantile_turnover(quantized_test_factor, test_quantile) 152 | 153 | expected = pd.Series( 154 | index=quantized_test_factor.index.levels[0], data=expected_vals) 155 | expected.name = test_quantile 156 | 157 | pd.testing.assert_series_equal(to, expected) 158 | 159 | 160 | @pytest.mark.parametrize( 161 | ('factor_data', 'factor_vals', 'fwd_return_vals', 162 | 'group_adjust', 'expected_vals'), 163 | [(factor_data, [1, 2, 3, 4, 4, 3, 2, 1], [4, 3, 2, 1, 1, 2, 3, 4], 164 | False, [-1.25000, -1.25000]), 165 | (factor_data, [1, 1, 1, 1, 1, 1, 1, 1], [4, 3, 2, 1, 1, 2, 3, 4], 166 | False, [0.0, 0.0]), 167 | (factor_data, [1, 2, 3, 4, 4, 3, 2, 1], [4, 3, 2, 1, 1, 2, 3, 4], 168 | True, [-0.5, -0.5]), 169 | (factor_data, [1, 2, 3, 4, 1, 2, 3, 4], [1, 4, 1, 2, 1, 2, 2, 1], 170 | True, [1.0, 0.0]), 171 | (factor_data, [1, 1, 1, 1, 1, 1, 1, 1], [4, 3, 2, 1, 1, 2, 3, 4], 172 | True, [0.0, 0.0])] 173 | ) 174 | def test_factor_returns(factor_data, 175 | factor_vals, 176 | fwd_return_vals, 177 | group_adjust, 178 | expected_vals): 179 | 180 | factor_data = factor_data.copy() 181 | factor_data['period_1'] = fwd_return_vals 182 | factor_data['factor'] = factor_vals 183 | 184 | factor_returns_s = factor_returns(factor_data=factor_data, 185 | demeaned=True, 186 | group_adjust=group_adjust) 187 | 188 | expected = pd.DataFrame( 189 | index=dr, 190 | data=expected_vals, 191 | columns=get_forward_returns_columns(factor_data.columns) 192 | ) 193 | 194 | pd.testing.assert_frame_equal(factor_returns_s, expected) 195 | 196 | 197 | @pytest.mark.parametrize( 198 | ('factor_data', 'fwd_return_vals', 'alpha', 'beta'), 199 | [(factor_data, [1, 2, 3, 4, 1, 1, 1, 1], -1, 5. / 6.)] 200 | ) 201 | def test_factor_alpha_beta(factor_data, fwd_return_vals, alpha, beta): 202 | 203 | factor_data = factor_data.copy() 204 | factor_data['period_1'] = fwd_return_vals 205 | 206 | ab = factor_alpha_beta(factor_data=factor_data) 207 | 208 | expected = pd.DataFrame(columns=['period_1'], 209 | index=['Ann. alpha', 'beta'], 210 | data=[alpha, beta]) 211 | 212 | pd.testing.assert_frame_equal(ab, expected) 213 | 214 | @pytest.mark.parametrize( 215 | ('factor_values', 'end_date', 'period', 'expected_vals'), 216 | [([[1.0, 2.0, 3.0, 4.0], 217 | [1.0, 2.0, 3.0, 4.0], 218 | [1.0, 2.0, 3.0, 4.0], 219 | [1.0, 2.0, 3.0, 4.0]], 220 | '2015-1-4', 1, 221 | [nan, 1.0, 1.0, 1.0]), 222 | ([[4.0, 3.0, 2.0, 1.0], 223 | [1.0, 2.0, 3.0, 4.0], 224 | [4.0, 3.0, 2.0, 1.0], 225 | [1.0, 2.0, 3.0, 4.0]], 226 | '2015-1-4', 1, 227 | [nan, -1.0, -1.0, -1.0]), 228 | ([[1.0, 2.0, 3.0, 4.0], 229 | [2.0, 1.0, 4.0, 3.0], 230 | [4.0, 3.0, 2.0, 1.0], 231 | [1.0, 2.0, 3.0, 4.0], 232 | [2.0, 1.0, 4.0, 3.0], 233 | [4.0, 3.0, 2.0, 1.0], 234 | [2.0, 1.0, 4.0, 3.0], 235 | [4.0, 3.0, 2.0, 1.0], 236 | [1.0, 2.0, 3.0, 4.0], 237 | [2.0, 1.0, 4.0, 3.0], 238 | [2.0, 1.0, 4.0, 3.0], 239 | [4.0, 3.0, 2.0, 1.0]], 240 | '2015-1-12', 3, 241 | [nan, nan, nan, 1.0, 1.0, 1.0, 0.6, -0.6, -1.0, 1.0, -0.6, -1.0])] 242 | ) 243 | def test_factor_autocorrelation(factor_values, 244 | end_date, 245 | period, 246 | expected_vals): 247 | dr = pd.date_range(start='2015-1-1', end=end_date) 248 | dr.name = 'date' 249 | tickers = ['A', 'B', 'C', 'D'] 250 | factor = pd.DataFrame(index=dr, 251 | columns=tickers, 252 | data=factor_values).stack() 253 | factor.index = factor.index.set_names(['date', 'asset']) 254 | 255 | factor_df = pd.DataFrame() 256 | factor_df['factor'] = factor 257 | 258 | fa = factor_autocorrelation(factor_df, period) 259 | expected = pd.Series(index=dr, data=expected_vals) 260 | expected.name = period 261 | 262 | pd.testing.assert_series_equal(fa, expected) 263 | 264 | @pytest.mark.parametrize( 265 | ('before', 'after', 'demeaned', 'quantiles', 'expected_vals'), 266 | [(1, 2, False, 4, 267 | [[1.00, 0.0, -0.50, -0.75], 268 | [0.0, 0.0, 0.0, 0.0], 269 | [0.00, 0.00, 0.00, 0.00], 270 | [0.0, 0.0, 0.0, 0.0], 271 | [-0.20, 0.0, 0.25, 0.5625], 272 | [0.0, 0.0, 0.0, 0.0], 273 | [-0.3333333, 0.0, 0.50, 1.25], 274 | [0.0, 0.0, 0.0, 0.0]]), 275 | (1, 2, True, 4, 276 | [[0.8833333, 0.0, -0.5625, -1.015625], 277 | [0.0, 0.0, 0.0, 0.0], 278 | [-0.1166667, 0.0, -0.0625, -0.265625], 279 | [0.0, 0.0, 0.0, 0.0], 280 | [-0.3166667, 0.0, 0.1875, 0.296875], 281 | [0.0, 0.0, 0.0, 0.0], 282 | [-0.4500000, 0.0, 0.4375, 0.984375], 283 | [0.0, 0.0, 0.0, 0.0]]), 284 | (3, 0, False, 4, 285 | [[7.0, 3.0, 1.0, 0.0], 286 | [0.0, 0.0, 0.0, 0.0], 287 | [0.0, 0.0, 0.0, 0.0], 288 | [0.0, 0.0, 0.0, 0.0], 289 | [-0.488, -0.36, -0.2, 0.0], 290 | [0.0, 0.0, 0.0, 0.0], 291 | [-0.703704, -0.55555555, -0.333333333, 0.0], 292 | [0.0, 0.0, 0.0, 0.0]]), 293 | (0, 3, True, 4, 294 | [[0.0, -0.5625, -1.015625, -1.488281], 295 | [0.0, 0.0, 0.0, 0.0], 296 | [0.0, -0.0625, -0.265625, -0.613281], 297 | [0.0, 0.0, 0.0, 0.0], 298 | [0.0, 0.1875, 0.296875, 0.339844], 299 | [0.0, 0.0, 0.0, 0.0], 300 | [0.0, 0.4375, 0.984375, 1.761719], 301 | [0.0, 0.0, 0.0, 0.0]]), 302 | (3, 3, False, 2, 303 | [[3.5, 1.5, 0.5, 0.0, -0.25, -0.375, -0.4375], 304 | [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 305 | [-0.595852, -0.457778, -0.266667, 0.0, 0.375, 0.90625, 1.664062], 306 | [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]), 307 | (3, 3, True, 2, 308 | [[2.047926, 0.978888, 0.383333, 0.0, -0.3125, -0.640625, -1.050781], 309 | [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 310 | [-2.047926, -0.978888, -0.383333, 0.0, 0.3125, 0.640625, 1.050781], 311 | [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]])] 312 | ) 313 | def test_average_cumulative_return_by_quantile(before, after, 314 | demeaned, quantiles, 315 | expected_vals): 316 | dr = pd.date_range(start='2015-1-15', end='2015-2-1') 317 | dr.name = 'date' 318 | tickers = ['A', 'B', 'C', 'D'] 319 | r1, r2, r3, r4 = (1.25, 1.50, 1.00, 0.50) 320 | data = [[r1**i, r2**i, r3**i, r4**i] for i in range(1, 19)] 321 | prices = pd.DataFrame(index=dr, columns=tickers, data=data) 322 | dr2 = pd.date_range(start='2015-1-21', end='2015-1-26') 323 | dr2.name = 'date' 324 | factor = pd.DataFrame( 325 | index=dr2, columns=tickers, data=[ 326 | [3, 4, 2, 1], 327 | [3, 4, 2, 1], 328 | [3, 4, 2, 1], 329 | [3, 4, 2, 1], 330 | [3, 4, 2, 1], 331 | [3, 4, 2, 1]]).stack() 332 | 333 | factor_data = get_clean_factor_and_forward_returns( 334 | factor, prices, quantiles=quantiles, periods=range(0, after + 1) 335 | ) 336 | 337 | avgrt = average_cumulative_return_by_quantile( 338 | factor_data, prices, before, after, demeaned) 339 | arrays = [] 340 | for q in range(1, quantiles + 1): 341 | arrays.append((q, 'mean')) 342 | arrays.append((q, 'std')) 343 | index = pd.MultiIndex.from_tuples(arrays, names=['factor_quantile', None]) 344 | expected = pd.DataFrame( 345 | index=index, columns=range(-before, after + 1), data=expected_vals) 346 | pd.testing.assert_frame_equal(avgrt, expected) 347 | 348 | @pytest.mark.parametrize( 349 | ('before', 'after', 'demeaned', 'quantiles', 'expected_vals'), 350 | [(0, 2, False, 4, 351 | [[0.0, -0.50, -0.75], 352 | [0.0, 0.0, 0.0], 353 | [0.0, 0.0, 0.0], 354 | [0.0, 0.0, 0.0], 355 | [0.0, 0.25, 0.5625], 356 | [0.0, 0.0, 0.0], 357 | [0.0, 0.50, 1.25], 358 | [0.0, 0.0, 0.0]]), 359 | (0, 3, True, 4, 360 | [[0.0, -0.5625, -1.015625, -1.488281], 361 | [0.0, 0.0, 0.0, 0.0], 362 | [0.0, -0.0625, -0.265625, -0.613281], 363 | [0.0, 0.0, 0.0, 0.0], 364 | [0.0, 0.1875, 0.296875, 0.339844], 365 | [0.0, 0.0, 0.0, 0.0], 366 | [0.0, 0.4375, 0.984375, 1.761719], 367 | [0.0, 0.0, 0.0, 0.0]]), 368 | (0, 3, False, 2, 369 | [[0.0, -0.25, -0.375, -0.4375], 370 | [0.0, 0.0, 0.0, 0.0], 371 | [0.0, 0.375, 0.90625, 1.664062], 372 | [0.0, 0.0, 0.0, 0.0]]), 373 | (0, 3, True, 2, 374 | [[0.0, -0.3125, -0.640625, -1.050781], 375 | [0.0, 0.0, 0.0, 0.0], 376 | [0.0, 0.3125, 0.640625, 1.050781], 377 | [0.0, 0.0, 0.0, 0.0]])] 378 | ) 379 | def test_average_cumulative_return_by_quantile_2(before, after, 380 | demeaned, quantiles, 381 | expected_vals): 382 | """Test varying factor asset universe 383 | 384 | at different dates there might be different assets 385 | """ 386 | dr = pd.date_range(start='2015-1-15', end='2015-1-25') 387 | dr.name = 'date' 388 | tickers = ['A', 'B', 'C', 'D', 'E', 'F'] 389 | r1, r2, r3, r4 = (1.25, 1.50, 1.00, 0.50) 390 | data = [[r1**i, r2**i, r3**i, r4**i, r2**i, r3**i] 391 | for i in range(1, 12)] 392 | prices = pd.DataFrame(index=dr, columns=tickers, data=data) 393 | dr2 = pd.date_range(start='2015-1-18', end='2015-1-21') 394 | dr2.name = 'date' 395 | factor = pd.DataFrame(index=dr2, columns=tickers, 396 | data=[[3, 4, 2, 1, nan, nan], 397 | [3, 4, 2, 1, nan, nan], 398 | [3, nan, nan, 1, 4, 2], 399 | [3, nan, nan, 1, 4, 2]]).stack() 400 | 401 | factor_data = get_clean_factor_and_forward_returns( 402 | factor, prices, quantiles=quantiles, periods=range(0, after + 1), 403 | ) 404 | 405 | avgrt = average_cumulative_return_by_quantile( 406 | factor_data, prices, before, after, demeaned 407 | ) 408 | arrays = [] 409 | for q in range(1, quantiles + 1): 410 | arrays.append((q, 'mean')) 411 | arrays.append((q, 'std')) 412 | index = pd.MultiIndex.from_tuples(arrays, names=['factor_quantile', None]) 413 | expected = pd.DataFrame( 414 | index=index, columns=range(-before, after + 1), data=expected_vals 415 | ) 416 | pd.testing.assert_frame_equal(avgrt, expected) 417 | -------------------------------------------------------------------------------- /tests/test_prepare.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | import pytest 5 | import pandas as pd 6 | from numpy import nan 7 | 8 | from jqfactor_analyzer.prepare import ( 9 | quantize_factor, compute_forward_returns, common_start_returns 10 | ) 11 | 12 | 13 | dr = pd.date_range(start='2015-1-1', end='2015-1-2') 14 | dr.name = 'date' 15 | tickers = ['A', 'B', 'C', 'D'] 16 | factor = pd.DataFrame( 17 | index=dr, columns=tickers, data=[[1, 2, 3, 4], [4, 3, 2, 1]] 18 | ).stack() 19 | factor.index = factor.index.set_names(['date', 'asset']) 20 | factor.name = 'factor' 21 | factor_data = pd.DataFrame() 22 | factor_data['factor'] = factor 23 | factor_data['group'] = pd.Series( 24 | index=factor.index, 25 | data=[1, 1, 2, 2, 1, 1, 2, 2], 26 | ) 27 | 28 | 29 | def test_compute_forward_returns(): 30 | dr = pd.date_range(start='2015-1-1', end='2015-1-3') 31 | prices = pd.DataFrame( 32 | index=dr, columns=['A', 'B'], data=[[1, 1], [1, 2], [2, 1]] 33 | ) 34 | 35 | fp = compute_forward_returns(factor, prices, periods=[1, 2]) 36 | 37 | ix = pd.MultiIndex.from_product([dr, ['A', 'B']], names=['date', 'asset']) 38 | expected = pd.DataFrame(index=ix, columns=['period_1', 'period_2']) 39 | expected['period_1'] = [0., 1., 1., -0.5, nan, nan] 40 | expected['period_2'] = [1., 0., nan, nan, nan, nan] 41 | 42 | pd.testing.assert_frame_equal(fp, expected) 43 | 44 | 45 | @pytest.mark.parametrize( 46 | ('factor', 'quantiles', 'bins', 'by_group', 'expected_vals'), [ 47 | (factor_data, 4, None, False, [1, 2, 3, 4, 4, 3, 2, 1]), 48 | (factor_data, 2, None, False, [1, 1, 2, 2, 2, 2, 1, 1]), 49 | (factor_data, 2, None, True, [1, 2, 1, 2, 2, 1, 2, 1]), 50 | ( 51 | factor_data, [0, .25, .5, .75, 1.], None, False, 52 | [1, 2, 3, 4, 4, 3, 2, 1] 53 | ), 54 | (factor_data, [0, .5, .75, 1.], None, False, [1, 1, 2, 3, 3, 2, 1, 1]), 55 | (factor_data, [0, .25, .5, 1.], None, False, [1, 2, 3, 3, 3, 3, 2, 1]), 56 | (factor_data, [0, .5, 1.], None, False, [1, 1, 2, 2, 2, 2, 1, 1]), 57 | ( 58 | factor_data, [.25, .5, .75], None, False, 59 | [nan, 1, 2, nan, nan, 2, 1, nan] 60 | ), (factor_data, [0, .5, 1.], None, True, [1, 2, 1, 2, 2, 1, 2, 1]), 61 | (factor_data, [.5, 1.], None, True, [nan, 1, nan, 1, 1, nan, 1, nan]), 62 | (factor_data, [0, 1.], None, True, [1, 1, 1, 1, 1, 1, 1, 1]), 63 | (factor_data, None, 4, False, [1, 2, 3, 4, 4, 3, 2, 1]), 64 | (factor_data, None, 2, False, [1, 1, 2, 2, 2, 2, 1, 1]), 65 | (factor_data, None, 3, False, [1, 1, 2, 3, 3, 2, 1, 1]), 66 | (factor_data, None, 8, False, [1, 3, 6, 8, 8, 6, 3, 1]), 67 | (factor_data, None, [0, 1, 2, 3, 5], False, [1, 2, 3, 4, 4, 3, 2, 1]), 68 | (factor_data, None, [1, 2, 3], False, [nan, 1, 2, nan, nan, 2, 1, nan]), 69 | (factor_data, None, [0, 2, 5], False, [1, 1, 2, 2, 2, 2, 1, 1]), 70 | (factor_data, None, [0.5, 2.5, 4.5], False, [1, 1, 2, 2, 2, 2, 1, 1]), 71 | (factor_data, None, [0.5, 2.5], True, [1, 1, nan, nan, nan, nan, 1, 1]), 72 | (factor_data, None, 2, True, [1, 2, 1, 2, 2, 1, 2, 1]) 73 | ] 74 | ) 75 | def test_quantize_factor(factor, quantiles, bins, by_group, expected_vals): 76 | quantized_factor = quantize_factor( 77 | factor, quantiles=quantiles, bins=bins, by_group=by_group 78 | ) 79 | expected = pd.Series( 80 | index=factor.index, data=expected_vals, name='factor_quantile' 81 | ).dropna() 82 | pd.testing.assert_series_equal(quantized_factor, expected) 83 | 84 | 85 | @pytest.mark.parametrize( 86 | ('before', 'after', 'mean_by_date', 'demeaned', 'expected_vals'), [ 87 | ( 88 | 2, 3, False, False, [ 89 | [0.075, 0.241868], [0.075, 0.241868], [0.075, 0.241868], 90 | [0.075, 0.241868], [0.075, 0.241868], [0.075, 0.241868] 91 | ] 92 | ), 93 | ( 94 | 3, 2, False, True, [ 95 | [0.0, 0.241868], [0.0, 0.241868], [0.0, 0.241868], 96 | [0.0, 0.241868], [0.0, 0.241868], [0.0, 0.241868] 97 | ] 98 | ), 99 | ( 100 | 3, 5, True, False, [ 101 | [0.075, 0.0], [0.075, 0.0], [0.075, 0.0], [0.075, 0.0], 102 | [0.075, 0.0], [0.075, 0.0], [0.075, 0.0], [0.075, 0.0], 103 | [0.075, 0.0] 104 | ] 105 | ), 106 | ( 107 | 1, 4, True, True, 108 | [[0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.]] 109 | ), 110 | ( 111 | 6, 6, False, False, [ 112 | [0.075, 0.243614], [0.075, 0.242861], [0.075, 0.242301], 113 | [0.075, 0.241868], [0.075, 0.241868], [0.075, 0.241868], 114 | [0.075, 0.241868], [0.075, 0.241868], [0.075, 0.241868], 115 | [0.075, 0.241868], [0.075, 0.241868], [0.075, 0.242301], 116 | [0.075, 0.242861] 117 | ] 118 | ), 119 | ( 120 | 6, 6, False, True, [ 121 | [0.0, 0.243614], [0.0, 0.242861], [0.0, 0.242301], 122 | [0.0, 0.241868], [0.0, 0.241868], [0.0, 0.241868], 123 | [0.0, 0.241868], [0.0, 0.241868], [0.0, 0.241868], 124 | [0.0, 0.241868], [0.0, 0.241868], [0.0, 0.242301], 125 | [0.0, 0.242861] 126 | ] 127 | ), 128 | ( 129 | 6, 6, True, False, [ 130 | [0.075, 0.0], [0.075, 0.0], [0.075, 0.0], [0.075, 0.0], 131 | [0.075, 0.0], [0.075, 0.0], [0.075, 0.0], [0.075, 0.0], 132 | [0.075, 0.0], [0.075, 0.0], [0.075, 0.0], [0.075, 0.0], 133 | [0.075, 0.0] 134 | ] 135 | ), 136 | ( 137 | 6, 6, True, True, [ 138 | [0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], 139 | [0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], 140 | [0., 0.] 141 | ] 142 | ) 143 | ] 144 | ) 145 | def test_common_start_returns( 146 | before, after, mean_by_date, demeaned, expected_vals 147 | ): 148 | dr = pd.date_range(start='2015-1-17', end='2015-2-2') 149 | dr.name = 'date' 150 | tickers = ['A', 'B', 'C', 'D'] 151 | r1, r2, r3, r4 = (1.20, 1.40, 0.90, 0.80) 152 | prices = pd.DataFrame( 153 | index=dr, 154 | columns=tickers, 155 | data=[ 156 | [r1**1, r2**1, r3**1, r4**1], [r1**2, r2**2, r3**2, r4**2], 157 | [r1**3, r2**3, r3**3, r4**3], [r1**4, r2**4, r3**4, r4**4], 158 | [r1**5, r2**5, r3**5, r4**5], [r1**6, r2**6, r3**6, r4**6], 159 | [r1**7, r2**7, r3**7, r4**7], [r1**8, r2**8, r3**8, r4**8], 160 | [r1**9, r2**9, r3**9, r4**9], [r1**10, r2**10, r3**10, r4**10], 161 | [r1**11, r2**11, r3**11, r4**11], [r1**12, r2**12, r3**12, r4**12], 162 | [r1**13, r2**13, r3**13, r4**13], [r1**14, r2**14, r3**14, r4**14], 163 | [r1**15, r2**15, r3**15, r4**15], [r1**16, r2**16, r3**16, r4**16], 164 | [r1**17, r2**17, r3**17, r4**17] 165 | ] 166 | ) 167 | dr2 = pd.date_range(start='2015-1-21', end='2015-1-29') 168 | factor = pd.DataFrame( 169 | index=dr2, 170 | columns=tickers, 171 | data=[ 172 | [3, 4, 2, 1], [3, 4, 2, 1], [3, 4, 2, 1], [3, 4, 2, 173 | 1], [3, 4, 2, 1], 174 | [3, 4, 2, 1], [3, 4, 2, 1], [3, 4, 2, 1], [3, 4, 2, 1] 175 | ] 176 | ).stack() 177 | factor.index = factor.index.set_names(['date', 'asset']) 178 | factor.name = 'factor' 179 | 180 | cmrt = common_start_returns( 181 | factor, prices, before, after, False, mean_by_date, 182 | factor if demeaned else None 183 | ) 184 | cmrt = pd.DataFrame({'mean': cmrt.mean(axis=1), 'std': cmrt.std(axis=1)}) 185 | expected = pd.DataFrame( 186 | index=range(-before, after + 1), 187 | columns=['mean', 'std'], 188 | data=expected_vals 189 | ) 190 | pd.testing.assert_frame_equal(cmrt, expected) 191 | --------------------------------------------------------------------------------