├── .gitignore ├── DataBase.py ├── Factor_Test ├── Stratification_Method.py └── __init__.py ├── Framework ├── Factor_Construction.py ├── Factor_Model.py ├── ResultDisplay.py └── Tools │ ├── DateStatus.py │ ├── FactorInfoFunc.py │ ├── FactorSignalFunc.py │ └── __init__.py ├── README.md ├── __init__.py └── test.xlsx /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /DataBase.py: -------------------------------------------------------------------------------- 1 | import cx_Oracle 2 | import pandas as pd 3 | import numpy as np 4 | import sys 5 | import math 6 | import os 7 | os.environ['NLS_LANG'] = 'SIMPLIFIED CHINESE_CHINA.ZHS16GBK' 8 | os.environ['NLS_CHARACTERSET'] = 'ZHS16GBK' 9 | os.environ['NLS_NCHAR_CHARACTERSET'] = 'AL16UTF16' 10 | import pickle 11 | import re 12 | # ----------------------------------------------------------函数(开始)------------------------------------------------------------------------------- 13 | low_level_divided_str = '{0: >{width}}'.format('', width=4) + '{0:~>{width}}'.format('', width=92) + '{0: >{width}}'.format('', width=4) 14 | 15 | 16 | def print_seq_line(action_str): 17 | def decorate(func): 18 | def wrapper(*args, **kwargs): 19 | print('\n' + '{0:*>{width}}'.format(action_str, width=50) + '{0:*>{width}}'.format('', width=50 - len(action_str)) + '\n') 20 | result = func(*args, **kwargs) 21 | print('\n' + '{0:*>{width}}'.format(action_str, width=50) + '{0:*>{width}}'.format('', width=50 - len(action_str)) + '\n') 22 | return result 23 | return wrapper 24 | return decorate 25 | 26 | 27 | def logging(): 28 | def decorate(func): 29 | def wrapper(*args, **kwargs): 30 | print('\tlogging') 31 | return func(*args, **kwargs) 32 | return wrapper 33 | return decorate 34 | 35 | 36 | def connect_oracle_db(account='lyzs_tinysoft', passport='lyzs@2018'): 37 | print('\t正在连接数据库…') 38 | # dsn = cx_Oracle.makedsn('10.1.1.10', '1521', 'ly_orcl') 39 | connection = cx_Oracle.connect(account, passport, '10.1.1.10:1521/orclly') 40 | print('\t数据库连接成功') 41 | print(low_level_divided_str) 42 | return connection 43 | 44 | 45 | @print_seq_line('写入数据') 46 | def insert_data_to_oracle_db(data=None, table_name=None, account='lyzs_tinysoft', passport='lyzs@2018'): 47 | try: 48 | connection = connect_oracle_db(account, passport) 49 | except BaseException: 50 | raise BaseException('数据库连接出错。') 51 | 52 | data = data.where(pd.notnull(data), None) 53 | print('\t开始写入数据表' + table_name + '…') 54 | cursor = connection.cursor() 55 | 56 | # 生成对应类型的格式化输出 57 | # format_string = ','.join(data.iloc[0, :].apply(lambda s: '\'%s\'' if isinstance(s, str) else( 58 | # '%d' if np.issubdtype(s, np.integer) else '%f')).tolist()) 59 | # insert_sql = 'insert into ' + table_name + ' (' + ','.join(data.columns.tolist()) + ') values (' + format_string + ')' 60 | # 61 | # for i in range(data.shape[0]): 62 | # cursor.execute(insert_sql % tuple(data.iloc[i, :][data.columns].apply(lambda s: s.replace('\'', '\'\'') if isinstance(s, str) else s).values)) 63 | # # if i % (data.shape[0] // 50) == 0: 64 | # percent = '{:.2%}'.format(i / data.shape[0]) 65 | # sys.stdout.write('\r') 66 | # sys.stdout.write("\t数据写入完成进度:[%-50s] %s" % ('#' * int(math.floor(i * 50 / data.shape[0])), percent)) 67 | # sys.stdout.flush() 68 | executemany_format_string = ','.join([':' + str(i) for i in range(1, data.shape[1] + 1)]) 69 | 70 | chunk_size = 100 71 | chunk_loc_list = list(range(0, len(data), len(data) // chunk_size)) 72 | chunk_loc_list[-1] = len(data) 73 | 74 | for i, chunk_loc in enumerate(chunk_loc_list[:-1]): 75 | 76 | insert_sql = 'insert into ' + table_name + ' (' + ','.join(data.columns.tolist()) + ') values (' + executemany_format_string + ')' 77 | insert_data = data.iloc[chunk_loc_list[i]:chunk_loc_list[i + 1]].apply(lambda s: tuple(s.tolist()), axis=1).values.tolist() 78 | 79 | cursor.executemany(insert_sql, insert_data) 80 | connection.commit() 81 | percent = '{:.2%}'.format(chunk_loc_list[i + 1] / data.shape[0]) 82 | sys.stdout.write('\r') 83 | sys.stdout.write("\t数据写入完成进度:[%-50s] %s" % ('>' * ((i + 1) // 2), percent)) 84 | sys.stdout.flush() 85 | 86 | # 关闭游标 87 | cursor.close() 88 | connection.close() 89 | print('\n\t数据提交成功,已关闭数据库连接') 90 | print(low_level_divided_str) 91 | 92 | 93 | @print_seq_line('读取数据') 94 | # @logging() 95 | def read_data_from_oracle_db(sql=None, account='lyzs_tinysoft', passport='lyzs@2018', print_context=True): 96 | 97 | try: 98 | connection = connect_oracle_db(account, passport) 99 | except BaseException: 100 | raise BaseException('数据库连接出错。') 101 | 102 | if print_context: 103 | print('\t开始读取数据…') 104 | print('\t传入的sql语句为:\n') 105 | for line in sql.splitlines(): 106 | if re.sub('\s', '', line) != '': 107 | print('\t\t*> ' + line) 108 | print() 109 | cursor = connection.cursor() 110 | cursor.execute(sql) 111 | exec_result = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description]) 112 | if print_context: 113 | print('\t读取数据成功,共' + 'x'.join([str(i) for i in exec_result.shape]) + '条') 114 | return exec_result 115 | 116 | 117 | # ----------------------------------------------------------函数(结束)------------------------------------------------------------------------------- 118 | if __name__ == "__main__": 119 | # params_data_url = eval(input('请输入需要写入数据库的数据存放文件夹地址:')) 120 | # account = eval(input('请输入连接数据库的账号:')) 121 | # passport = eval(input('请输入连接数据库的密码:')) 122 | # 123 | # raw_data = pickle.load(open(params_data_url + '/raw_data.dat', 'rb')) 124 | # factor_library = pd.read_excel(params_data_url + '/因子列表-初步检测.xlsx') 125 | # factor_list = factor_library['factor_number'].tolist() 126 | # 127 | # # 1. stock_info_data 128 | # # stock_info_data = raw_data[['数据提取日', '财务数据最新报告期', 'stockid', 'stockname', 'sectorid', 'sectorname', '上市天数', '沪深300成分股', 129 | # # '中证500成分股', '中证800成分股', '申万A股成分股', '是否st', '是否pt', '是否停牌']].copy() 130 | # # stock_info_data.columns = pd.read_excel(params_data_url + '/量化FOF研究-数据库表设计.xlsx', sheet_name='Stock_Info_Data')['字段英文名'].tolist() 131 | # # 132 | # # insert_data_to_oracle_db(data=stock_info_data, table_name='lyzs_tinysoft.stock_info_data', account=account, passport=passport) 133 | # 134 | # # 2. factor_raw_data 135 | # factor_raw_data = raw_data[['数据提取日', 'stockid'] + factor_list].rename( 136 | # columns={'数据提取日': 'get_data_date', 'stockid': 'stock_id'}).melt( 137 | # id_vars=['get_data_date', 'stock_id'], var_name=['factor_number'], value_name='raw_value') 138 | # factor_raw_data = factor_raw_data.where(pd.notnull(factor_raw_data), None) 139 | # insert_data_to_oracle_db(data=factor_raw_data, table_name='lyzs_tinysoft.new_factor_raw_data', account=account, passport=passport) 140 | 141 | # # 3. return_data 142 | # return_dict = {'持仓天数': 'holding_period_days', '持仓期停牌天数占比': 'hp_suspension_days_pct', '持仓期收益率': 'holding_period_return', 143 | # '申万行业收益率': 'sw_1st_sector_hpr', '沪深300收益率': 'hs300_hpr', '中证500收益率': 'zz500_hpr', '中证800收益率': 'zz800_hpr', 144 | # '上证综指收益率': 'szzz_hpr', '申万A股收益率': 'swag_hpr', '数据提取日': 'get_data_date', 'stockid': 'stock_id'} 145 | # return_data = raw_data[list(return_dict.keys())].rename(columns=return_dict) 146 | # return_data = return_data.where(pd.notnull(return_data), None) 147 | # insert_data_to_oracle_db(data=return_data, table_name='lyzs_tinysoft.return_data', account=account, passport=passport) 148 | # 149 | # # 4. factor_stratificated_return 150 | # 151 | # factor_stratificated_return = pickle.load(open(params_data_url + '/factor_stratificated_return.dat', 'rb')) 152 | # quantile_name_dict = {'low': '第1档收益率', **{str(i): '第' + str(i) + '档收益率' for i in range(2,10)}, 'high': '第10档收益率', 153 | # '数据提取日': 'get_data_date'} 154 | # factor_stratificated_return = factor_stratificated_return.rename(columns=quantile_name_dict).melt( 155 | # id_vars=['factor_number', 'get_data_date', 'sample_scope'], var_name=['type_name'], value_name='value') 156 | # factor_stratificated_return = factor_stratificated_return.where(pd.notnull(factor_stratificated_return), None) 157 | # insert_data_to_oracle_db(data=factor_stratificated_return, table_name='lyzs_tinysoft.factor_return', account=account, passport=passport) 158 | # 159 | # # 5. factor_return_regression 160 | # 161 | # factor_return_regression = pickle.load(open(params_data_url + '/factor_return_regression.dat', 'rb')) 162 | # factor_return_regression = factor_return_regression.where(pd.notnull(factor_return_regression), None) 163 | # insert_data_to_oracle_db(data=factor_return_regression, table_name='lyzs_tinysoft.factor_return_regression', account=account, passport=passport) 164 | # 165 | # # 6. factor_raw_data_describe 166 | # factor_raw_data_describe = pickle.load(open(params_data_url + '/factor_raw_data_describe.dat', 'rb')) 167 | # factor_raw_data_describe = factor_raw_data_describe.where(pd.notnull(factor_raw_data_describe), None) 168 | # insert_data_to_oracle_db(data=factor_raw_data_describe, table_name='lyzs_tinysoft.factor_raw_data_description', account=account, passport=passport) 169 | 170 | data_url = '/Users/yi.deng/Desktop/因子数据0831' 171 | 172 | date_list = os.listdir(data_url) 173 | data_list = [] 174 | 175 | for i, file_name in enumerate(date_list): 176 | if file_name[-4:] == 'xlsx': 177 | # tempo_factor_data = pd.read_excel(fact_data_url + '/' + file_name, dtype=all_type_dict) 178 | # tempo_base_data = pd.read_excel(base_data_url + '/' + file_name, dtype=factor_data_type_dict) 179 | temp_data = pd.read_excel(data_url + '/' + file_name).rename(columns={'数据提取日': 'get_data_date', 'stockid': 'stock_id'}) 180 | data_list.append(temp_data) 181 | 182 | data = pd.concat(data_list, axis=0) 183 | data = data.sort_values(by=['get_data_date', 'stock_id']).reset_index(drop=True) 184 | melt_data = data.melt(id_vars=['get_data_date', 'stock_id'], var_name=['factor_number'], value_name='raw_value') 185 | melt_data = melt_data.dropna() 186 | melt_data = melt_data.drop_duplicates() 187 | melt_data = melt_data.where(pd.notnull(melt_data), None) 188 | insert_data_to_oracle_db(data=melt_data, table_name='lyzs_tinysoft.factor_raw_data') 189 | -------------------------------------------------------------------------------- /Factor_Test/Stratification_Method.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import warnings 3 | warnings.filterwarnings("ignore") 4 | import pandas as pd 5 | pd.set_option('max_columns', 20) 6 | pd.set_option('display.width', 320) 7 | pd.set_option('display.unicode.east_asian_width', True) 8 | pd.set_option('display.unicode.ambiguous_as_wide', True) 9 | import numpy as np 10 | import math 11 | import pickle 12 | import statsmodels.api as sm 13 | from statsmodels.tsa.stattools import adfuller 14 | from decimal import Decimal 15 | import matplotlib.style as matstyle 16 | matstyle.use('ggplot') 17 | import matplotlib.pyplot as plt 18 | plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签 19 | plt.rcParams['axes.unicode_minus'] = False 20 | import seaborn as sns 21 | sns.set_context(rc={'figure.figsize': (12, 7)}) 22 | # ----------------------------------------------------------函数(开始)------------------------------------------------------------------------------- 23 | low_level_divided_str = '{0: >{width}}'.format('', width=4) + '{0:~>{width}}'.format('', width=92) + '{0: >{width}}'.format('', width=4) 24 | 25 | 26 | def print_seq_line(action_str): 27 | def decorate(func): 28 | def wrapper(*args, **kwargs): 29 | print('\n' + '{0:*>{width}}'.format(action_str, width=50) + '{0:*>{width}}'.format('', width=50 - len(action_str)) + '\n') 30 | result = func(*args, **kwargs) 31 | print('\n' + '{0:*>{width}}'.format(action_str, width=50) + '{0:*>{width}}'.format('', width=50 - len(action_str)) + '\n') 32 | return result 33 | return wrapper 34 | return decorate 35 | 36 | 37 | def get_outlier_stock_list(factor_data, factor_name, method='winsorize'): 38 | independent_var_outlier_index_list = factor_data[factor_data['持仓期停牌天数占比'] >= 0.1].index.tolist() 39 | independent_var_outlier_stock_list = factor_data.loc[independent_var_outlier_index_list, 'stockid'].tolist() 40 | 41 | if method == 'median': 42 | Dm = factor_data[factor_name].median() 43 | Dm1 = abs(factor_data[factor_name] - Dm).median() 44 | outlier_stock_list = factor_data[(factor_data[factor_name] > (Dm + 5 * Dm1)) | 45 | (factor_data[factor_name] < (Dm - 5 * Dm1))]['stockid'].tolist() 46 | elif method == 'winsorize': 47 | outlier_index_list = \ 48 | pd.Series(factor_data[factor_name].sort_values().iloc[0:factor_data[factor_name].shape[0] // 100].index).sort_values().tolist() 49 | outlier_index_list += \ 50 | pd.Series(factor_data[factor_name].sort_values().iloc[-factor_data[factor_name].shape[0] // 100:].index).sort_values().tolist() 51 | outlier_stock_list = factor_data.loc[outlier_index_list, 'stockid'].tolist() 52 | 53 | return independent_var_outlier_stock_list, outlier_stock_list 54 | 55 | 56 | def drop_outlier_and_standardization(factor_data, factor_name, drop_outlier_method='winsorize', standardization=False, 57 | fillna_after_standardization=True): 58 | # (1) 先对被解释变量,即收益率端的异常值进行处理 59 | independent_var_outlier_index = factor_data[factor_data['持仓期停牌天数占比'] >= 0.1].index.tolist() 60 | factor_data.loc[independent_var_outlier_index, '持仓期停牌天数占比'] = np.nan 61 | 62 | if drop_outlier_method == 'median': 63 | Dm = factor_data[factor_name].median() 64 | Dm1 = abs(factor_data[factor_name] - Dm).median() 65 | cap_index = factor_data[factor_name] > (Dm + 5 * Dm1) 66 | floor_index = factor_data[factor_name] < (Dm - 5 * Dm1) 67 | factor_data.loc[cap_index, factor_name] = np.nan 68 | factor_data.loc[floor_index, factor_name] = np.nan 69 | elif drop_outlier_method == 'winsorize': 70 | outlier_index_list = \ 71 | pd.Series(factor_data[factor_name].sort_values().iloc[0:factor_data[factor_name].shape[0] // 100].index).sort_values().tolist() 72 | outlier_index_list += \ 73 | pd.Series(factor_data[factor_name].sort_values().iloc[-factor_data[factor_name].shape[0] // 100:].index).sort_values().tolist() 74 | factor_data.loc[outlier_index_list, factor_name] = np.nan 75 | 76 | if standardization: 77 | factor_data[factor_name] = \ 78 | (factor_data[factor_name] - factor_data[factor_name].mean()) / factor_data[factor_name].std() 79 | if fillna_after_standardization : 80 | factor_data = factor_data.fillna(0) 81 | 82 | return factor_data.dropna() 83 | 84 | 85 | def get_stock_by_factor_quantile(each_report_deadline_date_data, factor_name, floor_quantile, cap_quantile): 86 | 87 | stock_floor_quantile = each_report_deadline_date_data[factor_name].quantile(floor_quantile) 88 | stock_cap_quantile = each_report_deadline_date_data[factor_name].quantile(cap_quantile) 89 | if floor_quantile == 0: 90 | stock_floor_quantile = stock_floor_quantile - 0.1 91 | 92 | stock_data = each_report_deadline_date_data[(each_report_deadline_date_data[factor_name] > stock_floor_quantile) & 93 | (each_report_deadline_date_data[factor_name] <= stock_cap_quantile)] 94 | 95 | return stock_data 96 | 97 | 98 | def get_portfolio_evaluation_indicator(portfolio_data, portfolio_name): 99 | list_1 = ['算数平均值', '几何平均值', '中位数', '标准差', '向上偏差', '向下偏差', '收益为正的时期占比', '收益为负的时期占比'] 100 | list_2 = ['年化收益率', '年化波动率', '夏普比率(2%)', '最大回撤比例', '最大回撤开始时间', '最大回撤结束时间'] 101 | 102 | evaluation_indicator = pd.DataFrame(index=['持仓期收益率 - ' + s for s in list_1] + ['组合表现 - ' + s for s in list_2], columns=[portfolio_name]) 103 | 104 | port_open_period_return_series = portfolio_data.groupby(by=['财务报告截止日']).mean()['持仓期收益率'].div(100).copy() 105 | evaluation_indicator.loc['持仓期收益率 - 算数平均值', portfolio_name] = format(port_open_period_return_series.mean(), '.2%') 106 | evaluation_indicator.loc['持仓期收益率 - 几何平均值', portfolio_name] = \ 107 | format(np.power(port_open_period_return_series.add(1).cumprod().iloc[-1], 1 / port_open_period_return_series.shape[0]) - 1, '.2%') 108 | evaluation_indicator.loc['持仓期收益率 - 中位数', portfolio_name] = format(port_open_period_return_series.median(), '.2%') 109 | 110 | evaluation_indicator.loc['持仓期收益率 - 标准差', portfolio_name] = format(port_open_period_return_series.std(), '.2%') 111 | up_bias = port_open_period_return_series[port_open_period_return_series >= port_open_period_return_series.mean()].copy() 112 | down_bias = port_open_period_return_series[port_open_period_return_series < port_open_period_return_series.mean()].copy() 113 | evaluation_indicator.loc['持仓期收益率 - 向上偏差', portfolio_name] = \ 114 | format(np.sqrt((up_bias - port_open_period_return_series.mean()).apply(lambda d: d * d).sum() / up_bias.shape[0]), '.2%') 115 | evaluation_indicator.loc['持仓期收益率 - 向下偏差', portfolio_name] = \ 116 | format(np.sqrt((down_bias - port_open_period_return_series.mean()).apply(lambda d: d * d).sum() / down_bias.shape[0]), '.2%') 117 | 118 | positive = port_open_period_return_series[port_open_period_return_series >= 0] 119 | negative = port_open_period_return_series[port_open_period_return_series < 0] 120 | evaluation_indicator.loc['持仓期收益率 - 收益为正的时期占比', portfolio_name] = format(positive.shape[0] / port_open_period_return_series.shape[0], '.2%') 121 | evaluation_indicator.loc['持仓期收益率 - 收益为负的时期占比', portfolio_name] = format(negative.shape[0] / port_open_period_return_series.shape[0], '.2%') 122 | 123 | holding_period_portfolio_return_series = portfolio_data.groupby(by=['财务报告截止日']).mean()['持仓期收益率'].div(100).copy() 124 | holding_period_trade_days = portfolio_data.groupby(by=['财务报告截止日']).mean()['持仓天数'].sum() 125 | holding_period_portfolio_return = holding_period_portfolio_return_series.sum() 126 | evaluation_indicator.loc['组合表现 - 年化收益率', portfolio_name] = format(holding_period_portfolio_return / holding_period_trade_days * 250, '.2%') 127 | evaluation_indicator.loc['组合表现 - 年化波动率', portfolio_name] = \ 128 | format(holding_period_portfolio_return_series.std() / np.sqrt(holding_period_trade_days) * np.sqrt(250), '.2%') 129 | sharp_ratio = (holding_period_portfolio_return / holding_period_trade_days * 250 - 0.02) / \ 130 | (holding_period_portfolio_return_series.std() / np.sqrt(holding_period_trade_days) * np.sqrt(250)) 131 | evaluation_indicator.loc['组合表现 - 夏普比率(2%)', portfolio_name] = format(sharp_ratio, '.2') 132 | 133 | portfolio_net_value = holding_period_portfolio_return_series.add(1).cumprod() 134 | max_drawdown_end_date = np.argmax(np.maximum.accumulate(portfolio_net_value) - portfolio_net_value) 135 | max_drawdown_begin_date = np.argmax(portfolio_net_value[:max_drawdown_end_date]) 136 | 137 | evaluation_indicator.loc['组合表现 - 最大回撤比例', portfolio_name] = \ 138 | format((portfolio_net_value.loc[max_drawdown_end_date] - portfolio_net_value.loc[max_drawdown_begin_date]) / 139 | portfolio_net_value.loc[max_drawdown_begin_date], '.2%') 140 | evaluation_indicator.loc['组合表现 - 最大回撤开始时间', portfolio_name] = max_drawdown_begin_date 141 | evaluation_indicator.loc['组合表现 - 最大回撤结束时间', portfolio_name] = max_drawdown_end_date 142 | 143 | return evaluation_indicator 144 | 145 | 146 | def constant_test(time_series_data): 147 | result = pd.Series(index=['样本数', 't值', 'p值', '显著性', '最大滞后项']) 148 | adftest = adfuller(time_series_data.astype(float), regression="ct") 149 | result.loc['样本数'] = adftest[3] 150 | result.loc['t值'] = round(adftest[0], 2) 151 | result.loc['p值'] = Decimal(format(adftest[1], '.3f')) 152 | if adftest[0] <= adftest[4]['1%']: 153 | result.loc['显著性'] = '***' 154 | elif adftest[0] <= adftest[4]['5%']: 155 | result.loc['显著性'] = '**' 156 | elif adftest[0] <= adftest[4]['10%']: 157 | result.loc['显著性'] = '*' 158 | else: 159 | result.loc['显著性'] = '不显著' 160 | 161 | result.loc['最大滞后项'] = adftest[2] 162 | return result 163 | 164 | 165 | def set_linear_regression_result(regression_result, result_content_list, method='OLS'): 166 | if (method == 'OLS') | (method == 'WLS'): 167 | result = pd.Series(index=result_content_list) 168 | result.loc['Adj. R-squared'] = round(regression_result.rsquared_adj, 4) 169 | elif method == 'RLM': 170 | result = pd.Series(index=result_content_list) 171 | 172 | result.loc[['Alpha', 'Beta']] = [round(value, 3) for value in list(regression_result.params)] 173 | result.loc[['Alpha t值', 'Beta t值']] = [round(value, 3) for value in list(regression_result.tvalues)] 174 | result.loc[['Alpha p值', 'Beta p值']] = [round(value, 4) for value in list(regression_result.pvalues)] 175 | result.loc[['Alpha标准误', 'Beta标准误']] = [round(value, 3) for value in list(regression_result.bse)] 176 | if regression_result.pvalues[0] <= 0.01: 177 | result.loc['Alpha显著性'] = '***' 178 | elif regression_result.pvalues[0] <= 0.05: 179 | result.loc['Alpha显著性'] = '**' 180 | elif regression_result.pvalues[0] <= 0.1: 181 | result.loc['Alpha显著性'] = '*' 182 | else: 183 | result.loc['Alpha显著性'] = '' 184 | 185 | if regression_result.pvalues[1] <= 0.01: 186 | result.loc['Beta显著性'] = '***' 187 | elif regression_result.pvalues[1] <= 0.05: 188 | result.loc['Beta显著性'] = '**' 189 | elif regression_result.pvalues[1] <= 0.1: 190 | result.loc['Beta显著性'] = '*' 191 | else: 192 | result.loc['Beta显著性'] = '' 193 | return result 194 | 195 | 196 | def finding_most_significant_factor(result_dict, rolling_window, factor_type_dict, factor_name_dict, quantile_dict, get_factor_data_date_list, 197 | factor_list, method='RLM'): 198 | 199 | significant_factor_df = pd.DataFrame(index=get_factor_data_date_list[rolling_window - 1:], 200 | columns=['因子序号', '因子小类', '因子名称', '档数', 'Alpha显著性', 'Alpha', 'Alpha p值']) 201 | 202 | for str_date in get_factor_data_date_list[rolling_window - 1:]: 203 | date = pd.to_datetime(str_date) 204 | 205 | significant_factor_df.loc[str_date, 'Alpha'] = 0 206 | significant_factor_df.loc[str_date, 'Alpha p值'] = 0.1 207 | for factor_name in factor_list: 208 | 209 | # 先找出每个因子Alpha为正的最显著的层数 210 | for i in range(10): 211 | tempo_data = result_dict[factor_name]['申万A股'][quantile_dict[i]][rolling_window].copy() 212 | if (float(tempo_data.loc[date, 'Alpha p值']) < significant_factor_df.loc[str_date, 'Alpha p值']) & \ 213 | (float(tempo_data.loc[date, 'Alpha']) > significant_factor_df.loc[str_date, 'Alpha']): 214 | significant_factor_df.loc[str_date, '因子序号'] = factor_name 215 | significant_factor_df.loc[str_date, '因子小类'] = factor_type_dict[factor_name] 216 | significant_factor_df.loc[str_date, '因子名称'] = factor_name_dict[factor_name] 217 | significant_factor_df.loc[str_date, '档数'] = '第' + quantile_dict[i] + '组' 218 | significant_factor_df.loc[str_date, 'Alpha显著性'] = tempo_data.loc[date, 'Alpha显著性'] 219 | significant_factor_df.loc[str_date, 'Alpha'] = float(tempo_data.loc[date, 'Alpha']) 220 | significant_factor_df.loc[str_date, 'Alpha p值'] = float(tempo_data.loc[date, 'Alpha p值']) 221 | if (method == 'OLS') | (method == 'WLS'): 222 | significant_factor_df.loc[str_date, 'Adj. R-squared'] = tempo_data.loc[date, 'Adj. R-squared'] 223 | print('完成显著因子挑选:' + str_date) 224 | return significant_factor_df 225 | 226 | 227 | def set_factor_info(regression_result, factor_num, factor_category, factor_type, factor_name, quantile_name): 228 | # significant_factor = regression_result[(regression_result['Alpha p值'].astype(float) < 0.1) & 229 | # (regression_result['Alpha'].astype(float) > 0)].copy() 230 | significant_factor = regression_result.copy() 231 | significant_factor['因子序号'] = factor_num 232 | significant_factor['因子大类'] = factor_category 233 | significant_factor['因子小类'] = factor_type 234 | significant_factor['因子名称'] = factor_name 235 | significant_factor['档位'] = quantile_name 236 | 237 | significant_factor.index = range(significant_factor.shape[0]) 238 | return significant_factor 239 | 240 | 241 | def data_cleaning(data, sample_list=None, factor_list=None, kicked_sector_list=None, go_public_days=250, data_processing_base_columns=None, 242 | get_factor_data_date_list=None): 243 | action_str = '数据清洗' 244 | print_high_level_divided_str(action_str) 245 | 246 | print('\t开始数据清洗…') 247 | print('\t原始数据样本各期数量最大/最小值:' + 248 | format(data.groupby(by=['数据提取日']).count()['stockid'].min(), '.0f') + '/' + 249 | format(data.groupby(by=['数据提取日']).count()['stockid'].max(), '.0f')) 250 | 251 | # 1. 数据清洗 252 | # 1.1 剔除某些行业 253 | data = data[data['sectorname'].apply(lambda s: s not in kicked_sector_list)] 254 | 255 | # 1.2 剔除天软的脏数据:报告期为1900-12-31、以及一些莫名的日期 256 | 257 | data = data[data['财务数据最新报告期'] != '1900-12-31'] 258 | data = data[data['数据提取日'].apply(lambda date: date in get_factor_data_date_list)] 259 | 260 | # 1.3 剔除ST、PT股票,以及当期停牌的股票 261 | 262 | data = data[(data['是否st'] == 0) & (data['是否pt'] == 0) & (data['是否停牌'] == 0)] 263 | 264 | # 1.4 剔除每个截面数据量少于95%的因子数据 265 | 266 | # raw_data_step4 = raw_data_step3 267 | 268 | # 1.5 剔除次新股 269 | data = data.groupby(by=['数据提取日']).apply(lambda df: df[df['上市天数'] > go_public_days]) 270 | 271 | # 1.6 取得干净数据 272 | clean_data = data[data_processing_base_columns + [sample_name + '成分股' for sample_name in sample_list] + factor_list] 273 | clean_data.index = range(clean_data.shape[0]) 274 | print('\t数据清洗后样本各期数量最大/最小值:' + 275 | format(clean_data.groupby(by=['数据提取日']).count()['stockid'].min(), '.0f') + '/' + 276 | format(clean_data.groupby(by=['数据提取日']).count()['stockid'].max(), '.0f')) 277 | clean_data = optimize_data_ram(clean_data) 278 | 279 | print_high_level_divided_str(action_str) 280 | return clean_data 281 | 282 | 283 | def data_processing(clean_data, sample_list=None, factor_list=None, factor_name_dict=None): 284 | 285 | action_str = '数据处理' 286 | print_high_level_divided_str(action_str) 287 | 288 | factor_raw_data_describe = {} 289 | factor_clean_data_describe = {} 290 | outlier_data_dict = {} 291 | clean_data_after_outlier = {} 292 | 293 | for sample_name in sample_list: 294 | 295 | tempo_clean_data = clean_data[clean_data[sample_name + '成分股'] == 1].copy() 296 | 297 | for factor_name in factor_list: 298 | 299 | print('\t开始数据处理:' + sample_name + '-' + factor_name + '(' + factor_name_dict[factor_name] + ')') 300 | 301 | # (1) 对原始数据进行描述性统计 302 | factor_raw_data_describe[(sample_name, factor_name)] = \ 303 | tempo_clean_data[['数据提取日'] + [factor_name]].groupby(by=['数据提取日']).apply( 304 | lambda df: df[factor_name].describe([0.05, 0.1, 0.3, 0.5, 0.7, 0.9, 0.95])) 305 | 306 | factor_raw_data_describe[(sample_name, factor_name)]['skewness'] = \ 307 | tempo_clean_data[['数据提取日'] + [factor_name]].groupby(by=['数据提取日']).apply(lambda df: df[factor_name].skew()) 308 | 309 | factor_raw_data_describe[(sample_name, factor_name)]['kurtosis'] = \ 310 | tempo_clean_data[['数据提取日'] + [factor_name]].groupby(by=['数据提取日']).apply(lambda df: df[factor_name].kurtosis()) 311 | 312 | factor_raw_data_describe[(sample_name, factor_name)]['non-NAN-data_pct'] = \ 313 | tempo_clean_data[['数据提取日'] + [factor_name]].groupby(by=['数据提取日']).apply( 314 | lambda df: df[factor_name].dropna().shape[0] / df[factor_name].shape[0]) 315 | 316 | print('\t数据处理前样本最大/最小数量:' + 317 | format(factor_raw_data_describe[(sample_name, factor_name)]['count'].min(), '.0f') + '/' + 318 | format(factor_raw_data_describe[(sample_name, factor_name)]['count'].max(), '.0f')) 319 | 320 | # (2) 对数据进行处理:去异常值、标准化及填0(可选) 321 | clean_data_after_outlier[(sample_name, factor_name)] = \ 322 | tempo_clean_data[['数据提取日', 'stockid', '持仓期停牌天数占比'] + [factor_name]].groupby(by=['数据提取日']).apply( 323 | lambda df: drop_outlier_and_standardization(df, factor_name, drop_outlier_method='winsorize', standardization=False))[ 324 | ['数据提取日', 'stockid'] + [factor_name]].reset_index(drop=True) 325 | 326 | # (3) 对缺数据的截面进行剔除,即保留截面数据大于多少的数据集 327 | cross_section_data_coverage = clean_data_after_outlier[(sample_name, factor_name)][['数据提取日'] + [factor_name]].groupby( 328 | by=['数据提取日']).count() 329 | 330 | date_list = cross_section_data_coverage[cross_section_data_coverage[factor_name] >= 100].index.tolist() 331 | clean_data_after_outlier[(sample_name, factor_name)] = clean_data_after_outlier[(sample_name, factor_name)][ 332 | clean_data_after_outlier[(sample_name, factor_name)]['数据提取日'].apply(lambda date: date in date_list)].reset_index(drop=True) 333 | clean_data_after_outlier[(sample_name, factor_name)]['数据提取日'] = \ 334 | clean_data_after_outlier[(sample_name, factor_name)]['数据提取日'].astype(object) 335 | 336 | factor_clean_data_describe[(sample_name, factor_name)] = \ 337 | clean_data_after_outlier[(sample_name, factor_name)].groupby(by=['数据提取日']).apply( 338 | lambda df: df[factor_name].describe([0.05, 0.1, 0.3, 0.5, 0.7, 0.9, 0.95])) 339 | 340 | factor_clean_data_describe[(sample_name, factor_name)]['skewness'] = \ 341 | clean_data_after_outlier[(sample_name, factor_name)][['数据提取日'] + [factor_name]].groupby(by=['数据提取日']).apply( 342 | lambda df: df[factor_name].skew()) 343 | factor_clean_data_describe[(sample_name, factor_name)]['kurtosis'] = \ 344 | clean_data_after_outlier[(sample_name, factor_name)][['数据提取日'] + [factor_name]].groupby(by=['数据提取日']).apply( 345 | lambda df: df[factor_name].kurtosis()) 346 | 347 | print('\t数据处理后样本最大/最小数量:' + 348 | format(factor_clean_data_describe[(sample_name, factor_name)]['count'].min(), '.0f') + '/' + 349 | format(factor_clean_data_describe[(sample_name, factor_name)]['count'].max(), '.0f')) 350 | 351 | # (4) 保存异常值股票池及其数据 352 | 353 | print('\t保存异常值数据…') 354 | outlier_stock_dict = clean_data[['数据提取日', 'stockid', '持仓期停牌天数占比'] + [factor_name]].groupby(by=['数据提取日']).apply( 355 | lambda df: get_outlier_stock_list(df, factor_name, method='winsorize')) 356 | tempo_factor_clean_data = clean_data.pivot(values=factor_name, index='数据提取日', columns='stockid') 357 | tempo_factor_clean_data.columns = tempo_factor_clean_data.columns.astype(object) # categorical columns不能以普通方式添加 358 | 359 | return_outlier = pd.DataFrame(outlier_stock_dict).apply( 360 | lambda t: tempo_factor_clean_data.loc[t.name, t[0][0]], axis=1).reset_index().melt( 361 | id_vars=['数据提取日'], var_name=['stockid'], value_name=factor_name).dropna() 362 | return_outlier['异常值类型'] = '收益端异常值' 363 | 364 | factor_data_outlier = pd.DataFrame(outlier_stock_dict).apply( 365 | lambda t: tempo_factor_clean_data.loc[t.name, t[0][1]], axis=1).reset_index().melt( 366 | id_vars=['数据提取日'], var_name=['stockid'], value_name=factor_name).dropna() 367 | factor_data_outlier['异常值类型'] = '因子端异常值' 368 | 369 | outlier_data_dict[(sample_name, factor_name)] = pd.concat([return_outlier, factor_data_outlier]).sort_values( 370 | by=['数据提取日', 'stockid']).reset_index(drop=True) 371 | 372 | print('\t完成数据处理:' + factor_name + '(' + factor_name_dict[factor_name] + ')\n') 373 | print(low_level_divided_str) 374 | 375 | print_high_level_divided_str(action_str) 376 | return clean_data_after_outlier, factor_raw_data_describe, factor_clean_data_describe, outlier_data_dict 377 | 378 | 379 | def get_factor_stratification_data(data, sample_list=None, factor_list=None, stratification_num=10, quantile_dict=None): 380 | 381 | action_str = '生成分档组合' 382 | print_high_level_divided_str(action_str) 383 | 384 | cap_quantile_list = [(quantile + 1) / stratification_num for quantile in range(0, stratification_num)] 385 | floor_quantile_list = [quantile / stratification_num for quantile in range(0, stratification_num)] 386 | 387 | factor_stratification_data = {} 388 | 389 | factor_data_columns = ['数据提取日', 'stockid'] 390 | 391 | for sample_name in sample_list: 392 | 393 | print('\t开始因子构建:' + sample_name) 394 | 395 | for factor_name in factor_list: 396 | 397 | for i in range(stratification_num): 398 | # 此段用来生成每个分位的组合。 399 | factor_stratification_data[(sample_name, factor_name, quantile_dict[i])] = \ 400 | data[(sample_name, factor_name)][factor_data_columns + [factor_name]].groupby(by=['数据提取日']).apply( 401 | lambda df: get_stock_by_factor_quantile(df, factor_name, floor_quantile_list[i], cap_quantile_list[i])) 402 | 403 | factor_stratification_data[(sample_name, factor_name, quantile_dict[i])].index = \ 404 | range(factor_stratification_data[(sample_name, factor_name, quantile_dict[i])].shape[0]) 405 | 406 | min_count = factor_stratification_data[(sample_name, factor_name, quantile_dict[i])].groupby(by=['数据提取日']).count()['stockid'].min() 407 | max_count = factor_stratification_data[(sample_name, factor_name, quantile_dict[i])].groupby(by=['数据提取日']).count()['stockid'].max() 408 | print('\t完成因子构建:' + sample_name + '-' + factor_name + '(' + factor_name_dict[factor_name] + ')' 409 | + '-第' + quantile_dict[i] + '组' + '(' + format(min_count, '.0f') + '/' + format(max_count, '.0f') + ')') 410 | print(low_level_divided_str) 411 | 412 | print_high_level_divided_str(action_str) 413 | return factor_stratification_data 414 | 415 | 416 | def get_factor_stratification_return(factor_stratification_data, stock_return_df, sample_list=None, factor_list=None, startification_num=10, 417 | quantile_dict=None, yield_type_list=None): 418 | 419 | action_str = '计算分档收益率' 420 | print_high_level_divided_str(action_str) 421 | 422 | factor_stratification_return = {} 423 | # 计算每个样本池 424 | for sample_name in sample_list: 425 | 426 | print('\t开始计算分层收益率:' + sample_name) 427 | 428 | for factor_name in factor_list: 429 | 430 | # 计算每层 431 | for i in range(startification_num): 432 | factor_stratification_return[(sample_name, factor_name, quantile_dict[i])] = \ 433 | pd.DataFrame(index=get_factor_data_date_list, columns=['因子均值'] + yield_type_list) 434 | 435 | factor_stratification_return[(sample_name, factor_name, quantile_dict[i])]['因子均值'] = \ 436 | factor_stratification_data[(sample_name, factor_name, quantile_dict[i])].groupby(by=['数据提取日']).apply( 437 | lambda df: df[factor_name].mean()).astype('float32') 438 | 439 | for yield_type in yield_type_list: 440 | tempo_yield_data = factor_stratification_data[(sample_name, factor_name, quantile_dict[i])].merge( 441 | stock_return_df[['数据提取日', 'stockid'] + [yield_type]], on=['数据提取日', 'stockid'], how='left').copy() 442 | 443 | factor_stratification_return[(sample_name, factor_name, quantile_dict[i])][yield_type] = \ 444 | tempo_yield_data.groupby(by=['数据提取日']).apply(lambda df: df[yield_type].mean()).astype('float32') 445 | 446 | print('\t完成分组收益率计算:' + sample_name + '-' + 447 | factor_name + '(' + factor_name_dict[factor_name] + ')' + '-第' + quantile_dict[i] + '组-' + yield_type) 448 | 449 | print(low_level_divided_str) 450 | 451 | print_high_level_divided_str(action_str) 452 | return factor_stratification_return 453 | 454 | 455 | def get_factor_test_result(factor_stratification_return, index_return_df, sample_list=None, factor_list=None, 456 | get_factor_data_date_list=None, regression_model_list=None, 457 | quantile_dict=None, rolling_window_list=None, stratification_num=10): 458 | 459 | action_str = '单因子检测' 460 | print_high_level_divided_str(action_str) 461 | 462 | ts_constant_test_result_dict = {} 463 | factor_test_result = {} 464 | result_content_list = ['Alpha显著性', 'Alpha', 'Alpha t值', 'Alpha标准误', 'Alpha p值', 'Beta显著性', 'Beta', 'Beta t值', 'Beta标准误', 'Beta p值'] 465 | result_value_content_list = ['Alpha', 'Alpha t值', 'Alpha标准误', 'Alpha p值', 'Beta', 'Beta t值', 'Beta标准误', 'Beta p值'] 466 | 467 | for sample_name in sample_list: 468 | 469 | print('\t开始单因子回归检测:' + sample_name) 470 | 471 | for factor_name in factor_list: 472 | 473 | factor_return_df = pd.DataFrame(index=get_factor_data_date_list, columns=['因子收益率', 'Alpha', 'Beta']) 474 | factor_return_df['Beta'] = index_return_df[sample_name + '收益率'] 475 | factor_return_df['Alpha'] = 1 476 | 477 | # 1. 时间序列平稳性检测 478 | for i in range(stratification_num): 479 | factor_return_df['因子收益率'] = factor_stratification_return[(sample_name, factor_name, quantile_dict[i])]['持仓期收益率'] 480 | 481 | # (1) 每档都做回归,因为有的基准是从2007年才开始,因此要dropna 482 | ts_regression_df = factor_return_df.dropna() 483 | ts_regression_df.index = pd.Series(ts_regression_df.index).apply(lambda d: pd.to_datetime(d)) 484 | 485 | # (2) 对因子收益率和指数收益率序列进行平稳性检验 486 | ts_constant_test_result_dict[(sample_name, factor_name, quantile_dict[i])] = \ 487 | pd.DataFrame(index=['因子收益率', '指数收益率'], columns=['样本数', 't值', 'p值', '显著性', '最大滞后项']) 488 | ts_constant_test_result_dict[(sample_name, factor_name, quantile_dict[i])].loc['因子收益率'] = constant_test(ts_regression_df['因子收益率']) 489 | ts_constant_test_result_dict[(sample_name, factor_name, quantile_dict[i])].loc['指数收益率'] = constant_test(ts_regression_df['Beta']) 490 | 491 | # 2. 滚动窗口回归 492 | for regression_model in regression_model_list: 493 | # (1) 选择不同的回归模型 494 | 495 | for rolling_window in rolling_window_list: 496 | # (2) 选择不同的滚动窗口长度 497 | rolling_window_end_date_list = get_factor_data_date_list[rolling_window - 1:] 498 | 499 | for i in range(stratification_num): 500 | 501 | # (3) 每档都分别进行滚动窗口回归 502 | if (regression_model == 'WLS') | (regression_model == 'OLS'): 503 | factor_test_result[(sample_name, regression_model, rolling_window, factor_name, quantile_dict[i])] = \ 504 | pd.DataFrame(index=rolling_window_end_date_list, columns=result_content_list + ['Adj. R-squared']) 505 | elif regression_model == 'RLM': 506 | factor_test_result[(sample_name, regression_model, rolling_window, factor_name, quantile_dict[i])] = \ 507 | pd.DataFrame(index=rolling_window_end_date_list, columns=result_content_list) 508 | 509 | for date_i, date in enumerate(rolling_window_end_date_list, rolling_window): 510 | # 选择每个滚动窗口的最后一个日期 511 | regression_period = get_factor_data_date_list[date_i - rolling_window:date_i] 512 | regression_data = pd.DataFrame(index=regression_period, columns=['因子收益率', 'Alpha', 'Beta']) 513 | regression_data['因子收益率'] = \ 514 | factor_stratification_return[(sample_name, factor_name, quantile_dict[i])]['持仓期收益率'].loc[regression_period] 515 | regression_data['Alpha'] = 1 516 | regression_data['Beta'] = index_return_df[sample_name + '收益率'].loc[regression_period] 517 | 518 | if regression_model == 'RLM': 519 | regression_result = sm.RLM(regression_data.loc[regression_period, '因子收益率'].astype(float), 520 | regression_data.loc[regression_period, ['Alpha', 'Beta']].astype(float)).fit() 521 | elif regression_model == 'OLS': 522 | regression_result = sm.OLS(regression_data.loc[regression_period, '因子收益率'].astype(float), 523 | regression_data.loc[regression_period, ['Alpha', 'Beta']].astype( 524 | float)).fit().get_robustcov_results() 525 | elif regression_model == 'WLS': 526 | weight_dict = {'cos': [1 / (math.cos(x) / sum([math.cos(x) for x in np.linspace(0, math.pi / 2, rolling_window)])) 527 | for x in np.linspace(0, math.pi / 2, rolling_window)]} 528 | 529 | regression_result = sm.WLS(regression_data.loc[regression_period, '因子收益率'].astype(float), 530 | regression_data.loc[regression_period, ['Alpha', 'Beta']].astype(float), 531 | weights=weight_dict['cos']).fit().get_robustcov_results() 532 | 533 | factor_test_result[(sample_name, regression_model, rolling_window, factor_name, quantile_dict[i])].loc[date] = \ 534 | set_linear_regression_result(regression_result, result_content_list, method=regression_model) 535 | 536 | # 把作为index的日期提出来成为一列,因为这一个dataframe只包括一个因子序号的一个档位回归结果,后期要整合所有档位到一起 537 | factor_test_result[(sample_name, regression_model, rolling_window, factor_name, quantile_dict[i])] = \ 538 | factor_test_result[(sample_name, regression_model, rolling_window, factor_name, quantile_dict[i])].reset_index().rename( 539 | columns={'index': '数据提取日'}) 540 | factor_test_result[(sample_name, regression_model, rolling_window, factor_name, quantile_dict[i])][ 541 | result_value_content_list] = \ 542 | factor_test_result[(sample_name, regression_model, rolling_window, factor_name, quantile_dict[i])][ 543 | result_value_content_list].apply(pd.to_numeric, downcast='float') 544 | if (regression_model == 'OLS') | (regression_model == 'WLS'): 545 | factor_test_result[(sample_name, regression_model, rolling_window, factor_name, quantile_dict[i])]['Adj. R-squared'] = \ 546 | factor_test_result[(sample_name, regression_model, rolling_window, factor_name, quantile_dict[i])][ 547 | 'Adj. R-squared'].apply(pd.to_numeric, downcast='float') 548 | 549 | print('\t完成单因子回归检验:' + sample_name + '-' + factor_name + '(' + factor_name_dict[factor_name] + ')' + '-回归方法' + 550 | regression_model + '-第' + quantile_dict[i] + '组-窗口期' + str(rolling_window)) 551 | 552 | print(low_level_divided_str) 553 | 554 | print_high_level_divided_str(action_str) 555 | 556 | return factor_test_result, ts_constant_test_result_dict 557 | 558 | 559 | def get_factor_stratification_hp_return(factor_stratification_return, market_mean_return, sample_list=None, factor_list=None, 560 | stratification_num=10, quantile_dict=None, factor_name_dict=None): 561 | action_str = '因子序号各档收益保存' 562 | print_high_level_divided_str(action_str) 563 | 564 | factor_stratification_hp_return = {} 565 | 566 | for sample_name in sample_list: 567 | 568 | for factor_name in factor_list: 569 | 570 | factor_stratification_hp_return[(sample_name, factor_name)] = \ 571 | pd.DataFrame(index=get_factor_data_date_list, columns=list(quantile_dict.values()) + ['相邻档超额收益差平方和', '相邻档超额收益差平方均值']) 572 | for i in range(stratification_num): 573 | factor_stratification_hp_return[(sample_name, factor_name)][quantile_dict[i]] = \ 574 | factor_stratification_return[(sample_name, factor_name, quantile_dict[i])]['持仓期收益率'] 575 | 576 | factor_stratification_hp_return[(sample_name, factor_name)]['相邻档超额收益差平方和'] = \ 577 | factor_stratification_hp_return[(sample_name, factor_name)][list(quantile_dict.values())].sub( 578 | market_mean_return['持仓期收益率'], axis=0).rolling(2, axis=1).apply(lambda s: s[1] - s[0]).applymap(lambda v: v ** 2).sum(axis=1) 579 | factor_stratification_hp_return[(sample_name, factor_name)]['相邻档超额收益差平方均值'] = \ 580 | factor_stratification_hp_return[(sample_name, factor_name)][list(quantile_dict.values())].sub( 581 | market_mean_return['持仓期收益率'], axis=0).rolling(2, axis=1).apply(lambda s: s[1] - s[0]).applymap(lambda v: v ** 2).mean(axis=1) 582 | 583 | factor_stratification_hp_return[(sample_name, factor_name)] = \ 584 | factor_stratification_hp_return[(sample_name, factor_name)].astype('float32').reset_index().rename(columns={'index': '数据提取日'}) 585 | print('\t保存各因子序号分档收益率,计算各档收益发散性指标:' + sample_name + '-' + factor_name + '(' + factor_name_dict[factor_name] + ')') 586 | 587 | print(low_level_divided_str) 588 | 589 | print_high_level_divided_str(action_str) 590 | return factor_stratification_hp_return 591 | 592 | 593 | def transform_dict_to_df(dict_data, keys_column_name_list): 594 | """ 595 | 用于将处理过程中保存的dict类型转为dataframe,因为在处理过程中用dict更加方便明了,但是在最终结果展示环节可能还需要dataframe的形式导出成excel 596 | :param dict_data: 597 | :param keys_column_name_list: 该参数是dict_data中要转为一列column的key的所属类别,例如dict_data的keys是('申万A股', 'WLS'), 598 | 那么该参数就是['股票池', '回归模型'] 599 | :return: 将dict_data中的keys全部转换为了columns的dataframe 600 | """ 601 | 602 | dataframe_data = pd.DataFrame() 603 | for key_list, df_data in dict_data.items(): 604 | for key_i, key_column_name in enumerate(keys_column_name_list): 605 | df_data[key_column_name] = key_list[key_i] 606 | dataframe_data = pd.concat([dataframe_data, df_data]) 607 | dataframe_data.index = range(dataframe_data.shape[0]) 608 | 609 | return dataframe_data 610 | 611 | 612 | def put_keys_into_df(data, keys_list): 613 | """ 614 | 为了体现transform_dict_to_df函数的愚蠢 615 | :param data: 616 | :param keys_list: 617 | :return: 618 | """ 619 | concat_list = [] 620 | for keys, df in data.items(): 621 | if not pd.Series(df.index).equals(pd.Series(range(df.shape[0]))): 622 | df = df.reset_index() 623 | 624 | df[keys_list] = pd.DataFrame([list(keys)], index=range(df.shape[0])) 625 | concat_list.append(df) 626 | 627 | concat_df = pd.concat(concat_list, axis=0) 628 | 629 | return concat_df 630 | 631 | 632 | def get_purified_factor(prior_purified_data, purified_class=None, lower_class=None, sample_name=None, regression_model=None, rolling_window=None, 633 | factor_stratification_return=None, index_return_df=None, get_factor_data_date_list=None): 634 | """ 635 | 通用的因子提纯函数,因子序列提纯为因子小类,因子小类提纯为因子大类都能使用,暂时只用了取解释度最高的因子,因此只能用WLS或者OLS的方法 636 | :param prior_purified_data: 637 | :param purified_class: 638 | :param lower_class: 639 | :param sample_name: 640 | :param regression_model: 641 | :param rolling_window: 642 | :param factor_stratification_return: 643 | :param index_return_df: 644 | :param get_factor_data_date_list: 645 | :return: 646 | """ 647 | # 变量声明 648 | within_purified_class_factor_corr = {} # 提纯后,该类别内所有纯净小类因子的相关系数 649 | 650 | # 变量定义 651 | purified_class_name_list = prior_purified_data[purified_class].unique().tolist() # 得到所有类别名称 652 | factor_info_list = ['因子大类', '因子小类', '因子序号', '档位'] 653 | required_factor_info_list = factor_info_list[factor_info_list.index(lower_class):] 654 | complement_factor_info_list = factor_info_list[:factor_info_list.index(lower_class)] 655 | factor_test_info_list = ['Alpha显著性', 'Alpha', 'Alpha t值', 'Alpha标准误', 'Alpha p值', 'Beta显著性', 'Beta', 'Beta t值', 'Beta标准误', 656 | 'Beta p值', 'Adj. R-squared'] 657 | all_factor_info_list = ['数据提取日'] + factor_info_list + factor_test_info_list 658 | 659 | all_purified_data = pd.DataFrame(columns=all_factor_info_list) 660 | MES_purified_data = pd.DataFrame(columns=all_factor_info_list) 661 | 662 | if regression_model == 'WLS': 663 | weight_dict = {'cos': [1 / (math.cos(x) / sum([math.cos(x) for x in np.linspace(0, math.pi / 2, rolling_window)])) 664 | for x in np.linspace(0, math.pi / 2, rolling_window)]} 665 | 666 | for class_name in purified_class_name_list: 667 | 668 | tempo_all_purified_data = pd.DataFrame(columns=all_factor_info_list) 669 | tempo_MES_purified_data = pd.DataFrame(columns=all_factor_info_list) 670 | 671 | tempo_data = prior_purified_data[prior_purified_data[purified_class] == class_name].copy() 672 | 673 | # 注意:此处直接忽略了没有显著因子的日期,只对存在显著因子的日期进行提纯 674 | tempo_regression_end_date_list = tempo_data['数据提取日'].unique().tolist() 675 | 676 | for regression_end_date in tempo_regression_end_date_list: 677 | 678 | # (1) 根据回归最后的时间点找到该回归显著的因子,如果只有1个那么就不用提纯了 679 | tempo_date_data = tempo_data[tempo_data['数据提取日'] == regression_end_date].copy() 680 | tempo_date_data.index = range(tempo_date_data.shape[0]) 681 | 682 | lower_class_name_list = tempo_date_data[lower_class].tolist() # 由于最底层只取最显著的那一档,所以每个因子序号/因子名称唯一 683 | if len(lower_class_name_list) == 1: 684 | continue 685 | 686 | # (2) 进行Fama-MacBeth两次回归进行提纯 687 | 688 | # (2.1) 根据回归最后的时间点找到滚动窗口时间段 689 | regression_end_date_index = get_factor_data_date_list.index(regression_end_date) # index是31,表示第32个周 690 | # list后不包,所以要+1 691 | regression_period = get_factor_data_date_list[regression_end_date_index - rolling_window + 1:regression_end_date_index + 1] 692 | 693 | # (2.2) 构建第一次回归所需数据 694 | first_regression_data = pd.DataFrame(index=regression_period, columns=lower_class_name_list) 695 | for i in tempo_date_data.index: 696 | factor_num = tempo_date_data.loc[i, '因子序号'] 697 | factor_stratification_num = tempo_date_data.loc[i, '档位'] 698 | first_regression_data[tempo_date_data.loc[i, lower_class]] = \ 699 | factor_stratification_return[(sample_name, factor_num, factor_stratification_num)]['持仓期收益率'].loc[regression_period] 700 | 701 | # (2.3) 第一阶段回归:回归取残差项 702 | first_regression_residual = pd.DataFrame(index=regression_period, columns=lower_class_name_list) # 残差数据 703 | purified_factor_data = pd.DataFrame(index=regression_period, columns=lower_class_name_list) # 因子减去残差得到提纯数据 704 | # 保存一阶段回归详细结果,以便后续查看提纯情况 705 | first_regression_result = pd.DataFrame(index=lower_class_name_list, columns=['Adj. R-squared', 'F值', 'F值显著性']) 706 | 707 | for purified_factor in lower_class_name_list: 708 | # 得到因子回归提纯后的残差 709 | other_factors_list = [factor for factor in lower_class_name_list if factor not in [purified_factor]] 710 | if regression_model == 'WLS': 711 | first_regression_model = sm.WLS(first_regression_data[purified_factor].astype(float), 712 | first_regression_data[other_factors_list].astype(float), 713 | weights=weight_dict['cos']).fit().get_robustcov_results() 714 | 715 | first_regression_residual[purified_factor] = first_regression_model.resid 716 | purified_factor_data[purified_factor] = \ 717 | first_regression_data[purified_factor].astype(float) - first_regression_residual[purified_factor] 718 | 719 | first_regression_result.loc[purified_factor, 'Adj. R-squared'] = format(first_regression_model.rsquared_adj, 720 | '.2%') 721 | first_regression_result.loc[purified_factor, 'F值'] = format(first_regression_model.fvalue[0][0], '.2f') 722 | first_regression_result.loc[purified_factor, 'F值显著性'] = format(first_regression_model.f_pvalue, '.3f') 723 | 724 | # (2.4) 得到原收益率减去残差值的相关系数矩阵 725 | within_purified_class_factor_corr[(class_name, regression_end_date)] = purified_factor_data.corr() 726 | 727 | # (2.5) 第二阶段回归:使用残差值再进行一次单因子回归检验,使用原收益率减去残差值再进行一次单因子回归检验 728 | second_regression_data = pd.DataFrame(index=regression_period, columns=['Alpha', 'Beta']) 729 | second_regression_data['Alpha'] = 1 730 | second_regression_data['Beta'] = index_return_df.loc[regression_period, sample_name + '收益率'] 731 | 732 | second_regression_result = pd.DataFrame(index=lower_class_name_list, columns=factor_test_info_list) 733 | 734 | for purified_factor in lower_class_name_list: 735 | if regression_model == 'WLS': 736 | second_regression_model = sm.WLS(purified_factor_data[purified_factor], 737 | second_regression_data[['Alpha', 'Beta']].astype(float), 738 | weights=weight_dict['cos']).fit().get_robustcov_results() 739 | 740 | second_regression_result.loc[purified_factor] = \ 741 | set_linear_regression_result(second_regression_model, factor_test_info_list, method='WLS') 742 | 743 | # (2.6) 得到因子类型内提纯后的结果 744 | 745 | # 挑选提纯后仍然显著的因子 746 | significant_condition = (second_regression_result['Alpha p值'].astype(float) < 0.1) & \ 747 | (second_regression_result['Alpha'].astype(float) > 0) 748 | 749 | significant_result = second_regression_result[significant_condition].sort_values( 750 | by=['Adj. R-squared'], ascending=False).reset_index().rename(columns={'index': lower_class}) 751 | 752 | significant_result = tempo_date_data[required_factor_info_list].merge( 753 | significant_result, on=[lower_class], how='right').sort_values(by=['Adj. R-squared'], ascending=False) 754 | for info_name in complement_factor_info_list: 755 | if info_name == '因子大类': 756 | significant_result[info_name] = significant_result['因子序号'].map(factor_category_dict) 757 | elif info_name == '因子小类': 758 | significant_result[info_name] = significant_result['因子序号'].map(factor_type_dict) 759 | significant_result['数据提取日'] = regression_end_date 760 | 761 | tempo_all_purified_data = pd.concat([tempo_all_purified_data, significant_result[all_factor_info_list]]) 762 | 763 | # 挑选解释度最高的那个 764 | most_explaination_result = significant_result.iloc[:1, :].copy() 765 | tempo_MES_purified_data = pd.concat([tempo_MES_purified_data, most_explaination_result[all_factor_info_list]]) 766 | 767 | all_purified_data = pd.concat([all_purified_data, tempo_all_purified_data]) 768 | MES_purified_data = pd.concat([MES_purified_data, tempo_MES_purified_data]) 769 | 770 | print('完成因子提纯:' + purified_class + '(' + class_name + ')') 771 | 772 | return all_purified_data, MES_purified_data, within_purified_class_factor_corr 773 | 774 | 775 | def get_MES_factor_stratification_number_in_factor_number(all_factor_stratification_number_regression_test_result, 776 | sample_list, regression_model_list, rolling_window_list, 777 | factor_category_dict, factor_type_dict): 778 | 779 | action_str = '因子序号内筛选MSE' 780 | print_high_level_divided_str(action_str) 781 | 782 | MES_factor_stratification_number = {} 783 | factor_info_list = ['因子大类', '因子小类', '因子序号', '档位'] 784 | factor_test_info_list = ['Alpha显著性', 'Alpha', 'Alpha t值', 'Alpha标准误', 'Alpha p值', 'Beta显著性', 'Beta', 'Beta t值', 'Beta标准误', 'Beta p值', 785 | 'Adj. R-squared'] 786 | all_info_list = ['数据提取日'] + factor_info_list + factor_test_info_list 787 | 788 | for sample_name in sample_list: 789 | 790 | for regression_model in regression_model_list: 791 | 792 | for rolling_window in rolling_window_list: 793 | 794 | tempo_factor_stratification_number = all_factor_stratification_number_regression_test_result[ 795 | (all_factor_stratification_number_regression_test_result['样本范围'] == sample_name) & 796 | (all_factor_stratification_number_regression_test_result['回归模型'] == regression_model) & 797 | (all_factor_stratification_number_regression_test_result['滚动窗口'] == rolling_window)].copy() 798 | 799 | tempo_significant_factor_stratifaction_number = tempo_factor_stratification_number[ 800 | (tempo_factor_stratification_number['Alpha'] > 0) & (tempo_factor_stratification_number['Alpha p值'] <= 0.1)].copy() 801 | 802 | # (1) 每期每个因子序号下解释度最高的那个Alpha显著为正的档位 803 | MES_factor_stratification_number[(sample_name, regression_model, rolling_window)] = \ 804 | tempo_significant_factor_stratifaction_number.groupby(by=['数据提取日', '因子序号']).apply( 805 | lambda df: df[['档位'] + factor_test_info_list].sort_values(by='Adj. R-squared', ascending=False).iloc[:1, :]).reset_index() 806 | 807 | MES_factor_stratification_number[(sample_name, regression_model, rolling_window)]['因子大类'] = \ 808 | MES_factor_stratification_number[(sample_name, regression_model, rolling_window)]['因子序号'].map(factor_category_dict) 809 | MES_factor_stratification_number[(sample_name, regression_model, rolling_window)]['因子小类'] = \ 810 | MES_factor_stratification_number[(sample_name, regression_model, rolling_window)]['因子序号'].map(factor_type_dict) 811 | 812 | # (2) 调换columns顺序方便后续查看 813 | MES_factor_stratification_number[(sample_name, regression_model, rolling_window)] = \ 814 | MES_factor_stratification_number[(sample_name, regression_model, rolling_window)][all_info_list] 815 | print('\t完成因子序号内筛选MSE:' + ','.join([sample_name, regression_model, str(rolling_window)])) 816 | print(low_level_divided_str) 817 | 818 | print_high_level_divided_str(action_str) 819 | return MES_factor_stratification_number 820 | 821 | 822 | def purify_factor_number_in_factor_type(MES_factor_stratification_number, factor_stratification_return, index_return_df, 823 | sample_list, regression_model_list, rolling_window_list, 824 | get_factor_data_date_list, factor_category_dict, factor_type_dict): 825 | """ 暂时用解释度最高的档位作为该因子序号的代表,在每个因子小类中进行提纯 826 | :param MES_factor_stratification_number:在某个因子序号中解释度最高的档位 827 | :param factor_stratification_return: 828 | :param index_return_df: 829 | :param sample_list: 830 | :param regression_model_list: 831 | :param rolling_window_list: 832 | :param get_factor_data_date_list: 833 | :param factor_category_dict: 834 | :param factor_type_dict: 835 | :return:参数1是提纯后仍显著的所有因子序号,参数2是提纯后仍显著的Adj. R-squared最高的那个因子序号 836 | """ 837 | action_str = '因子小类内提纯' 838 | print_high_level_divided_str(action_str) 839 | 840 | factor_info_list = ['数据提取日','因子大类', '因子小类', '因子序号', '档位'] 841 | factor_test_info_list = ['Alpha显著性', 'Alpha', 'Alpha t值', 'Alpha标准误', 'Alpha p值', 'Beta显著性', 'Beta', 'Beta t值', 'Beta标准误', 'Beta p值', 842 | 'Adj. R-squared'] 843 | all_info_list = factor_info_list + factor_test_info_list 844 | 845 | all_factor_number_after_purified = {} 846 | MES_factor_number_after_purified = {} # Most Explaination Significant 847 | 848 | for sample_name in sample_list: 849 | 850 | for regression_model in regression_model_list: 851 | 852 | for rolling_window in rolling_window_list: 853 | 854 | # (2) 在同一因子小类中,进行因子序号提纯 855 | all_factor_number_after_purified[(sample_name, regression_model, rolling_window)], \ 856 | MES_factor_number_after_purified[(sample_name, regression_model, rolling_window)], _ = \ 857 | get_purified_factor(MES_factor_stratification_number[(sample_name, regression_model, rolling_window)], 858 | purified_class='因子小类', lower_class='因子序号', sample_name=sample_name, regression_model=regression_model, 859 | rolling_window=rolling_window, factor_stratification_return=factor_stratification_return, 860 | index_return_df=index_return_df, get_factor_data_date_list=get_factor_data_date_list) 861 | 862 | all_factor_number_after_purified[(sample_name, regression_model, rolling_window)]['因子大类'] = \ 863 | all_factor_number_after_purified[(sample_name, regression_model, rolling_window)]['因子序号'].map(factor_category_dict) 864 | all_factor_number_after_purified[(sample_name, regression_model, rolling_window)]['因子小类'] = \ 865 | all_factor_number_after_purified[(sample_name, regression_model, rolling_window)]['因子序号'].map(factor_type_dict) 866 | MES_factor_number_after_purified[(sample_name, regression_model, rolling_window)]['因子大类'] = \ 867 | MES_factor_number_after_purified[(sample_name, regression_model, rolling_window)]['因子序号'].map(factor_category_dict) 868 | MES_factor_number_after_purified[(sample_name, regression_model, rolling_window)]['因子小类'] = \ 869 | MES_factor_number_after_purified[(sample_name, regression_model, rolling_window)]['因子序号'].map(factor_type_dict) 870 | 871 | all_factor_number_after_purified[(sample_name, regression_model, rolling_window)] = \ 872 | all_factor_number_after_purified[(sample_name, regression_model, rolling_window)][all_info_list] 873 | MES_factor_number_after_purified[(sample_name, regression_model, rolling_window)] = \ 874 | MES_factor_number_after_purified[(sample_name, regression_model, rolling_window)][all_info_list] 875 | 876 | print('\t完成因子小类内提纯:' + ','.join([sample_name, regression_model, str(rolling_window)])) 877 | print(low_level_divided_str) 878 | 879 | print_high_level_divided_str(action_str) 880 | return all_factor_number_after_purified, MES_factor_number_after_purified 881 | 882 | 883 | def purify_factor_type_in_factor_category(MES_purified_factor_number, factor_stratification_return, index_return_df, 884 | sample_list, regression_model_list, rolling_window_list, 885 | get_factor_data_date_list, factor_category_dict): 886 | 887 | """ 暂时用解释度最高的因子序号作为该因子小类的代表,在每个因子大类中进行提纯 888 | :param MES_purified_factor_number:因子小类提纯后仍显著的Adj. R-squared最高的那个因子序号 889 | :param factor_stratification_return: 890 | :param index_return_df: 891 | :param sample_list: 892 | :param regression_model_list: 893 | :param rolling_window_list: 894 | :param get_factor_data_date_list: 895 | :param factor_category_dict: 896 | :return: 897 | """ 898 | action_str = '因子大类内提纯' 899 | print_high_level_divided_str(action_str) 900 | 901 | factor_info_list = ['数据提取日', '因子大类', '因子小类', '因子序号', '档位'] 902 | factor_test_info_list = ['Alpha显著性', 'Alpha', 'Alpha t值', 'Alpha标准误', 'Alpha p值', 'Beta显著性', 'Beta', 'Beta t值', 'Beta标准误', 'Beta p值', 903 | 'Adj. R-squared'] 904 | all_info_list = factor_info_list + factor_test_info_list 905 | 906 | all_factor_type_after_purified = {} 907 | MES_factor_after_purified = {} # Most Explaination Significant 908 | 909 | for sample_name in sample_list: 910 | 911 | for regression_model in regression_model_list: 912 | 913 | for rolling_window in rolling_window_list: 914 | 915 | # (1) 在同一因子大类中,进行因子小类提纯 916 | all_factor_type_after_purified[(sample_name, regression_model, rolling_window)], \ 917 | MES_factor_after_purified[(sample_name, regression_model, rolling_window)], _ = \ 918 | get_purified_factor(MES_purified_factor_number[(sample_name, regression_model, rolling_window)], 919 | purified_class='因子大类', lower_class='因子小类', sample_name=sample_name, regression_model=regression_model, 920 | rolling_window=rolling_window, factor_stratification_return=factor_stratification_return, 921 | index_return_df=index_return_df, get_factor_data_date_list=get_factor_data_date_list) 922 | 923 | # (2) 补充factor信息 924 | all_factor_type_after_purified[(sample_name, regression_model, rolling_window)]['因子大类'] = \ 925 | all_factor_type_after_purified[(sample_name, regression_model, rolling_window)]['因子序号'].map(factor_category_dict) 926 | # all_factor_type_after_purified[(sample_name, regression_model, rolling_window)]['因子小类'] = \ 927 | # all_factor_type_after_purified[(sample_name, regression_model, rolling_window)]['因子序号'].map(factor_type_dict) 928 | MES_factor_after_purified[(sample_name, regression_model, rolling_window)]['因子大类'] = \ 929 | MES_factor_after_purified[(sample_name, regression_model, rolling_window)]['因子序号'].map(factor_category_dict) 930 | # MES_factor_after_purified[(sample_name, regression_model, rolling_window)]['因子小类'] = \ 931 | # MES_factor_after_purified[(sample_name, regression_model, rolling_window)]['因子序号'].map(factor_type_dict) 932 | 933 | # (3) 调换columns顺序,方便查看 934 | all_factor_type_after_purified[(sample_name, regression_model, rolling_window)] = \ 935 | all_factor_type_after_purified[(sample_name, regression_model, rolling_window)][all_info_list] 936 | MES_factor_after_purified[(sample_name, regression_model, rolling_window)] = \ 937 | MES_factor_after_purified[(sample_name, regression_model, rolling_window)][all_info_list] 938 | 939 | print('\t完成因子大类内提纯:' + ','.join([sample_name, regression_model, str(rolling_window)])) 940 | 941 | print(low_level_divided_str) 942 | 943 | print_high_level_divided_str(action_str) 944 | 945 | return all_factor_type_after_purified, MES_factor_after_purified 946 | 947 | 948 | def optimize_data_ram(data): 949 | """ 950 | 主要是将int、float、object等类型转变为小一点的类型 951 | :param data: 952 | :return: 953 | """ 954 | action_str = '优化变量存储结构' 955 | print('\n' + '{0:*>{width}}'.format(action_str, width=40) + '{0:*>{width}}'.format('', width=40 - len(action_str)) + '\n') 956 | 957 | if isinstance(data, pd.DataFrame): 958 | 959 | print('\t传入数据:' + ','.join([str(type) + '(' + str(num) + '列)' for type, num in data.get_dtype_counts().to_dict().items()])) 960 | before_memory = data.memory_usage(deep=True).sum() / 1024 ** 2 961 | print('\t传入数据大小:' + "{:03.2f}MB".format(before_memory)) 962 | if before_memory <= 100: 963 | print('\t数据大小小于100M,暂时不进行优化') 964 | return data 965 | print('\t正在优化数据结构及存储空间……') 966 | 967 | if not data.select_dtypes(include=['int']).empty: 968 | data[data.select_dtypes(include=['int']).columns] = \ 969 | data.select_dtypes(include=['int']).apply(pd.to_numeric, downcast='integer') # 最小为int8 970 | 971 | if not data.select_dtypes(include=['float']).empty: 972 | data[data.select_dtypes(include=['float']).columns] = \ 973 | data.select_dtypes(include=['float']).apply(pd.to_numeric, downcast='float') # 最小为float32 974 | 975 | if not data.select_dtypes(include=['object']).empty: 976 | for col in data.select_dtypes(include=['object']).columns: 977 | num_unique_values = len(data.select_dtypes(include=['object'])[col].unique()) 978 | num_total_values = len(data.select_dtypes(include=['object'])[col]) 979 | if num_unique_values / num_total_values < 0.5: # 因为是用字典存,所以重复率较高的数据才适合 980 | data.loc[:, col] = data.select_dtypes(include=['object'])[col].astype('category') # 将object转为catagory 981 | print('\t优化后数据:' + ','.join([str(type) + '(' + str(num) + '列)' for type, num in data.get_dtype_counts().to_dict().items()])) 982 | print('\t优化后数据大小:' + "{:03.2f}MB".format(data.memory_usage(deep=True).sum() / 1024 ** 2)) 983 | change_pct = before_memory / (data.memory_usage(deep=True).sum() / 1024 ** 2) - 1 984 | print('\t数据存储优化幅度:' + format(change_pct, '.2%')) 985 | 986 | elif isinstance(data, dict): 987 | type_count = {} 988 | for key, df in data.items(): 989 | for type, count in df.get_dtype_counts().to_dict().items(): 990 | if type in type_count.keys(): 991 | type_count[type] += count 992 | else: 993 | type_count[type] = count 994 | print('\t传入数据:' + ', '.join([str(type) + '(' + str(count) + ')' for type, count in type_count.items()])) 995 | before_memory = sys.getsizeof(data) / 1024 ** 2 996 | if before_memory <= 100: 997 | print('\t数据大小小于100M,暂时不进行优化') 998 | return data 999 | print('\t传入数据大小:' + "{:03.2f}MB".format(before_memory)) 1000 | 1001 | for key, df in data.items(): 1002 | 1003 | if not df.select_dtypes(include=['int']).empty: 1004 | df[df.select_dtypes(include=['int']).columns] = \ 1005 | df.select_dtypes(include=['int']).apply(pd.to_numeric, downcast='integer') # 最小为int8 1006 | 1007 | if not df.select_dtypes(include=['float']).empty: 1008 | df[df.select_dtypes(include=['float']).columns] = \ 1009 | df.select_dtypes(include=['float']).apply(pd.to_numeric, downcast='float') # 最小为float32 1010 | 1011 | if not df.select_dtypes(include=['object']).empty: 1012 | for col in df.select_dtypes(include=['object']).columns: 1013 | num_unique_values = len(df.select_dtypes(include=['object'])[col].unique()) 1014 | num_total_values = len(df.select_dtypes(include=['object'])[col]) 1015 | if num_unique_values / num_total_values < 0.5: # 因为是用字典存,所以重复率较高的数据才适合 1016 | df.loc[:, col] = df.select_dtypes(include=['object'])[col].astype('category') # 将object转为catagory 1017 | data[key] = df 1018 | 1019 | type_count = {} 1020 | for key, df in data.items(): 1021 | for type, count in df.get_dtype_counts().to_dict().items(): 1022 | if type in type_count.keys(): 1023 | type_count[type] += count 1024 | else: 1025 | type_count[type] = count 1026 | print('\t优化后数据:' + ', '.join([str(type) + '(' + str(count) + ')' for type, count in type_count.items()])) 1027 | after_memory = sys.getsizeof(data) / 1024 ** 2 1028 | print('\t优化后数据大小:' + "{:03.2f}MB".format(after_memory)) 1029 | change_pct = before_memory / after_memory - 1 1030 | print('\t数据存储优化幅度:' + format(change_pct, '.2%')) 1031 | 1032 | print('\n' + '{0:*>{width}}'.format(action_str, width=40) + '{0:*>{width}}'.format('', width=40 - len(action_str)) + '\n') 1033 | return data 1034 | 1035 | 1036 | def pickle_dump_data(data, output_file_url, data_name): 1037 | pickle.dump(data, open(output_file_url + '/' + data_name + '.dat', 'wb'), pickle.HIGHEST_PROTOCOL) 1038 | 1039 | 1040 | # ----------------------------------------------------------函数(结束)------------------------------------------------------------------------------- 1041 | 1042 | if __name__ == "__main__": 1043 | # 设置参数 1044 | params_data_url = eval(input('请输入输入参数excel的地址:')) 1045 | params_data = pd.read_excel(params_data_url, index_col=0) 1046 | data_url = params_data.loc['data_url', '参数'] 1047 | sample_list = eval(params_data.loc['sample_list', '参数']) 1048 | regression_model_list = eval(params_data.loc['regression_model_list', '参数']) 1049 | rolling_window_list = eval(params_data.loc['rolling_window_list', '参数']) 1050 | stratification_number = params_data.loc['stratification_number', '参数'] 1051 | output_file_url = '/'.join(params_data_url.split('/')[:-1]) + '/result' 1052 | 1053 | # ----------------------------------------------------------基础数据准备(开始)------------------------------------------------------------------------ 1054 | action_str = '基础数据' 1055 | print_high_level_divided_str(action_str) 1056 | # 得到初始数据 1057 | raw_data = pickle.load(open(data_url + '/raw_data.dat', 'rb')) 1058 | get_factor_data_date_list = [date.strftime('%Y-%m-%d') for date in pd.read_excel(data_url + '/日期序列-周度.xlsx')['endt'].tolist()] 1059 | factor_library = pd.read_excel(data_url + '/因子列表-初步检测.xlsx') 1060 | monetary_fund_return = pd.read_excel(data_url + '/货币基金收益.xlsx', index_col=0) 1061 | quantile_dict = {0: 'low', **{i: str(i + 1) for i in range(1, stratification_number - 1)}, stratification_number - 1: 'high'} 1062 | 1063 | factor_list = factor_library['factor_number'].tolist() 1064 | factor_name_dict = {factor_library.loc[i, 'factor_number']: factor_library.loc[i, 'factor_name'] for i in range(factor_library.shape[0])} 1065 | factor_type_dict = {factor_library.loc[i, 'factor_number']: factor_library.loc[i, 'factor_second_class'] for i in range(factor_library.shape[0])} 1066 | factor_category_dict = {factor_library.loc[i, 'factor_number']: factor_library.loc[i, 'factor_first_class'] for i in range(factor_library.shape[0])} 1067 | 1068 | base_info_columns_list = raw_data.columns[:23].tolist() 1069 | base_info_data = raw_data[base_info_columns_list].copy() 1070 | index_return_list = ['申万行业收益率', '沪深300收益率', '中证500收益率', '中证800收益率', '上证综指收益率', '申万A股收益率'] 1071 | yield_type_list = ['持仓期收益率'] + [index_name[:-3] + '相对' + index_name[-3:] for index_name in index_return_list[1:]] 1072 | base_info_columns = base_info_columns_list + [index_name[:-3] + '相对' + index_name[-3:] for index_name in index_return_list[1:]] 1073 | # 避免每个变量都保存一遍基础数据,释放内存 1074 | data_cleaning_base_columns = ['数据提取日', '财务数据最新报告期', 'stockid', 'sectorname', '是否st', '是否pt', '是否停牌', '上市天数', '持仓期停牌天数占比'] + \ 1075 | [sample_name + '成分股' for sample_name in sample_list] + factor_list 1076 | 1077 | # 计算指数收益率(因为不想另外再单独取指数的收益率,所以在天软中取基础数据的时候同时取了) 1078 | index_return_df = raw_data.groupby('数据提取日').apply(lambda df: df[index_return_list[1:]].iloc[0, :]) 1079 | 1080 | # 计算相对收益率(后续在天软中实现,同时保留指数的收益率和相对收益率) 1081 | for index_name in index_return_list[1:]: 1082 | raw_data[index_name[:-3] + '相对' + index_name[-3:]] = raw_data['持仓期收益率'] - raw_data[index_name] 1083 | 1084 | # 计算市场平均收益率、中位数收益率 1085 | market_mean_return = raw_data.groupby(by=['数据提取日']).apply(lambda df: df[yield_type_list].mean()).astype('float32') 1086 | market_median_return = raw_data.groupby(by=['数据提取日']).apply(lambda df: df[yield_type_list].median()).astype('float32') 1087 | print_high_level_divided_str(action_str) 1088 | 1089 | # ----------------------------------------------------------基础数据准备(结束)------------------------------------------------------------------------ 1090 | 1091 | # ----------------------------------------------------------数据处理及分布描述(开始)------------------------------------------------------------------- 1092 | 1093 | clean_data = data_cleaning(raw_data[data_cleaning_base_columns], sample_list=sample_list, factor_list=factor_list, 1094 | kicked_sector_list=['申万金融服务', '申万非银金融', '申万综合', '申万银行'], 1095 | go_public_days=250, data_processing_base_columns=['数据提取日', 'stockid', '持仓期停牌天数占比'], 1096 | get_factor_data_date_list=get_factor_data_date_list) 1097 | clean_data = optimize_data_ram(clean_data) 1098 | pickle_dump_data(clean_data, output_file_url, data_name='clean_data') 1099 | 1100 | # 2. 数据分布及异常值处理 1101 | 1102 | # 2.1 数据分布和异常值数据 1103 | 1104 | clean_data_after_outlier, factor_raw_data_describe, factor_clean_data_describe, outlier_data = \ 1105 | data_processing(clean_data, sample_list=sample_list, factor_list=factor_list, factor_name_dict=factor_name_dict) 1106 | clean_data_after_outlier = optimize_data_ram(clean_data_after_outlier) 1107 | pickle_dump_data(clean_data_after_outlier, output_file_url, data_name='clean_data_after_outlier') 1108 | 1109 | # ----------------------------------------------------------数据处理及分布描述(结束)------------------------------------------------------------------- 1110 | 1111 | # ----------------------------------------------------------分组构建(开始)---------------------------------------------------------------------------- 1112 | 1113 | factor_stratification_data = get_factor_stratification_data(clean_data_after_outlier, sample_list=sample_list, factor_list=factor_list, 1114 | stratification_num=stratification_number, quantile_dict=quantile_dict) 1115 | factor_stratification_data = optimize_data_ram(factor_stratification_data) 1116 | pickle_dump_data(factor_stratification_data, output_file_url, data_name='factor_stratification_data') 1117 | 1118 | # **************************************************************************************************************************************************** 1119 | 1120 | factor_stratification_return = \ 1121 | get_factor_stratification_return(factor_stratification_data, raw_data[['数据提取日', 'stockid'] + yield_type_list], sample_list=sample_list, 1122 | factor_list=factor_list, startification_num=stratification_number, quantile_dict=quantile_dict, 1123 | yield_type_list=['持仓期收益率']) 1124 | factor_stratification_return = optimize_data_ram(factor_stratification_return) 1125 | pickle_dump_data(factor_stratification_return, output_file_url, data_name='factor_stratification_return') 1126 | 1127 | # 1. 小类因子收益率 1128 | 1129 | factor_stratification_hp_return = get_factor_stratification_hp_return(factor_stratification_return, market_mean_return, sample_list=sample_list, 1130 | factor_list=factor_list, stratification_num=stratification_number, 1131 | quantile_dict=quantile_dict, factor_name_dict=factor_name_dict) 1132 | factor_stratification_hp_return = optimize_data_ram(factor_stratification_hp_return) 1133 | factor_stratificated_return = transform_dict_to_df(factor_stratification_hp_return, ['sample_scope', 'factor_number']) 1134 | 1135 | pickle_dump_data(factor_stratification_hp_return, output_file_url, data_name='factor_stratification_hp_return') 1136 | 1137 | # ----------------------------------------------------------分组构建(结束)---------------------------------------------------------------------------- 1138 | 1139 | # ----------------------------------------------------------时间序列回归计算alpha、beta(开始)---------------------------------------------------------- 1140 | 1141 | factor_test_result, _ = get_factor_test_result(factor_stratification_return, index_return_df, sample_list=sample_list, factor_list=factor_list, 1142 | get_factor_data_date_list=get_factor_data_date_list, 1143 | regression_model_list=regression_model_list, quantile_dict=quantile_dict, 1144 | rolling_window_list=rolling_window_list, 1145 | stratification_num=stratification_number) 1146 | factor_test_result = optimize_data_ram(factor_test_result) 1147 | pickle_dump_data(factor_test_result, output_file_url, data_name='factor_test_result') 1148 | 1149 | # ----------------------------------------------------------时间序列回归计算alpha、beta(结束)---------------------------------------------------------- 1150 | 1151 | # ----------------------------------------------------------显著因子挑选及所需存储数据(开始)------------------------------------------------------------ 1152 | 1153 | # 2. 将回归结果保存在一个完成的dataframe中,以便后续保存 1154 | 1155 | factor_test_result_df = transform_dict_to_df(factor_test_result, ['样本范围', '回归模型', '滚动窗口', '因子序号', '档位']) 1156 | factor_test_result_df = optimize_data_ram(factor_test_result_df) 1157 | pickle_dump_data(factor_test_result_df, output_file_url, data_name='factor_test_result_df') 1158 | 1159 | # 3. 在显著的(Alpha为正且p值小于1)各档位中,暂时筛选出解释度最高的那个档位作为该因子序号的代表 1160 | 1161 | MES_factor_stratification_number = get_MES_factor_stratification_number_in_factor_number( 1162 | factor_test_result_df, sample_list, regression_model_list, rolling_window_list, factor_category_dict, factor_type_dict) 1163 | MES_factor_stratification_number = optimize_data_ram(MES_factor_stratification_number) 1164 | pickle_dump_data(MES_factor_stratification_number, output_file_url, data_name='MES_factor_stratification_number') 1165 | 1166 | # 4. 对因子小类进行提纯 1167 | 1168 | all_factor_number_after_purified, MES_factor_number_after_purified = \ 1169 | purify_factor_number_in_factor_type(MES_factor_stratification_number, factor_stratification_return, index_return_df, 1170 | sample_list, regression_model_list, rolling_window_list, 1171 | get_factor_data_date_list, factor_category_dict, factor_type_dict) 1172 | all_factor_number_after_purified = optimize_data_ram(all_factor_number_after_purified) 1173 | pickle_dump_data(all_factor_number_after_purified, output_file_url, data_name='all_factor_number_after_purified') 1174 | MES_factor_number_after_purified = optimize_data_ram(MES_factor_number_after_purified) 1175 | pickle_dump_data(MES_factor_number_after_purified, output_file_url, data_name='MES_factor_number_after_purified') 1176 | 1177 | # 5. 对因子大类进行提纯 1178 | 1179 | all_factor_type_after_purified, MSE_factor_type_after_purified = \ 1180 | purify_factor_type_in_factor_category(MES_factor_number_after_purified, factor_stratification_return, index_return_df, 1181 | sample_list, regression_model_list, rolling_window_list, get_factor_data_date_list, 1182 | factor_category_dict) 1183 | all_factor_type_after_purified = optimize_data_ram(all_factor_type_after_purified) 1184 | pickle_dump_data(all_factor_type_after_purified, output_file_url, data_name='all_factor_type_after_purified') 1185 | MSE_factor_type_after_purified = optimize_data_ram(MSE_factor_type_after_purified) 1186 | pickle_dump_data(MSE_factor_type_after_purified, output_file_url, data_name='MSE_factor_type_after_purified') 1187 | 1188 | # ----------------------------------------------------------显著因子挑选及所需存储数据(结束)------------------------------------------------------------ 1189 | 1190 | # ----------------------------------------------------------大类因子收益率(开始)----------------------------------------------------------------------- 1191 | 1192 | -------------------------------------------------------------------------------- /Factor_Test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Dengyi-CN/Quantitative_FOF_Multi_Factors_Framework/65875c7fcf3b6c0a85f98d4e3fffb38f821c7d4a/Factor_Test/__init__.py -------------------------------------------------------------------------------- /Framework/Factor_Construction.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Dengyi-CN/Quantitative_FOF_Multi_Factors_Framework/65875c7fcf3b6c0a85f98d4e3fffb38f821c7d4a/Framework/Factor_Construction.py -------------------------------------------------------------------------------- /Framework/Factor_Model.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | pd.set_option('max_columns', 20) 3 | pd.set_option('display.width', 320) 4 | pd.set_option('display.unicode.east_asian_width', True) 5 | pd.set_option('display.unicode.ambiguous_as_wide', True) 6 | import numpy as np 7 | from Quantitative_FOF_Multi_Factors_Framework.DataBase import read_data_from_oracle_db 8 | from Quantitative_FOF_Multi_Factors_Framework.Framework.Tools.FactorInfoFunc import fill_factor_number_info 9 | from Quantitative_FOF_Multi_Factors_Framework.Framework.Tools.FactorSignalFunc import standardization 10 | from functools import reduce 11 | import pickle 12 | import statsmodels.api as sm 13 | import scipy.stats as stats 14 | import re 15 | import sys 16 | import math 17 | import cvxpy 18 | 19 | 20 | class BaseDataType(object): 21 | 22 | def __init__(self): 23 | pass 24 | 25 | # columns是数据属性(如sz000001、sz000002或factor1、factor2),index是时间序列 26 | pivot_data_structure = pd.DataFrame() 27 | # columns是数据标识(如股票代码、日期、因子值…),index是序号(无特殊形式) 28 | melt_data_structure = pd.DataFrame() 29 | 30 | 31 | class BaseData(object): 32 | """ 33 | 用于获取基础数据 34 | """ 35 | def __init__(self, begin_date, data_freq): 36 | self.begin_date = begin_date 37 | self.data_freq = data_freq 38 | # self.stock_return = pd.DataFrame() 39 | self.stock_active_return = pd.DataFrame() 40 | self.time_series_regression_result = pd.DataFrame() 41 | self.stock_status = pd.DataFrame() 42 | self.industry_classification = pd.DataFrame() 43 | self.floating_mv = pd.DataFrame() 44 | self.factor_raw_data = pd.DataFrame() 45 | self.date_list = None 46 | self.benchmark_weight = pd.DataFrame() 47 | self.factor_stratificated_return = pd.DataFrame() 48 | self.factor_return_regression = pd.DataFrame() 49 | 50 | def optimize_data_ram(self, data): 51 | """ 52 | 主要是将int、float、object等类型转变为小一点的类型 53 | :param data: 54 | :return: 55 | """ 56 | action_str = '优化变量存储结构' 57 | print('\n' + '{0:*>{width}}'.format(action_str, width=40) + '{0:*>{width}}'.format('', width=40 - len(action_str)) + '\n') 58 | 59 | if isinstance(data, pd.DataFrame): 60 | 61 | print('\t传入数据:' + ','.join([str(type) + '(' + str(num) + '列)' for type, num in data.get_dtype_counts().to_dict().items()])) 62 | before_memory = data.memory_usage(deep=True).sum() / 1024 ** 2 63 | print('\t传入数据大小:' + "{:03.2f}MB".format(before_memory)) 64 | if before_memory <= 100: 65 | print('\t数据大小小于100M,暂时不进行优化') 66 | return data 67 | print('\t正在优化数据结构及存储空间……') 68 | 69 | if not data.select_dtypes(include=['int']).empty: 70 | data[data.select_dtypes(include=['int']).columns] = \ 71 | data.select_dtypes(include=['int']).apply(pd.to_numeric, downcast='integer') # 最小为int8 72 | 73 | if not data.select_dtypes(include=['float']).empty: 74 | data[data.select_dtypes(include=['float']).columns] = \ 75 | data.select_dtypes(include=['float']).apply(pd.to_numeric, downcast='float') # 最小为float32 76 | 77 | if not data.select_dtypes(include=['object']).empty: 78 | for col in data.select_dtypes(include=['object']).columns: 79 | num_unique_values = len(data.select_dtypes(include=['object'])[col].unique()) 80 | num_total_values = len(data.select_dtypes(include=['object'])[col]) 81 | if num_unique_values / num_total_values < 0.5: # 因为是用字典存,所以重复率较高的数据才适合 82 | data.loc[:, col] = data.select_dtypes(include=['object'])[col].astype('category') # 将object转为catagory 83 | print('\t优化后数据:' + ','.join([str(type) + '(' + str(num) + '列)' for type, num in data.get_dtype_counts().to_dict().items()])) 84 | print('\t优化后数据大小:' + "{:03.2f}MB".format(data.memory_usage(deep=True).sum() / 1024 ** 2)) 85 | change_pct = before_memory / (data.memory_usage(deep=True).sum() / 1024 ** 2) - 1 86 | print('\t数据存储优化幅度:' + format(change_pct, '.2%')) 87 | 88 | elif isinstance(data, dict): 89 | type_count = {} 90 | for key, df in data.items(): 91 | for type, count in df.get_dtype_counts().to_dict().items(): 92 | if type in type_count.keys(): 93 | type_count[type] += count 94 | else: 95 | type_count[type] = count 96 | print('\t传入数据:' + ', '.join([str(type) + '(' + str(count) + ')' for type, count in type_count.items()])) 97 | before_memory = sys.getsizeof(data) / 1024 ** 2 98 | if before_memory <= 100: 99 | print('\t数据大小小于100M,暂时不进行优化') 100 | return data 101 | print('\t传入数据大小:' + "{:03.2f}MB".format(before_memory)) 102 | 103 | for key, df in data.items(): 104 | 105 | if not df.select_dtypes(include=['int']).empty: 106 | df[df.select_dtypes(include=['int']).columns] = \ 107 | df.select_dtypes(include=['int']).apply(pd.to_numeric, downcast='integer') # 最小为int8 108 | 109 | if not df.select_dtypes(include=['float']).empty: 110 | df[df.select_dtypes(include=['float']).columns] = \ 111 | df.select_dtypes(include=['float']).apply(pd.to_numeric, downcast='float') # 最小为float32 112 | 113 | if not df.select_dtypes(include=['object']).empty: 114 | for col in df.select_dtypes(include=['object']).columns: 115 | num_unique_values = len(df.select_dtypes(include=['object'])[col].unique()) 116 | num_total_values = len(df.select_dtypes(include=['object'])[col]) 117 | if num_unique_values / num_total_values < 0.5: # 因为是用字典存,所以重复率较高的数据才适合 118 | df.loc[:, col] = df.select_dtypes(include=['object'])[col].astype('category') # 将object转为catagory 119 | data[key] = df 120 | 121 | type_count = {} 122 | for key, df in data.items(): 123 | for type, count in df.get_dtype_counts().to_dict().items(): 124 | if type in type_count.keys(): 125 | type_count[type] += count 126 | else: 127 | type_count[type] = count 128 | print('\t优化后数据:' + ', '.join([str(type) + '(' + str(count) + ')' for type, count in type_count.items()])) 129 | after_memory = sys.getsizeof(data) / 1024 ** 2 130 | print('\t优化后数据大小:' + "{:03.2f}MB".format(after_memory)) 131 | change_pct = before_memory / after_memory - 1 132 | print('\t数据存储优化幅度:' + format(change_pct, '.2%')) 133 | 134 | print('\n' + '{0:*>{width}}'.format(action_str, width=40) + '{0:*>{width}}'.format('', width=40 - len(action_str)) + '\n') 135 | return data 136 | 137 | def get_data_from_local_database(self, data_sql=None, data_name=None, optimize_data=False): 138 | data = read_data_from_oracle_db(sql=data_sql) 139 | data.columns = [column.lower() for column in data.columns.tolist()] 140 | if optimize_data: 141 | data = self.optimize_data_ram(data) 142 | 143 | if data_name == 'get_data_date_list': 144 | return data['get_data_date'].tolist() 145 | elif (data_name == 'stock_return') | (data_name == 'stock_active_return'): 146 | return data.pivot_table(values=data_name, index='get_data_date', columns='stock_id') 147 | else: 148 | return data 149 | 150 | def get_stock_return_by_freq(self, active_benchmark=None): 151 | """ 152 | 用于把日度数据结合成某种频率的收益率 153 | :param stock_return: 154 | :param calender_freq: 155 | :return: 156 | """ 157 | if active_benchmark: 158 | stock_return_sql = ''' 159 | select a.get_data_date, a.stock_id, a.stock_return, b.DAILY_RETURN, a.STOCK_RETURN - b.DAILY_RETURN as stock_active_return 160 | from lyzs_tinysoft.stock_return a 161 | LEFT JOIN LYZS_TINYSOFT.INDEX_RETURN b 162 | on a.GET_DATA_DATE = b.GET_DATA_DATE 163 | where b.STOCK_NAME = \'''' + active_benchmark + '''\' and a.GET_DATA_DATE >= \'''' + self.begin_date + '''\' 164 | ORDER BY a.GET_DATA_DATE, a.STOCK_ID 165 | ''' 166 | stock_return = self.get_data_from_local_database(data_sql=stock_return_sql, data_name='stock_active_return').astype(np.float16) 167 | else: 168 | stock_return_sql = '''select get_data_date, stock_id, stock_return from lyzs_tinysoft.stock_return 169 | where get_data_date >= \'''' + self.begin_date + '''\' 170 | ''' 171 | stock_return = self.get_data_from_local_database(data_sql=stock_return_sql, data_name='stock_return').astype(np.float16) 172 | 173 | if self.data_freq == '周度': 174 | stock_return = stock_return.div(100) 175 | 176 | trading_day_status = self.get_data_from_local_database( 177 | data_sql='''select * from lyzs_tinysoft.trading_day_status 178 | where trading_day >= \'''' + self.begin_date + '''\' order by trading_day 179 | ''') 180 | trading_day_status = trading_day_status.rename(columns={'trading_day': 'get_data_date'}) 181 | 182 | stock_list = stock_return.columns.tolist() 183 | combined_stock_return = trading_day_status[['get_data_date', 'week_number']].merge(stock_return.reset_index(), on=['get_data_date']) 184 | combined_stock_return = combined_stock_return.groupby(by=['week_number']).apply( 185 | lambda df: df[stock_list].add(1).cumprod().iloc[-1, :].sub(1)).reset_index() 186 | combined_stock_return['get_data_date'] = trading_day_status[trading_day_status['week_end_date'] == 1]['get_data_date'].values 187 | 188 | result_data = combined_stock_return[['get_data_date'] + stock_list].set_index('get_data_date') 189 | 190 | return result_data 191 | 192 | def get_base_data(self, data_name=None, data_sql=None, active_benchmark=None): 193 | 194 | # if data_name == 'stock_return': 195 | # if self.stock_return.empty: 196 | # self.stock_return = self.get_stock_return_by_freq() 197 | if data_name == 'stock_active_return': 198 | if self.stock_active_return.empty: 199 | self.stock_active_return = self.get_stock_return_by_freq(active_benchmark) 200 | elif data_name == 'stock_status': 201 | if self.stock_status.empty: 202 | self.stock_status = self.get_data_from_local_database( 203 | data_sql='select * from lyzs_tinysoft.stock_status where get_data_date >= \'' + self.begin_date + '\'' 204 | ) 205 | elif data_name == 'industry_classification': 206 | if self.stock_status.empty: 207 | self.stock_status = self.get_data_from_local_database( 208 | data_sql='select * from lyzs_tinysoft.stock_status where get_data_date >= \'' + self.begin_date + '\'' 209 | ) 210 | # self.industry_classification = self.stock_status.pivot(values='sw_sector_id', index='get_data_date', columns='stock_id') 211 | self.industry_classification = self.stock_status[['get_data_date', 'stock_id', 'sw_sector_id']] 212 | elif data_name == 'floating_mv': 213 | if self.stock_status.empty: 214 | self.stock_status = self.get_data_from_local_database( 215 | data_sql=''' 216 | select get_data_date, stock_id, floating_mv from lyzs_tinysoft.stock_status 217 | where get_data_date >= \'''' + self.begin_date + '''\' 218 | ''') 219 | self.floating_mv = self.stock_status.pivot_table(values='floating_mv', index='get_data_date', columns='stock_id') 220 | 221 | elif data_name == 'time_series_regression_result': 222 | if self.time_series_regression_result.empty: 223 | self.time_series_regression_result = self.get_data_from_local_database( 224 | data_sql='select * from lyzs_tinysoft.factor_return_regression where get_data_date >= \'' + self.begin_date + '\'', 225 | optimize_data=True 226 | ) 227 | 228 | elif data_name == 'factor_raw_data': 229 | if self.factor_raw_data.empty: 230 | if not data_sql: 231 | data_sql = 'select * from lyzs_tinysoft.factor_raw_data where get_data_date >= \'' + self.begin_date + '\'' 232 | self.factor_raw_data = self.get_data_from_local_database(data_sql=data_sql, optimize_data=True) 233 | 234 | elif data_name == 'date_list': 235 | if not self.date_list: 236 | self.date_list = self.get_data_from_local_database( 237 | ''' 238 | select get_data_date from lyzs_tinysoft.get_data_date_library 239 | where get_data_freq = \'''' + self.data_freq + '''\' and get_data_date >= \'''' + self.begin_date + '''\' 240 | ''' 241 | )['get_data_date'].tolist() 242 | elif data_name == 'benchmark_weight': 243 | if self.benchmark_weight.empty: 244 | self.benchmark_weight = self.get_data_from_local_database( 245 | data_sql=''' 246 | select get_data_date, stock_id, weight from lyzs_tinysoft.index_weight 247 | where index_name = \'''' + active_benchmark + '''\' and get_data_date >= \'''' + self.begin_date + '''\' 248 | ''' 249 | ) 250 | 251 | elif data_name == 'all_stocks_list': 252 | if self.stock_status.empty: 253 | self.stock_status = self.get_data_from_local_database( 254 | data_sql=''' 255 | select get_data_date, stock_id, floating_mv from lyzs_tinysoft.stock_status 256 | where get_data_date >= \'''' + self.begin_date + '''\' 257 | ''') 258 | self.all_stocks_list = self.stock_status['stock_id'].unique().tolist() 259 | 260 | elif data_name == 'factor_stratificated_return': 261 | if self.factor_stratificated_return.empty: 262 | self.factor_stratificated_return = self.get_data_from_local_database( 263 | data_sql=''' 264 | select get_data_date, factor_number, type_name, return from lyzs_tinysoft.factor_stratificated_return 265 | ''' 266 | ) 267 | elif data_name == 'factor_return_regression': 268 | if self.factor_return_regression.empty: 269 | self.factor_return_regression = self.get_data_from_local_database( 270 | data_sql=''' 271 | select * from lyzs_tinysoft.factor_return_regression 272 | ''' 273 | ) 274 | 275 | 276 | class Model(object): 277 | 278 | def __init__(self): 279 | pass 280 | 281 | def cross_section_data_regression(self, left_var_data=None, factor_melt_data=None, industry_classification_melt_data=None, 282 | floating_mv_pivot_data=None, used_industry_name='sw_sector_id'): 283 | 284 | # left_var_data是等式左边的变量,为pivot类型 285 | # factor_melt_data为melt类型 286 | 287 | # 先获取必要数据 288 | stock_active_return = left_var_data 289 | get_data_date_list = list(set(stock_active_return.index).intersection(set(factor_melt_data['get_data_date']))) 290 | get_data_date_list.sort() 291 | 292 | factor_list = factor_melt_data['factor_number'].unique().tolist() 293 | all_stock_list = factor_melt_data['stock_id'].unique().tolist() 294 | # 下面的变量用于存储所有的回归结果 295 | all_industry_list = industry_classification_melt_data[used_industry_name].dropna().unique().tolist() 296 | right_var_list = factor_list + all_industry_list 297 | 298 | # 用于存储因子收益相关的结果 299 | regression_result = {i: pd.DataFrame(index=get_data_date_list, columns=right_var_list) for i in ['coef', 'coef_t_value', 'coef_p_value']} 300 | regression_result['r-suqared'] = pd.DataFrame(index=get_data_date_list, columns=['r-squared', 'Adj-rsquared']) 301 | regression_residual = pd.DataFrame(index=get_data_date_list, columns=all_stock_list) 302 | 303 | factor_exposure = {} 304 | 305 | # 每个截面分别进行回归 306 | for j, begin_date in enumerate(get_data_date_list[:-1], 1): 307 | 308 | begin_date_factor_data = factor_melt_data[factor_melt_data['get_data_date'] == begin_date].pivot_table( 309 | values='raw_value', index='stock_id', columns='factor_number') 310 | begin_date_industry_data = industry_classification_melt_data[industry_classification_melt_data['get_data_date'] == begin_date] 311 | # 得到本次回归所需要的期初、期末日期 312 | end_date = get_data_date_list[j] 313 | 314 | # 得到本次截面下的所有股票、行业 315 | begin_date_stock_list = begin_date_factor_data.index.tolist() 316 | begin_date_industry_list = begin_date_industry_data[used_industry_name].dropna().unique().tolist() 317 | begin_date_right_var_list = factor_list + begin_date_industry_list 318 | 319 | # 初始化回归数据 320 | regression_raw_data = pd.DataFrame(index=begin_date_stock_list, columns=begin_date_right_var_list) 321 | 322 | regression_raw_data['stock_active_return'] = stock_active_return.loc[end_date, begin_date_stock_list] 323 | regression_raw_data[factor_list] = begin_date_factor_data.loc[begin_date_stock_list, factor_list] 324 | # regression_raw_data['market_factor'] = 1 325 | regression_raw_data[begin_date_industry_list] = pd.get_dummies(begin_date_industry_data.set_index('stock_id')[used_industry_name]).loc[ 326 | begin_date_stock_list, begin_date_industry_list] 327 | 328 | # 丢掉nan数据 329 | regression_data = regression_raw_data.dropna(how='any') 330 | 331 | # 得到wls所需要的市值权重 332 | wls_weight = 1 / floating_mv_pivot_data.loc[begin_date, regression_data.index] 333 | 334 | # 因子数据标准化 335 | regression_data[factor_list] = regression_raw_data[factor_list].apply( 336 | lambda f: standardization(regression_raw_data, f.name, False)[f.name], axis=0) 337 | factor_exposure[begin_date] = regression_raw_data[begin_date_right_var_list] 338 | 339 | regression_results = sm.WLS(regression_data['stock_active_return'], 340 | regression_data[begin_date_right_var_list].astype(float), 341 | weights=wls_weight).fit() 342 | 343 | regression_result['coef'].loc[end_date] = regression_results.params 344 | regression_result['coef_t_value'].loc[end_date] = regression_results.tvalues 345 | regression_result['coef_p_value'].loc[end_date] = regression_results.pvalues 346 | regression_result['r-suqared'].loc[end_date] = [regression_results.rsquared, regression_results.rsquared_adj] 347 | 348 | regression_residual.loc[begin_date] = regression_results.resid 349 | 350 | print('Factors Model('+ str(len(factor_list)) + ' factors) -> linear regression ->', end_date) 351 | 352 | return regression_result, factor_exposure, regression_residual 353 | 354 | def time_series_data_regression(self): 355 | pass 356 | 357 | 358 | class Factor(object): 359 | # 因子就像是坐标系,是为了找到恒星的地图 360 | 361 | def __init__(self): 362 | self.__factor_number = None # 类内private属性,类外最好不要访问,若要取值,请用get_factor_number方法 363 | self.factor_raw_value = None 364 | self.date = None 365 | 366 | def set_factor_number(self, factor_number): 367 | 368 | if not isinstance(factor_number, str): 369 | raise TypeError('\tfactor_number必须是字符串类型') 370 | if len(re.split(r'(factor)(\d+)', factor_number)) not in [2, 3, 4]: # 将字符串分成s1,factor,XXX,s4四个部分,如果没有匹配到factorXXX,则只有1个部分 371 | raise ValueError('\tfactor_number必须是factorXXX的格式,XXX为序号,如123') 372 | # sql = ''' 373 | # select factor_number 374 | # from lyzs_tinysoft.factor_library 375 | # where factor_number = '%s' 376 | # ''' 377 | # if read_data_from_oracle_db(sql % factor_number).empty: 378 | # raise ValueError('\t' + factor_number + '不在数据库\"lyzs_tinysoft\"的数据表\"factor_library\"中,请先添加相应的基础数据') 379 | 380 | self.__factor_number = factor_number 381 | # print('创建了Factor的factor_number') 382 | 383 | def get_factor_number(self): 384 | return self.__factor_number 385 | 386 | def set_factor_raw_value(self, factor_raw_value): 387 | self.factor_raw_value = factor_raw_value 388 | 389 | def get_factor_raw_value(self, begin_and_end_date=('', '')): 390 | 391 | if not self.factor_raw_value: 392 | if begin_and_end_date[0] != '': # 有指定日期范围的情况下 393 | sql = ''' 394 | select stock_id, get_data_date, factor_raw_value 395 | from lyzs_tinysoft.factor_raw_data 396 | where factor_number = '%s' 397 | and get_data_date between '%s' and '%s' 398 | order by get_data_date, stock_id 399 | ''' 400 | self.factor_raw_value = read_data_from_oracle_db(sql % (self.__factor_number, begin_and_end_date[0], begin_and_end_date[1])) 401 | else: 402 | sql = ''' 403 | select stock_id, get_data_date, factor_raw_value 404 | from lyzs_tinysoft.factor_raw_data 405 | where factor_number = '%s' 406 | order by get_data_date, stock_id 407 | ''' 408 | self.factor_raw_value = read_data_from_oracle_db(sql % self.__factor_number) 409 | 410 | return self.factor_raw_value 411 | 412 | 413 | class FactorSignal(object): 414 | # 用于因子信号生成 415 | 416 | def __init__(self, data_freq='周度'): 417 | self.data_freq = data_freq 418 | 419 | def stratify_ts_data_regression(self): 420 | # 用于分档形成时间序列数据进行回归 421 | 422 | # 1. 分档收益率时间序列数据、回归要用的时间序列数据 423 | 424 | # 1.1 分档收益率时间序列数据 425 | # 每个时点:所选股票池内,根据因子从高到低排序,将股票分成若干个组合,每个组合的平均收益即为一条单位数据 426 | 427 | pass 428 | 429 | 430 | class CrossSectionDataSignal(FactorSignal): 431 | 432 | def __init__(self, base_data): 433 | super(CrossSectionDataSignal, self).__init__() 434 | self.base_data = base_data 435 | self.stock_active_return = base_data.stock_active_return 436 | self.industry_classification = base_data.industry_classification 437 | self.floating_mv = base_data.floating_mv 438 | 439 | def regression(self, factor_data=None, used_industry_name='sw_sector_id', factor_number=None): 440 | """ 441 | 函数用于每个截面数据进行回归 442 | :param factor_data: 443 | :return: 444 | """ 445 | # 将self数据先弄出来,防止被修改,且方便调试 446 | data_freq = self.data_freq 447 | 448 | # 先获取必要数据 449 | stock_active_return = self.stock_active_return 450 | 451 | get_data_date_list = list(set(stock_active_return.index).intersection(set(factor_data.index))) 452 | get_data_date_list.sort() 453 | 454 | industry_classification = self.industry_classification 455 | floating_mv = self.floating_mv 456 | # factor_return只用于存储因子收益相关的结果 457 | factor_return = pd.DataFrame(index=get_data_date_list, columns=['factor_return', 't_value', 'p_value', 'adj_rsquared']) 458 | 459 | # 下面的变量用于存储所有的回归结果 460 | all_industry_list = industry_classification[used_industry_name].dropna().unique().tolist() 461 | all_regression_result = {} 462 | all_regression_coef = pd.DataFrame(index=get_data_date_list, columns=['factor', 'market_factor'] + all_industry_list) 463 | all_regression_t_value = pd.DataFrame(index=get_data_date_list, columns=['factor', 'market_factor'] + all_industry_list) 464 | all_regression_p_value = pd.DataFrame(index=get_data_date_list, columns=['factor', 'market_factor'] + all_industry_list) 465 | all_regression_adj_rsquared = pd.Series(index=get_data_date_list) 466 | 467 | # 每个截面分别进行回归 468 | for j, begin_date in enumerate(get_data_date_list[:-1], 1): 469 | 470 | # 得到本次回归所需要的期初、期末日期 471 | end_date = get_data_date_list[j] 472 | # 得到本次截面下的所有股票、行业 473 | temp_stock_list = factor_data.loc[begin_date].dropna().index.tolist() 474 | temp_industry_data = industry_classification[industry_classification['get_data_date'] == begin_date] 475 | industry_dummy_factor = pd.get_dummies(temp_industry_data.set_index('stock_id')[used_industry_name]) 476 | industry_list = industry_dummy_factor.columns.tolist() 477 | 478 | regression_raw_data = pd.DataFrame( 479 | index=temp_stock_list, 480 | columns=['stock_active_return'] + ['factor'] + ['market_factor'] + industry_list 481 | ) 482 | 483 | regression_raw_data['stock_active_return'] = stock_active_return.loc[end_date, temp_stock_list].copy() 484 | regression_raw_data['factor'] = factor_data.loc[begin_date, temp_stock_list].copy() 485 | regression_raw_data['market_factor'] = 1 486 | regression_raw_data[industry_list] = industry_dummy_factor 487 | 488 | # 丢掉nan数据 489 | regression_data = regression_raw_data.dropna(how='any') 490 | 491 | # 得到wls所需要的市值权重 492 | wls_weight = 1 / floating_mv.loc[begin_date, regression_data.index] 493 | 494 | # 因子数据标准化 495 | regression_data['factor'] = standardization(regression_data, 'factor', False)['factor'].copy() 496 | 497 | # 做单因子回归 498 | single_factor_regression_results = sm.WLS(regression_data['stock_active_return'], 499 | regression_data[['factor', 'market_factor'] + industry_list].astype(float), 500 | weights=wls_weight).fit() 501 | 502 | factor_return.loc[end_date, 'factor_return'] = single_factor_regression_results.params['factor'] 503 | factor_return.loc[end_date, 't_value'] = single_factor_regression_results.tvalues['factor'] 504 | factor_return.loc[end_date, 'p_value'] = single_factor_regression_results.pvalues['factor'] 505 | factor_return.loc[end_date, 'adj_rsquared'] = single_factor_regression_results.rsquared_adj 506 | 507 | all_regression_coef.loc[end_date] = single_factor_regression_results.params 508 | all_regression_t_value.loc[end_date] = single_factor_regression_results.tvalues 509 | all_regression_p_value.loc[end_date] = single_factor_regression_results.pvalues 510 | all_regression_adj_rsquared.loc[end_date] = single_factor_regression_results.rsquared_adj 511 | 512 | print(factor_number + ' -> CrossSectionDataSignal -> regression ->', end_date) 513 | 514 | all_regression_result['coef'] = all_regression_coef 515 | all_regression_result['t_value'] = all_regression_t_value 516 | all_regression_result['p_value'] = all_regression_p_value 517 | all_regression_result['adj_rsquared'] = all_regression_adj_rsquared 518 | 519 | return factor_return, all_regression_result 520 | 521 | def correlation(self, factor_data=None, return_lag_number=1): 522 | """ 523 | 用于每个截面数据进行排序相关 524 | :param factor_data: 525 | :return: 526 | """ 527 | 528 | # 将self数据先弄出来,防止被修改,且方便调试 529 | data_freq = self.data_freq 530 | # 先获取必要数据 531 | if self.stock_active_return.empty: 532 | self.base_data.get_base_data('stock_active_return') 533 | stock_active_return = self.stock_active_return 534 | 535 | get_data_date_list = list(set(stock_active_return.index).intersection(set(factor_data.index))) 536 | get_data_date_list.sort() 537 | 538 | if self.industry_classification.empty: 539 | self.base_data.get_base_data('industry_classification') 540 | industry_classification = self.industry_classification 541 | 542 | if self.floating_mv.empty: 543 | self.base_data.get_base_data('floating_mv') 544 | floating_mv = self.floating_mv 545 | 546 | factor_ic = pd.DataFrame(index=get_data_date_list, columns=['spearman_correlation', 'p_value']) 547 | 548 | for j, begin_date in enumerate(get_data_date_list[:-return_lag_number], 1): 549 | 550 | # 得到本次回归所需要的期初、期末日期 551 | end_date = get_data_date_list[j] 552 | return_end_date = get_data_date_list[j + return_lag_number - 1] 553 | 554 | 555 | # 得到本次截面下的所有股票 556 | stock_list = factor_data.loc[begin_date].dropna().index.tolist() 557 | 558 | correlation_data = pd.DataFrame( 559 | index=stock_list, 560 | columns=['stock_active_return'] + ['factor'] 561 | ) 562 | 563 | # 收益率为持仓期收益率 564 | correlation_data['stock_active_return'] = \ 565 | stock_active_return.loc[end_date:return_end_date, stock_list].add(1).cumprod().loc[return_end_date].subtract(1).copy() 566 | correlation_data['factor'] = factor_data.loc[begin_date, stock_list].copy() 567 | 568 | # 丢掉nan数据 569 | correlation_data = correlation_data.dropna(how='any') 570 | 571 | # 做单因子回归 572 | factor_ic.loc[return_end_date, 'spearman_correlation'] = stats.spearmanr(correlation_data['stock_active_return'], correlation_data['factor'])[0] 573 | factor_ic.loc[return_end_date, 'p_value'] = stats.spearmanr(correlation_data['stock_active_return'], correlation_data['factor'])[1] 574 | 575 | print('CrossSectionDataSignal -> correlation (t+' + str(return_lag_number) + ') ->', return_end_date) 576 | 577 | return factor_ic 578 | 579 | def ic_time_attenuate(self, factor_data=None, max_lag_number=6): 580 | 581 | factor_ic = pd.DataFrame(columns=['ic_' + str(i) for i in range(1, max_lag_number + 1)]) 582 | factor_ic_p_value = pd.DataFrame(columns=['ic_' + str(i) for i in range(1, max_lag_number + 1)]) 583 | 584 | for i in range(1, max_lag_number + 1): 585 | ic_result = self.correlation(factor_data=factor_data, return_lag_number=i) 586 | factor_ic['ic_' + str(i)] = ic_result['spearman_correlation'] 587 | factor_ic_p_value['ic_' + str(i)] = ic_result['p_value'] 588 | 589 | return factor_ic, factor_ic_p_value 590 | 591 | 592 | class BackTest(Model): 593 | 594 | def __init__(self, base_data): 595 | super(BackTest, self).__init__() 596 | self.date_list = base_data.date_list 597 | self.stock_status = base_data.stock_status 598 | self.all_stocks_list = base_data.all_stocks_list 599 | 600 | def forecast_factor_return(self, factor_return_data): 601 | current_data_length = factor_return_data.shape[0] 602 | cos_value = [math.cos(x) for x in np.linspace(-math.pi / 2, 0, current_data_length)] 603 | weight_list = [x / sum(cos_value) for x in cos_value] 604 | forecasted_factor_return = factor_return_data.mul(weight_list, axis=0).cumsum().iloc[-1, :].dropna() 605 | return forecasted_factor_return 606 | 607 | def forecast_factor_residual_risk(self): 608 | pass 609 | 610 | def get_prior_return_and_risk(self, post_factor_return=None, factor_exposure_data=None, post_stock_residual_return=None, 611 | date=None, all_factors=None, feasible_stocks_list=None): 612 | 613 | # 3.2 t+1期因子预测收益率、股票残差收益率 614 | forecasted_factor_return = self.forecast_factor_return(post_factor_return.loc[:date, all_factors].dropna(how='all')) 615 | forecasted_stock_beta_return = factor_exposure_data.mul(forecasted_factor_return).sum(axis=1) 616 | forecasted_stock_residual_return = self.forecast_factor_return(post_stock_residual_return.loc[:date, feasible_stocks_list]) 617 | forecasted_stock_total_return = forecasted_stock_beta_return + forecasted_stock_residual_return 618 | 619 | # 3.3 t+1期因子预测共同风险矩阵 620 | factors_return_cov = np.asmatrix(post_factor_return.loc[:date, all_factors].dropna(how='all').astype(float).cov()) 621 | factor_exposure_matrix = np.asmatrix(factor_exposure_data) 622 | forecasted_common_risk = factor_exposure_matrix.dot(factors_return_cov).dot(factor_exposure_matrix.T) 623 | 624 | # 3.4 t+1期股票残差收益率 -> t+1期异质风险矩阵 625 | stock_residual_return_matrix = post_stock_residual_return.loc[:date, feasible_stocks_list].copy() 626 | stock_residual_return_matrix.loc['预测', feasible_stocks_list] = forecasted_stock_residual_return 627 | forecasted_idiosyncratic_risk = np.diag(stock_residual_return_matrix.astype(float).var()) 628 | 629 | # 3.5 加总风险矩阵,注意是var不是std 630 | forecasted_risk = forecasted_common_risk + forecasted_idiosyncratic_risk 631 | return forecasted_stock_total_return, forecasted_risk 632 | 633 | def get_historical_post_return_and_risk(self, date_list=None, factor_exposure_data=None, factor_return_data=None, 634 | stock_residual_return_data=None): 635 | 636 | post_return = {} 637 | post_risk = {} 638 | 639 | for j, date in enumerate(date_list[:-1]): 640 | post_date = date_list[j + 1] 641 | 642 | date_factors = factor_exposure_data[date].columns.tolist() 643 | date_stocks = factor_exposure_data[date].index.tolist() 644 | 645 | # return 646 | post_factor_return = factor_return_data.loc[post_date] 647 | stock_beta_return = factor_exposure_data[date].mul(post_factor_return.loc[date_factors], axis=1).sum(axis=1) 648 | stock_residual_return = stock_residual_return_data.loc[post_date, date_stocks] 649 | stock_post_return = stock_beta_return + stock_residual_return 650 | 651 | # risk 652 | post_factor_return_cov = factor_return_data.loc[:post_date, date_factors].astype(float).cov() 653 | post_factor_return_cov_matrix = np.asmatrix(post_factor_return_cov) 654 | factor_exposure_matrix = np.asmatrix(factor_exposure_data[date]) 655 | common_risk_matrix = factor_exposure_matrix.dot(post_factor_return_cov_matrix).dot(factor_exposure_matrix.T) 656 | idiosyncratic_risk = stock_residual_return_data.loc[:post_date, date_stocks].astype(float).var() 657 | idiosyncratic_risk_matrix = np.asmatrix(np.diag(idiosyncratic_risk)) 658 | risk_matrix = common_risk_matrix + idiosyncratic_risk_matrix 659 | 660 | post_return[date] = stock_post_return 661 | post_risk[date] = risk_matrix 662 | print('BackTest -> get post return and risk -> ' + date) 663 | 664 | return post_return, post_risk 665 | 666 | def optimization(self, forecasted_stock_return=None, forecasted_risk=None, date_factor_exposure_data=None, benchmark_weight_data=None, 667 | stock_industry_classification=None, date=None, industries=None, stocks_list=None, current_position=None): 668 | 669 | # stocks_list是本次优化可以交易的股票池,current_position是本次可以交易的且有持仓的信息 670 | stocks_count = len(stocks_list) 671 | x = cvxpy.Variable(shape=(stocks_count, 1)) 672 | if current_position is not None: 673 | x_0 = current_position.values 674 | x_s = cvxpy.Variable(shape=(stocks_count, 1)) 675 | else: 676 | x_0 = np.zeros(stocks_count) 677 | x_s = np.zeros(stocks_count) 678 | x_s.shape = (stocks_count, 1) 679 | 680 | x_0.shape = (stocks_count, 1) 681 | x_b = cvxpy.Variable(shape=(stocks_count, 1)) 682 | 683 | miu = 0.5 684 | q = forecasted_stock_return.values.copy() 685 | q.shape = (stocks_count, 1) 686 | portfolio_return = q.T * x 687 | risk_punishment = cvxpy.quad_form(x, miu * forecasted_risk) 688 | 689 | buy_cost_series = np.array([0.0003] * stocks_count) 690 | buy_cost_series.shape = (stocks_count, 1) 691 | buy_cost = buy_cost_series.T * x_b 692 | sale_cost_series = np.array([0.0013] * stocks_count) 693 | sale_cost_series.shape = (stocks_count, 1) 694 | if current_position is not None: 695 | sale_cost = sale_cost_series.T * x_s 696 | else: 697 | sale_cost = sale_cost_series.T.dot(x_s) 698 | 699 | # 等式约束 700 | eq = [] 701 | 702 | # a. 行业中性 703 | 704 | A = np.asmatrix(date_factor_exposure_data[industries].T.astype(float).values) 705 | date_benchmark_stock_weight = benchmark_weight_data[benchmark_weight_data['get_data_date'] == date].dropna() 706 | date_benchmark_stock_industry = stock_industry_classification[stock_industry_classification['get_data_date'] == date] 707 | industry_weights = date_benchmark_stock_weight.merge(date_benchmark_stock_industry, on=['stock_id'])[['sw_sector_id', 'weight']] 708 | industry_weights_sum = industry_weights.groupby(by=['sw_sector_id']).sum().loc[industries] 709 | 710 | b = industry_weights_sum.values 711 | # st_1_right_matrix.shape = (len(industries), 1) 712 | 713 | eq += [A * x == b] 714 | 715 | # b. 仓位约束 716 | 717 | A1 = np.ones(stocks_count) 718 | A1.shape = (stocks_count, 1) 719 | b1 = np.array([1]) 720 | b1.shape = (1, 1) 721 | eq += [A1.T * x == b1] 722 | 723 | # c. 多空交易成本约束 724 | 725 | eq += [x - x_0 == x_b - x_s] 726 | 727 | # 不等式约束 728 | ineq = [] 729 | 730 | # a. 个股权重上下限 731 | G = np.diag(np.ones(stocks_count)).astype(float) 732 | 733 | # 注意,很可能由于上下限约束太强,导致无解 734 | cap_bound = np.array([0.01] * stocks_count) 735 | cap_bound.shape = (stocks_count, 1) 736 | floor_bound = np.array([0] * stocks_count) 737 | floor_bound.shape = (stocks_count, 1) 738 | ineq += [G * x >= floor_bound, G * x <= cap_bound] 739 | 740 | # b. 多空交易仓位 741 | if current_position is not None: 742 | ineq += [x_b >= 0, x_s >= 0] 743 | 744 | optimums_results = cvxpy.Problem(cvxpy.Maximize(portfolio_return - risk_punishment - buy_cost - sale_cost), eq + ineq) 745 | optimums_results.solve(solver=cvxpy.SCS, verbose=True) 746 | if current_position is not None: 747 | cost = pd.DataFrame(buy_cost.value).iloc[0, 0] + pd.DataFrame(sale_cost.value).iloc[0, 0] 748 | else: 749 | cost = pd.DataFrame(buy_cost.value).iloc[0, 0] 750 | 751 | return optimums_results, x, cost 752 | 753 | def back_test(self, factors=None, factor_exposure_data=None, factor_return_data=None, stock_residual_return=None, benchmark_weight_data=None, 754 | stock_industry_classification=None): 755 | 756 | back_test_date_list = pd.Series(back_test.date_list)[pd.Series(back_test.date_list) >= '2010-01-01'].tolist() 757 | back_test_portfolio_info = pd.DataFrame(index=back_test_date_list, 758 | columns=['solver_status', 'return', 'risk', 'total_cost']) 759 | stocks_list_info = pd.DataFrame(index=back_test_date_list, columns=['feasible']) 760 | back_test_stocks_weight = pd.DataFrame(index=back_test_date_list, columns=back_test.all_stocks_list) 761 | holding_position_info = {} 762 | trading_detail = pd.DataFrame() 763 | 764 | for j, date in enumerate(back_test_date_list): 765 | 766 | # 每个date进行组合优化得到最优权重 767 | 768 | # 1. 准备基础数据 769 | all_factors = factor_exposure_data[date].columns.tolist() 770 | stocks = factor_exposure_data[date].index.tolist() 771 | industries = list(set(all_factors).difference(set(factors))) 772 | 773 | # 2. 确定当前时点能够交易对股票池 774 | # 2.1 没st、没pt、没停牌、上市时间超过1年的股票 775 | stocks_list_1 = back_test.stock_status[(back_test.stock_status['get_data_date'] == date) & 776 | (back_test.stock_status['is_st'] == 0) & 777 | (back_test.stock_status['is_pt'] == 0) & 778 | (back_test.stock_status['is_suspended'] == 0) & 779 | (back_test.stock_status['days_from_public_date'] >= 365)]['stock_id'].tolist() 780 | 781 | # 2.2 拥有足够异质风险数据的股票 782 | # 由于因子收益率是每个截面回归得到的,因子残差收益率也是截面上的,所以会造成这个截面上的股票有一部分残差收益率的时间序列数据确实严重 783 | residual_enough_stocks = stock_residual_return.loc[:date, stocks].apply( 784 | lambda s: s.dropna().shape[0] >= 52, axis=0).replace(False, np.nan).dropna().index.tolist() 785 | 786 | feasible_stocks_list = list(set(stocks_list_1).intersection(set(residual_enough_stocks))) 787 | feasible_stocks_list.sort() 788 | stocks_list_info.loc[date, 'feasible'] = feasible_stocks_list 789 | 790 | # 2.3 确定当前有持仓且能参与优化的股票池 791 | if j == 0: 792 | current_position = None 793 | else: 794 | current_position = holding_position_info[back_test_date_list[j - 1]].loc[feasible_stocks_list].fillna(0).copy() 795 | 796 | # 3. date日期的每个股票的因子暴露;每个因子的预测收益率;预测的共同风险矩阵;预测的异质风险矩阵 797 | # 3.1 t期因子暴露 798 | date_factor_exposure_data = factor_exposure_data[date].loc[feasible_stocks_list, all_factors] 799 | 800 | prior_return, prior_risk = back_test.get_prior_return_and_risk(post_factor_return=factor_return_data, 801 | factor_exposure_data=date_factor_exposure_data, 802 | post_stock_residual_return=stock_residual_return, 803 | date=date, all_factors=all_factors, 804 | feasible_stocks_list=feasible_stocks_list) 805 | 806 | # ***************************************************组合优化得到最优权重*************************************************** 807 | optimums_results, x, trading_cost = back_test.optimization( 808 | forecasted_stock_return=prior_return, forecasted_risk=prior_risk, 809 | date_factor_exposure_data=date_factor_exposure_data, benchmark_weight_data=benchmark_weight_data, 810 | stock_industry_classification=stock_industry_classification, date=date, industries=industries, 811 | stocks_list=feasible_stocks_list, current_position=current_position) 812 | 813 | # ***************************************************组合优化得到最优权重*************************************************** 814 | # 记录组合优化结果 815 | back_test_portfolio_info.loc[date, 'solver_status'] = optimums_results.status 816 | back_test_portfolio_info.loc[date, 'return'] = pd.DataFrame(prior_return.values.dot(x.value)).iloc[0, 0] 817 | back_test_portfolio_info.loc[date, 'risk'] = math.sqrt(x.value.T.dot(prior_risk).dot(x.value)) 818 | back_test_stocks_weight.loc[date, feasible_stocks_list] = pd.DataFrame(x.value).iloc[:, 0].apply( 819 | lambda w: 0 if np.abs(w) < 0.00001 else w).values 820 | holding_position_info[date] = back_test_stocks_weight.loc[date].replace(0, np.nan).dropna() 821 | 822 | # 记录交易细节 823 | if j == 0: 824 | current_portfolio_weight = pd.Series(0, index=feasible_stocks_list) 825 | rebalance_stocks_list = feasible_stocks_list.copy() 826 | else: 827 | current_portfolio_weight = back_test_stocks_weight.loc[ 828 | back_test_date_list[j - 1], stocks_list_info.loc[back_test_date_list[j - 1], 'feasible']] 829 | rebalance_stocks_list = list(set(stocks_list_info.loc[back_test_date_list[j - 1], 'feasible']).intersection( 830 | set(feasible_stocks_list))) 831 | 832 | target_portfolio_weight = back_test_stocks_weight.loc[date, feasible_stocks_list] 833 | rebalance_stocks_list.sort() 834 | 835 | weight_change_info = target_portfolio_weight.loc[rebalance_stocks_list] - current_portfolio_weight.loc[rebalance_stocks_list] 836 | 837 | buy_stocks_weight = weight_change_info[weight_change_info > 0].apply(lambda w: np.nan if np.abs(w) < 0.00001 else w).dropna() 838 | buy_info = pd.DataFrame(index=buy_stocks_weight.index, columns=['former_weight', 'new_weight']) 839 | buy_info['former_weight'] = current_portfolio_weight.loc[buy_stocks_weight.index] 840 | buy_info['new_weight'] = target_portfolio_weight.loc[buy_stocks_weight.index] 841 | buy_info['increment'] = buy_info['new_weight'] - buy_info['former_weight'] 842 | buy_info['commissions'] = buy_info['increment'] * 0.0003 843 | buy_info['position'] = 'buy' 844 | buy_info['date'] = date 845 | buy_info = buy_info.reset_index().rename(columns={'index': 'stock_id'}) 846 | 847 | sell_stocks_weight = weight_change_info[weight_change_info < 0].apply(lambda w: np.nan if np.abs(w) < 0.00001 else w).dropna() 848 | sell_info = pd.DataFrame(index=sell_stocks_weight.index, columns=['former_weight', 'new_weight']) 849 | sell_info['former_weight'] = current_portfolio_weight.loc[sell_stocks_weight.index] 850 | sell_info['new_weight'] = target_portfolio_weight.loc[sell_stocks_weight.index] 851 | sell_info['increment'] = sell_info['new_weight'] - sell_info['former_weight'] 852 | sell_info['commissions'] = sell_info['increment'].abs() * 0.0013 853 | sell_info['position'] = 'sell' 854 | sell_info['date'] = date 855 | sell_info = sell_info.reset_index().rename(columns={'index': 'stock_id'}) 856 | 857 | temp_trading_detail = pd.concat([buy_info, sell_info]).reset_index(drop=True)[ 858 | ['date', 'stock_id', 'former_weight', 'new_weight', 'increment', 'commissions']] 859 | back_test_portfolio_info.loc[date, 'total_cost'] = temp_trading_detail['commissions'].sum() 860 | trading_detail = pd.concat([trading_detail, temp_trading_detail]).reset_index(drop=True) 861 | 862 | print('******************************************************************') 863 | print('*\t\tBack Testing -> ' + date) 864 | print('******************************************************************') 865 | 866 | return back_test_portfolio_info, stocks_list_info, back_test_stocks_weight, holding_position_info, trading_detail 867 | 868 | 869 | if __name__ == '__main__': 870 | 871 | base_data = BaseData(begin_date='2013-01-01', data_freq='周度') 872 | 873 | # 基础数据: 874 | 875 | base_data.get_base_data(data_name='stock_active_return', active_benchmark='中证800') 876 | base_data.get_base_data(data_name='stock_status') 877 | base_data.get_base_data(data_name='industry_classification') 878 | base_data.get_base_data(data_name='floating_mv') 879 | base_data.get_base_data(data_name='factor_raw_data') # 默认取所有的因子数据 880 | base_data.get_base_data(data_name='benchmark_weight', active_benchmark='中证800') 881 | base_data.get_base_data(data_name='date_list') 882 | base_data.get_base_data(data_name='all_stocks_list') 883 | 884 | cs_signal = CrossSectionDataSignal(base_data=base_data) 885 | 886 | factor_number_list = base_data.factor_raw_data['factor_number'].unique().tolist() 887 | factor_return_dict = {} 888 | all_regression_result_dict = {} 889 | ic_dict = {} 890 | ic_p_value_dict ={} 891 | 892 | for factor_number in factor_number_list: 893 | 894 | temp_data = base_data.factor_raw_data[base_data.factor_raw_data['factor_number'] == factor_number] 895 | temp_pivot_data = temp_data.pivot_table(values='raw_value', index='get_data_date', columns='stock_id') 896 | factor_return_dict[factor_number], all_regression_result_dict[factor_number] = \ 897 | cs_signal.regression(factor_data=temp_pivot_data, factor_number=factor_number) 898 | ic_dict[factor_number], ic_p_value_dict[factor_number] = cs_signal.ic_time_attenuate(factor_data=temp_pivot_data, max_lag_number=6) 899 | 900 | pickle.dump(factor_return_dict, open('D://Quantitative_FOF_Framework_data//201301tillnow_test//factor_return_dict.dat', 'rb'), 901 | pickle.HIGHEST_PROTOCOL) 902 | pickle.dump(all_regression_result_dict, open('D://Quantitative_FOF_Framework_data//201301tillnow_test//all_regression_result_dict.dat', 'rb'), 903 | pickle.HIGHEST_PROTOCOL) 904 | pickle.dump(ic_dict, open('D://Quantitative_FOF_Framework_data//201301tillnow_test//ic_dict.dat', 'rb'), 905 | pickle.HIGHEST_PROTOCOL) 906 | pickle.dump(ic_p_value_dict, open('D://Quantitative_FOF_Framework_data//201301tillnow_test//ic_p_value_dict.dat', 'rb'), 907 | pickle.HIGHEST_PROTOCOL) 908 | 909 | # 回测 910 | # bt_factor_list = ['factor3', 'factor6', 'factor9', 'factor12', 'factor15', 'factor18', 'factor21', 'factor24', 'factor27'] 911 | # bt_factor_raw_data = base_data.factor_raw_data[base_data.factor_raw_data['factor_number'].apply(lambda s: s in bt_factor_list)] 912 | # 913 | # model = Model() 914 | # regression_result, factor_exposure, stock_residual = \ 915 | # model.cross_section_data_regression(left_var_data=base_data.stock_active_return, factor_melt_data=bt_factor_raw_data, 916 | # industry_classification_melt_data=base_data.industry_classification, 917 | # floating_mv_pivot_data=base_data.floating_mv) 918 | # 919 | # back_test = BackTest(base_data) 920 | # result = back_test.back_test() 921 | # 922 | # # 尝试构建纯因子策略 923 | # 924 | # # 1. 要有个流程把显著的单因子选出来 925 | # 926 | # # 1.1 所有单因子检验显著性 927 | # 928 | # single_factor_list = list(all_regression_result_dict.keys()) 929 | # 930 | # 931 | # all_regression_result_dict['factor1']['p_value'] 932 | # 933 | # 934 | # all_regression_result_dict['factor1']['adj_rsquared'] 935 | # 936 | # # 2. 把单因子进行结合:线性、非线性 937 | # 938 | # 939 | # 940 | # # 3. 因子收益预测:得到因子预期收益率,得到股票的异质收益率预测,然后得到每个股票的预期收益率 941 | # 942 | # # 4. 风险矩阵预测:根据因子预期收益率得到协方差矩阵,根据股票的异质收益率得到异质风险矩阵 943 | # 944 | # # 5. 组合优化:得到每期的最优组合 945 | # 946 | # # 6. 最优组合的历史净值分析:即策略分析 947 | 948 | 949 | -------------------------------------------------------------------------------- /Framework/ResultDisplay.py: -------------------------------------------------------------------------------- 1 | import sys, os 2 | sys.path.append(os.path.abspath(os.path.join(os.getcwd(), ".."))) 3 | sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../.."))) 4 | import pandas as pd 5 | pd.set_option('max_columns', 20) 6 | pd.set_option('display.width', 400) 7 | pd.set_option('display.unicode.east_asian_width', True) 8 | pd.set_option('display.unicode.ambiguous_as_wide', True) 9 | import numpy as np 10 | import pickle 11 | from Quantitative_FOF_Multi_Factors_Framework.DataBase import read_data_from_oracle_db 12 | from Quantitative_FOF_Multi_Factors_Framework.Factor_Test.Stratification_Method import transform_dict_to_df, optimize_data_ram, pickle_dump_data 13 | import itertools 14 | from collections import Counter 15 | 16 | 17 | def get_factor_info(): 18 | factor_library = read_data_from_oracle_db('select * from lyzs_tinysoft.factor_library') 19 | # factor_library = pd.read_excel('/Users/yi.deng/Desktop/file/database/因子列表-初步检测.xlsx') 20 | factor_list = factor_library['FACTOR_NUMBER'].tolist() 21 | factor_name_dict = {factor_library.loc[i, 'FACTOR_NUMBER']: factor_library.loc[i, 'FACTOR_NAME'] for i in range(factor_library.shape[0])} 22 | factor_second_class_dict = {factor_library.loc[i, 'FACTOR_NUMBER']: factor_library.loc[i, 'SECOND_CLASS'] for i in 23 | range(factor_library.shape[0])} 24 | factor_first_class_dict = {factor_library.loc[i, 'FACTOR_NUMBER']: factor_library.loc[i, 'FIRST_CLASS'] for i in range(factor_library.shape[0])} 25 | return factor_list, factor_name_dict, factor_first_class_dict, factor_second_class_dict 26 | 27 | 28 | def fill_factor_number_info(data, factor_name_dict=None, factor_first_class_dict=None, factor_second_class_dict=None): 29 | 30 | if factor_name_dict: 31 | data['factor_name'] = data['factor_number'].map(factor_name_dict) 32 | if factor_first_class_dict: 33 | data['first_class'] = data['factor_number'].map(factor_first_class_dict) 34 | if factor_second_class_dict: 35 | data['sceond_class'] = data['factor_number'].map(factor_second_class_dict) 36 | return data 37 | 38 | 39 | if __name__ == "__main__": 40 | 41 | # 测试 42 | output_file_url = 'D:/Quantitative_FOF_Framework_data/ResultDisplay_20190110' 43 | 44 | factor_list, factor_name_dict, factor_first_class_dict, factor_second_class_dict = get_factor_info() 45 | 46 | factor_return_reg_columns = ['regression_model', 'rolling_window', 'factor_number', 'type_name', 'get_data_date', 47 | 'alpha_significance', 'alpha', 'alpha_ste', 'alpha_t_value', 'alpha_p_value', 48 | 'beta_significance', 'beta', 'beta_ste', 'beta_t_value', 'beta_p_value', 49 | 'adj_rsquared'] 50 | 51 | factor_return_reg_result_sql = 'select ' + ','.join(factor_return_reg_columns) + ' from lyzs_tinysoft.factor_return_regression' 52 | 53 | factor_return_reg_result = read_data_from_oracle_db(factor_return_reg_result_sql).rename( 54 | columns={col.upper():col for col in factor_return_reg_columns})[factor_return_reg_columns] 55 | 56 | # factor_return_reg_result = pickle.load(open('/Users/yi.deng/Desktop/file/database/factor_return_regression.dat', 'rb')) 57 | 58 | critirion = 0.1 59 | signifcant_factor_return_reg = factor_return_reg_result[factor_return_reg_result['alpha_p_value'] <= critirion].copy() 60 | 61 | date_sql = ''' 62 | select get_data_date 63 | from lyzs_tinysoft.get_data_date_library 64 | where get_data_freq = \'周度\' 65 | order by get_data_date 66 | ''' 67 | all_get_data_date = read_data_from_oracle_db(date_sql)['GET_DATA_DATE'] 68 | all_get_data_date.name = 'get_data_date' 69 | 70 | ranking_columns = ['regression_model', 'rolling_window', 'factor_number', 'type_name', 'get_data_date', 71 | 'alpha_significance', 'alpha', 'alpha_ste', 'alpha_t_value', 'alpha_p_value', 72 | 'beta_significance', 'beta', 'beta_ste', 'beta_t_value', 'beta_p_value', 73 | 'adj_rsquared'] 74 | 75 | # 每个日期下,每个模型、滚动窗口显著情况汇总显著个数 76 | 77 | # 1.1 时间序列 78 | 79 | primary_keys_name_list = ['regression_model', 'rolling_window', 'factor_number'] 80 | significant_pk_all_list = [signifcant_factor_return_reg[column_name].unique().tolist() for column_name in primary_keys_name_list] 81 | primary_keys_list = list(itertools.product(*significant_pk_all_list)) 82 | 83 | week_data_rolling_pct_list = [4, 5, 6, 12, 36, 52] 84 | factor_return_reg_signifcant_ts_pct = {} 85 | 86 | for primary_key in primary_keys_list: 87 | temp_ts_pct_pivot_data = pd.DataFrame(index=all_get_data_date, columns=['第' + str(i + 1) + '档收益率' for i in range(10)]) 88 | # 有的档位可能不存在显著的,所以导致该列pivot出来之后不存在 89 | temp_pivot_data = signifcant_factor_return_reg[signifcant_factor_return_reg[primary_keys_name_list].isin(primary_key).all(1)].pivot_table( 90 | index='get_data_date', columns='type_name', values='alpha') 91 | temp_ts_pct_pivot_data.loc[temp_pivot_data.index, temp_pivot_data.columns] = temp_pivot_data 92 | 93 | temp_ts_pct_melt_data = temp_ts_pct_pivot_data.reset_index().rename(columns={'GET_DATA_DATE': 'get_data_date'}).melt( 94 | id_vars=['get_data_date'], var_name=['type_name'], value_name='alpha') 95 | 96 | for rolling_num in week_data_rolling_pct_list: 97 | temp_ts_pct_melt_data['last_' + str(rolling_num) + '_week'] = np.nan 98 | rolling_result = temp_ts_pct_melt_data.groupby(by=['type_name']).apply( 99 | lambda df: df['alpha'].rolling(rolling_num, min_periods=1).apply( 100 | lambda alpha: len(pd.Series(alpha).dropna()) / len(alpha))).reset_index() 101 | temp_ts_pct_melt_data.loc[rolling_result['level_1'], 'last_' + str(rolling_num) + '_week'] = rolling_result['alpha'].values 102 | 103 | factor_return_reg_signifcant_ts_pct[primary_key] = temp_ts_pct_melt_data.copy() 104 | print('1.1 时间序列:' + ','.join([str(pk) for pk in primary_key])) 105 | 106 | factor_return_reg_signifcant_ts_pct_df = transform_dict_to_df(factor_return_reg_signifcant_ts_pct, primary_keys_name_list) 107 | factor_return_reg_signifcant_ts_pct_df = optimize_data_ram(factor_return_reg_signifcant_ts_pct_df) 108 | pickle_dump_data(factor_return_reg_signifcant_ts_pct_df, output_file_url, data_name='factor_return_reg_signifcant_ts_pct_df') 109 | 110 | # # 1.2.1 截面 111 | # factor_return_reg_signifcant = factor_return_reg_result.groupby(by=[ 'regression_model', 'rolling_window', 'factor_number', 'get_data_date']).apply( 112 | # lambda df: sum(df['alpha_p_value'] <= critirion)).reset_index().rename(columns={0: '10档显著个数'}) 113 | # factor_return_reg_significant_cs_pct = factor_return_reg_signifcant.groupby(by=['regression_model', 'rolling_window', 'get_data_date']).apply( 114 | # lambda df: df.sort_values(by=['10档显著个数'], ascending=False)).reset_index(drop=True) 115 | # factor_return_reg_significant_cs_pct['10档显著占比'] = factor_return_reg_significant_cs_pct['10档显著个数'].div(10) 116 | # factor_return_reg_significant_cs_pct = \ 117 | # fill_factor_number_info(factor_return_reg_significant_cs_pct, factor_name_dict=None, factor_first_class_dict=factor_first_class_dict, 118 | # factor_second_class_dict=factor_second_class_dict) 119 | # 120 | # # 1.2.2 截面的时间序列表现 121 | # 122 | # for rolling_num in week_data_rolling_pct_list: 123 | # temp_rolling = factor_return_reg_significant_cs_pct.groupby(by=['regression_model', 'rolling_window', 'factor_number']).apply( 124 | # lambda cr_pct: cr_pct['10档显著占比'].rolling(rolling_num, min_periods=1).mean()).reset_index() 125 | # factor_return_reg_significant_cs_pct.loc[temp_rolling['level_3'], 'last_' + str(rolling_num) + '_week'] = temp_rolling['10档显著占比'].values 126 | # 127 | # factor_return_reg_significant_cs_pct = optimize_data_ram(factor_return_reg_significant_cs_pct) 128 | # pickle_dump_data(factor_return_reg_significant_cs_pct, output_file_url, data_name='factor_return_reg_significant_cs_pct') 129 | # 130 | # # 2. 基于rank的因子显著性 131 | # 132 | # # 2.1 基于alpha排序的 133 | # 134 | # ranking_sql = ''' 135 | # SELECT ''' + ','.join(ranking_columns) + ''', 136 | # rank () OVER (PARTITION BY REGRESSION_MODEL, ROLLING_WINDOW, GET_DATA_DATE, FACTOR_NUMBER ORDER BY ALPHA DESC) rank 137 | # FROM LYZS_TINYSOFT.FACTOR_RETURN_REGRESSION 138 | # WHERE ALPHA_P_VALUE <= 0.1 139 | # ''' 140 | # 141 | # ranking_alpha_data = read_data_from_oracle_db(ranking_sql) 142 | # ranking_alpha_data.columns = ranking_columns + ['rank'] 143 | # ranking_alpha_data = ranking_alpha_data.sort_values(by=['regression_model', 'rolling_window', 'factor_number', 'type_name', 'get_data_date']) 144 | # 145 | # primary_keys_name_list = ['regression_model', 'rolling_window', 'factor_number'] 146 | # significant_pk_all_list = [ranking_alpha_data[column_name].unique().tolist() for column_name in primary_keys_name_list] 147 | # primary_keys_list = list(itertools.product(*significant_pk_all_list)) 148 | # 149 | # factor_number_sql = ''' 150 | # select factor_number 151 | # from lyzs_tinysoft.factor_library 152 | # order by factor_number 153 | # ''' 154 | # factor_number_list = read_data_from_oracle_db(factor_number_sql)['FACTOR_NUMBER'] 155 | # week_data_rolling_pct_list = [12, 24, 52] 156 | # 157 | # factor_return_alpha_ranking_consistency = {} 158 | # 159 | # for primary_key in primary_keys_list: 160 | # 161 | # # primary_keys:('OLS', 32, 'factor1') 162 | # temp_rank_pivot_data = pd.DataFrame(index=all_get_data_date, columns=['第' + str(i + 1) + '档收益率' for i in range(10)]) 163 | # 164 | # # 有的档位可能不存在显著的,所以导致该列pivot出来之后不存在 165 | # temp_pivot_data = ranking_alpha_data[ranking_alpha_data[primary_keys_name_list].isin(primary_key).all(1)].pivot_table( 166 | # index='get_data_date', columns='type_name', values='rank') 167 | # temp_rank_pivot_data.loc[temp_pivot_data.index, temp_pivot_data.columns] = temp_pivot_data 168 | # temp_rank_melt_data = temp_rank_pivot_data.reset_index().rename(columns={'GET_DATA_DATE': 'get_data_date'}).melt( 169 | # id_vars=['get_data_date'], var_name=['type_name'], value_name='rank') 170 | # 171 | # for rolling_num in week_data_rolling_pct_list: 172 | # rolling_rank_number = temp_rank_melt_data.groupby(by=['type_name']).apply( 173 | # lambda df: 174 | # df['rank'].rolling(rolling_num, min_periods=1).apply(lambda rolling_data: Counter(pd.Series(rolling_data).dropna()).most_common(1)[0][0]) 175 | # ).reset_index() 176 | # # Counter(rolling_data).most_common(2) -> [(1, 5), (2, 3)] 第一个tuple表示出现次数最多的数字1出现过的次数是5 177 | # # Counter(rolling_data).most_common(1)[0][0] 即取出现次数第一多的数字是什么数字 178 | # # Counter(rolling_data).most_common(1)[0][1] 即取出现次数第一多的数字出现过几次 179 | # rolling_rank_number_pct = temp_rank_melt_data.groupby(by=['type_name']).apply( 180 | # lambda df: 181 | # df['rank'].rolling(rolling_num, min_periods=1).apply( 182 | # lambda rolling_data: Counter(pd.Series(rolling_data).dropna()).most_common(1)[0][1] / rolling_num) 183 | # ).reset_index() 184 | # 185 | # temp_rank_melt_data.loc[rolling_rank_number['level_1'], 'last_' + str(rolling_num) + '_week_most_freq_rank'] = rolling_rank_number['rank'] 186 | # temp_rank_melt_data.loc[rolling_rank_number_pct['level_1'], 'last_' + str(rolling_num) + '_week_most_freq_rank_pct'] = \ 187 | # rolling_rank_number_pct['rank'] 188 | # 189 | # # 计算一个全时段多最多rank,每个时点往前全时段 190 | # temp_rank_melt_data_cumsum = temp_rank_melt_data.groupby(by=['type_name']).apply( 191 | # lambda df: pd.get_dummies(df['rank']).cumsum().apply( 192 | # lambda rank_cumsum: [np.argmax(rank_cumsum), rank_cumsum.max()] 193 | # if rank_cumsum.max() != 0 194 | # else [np.nan, np.nan], axis=1 195 | # ) 196 | # ).reset_index() 197 | # 198 | # # 若全都没有显著的则会出现NaN的情况 199 | # temp_rank_melt_data_cumsum[0] = temp_rank_melt_data_cumsum[0].apply(lambda rank_list:[np.nan, np.nan] if rank_list != rank_list else rank_list) 200 | # 201 | # # 先算好一个截止到当前的日期长度,作为后面的分母 202 | # temp_rank_melt_data_cumsum[1] = 1 203 | # temp_rank_melt_data_date_length = temp_rank_melt_data_cumsum.groupby(by=['type_name']).apply(lambda df: df.cumsum()[1]).reset_index() 204 | # 205 | # temp_rank_melt_data.loc[temp_rank_melt_data_cumsum['level_1'], 'till_now_most_freq_rank'] = \ 206 | # temp_rank_melt_data_cumsum[0].apply(lambda l: l[0]) 207 | # # 注意index对的都是temp_rank_melt_data_cumsum的 208 | # temp_rank_melt_data.loc[temp_rank_melt_data_cumsum['level_1'], 'till_now_most_freq_rank_pct'] = \ 209 | # temp_rank_melt_data_cumsum[0].apply(lambda l: l[1]).div(temp_rank_melt_data_date_length.loc[temp_rank_melt_data_cumsum['level_1']][1]) 210 | # 211 | # factor_return_alpha_ranking_consistency[primary_key] = temp_rank_melt_data.copy() 212 | # print('2.1 基于alpha排序的:' + ','.join([str(pk) for pk in primary_key])) 213 | # 214 | # factor_return_alpha_ranking_consistency_df = transform_dict_to_df(factor_return_alpha_ranking_consistency, primary_keys_name_list) 215 | # factor_return_alpha_ranking_consistency_df = optimize_data_ram(factor_return_alpha_ranking_consistency_df) 216 | # pickle_dump_data(factor_return_alpha_ranking_consistency_df, output_file_url, data_name='factor_return_alpha_ranking_consistency_df') 217 | # 218 | # # 2.2 基于r_squared排序的 219 | # 220 | # ranking_sql = ''' 221 | # SELECT ''' + ','.join(ranking_columns) + ''', 222 | # rank () OVER (PARTITION BY REGRESSION_MODEL, ROLLING_WINDOW, GET_DATA_DATE, FACTOR_NUMBER ORDER BY ADJ_RSQUARED DESC) rank 223 | # FROM LYZS_TINYSOFT.FACTOR_RETURN_REGRESSION 224 | # WHERE ALPHA_P_VALUE <= 0.1 225 | # ''' 226 | # 227 | # ranking_rsquared_data = read_data_from_oracle_db(ranking_sql) 228 | # ranking_rsquared_data.columns = ranking_columns + ['rank'] 229 | # ranking_rsquared_data = ranking_rsquared_data.sort_values(by=['regression_model', 'rolling_window', 'factor_number', 'type_name', 'get_data_date']) 230 | # 231 | # primary_keys_name_list = ['regression_model', 'rolling_window', 'factor_number'] 232 | # significant_pk_all_list = [ranking_rsquared_data[column_name].unique().tolist() for column_name in primary_keys_name_list] 233 | # primary_keys_list = list(itertools.product(*significant_pk_all_list)) 234 | # 235 | # factor_number_sql = ''' 236 | # select factor_number 237 | # from lyzs_tinysoft.factor_library 238 | # order by factor_number 239 | # ''' 240 | # factor_number_list = read_data_from_oracle_db(factor_number_sql)['FACTOR_NUMBER'] 241 | # week_data_rolling_pct_list = [12, 24, 52] 242 | # 243 | # factor_return_rsquared_ranking_consistency = {} 244 | # 245 | # for primary_key in primary_keys_list: 246 | # 247 | # # primary_keys:('OLS', 32, 'factor1') 248 | # temp_rank_pivot_data = pd.DataFrame(index=all_get_data_date, columns=['第' + str(i + 1) + '档收益率' for i in range(10)]) 249 | # # 有的档位可能不存在显著的,所以导致该列pivot出来之后不存在 250 | # temp_pivot_data = ranking_rsquared_data[ranking_rsquared_data[primary_keys_name_list].isin(primary_key).all(1)].pivot_table( 251 | # index='get_data_date', columns='type_name', values='rank') 252 | # temp_rank_pivot_data.loc[temp_pivot_data.index, temp_pivot_data.columns] = temp_pivot_data 253 | # temp_rank_melt_data = temp_rank_pivot_data.reset_index().rename(columns={'GET_DATA_DATE': 'get_data_date'}).melt( 254 | # id_vars=['get_data_date'], var_name=['type_name'], value_name='rank') 255 | # 256 | # for rolling_num in week_data_rolling_pct_list: 257 | # rolling_rank_number = temp_rank_melt_data.groupby(by=['type_name']).apply( 258 | # lambda df: 259 | # df['rank'].rolling(rolling_num, min_periods=1).apply(lambda rolling_data: Counter(pd.Series(rolling_data).dropna()).most_common(1)[0][0]) 260 | # ).reset_index() 261 | # # Counter(rolling_data).most_common(2) -> [(1, 5), (2, 3)] 第一个tuple表示出现次数最多的数字1出现过的次数是5 262 | # # Counter(rolling_data).most_common(1)[0][0] 即取出现次数第一多的数字是什么数字 263 | # # Counter(rolling_data).most_common(1)[0][1] 即取出现次数第一多的数字出现过几次 264 | # rolling_rank_number_pct = temp_rank_melt_data.groupby(by=['type_name']).apply( 265 | # lambda df: 266 | # df['rank'].rolling(rolling_num, min_periods=1).apply( 267 | # lambda rolling_data: Counter(pd.Series(rolling_data).dropna()).most_common(1)[0][1] / rolling_num) 268 | # ).reset_index() 269 | # 270 | # temp_rank_melt_data.loc[rolling_rank_number['level_1'], 'last_' + str(rolling_num) + '_week_most_freq_rank'] = rolling_rank_number['rank'] 271 | # temp_rank_melt_data.loc[rolling_rank_number_pct['level_1'], 'last_' + str(rolling_num) + '_week_most_freq_rank_pct'] = \ 272 | # rolling_rank_number_pct['rank'] 273 | # 274 | # # 计算一个全时段多最多rank,每个时点往前全时段 275 | # 276 | # temp_rank_melt_data_cumsum = temp_rank_melt_data.groupby(by=['type_name']).apply( 277 | # lambda df: pd.get_dummies(df['rank']).cumsum().apply( 278 | # lambda rank_cumsum: [np.argmax(rank_cumsum), rank_cumsum.max()] if rank_cumsum.max() != 0 else [np.nan, np.nan], axis=1 279 | # ) 280 | # ).reset_index() 281 | # 282 | # # 若全都没有显著的则会出现NaN的情况 283 | # temp_rank_melt_data_cumsum[0] = temp_rank_melt_data_cumsum[0].apply( 284 | # lambda rank_list: [np.nan, np.nan] if rank_list != rank_list else rank_list) 285 | # 286 | # # 先算好一个截止到当前的日期长度,作为后面的分母 287 | # temp_rank_melt_data_cumsum[1] = 1 288 | # temp_rank_melt_data_date_length = temp_rank_melt_data_cumsum.groupby(by=['type_name']).apply(lambda df: df.cumsum()[1]).reset_index() 289 | # 290 | # temp_rank_melt_data.loc[temp_rank_melt_data_cumsum['level_1'], 'till_now_most_freq_rank'] = temp_rank_melt_data_cumsum[0].apply(lambda l: l[0]) 291 | # # 注意index对的都是temp_rank_melt_data_cumsum的 292 | # temp_rank_melt_data.loc[temp_rank_melt_data_cumsum['level_1'], 'till_now_most_freq_rank_pct'] = \ 293 | # temp_rank_melt_data_cumsum[0].apply(lambda l: l[1]).div(temp_rank_melt_data_date_length.loc[temp_rank_melt_data_cumsum['level_1']][1]) 294 | # 295 | # factor_return_rsquared_ranking_consistency[primary_key] = temp_rank_melt_data.copy() 296 | # print('2.2 基于r_squared排序的:' + ','.join([str(pk) for pk in primary_key])) 297 | # 298 | # factor_return_alpha_ranking_consistency_df = transform_dict_to_df(factor_return_rsquared_ranking_consistency, primary_keys_name_list) 299 | # factor_return_alpha_ranking_consistency_df = optimize_data_ram(factor_return_alpha_ranking_consistency_df) 300 | # pickle_dump_data(factor_return_alpha_ranking_consistency_df, output_file_url, data_name='factor_return_alpha_ranking_consistency_df') 301 | 302 | 303 | -------------------------------------------------------------------------------- /Framework/Tools/DateStatus.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Dengyi-CN/Quantitative_FOF_Multi_Factors_Framework/65875c7fcf3b6c0a85f98d4e3fffb38f821c7d4a/Framework/Tools/DateStatus.py -------------------------------------------------------------------------------- /Framework/Tools/FactorInfoFunc.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from Quantitative_FOF_Multi_Factors_Framework.DataBase import read_data_from_oracle_db 4 | 5 | 6 | def get_factor_info(): 7 | factor_library = read_data_from_oracle_db('select * from lyzs_tinysoft.factor_library') 8 | # factor_library = pd.read_excel('/Users/yi.deng/Desktop/file/database/因子列表-初步检测.xlsx') 9 | factor_list = factor_library['FACTOR_NUMBER'].tolist() 10 | factor_name_dict = {factor_library.loc[i, 'FACTOR_NUMBER']: factor_library.loc[i, 'FACTOR_NAME'] for i in range(factor_library.shape[0])} 11 | factor_second_class_dict = {factor_library.loc[i, 'FACTOR_NUMBER']: factor_library.loc[i, 'SECOND_CLASS'] for i in 12 | range(factor_library.shape[0])} 13 | factor_first_class_dict = {factor_library.loc[i, 'FACTOR_NUMBER']: factor_library.loc[i, 'FIRST_CLASS'] for i in range(factor_library.shape[0])} 14 | return factor_list, factor_name_dict, factor_first_class_dict, factor_second_class_dict 15 | 16 | 17 | def fill_factor_number_info(data): 18 | factor_list, factor_name_dict, factor_first_class_dict, factor_second_class_dict = get_factor_info() 19 | if factor_name_dict: 20 | data['factor_name'] = data['factor_number'].map(factor_name_dict) 21 | if factor_first_class_dict: 22 | data['first_class'] = data['factor_number'].map(factor_first_class_dict) 23 | if factor_second_class_dict: 24 | data['second_class'] = data['factor_number'].map(factor_second_class_dict) 25 | return data 26 | -------------------------------------------------------------------------------- /Framework/Tools/FactorSignalFunc.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from statsmodels.tsa.stattools import adfuller 4 | import statsmodels.api as sm 5 | import math 6 | from decimal import Decimal 7 | 8 | # -------------------------细节函数------------------------------ 9 | 10 | 11 | def get_quantile_data_by_factor_quantile(single_date_data, factor_number, floor_quantile, cap_quantile): 12 | 13 | stock_floor_quantile = single_date_data[factor_number].quantile(floor_quantile) 14 | stock_cap_quantile = single_date_data[factor_number].quantile(cap_quantile) 15 | if floor_quantile == 0: # 因为最小值不包括,所以要把最小值变小一点把它包进来 16 | stock_floor_quantile = stock_floor_quantile - 0.1 17 | 18 | quantile_data = single_date_data[(single_date_data[factor_number] > stock_floor_quantile) & 19 | (single_date_data[factor_number] <= stock_cap_quantile)].copy() 20 | return quantile_data 21 | 22 | 23 | def get_factor_stratification_info(data, sample_scope=None, factor_number_list=None, stratification_num=10, quantile_dict=None): 24 | 25 | cap_quantile_list = [(quantile + 1) / stratification_num for quantile in range(0, stratification_num)] 26 | floor_quantile_list = [quantile / stratification_num for quantile in range(0, stratification_num)] 27 | 28 | factor_stratification_data = {} 29 | 30 | factor_data_columns = ['get_data_date', 'stockid'] 31 | 32 | for sample_name in sample_scope: 33 | 34 | print('\t开始根据因子进行分档:' + sample_name) 35 | 36 | for factor_number in factor_number_list: 37 | 38 | for i in range(stratification_num): 39 | # 此段用来生成每个分位的组合。 40 | factor_stratification_data[(sample_name, factor_number, quantile_dict[i])] = \ 41 | data[(sample_name, factor_number)][factor_data_columns + [factor_number]].groupby(by=['数据提取日']).apply( 42 | lambda df: get_quantile_data_by_factor_quantile(df, factor_number, floor_quantile_list[i], cap_quantile_list[i])) 43 | 44 | factor_stratification_data[(sample_name, factor_number, quantile_dict[i])].index = \ 45 | range(factor_stratification_data[(sample_name, factor_number, quantile_dict[i])].shape[0]) 46 | 47 | min_count = factor_stratification_data[(sample_name, factor_number, quantile_dict[i])].groupby( 48 | by=['数据提取日']).count()['stockid'].min() 49 | max_count = factor_stratification_data[(sample_name, factor_number, quantile_dict[i])].groupby( 50 | by=['数据提取日']).count()['stockid'].max() 51 | 52 | print('\t完成根据因子进行分档:' + sample_name + '-' + factor_number + '-第' + quantile_dict[i] + '档' + 53 | '(' + format(min_count, '.0f') + '/' + format(max_count, '.0f') + ')') 54 | 55 | return factor_stratification_data 56 | 57 | 58 | def get_factor_stratification_portfolio_return(factor_stratification_data, stock_return_df, sample_scope=None, factor_number_list=None, 59 | startification_num=10, quantile_dict=None, yield_type_list=None, get_factor_data_date_list=None): 60 | 61 | factor_stratification_return = {} 62 | # 计算每个样本池 63 | for sample_name in sample_scope: 64 | 65 | print('\t开始计算因子分档后每档收益率:' + sample_name) 66 | 67 | for factor_number in factor_number_list: 68 | 69 | # 计算每档 70 | for i in range(startification_num): 71 | factor_stratification_return[(sample_name, factor_number, quantile_dict[i])] = \ 72 | pd.DataFrame(index=get_factor_data_date_list, columns=['因子均值'] + yield_type_list) 73 | 74 | factor_stratification_return[(sample_name, factor_number, quantile_dict[i])]['因子均值'] = \ 75 | factor_stratification_data[(sample_name, factor_number, quantile_dict[i])].groupby(by=['get_data_date']).apply( 76 | lambda df: df[factor_number].mean()).astype('float32') 77 | 78 | for yield_type in yield_type_list: 79 | tempo_yield_data = factor_stratification_data[(sample_name, factor_number, quantile_dict[i])].merge( 80 | stock_return_df[['get_data_date', 'stockid'] + [yield_type]], on=['get_data_date', 'stockid'], how='left').copy() 81 | 82 | factor_stratification_return[(sample_name, factor_number, quantile_dict[i])][yield_type] = \ 83 | tempo_yield_data.groupby(by=['get_data_date']).apply(lambda df: df[yield_type].mean()).astype('float32') 84 | 85 | print('\t完成计算因子分档后每档收益率:' + sample_name + '-' + factor_number + '-第' + quantile_dict[i] + '档-' + yield_type) 86 | 87 | return factor_stratification_return 88 | 89 | 90 | def set_ts_data_constant_test_result(time_series_data): 91 | result = pd.Series(index=['样本数', 't值', 'p值', '显著性', '最大滞后项']) 92 | adftest = adfuller(time_series_data.astype(float), regression="ct") 93 | result.loc['样本数'] = adftest[3] 94 | result.loc['t值'] = round(adftest[0], 2) 95 | result.loc['p值'] = Decimal(format(adftest[1], '.3f')) 96 | if adftest[0] <= adftest[4]['1%']: 97 | result.loc['显著性'] = '***' 98 | elif adftest[0] <= adftest[4]['5%']: 99 | result.loc['显著性'] = '**' 100 | elif adftest[0] <= adftest[4]['10%']: 101 | result.loc['显著性'] = '*' 102 | else: 103 | result.loc['显著性'] = '不显著' 104 | 105 | result.loc['最大滞后项'] = adftest[2] 106 | return result 107 | 108 | 109 | def set_linear_regression_result(regression_result, result_content_list, method='OLS'): 110 | if (method == 'OLS') | (method == 'WLS'): 111 | result = pd.Series(index=result_content_list) 112 | result.loc['Adj. R-squared'] = round(regression_result.rsquared_adj, 4) 113 | elif method == 'RLM': 114 | result = pd.Series(index=result_content_list) 115 | 116 | result.loc[['Alpha', 'Beta']] = [round(value, 3) for value in list(regression_result.params)] 117 | result.loc[['Alpha t值', 'Beta t值']] = [round(value, 3) for value in list(regression_result.tvalues)] 118 | result.loc[['Alpha p值', 'Beta p值']] = [round(value, 4) for value in list(regression_result.pvalues)] 119 | result.loc[['Alpha标准误', 'Beta标准误']] = [round(value, 3) for value in list(regression_result.bse)] 120 | if regression_result.pvalues[0] <= 0.01: 121 | result.loc['Alpha显著性'] = '***' 122 | elif regression_result.pvalues[0] <= 0.05: 123 | result.loc['Alpha显著性'] = '**' 124 | elif regression_result.pvalues[0] <= 0.1: 125 | result.loc['Alpha显著性'] = '*' 126 | else: 127 | result.loc['Alpha显著性'] = '' 128 | 129 | if regression_result.pvalues[1] <= 0.01: 130 | result.loc['Beta显著性'] = '***' 131 | elif regression_result.pvalues[1] <= 0.05: 132 | result.loc['Beta显著性'] = '**' 133 | elif regression_result.pvalues[1] <= 0.1: 134 | result.loc['Beta显著性'] = '*' 135 | else: 136 | result.loc['Beta显著性'] = '' 137 | return result 138 | 139 | 140 | def get_stratify_ts_data_regression_result(factor_stratification_return, index_return_df, sample_scope=None, factor_number_list=None, 141 | get_factor_data_date_list=None, regression_model_list=None, 142 | quantile_dict=None, rolling_window_list=None, stratification_num=10): 143 | 144 | # ts_constant_test_result_dict = {} 145 | factor_test_result = {} 146 | result_content_list = ['Alpha显著性', 'Alpha', 'Alpha t值', 'Alpha标准误', 'Alpha p值', 'Beta显著性', 'Beta', 'Beta t值', 'Beta标准误', 'Beta p值'] 147 | result_value_content_list = ['Alpha', 'Alpha t值', 'Alpha标准误', 'Alpha p值', 'Beta', 'Beta t值', 'Beta标准误', 'Beta p值'] 148 | 149 | for sample_name in sample_scope: 150 | 151 | print('\t开始单因子时间序列数据回归检验:' + sample_name) 152 | 153 | for factor_number in factor_number_list: 154 | 155 | factor_return_df = pd.DataFrame(index=get_factor_data_date_list, columns=['因子收益率', 'Alpha', 'Beta']) 156 | factor_return_df['Beta'] = index_return_df[sample_name + '收益率'] 157 | factor_return_df['Alpha'] = 1 158 | 159 | # # 1. 时间序列平稳性检测 160 | # for i in range(stratification_num): 161 | # factor_return_df['因子收益率'] = factor_stratification_return[(sample_name, factor_number, quantile_dict[i])]['持仓期收益率'] 162 | # 163 | # # (1) 每档都做回归,因为有的基准是从2007年才开始,因此要dropna 164 | # ts_regression_df = factor_return_df.dropna() 165 | # ts_regression_df.index = pd.Series(ts_regression_df.index).apply(lambda d: pd.to_datetime(d)) 166 | # 167 | # # (2) 对因子收益率和指数收益率序列进行平稳性检验 168 | # ts_constant_test_result_dict[(sample_name, factor_number, quantile_dict[i])] = \ 169 | # pd.DataFrame(index=['因子收益率', '指数收益率'], columns=['样本数', 't值', 'p值', '显著性', '最大滞后项']) 170 | # ts_constant_test_result_dict[(sample_name, factor_number, quantile_dict[i])].loc['因子收益率'] = \ 171 | # set_ts_data_constant_test_result(ts_regression_df['因子收益率']) 172 | # ts_constant_test_result_dict[(sample_name, factor_number, quantile_dict[i])].loc['指数收益率'] = \ 173 | # set_ts_data_constant_test_result(ts_regression_df['Beta']) 174 | 175 | # 2. 滚动窗口回归 176 | for regression_model in regression_model_list: 177 | # (1) 选择不同的回归模型 178 | 179 | for rolling_window in rolling_window_list: 180 | # (2) 选择不同的滚动窗口长度 181 | rolling_window_end_date_list = get_factor_data_date_list[rolling_window - 1:] 182 | 183 | for i in range(stratification_num): 184 | 185 | # (3) 每档都分别进行滚动窗口回归 186 | if (regression_model == 'WLS') | (regression_model == 'OLS'): 187 | factor_test_result[(sample_name, factor_number, quantile_dict[i], regression_model, rolling_window)] = \ 188 | pd.DataFrame(index=rolling_window_end_date_list, columns=result_content_list + ['Adj. R-squared']) 189 | elif regression_model == 'RLM': 190 | factor_test_result[(sample_name, factor_number, quantile_dict[i], regression_model, rolling_window)] = \ 191 | pd.DataFrame(index=rolling_window_end_date_list, columns=result_content_list) 192 | 193 | for date_i, date in enumerate(rolling_window_end_date_list, rolling_window): 194 | # 选择每个滚动窗口的最后一个日期 195 | regression_period = get_factor_data_date_list[date_i - rolling_window:date_i] 196 | regression_data = pd.DataFrame(index=regression_period, columns=['因子收益率', 'Alpha', 'Beta']) 197 | regression_data['因子收益率'] = \ 198 | factor_stratification_return[(sample_name, factor_number, quantile_dict[i])]['持仓期收益率'].loc[regression_period] 199 | regression_data['Alpha'] = 1 200 | regression_data['Beta'] = index_return_df[sample_name + '收益率'].loc[regression_period] 201 | 202 | if regression_model == 'RLM': 203 | regression_result = sm.RLM(regression_data.loc[regression_period, '因子收益率'].astype(float), 204 | regression_data.loc[regression_period, ['Alpha', 'Beta']].astype(float)).fit() 205 | elif regression_model == 'OLS': 206 | regression_result = sm.OLS(regression_data.loc[regression_period, '因子收益率'].astype(float), 207 | regression_data.loc[regression_period, ['Alpha', 'Beta']].astype( 208 | float)).fit().get_robustcov_results() 209 | elif regression_model == 'WLS': 210 | weight_dict = {'cos': [1 / (math.cos(x) / sum([math.cos(x) for x in np.linspace(0, math.pi / 2, rolling_window)])) 211 | for x in np.linspace(0, math.pi / 2, rolling_window)]} 212 | 213 | regression_result = sm.WLS(regression_data.loc[regression_period, '因子收益率'].astype(float), 214 | regression_data.loc[regression_period, ['Alpha', 'Beta']].astype(float), 215 | weights=weight_dict['cos']).fit().get_robustcov_results() 216 | 217 | factor_test_result[(sample_name, factor_number, quantile_dict[i], regression_model, rolling_window)].loc[date] = \ 218 | set_linear_regression_result(regression_result, result_content_list, method=regression_model) 219 | 220 | # 把作为index的日期提出来成为一列,因为这一个dataframe只包括一个因子序号的一个档位回归结果,后期要整合所有档位到一起 221 | factor_test_result[(sample_name, factor_number, quantile_dict[i], regression_model, rolling_window)] = \ 222 | factor_test_result[(sample_name, regression_model, rolling_window, factor_number, quantile_dict[i])].reset_index().rename( 223 | columns={'index': '数据提取日'}) 224 | factor_test_result[(sample_name, factor_number, quantile_dict[i], regression_model, rolling_window)][ 225 | result_value_content_list] = \ 226 | factor_test_result[(sample_name, factor_number, quantile_dict[i], regression_model, rolling_window)][ 227 | result_value_content_list].apply(pd.to_numeric, downcast='float') 228 | if (regression_model == 'OLS') | (regression_model == 'WLS'): 229 | factor_test_result[(sample_name, factor_number, quantile_dict[i], regression_model, rolling_window)]['Adj. R-squared'] = \ 230 | factor_test_result[(sample_name, factor_number, quantile_dict[i], regression_model, rolling_window)][ 231 | 'Adj. R-squared'].apply(pd.to_numeric, downcast='float') 232 | 233 | print('\t完成时间序列数据回归检验:' + sample_name + '-' + factor_number + '-第' + quantile_dict[i] + '档-回归模型' 234 | + regression_model + '-滚动窗口期' + str(rolling_window)) 235 | 236 | return factor_test_result 237 | 238 | 239 | def standardization(factor_data, factor_name, fillna=True): 240 | # factor_data输入类型为Dataframe,例如regression_data_after_y,factor_name为列名,例如 '因子暴露'。 241 | Dm = factor_data[factor_name].median() 242 | Dm1 = abs(factor_data[factor_name] - Dm).median() 243 | cap_index = factor_data[factor_name] > (Dm + 5 * Dm1) 244 | floor_index = factor_data[factor_name] < (Dm - 5 * Dm1) 245 | factor_data.loc[cap_index, factor_name] = Dm + 5 * Dm1 246 | factor_data.loc[floor_index, factor_name] = Dm - 5 * Dm1 247 | factor_data[factor_name] = \ 248 | (factor_data[factor_name] - factor_data[factor_name].mean()) / factor_data[factor_name].std() 249 | if fillna : 250 | factor_data = factor_data.fillna(0) 251 | 252 | return factor_data 253 | 254 | 255 | -------------------------------------------------------------------------------- /Framework/Tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Dengyi-CN/Quantitative_FOF_Multi_Factors_Framework/65875c7fcf3b6c0a85f98d4e3fffb38f821c7d4a/Framework/Tools/__init__.py -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Quantitative_FOF_Framework 2 | 量化FOF框架 3 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Dengyi-CN/Quantitative_FOF_Multi_Factors_Framework/65875c7fcf3b6c0a85f98d4e3fffb38f821c7d4a/__init__.py -------------------------------------------------------------------------------- /test.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Dengyi-CN/Quantitative_FOF_Multi_Factors_Framework/65875c7fcf3b6c0a85f98d4e3fffb38f821c7d4a/test.xlsx --------------------------------------------------------------------------------