├── Signal
    ├── __init__.py
    ├── CrossSectionalModels
    │   ├── Base
    │   │   ├── __init__.py
    │   │   ├── ModelTest
    │   │   │   ├── __init__.py
    │   │   │   ├── ScoreMethod.py
    │   │   │   └── ModelTest.py
    │   │   ├── CrossSectionalModelBase.py
    │   │   └── CrossSectionalModelSklearn.py
    │   ├── test
    │   │   ├── __init__.py
    │   │   ├── paraDictLasso.json
    │   │   ├── paraDictRidge.json
    │   │   ├── testCSFeatureSelector.py
    │   │   ├── testCSModel.py
    │   │   └── testModelLinear.py
    │   ├── __init__.py
    │   ├── CrossSectionalModel
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── CrossSectionalModelKNN.py
    │   │   ├── CrossSectionalModelLinearSklearn.py
    │   │   ├── CrossSectionalModelTreeSklearn.py
    │   │   └── CrossSectionalModelLinearStat.py
    │   └── README.md
    ├── README.md
    ├── SignalFunctionsDev
    │   ├── GetSignals.py
    │   ├── trainTestSlice.py
    │   └── GenerateSignal.py
    ├── FeatureSelectors
    │   ├── test.py
    │   ├── CrossSectionalFeatureSelectorLinear.py
    │   └── CrossSectionalFeatureSelectorBase.py
    ├── SignalBase.py
    └── SignalSynthesis.py
├── .gitignore
├── Tool
    ├── __pycache__
    │   ├── __init__.cpython-36.pyc
    │   ├── __init__.cpython-37.pyc
    │   ├── globals.cpython-37.pyc
    │   ├── GeneralData.cpython-37.pyc
    │   ├── GeneralDataBase.cpython-36.pyc
    │   ├── GeneralDataBase.cpython-37.pyc
    │   └── FactorProfileBase.cpython-37.pyc
    ├── __init__.py
    ├── globalVars.py
    ├── FactorProfileBase.py
    ├── logger.py
    ├── GeneralDataBase.py
    ├── Factor.py
    ├── GeneralData.py
    └── DataPreProcessing.py
├── GetData
    ├── __init__.py
    ├── get_new_data.py
    ├── getDataFromWINDdatabase.py
    ├── manipulateToTable.py
    ├── getDataFromMySQLdatabase.py
    ├── backtestDataApi.py
    ├── backtestDatabase.py
    └── loadData.py
├── meeting log.md
├── README.md
├── Director
    ├── systhesisDirector.py
    └── singleFactorDirector.py
└── report
    ├── DataPreProcessing.py
    └── FactorAnalyse.py


/Signal/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Signal/CrossSectionalModels/Base/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Signal/CrossSectionalModels/test/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Signal/CrossSectionalModels/Base/ModelTest/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Signal/CrossSectionalModels/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | 


--------------------------------------------------------------------------------
/Signal/CrossSectionalModels/test/paraDictLasso.json:
--------------------------------------------------------------------------------
1 | {"fit_intercept": true, "alpha": 1}


--------------------------------------------------------------------------------
/Signal/CrossSectionalModels/test/paraDictRidge.json:
--------------------------------------------------------------------------------
1 | {"fit_intercept": true, "alpha": 0.3}


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | 
 2 | *.pkl
 3 | *.pyc
 4 | .ipynb_checkpoints/
 5 | __pycache__/
 6 | .idea/
 7 | .vscode/
 8 | .spyproject
 9 | 
10 | /data


--------------------------------------------------------------------------------
/Tool/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eiahb3838ya/AlphaSignalFromMachineLearning/HEAD/Tool/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/Tool/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eiahb3838ya/AlphaSignalFromMachineLearning/HEAD/Tool/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/Tool/__pycache__/globals.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eiahb3838ya/AlphaSignalFromMachineLearning/HEAD/Tool/__pycache__/globals.cpython-37.pyc


--------------------------------------------------------------------------------
/Tool/__pycache__/GeneralData.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eiahb3838ya/AlphaSignalFromMachineLearning/HEAD/Tool/__pycache__/GeneralData.cpython-37.pyc


--------------------------------------------------------------------------------
/Tool/__pycache__/GeneralDataBase.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eiahb3838ya/AlphaSignalFromMachineLearning/HEAD/Tool/__pycache__/GeneralDataBase.cpython-36.pyc


--------------------------------------------------------------------------------
/Tool/__pycache__/GeneralDataBase.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eiahb3838ya/AlphaSignalFromMachineLearning/HEAD/Tool/__pycache__/GeneralDataBase.cpython-37.pyc


--------------------------------------------------------------------------------
/GetData/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Fri Dec  4 15:20:00 2020
4 | 
5 | @author: Evan Hu (Yi Fan Hu)
6 | 
7 | """
8 | 
9 | from .loadData import *


--------------------------------------------------------------------------------
/Tool/__pycache__/FactorProfileBase.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eiahb3838ya/AlphaSignalFromMachineLearning/HEAD/Tool/__pycache__/FactorProfileBase.cpython-37.pyc


--------------------------------------------------------------------------------
/Tool/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue Dec  1 16:10:32 2020
 4 | 
 5 | @author: eiahb
 6 | """
 7 | from .logger import Logger
 8 | from .GeneralData import GeneralData
 9 | from .Factor import Factor
10 | 
11 | 


--------------------------------------------------------------------------------
/Signal/README.md:
--------------------------------------------------------------------------------
 1 | # Signal
 2 | 
 3 | ## CrossSectionalModels
 4 | 
 5 | 包含各类可以进行Cross Sectional Factor分析预测的models：
 6 | 
 7 | - 线性
 8 | - 树
 9 | - 网络等
10 | 
11 | ## FeatureSelectors
12 | 
13 | 包含各类可以进行Feature Selection的selectors：
14 | 
15 | - 线性
16 | - 树
17 | - 网络等
18 | 
19 | 


--------------------------------------------------------------------------------
/Signal/CrossSectionalModels/CrossSectionalModel/README.md:
--------------------------------------------------------------------------------
 1 | # CrossSectionalModel
 2 | 
 3 | ## CrossSectionalModelKNN.py
 4 | 
 5 | KNN的方法
 6 | 
 7 | ## CrossSectionalModelLinearSklearn.py
 8 | 
 9 | Sklearn like的线性方法
10 | 
11 | ## CrossSectionalModelLinearStat.py
12 | 
13 | statmodel 的线性方法：比起sklearn，可以直接得到回归的summary！
14 | 
15 | ## CrossSectionalModelTreeSklearn.py
16 | 
17 | Tree的方法


--------------------------------------------------------------------------------
/Signal/CrossSectionalModels/Base/ModelTest/ScoreMethod.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Dec 16 16:23:09 2020
 4 | 
 5 | @author: Mengjie Ye
 6 | """
 7 | 
 8 | from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
 9 | 
10 | 
11 | scoreMethodDict = {
12 |     'r2':r2_score, 
13 |     'mse':mean_squared_error, 
14 |     'mae':mean_absolute_error
15 |     }


--------------------------------------------------------------------------------
/GetData/get_new_data.py:
--------------------------------------------------------------------------------
 1 | # %%
 2 | import pandas as pd
 3 | import os
 4 | 
 5 | data_path = "C:\\Users\\eiahb\\Documents\\MyFiles\\WorkThing\\tf\\01task\\GeneticProgrammingProject\\AlphaSignalFromMachineLearning\\data\\newdata"
 6 | 
 7 | 
 8 | # %%
 9 | os.listdir(data_path)
10 | # %%
11 | a_file = os.listdir(data_path)[-1]
12 | 
13 | pd.read_csv(os.path.join(data_path, a_file), index_col=0)
14 | 
15 | 
16 | # %%
17 | 


--------------------------------------------------------------------------------
/Signal/CrossSectionalModels/README.md:
--------------------------------------------------------------------------------
 1 | # CrossSectionalModels
 2 | 
 3 | ## Base
 4 | 
 5 | CrossSectionalModel中的model所需要继承的类
 6 | 
 7 | - CrossSectionalModelBase（整个CrossSectionalModel的抽象基类）
 8 | - CrossSectionalModelSklearn（CrossSectionalModel中sklearn like models的父类）
 9 | - ModelTest（对于model进行测试的工具包）
10 | 
11 | ## CrossSectionalModel
12 | 
13 | 各种已经写好的models
14 | 
15 | ## LinearModelTest
16 | 
17 | 对于线性模型的测试程序
18 | 
19 | 


--------------------------------------------------------------------------------
/Signal/CrossSectionalModels/CrossSectionalModel/__init__.py:
--------------------------------------------------------------------------------
 1 | """different models for training"""
 2 | 
 3 | from .CrossSectionalModelKNN import CrossSectionalModelKNN
 4 | from .CrossSectionalModelLinearStat import CrossSectionalModelLinear,CrossSectionalModelOLS,CrossSectionalModelRidge,CrossSectionalModelLasso
 5 | from .CrossSectionalModelTreeSklearn import CrossSectionalModelDecisionTree, CrossSectionalModelXGBoost
 6 | 
 7 | __all__ = [
 8 |     'CrossSectionalModelKNN',
 9 |     'CrossSectionalModelLinear',
10 |     'CrossSectionalModelOLS',
11 |     'CrossSectionalModelRidge',
12 |     'CrossSectionalModelLasso',
13 |     'CrossSectionalModelDecisionTree'
14 | ]


--------------------------------------------------------------------------------
/Tool/globalVars.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Thu Dec  3 11:33:44 2020
 4 | 
 5 | @author: eiahb
 6 | """
 7 | 
 8 | 
 9 | 
10 | 
11 | 
12 | def initialize(inputLogger = None): 
13 |     if inputLogger ==None:
14 |         import logging
15 |         inputLogger = logging.getLogger()
16 |     global varList
17 |     varList = []    
18 |     register("logger", inputLogger)
19 |     register("factor", {})
20 |     register("barra", {})
21 |     register("materialData", {})
22 |     register("utilData", {})
23 | 
24 |     
25 | def register(name, aVar):
26 |     globals()[name] =  aVar
27 |     globals()['varList'].append(name)
28 |     logger.info('{}:{} is now in global'.format(name, aVar))
29 |     return(globals()['varList'])
30 | 
31 | def list_vars():
32 |     return(globals()['varList'])


--------------------------------------------------------------------------------
/GetData/getDataFromWINDdatabase.py:
--------------------------------------------------------------------------------
 1 | import cx_Oracle
 2 | import pandas as pd
 3 | 
 4 | '''
 5 | 改一下client的地址
 6 | '''
 7 | cx_Oracle.init_oracle_client(lib_dir=r"d:\oracle\instantclient_19_9")
 8 | 
 9 | conn = cx_Oracle.connect('student2001212409', 'QTA_ymj_2020', '219.223.208.202:1521/orcl')
10 | 
11 | cursor = conn.cursor()
12 | 
13 | 
14 | def get_data_from_windDB(description_sql, name):
15 |     cursor.execute(description_sql)
16 |     result = cursor.fetchall()
17 |     col_list = [i[0] for i in cursor.description]
18 |     res_df = pd.DataFrame(result, columns=col_list)
19 |     '''
20 |     改一下存的位置
21 |     '''
22 |     res_df.to_pickle('D:/AlphaSignalFromMachineLearning/GetData/tables/windDBData/{}'.format(name))
23 | 
24 | 
25 | l2_indicator_sql = "SELECT S_INFO_WINDCODE, TRADE_DT,\
26 | S_LI_INITIATIVEBUYRATE, S_LI_INITIATIVESELLRATE,\
27 | S_LI_LARGEBUYRATE, S_LI_LARGESELLRATE FROM FILESYNC.ASHAREL2INDICATORS ORDER BY TRADE_DT "
28 | l2_indicator_name = 'ASHAREL2INDICATORS'
29 | get_data_from_windDB(l2_indicator_sql, l2_indicator_name)
30 | 


--------------------------------------------------------------------------------
/Signal/CrossSectionalModels/Base/CrossSectionalModelBase.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Dec  9 19:52:53 2020
 4 | 
 5 | @author: Evan Hu (Yi Fan Hu)
 6 | 
 7 | """
 8 | 
 9 | import abc
10 | 
11 | class CrossSectionalModelBase(object, metaclass=abc.ABCMeta):
12 |     @abc.abstractmethod
13 |     def __init__(self, **kwargs):
14 |         self.parameter = {}
15 |         self.model = None
16 |         pass
17 |     
18 |     @abc.abstractmethod
19 |     def fit(self, X_train, y_train):
20 |         # fit the model with the input data
21 |         # self.model.fit(X,y)
22 |         pass
23 |     
24 |     @abc.abstractmethod
25 |     def predict(self, X):
26 |         # the one method that to be called to perform prediction
27 |         # return(self.model.predict(X))
28 |         pass
29 |     
30 |     @abc.abstractmethod
31 |     def get_para(self):
32 |         # return the hyperparameter of the model
33 |         # maybe from another file json-like or another module
34 |         # for the cv cases
35 |         # do some how cv or things to decide the hyperparameter in this
36 |         
37 |         # if self.parameter == {}:
38 |         #     do something
39 |         # else:
40 |         #     return(self.parameter)
41 |         pass
42 |     
43 |     @abc.abstractmethod
44 |     def get_model(self):
45 |         # return the model 
46 |         pass
47 | 
48 |     


--------------------------------------------------------------------------------
/Signal/SignalFunctionsDev/GetSignals.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue Dec 22 21:25:50 2020
 4 | 
 5 | @author: 国欣然
 6 | """
 7 | from BackTesting.Signal
 8 | 
 9 | CrossSectionalModels.
10 | import pandas as pd
11 | import json
12 | import sys
13 | import matplotlib.pyplot as plt
14 | sys.path.append("../../")
15 | 
16 | 
17 | def GetSignals(model,jsonPath = None, paraDict = {},code_list,factor_list,control_factor_list,start_date,end_date):
18 | 
19 |     Factor_date_dict = get_daily_factor_preprocessed(code_list, factor_list, control_factor_list, start_date, end_date)
20 |     date_list = list(Factor_date_dict.keys())
21 |     '''
22 |     daily_return = daily_quote.pivot('code','datetime','return')
23 |     '''
24 |     daily_quote = DatabaseReader.get_daily_quote(code_list, start_date, end_date)
25 |     daily_close = daily_quote.pivot('code', 'datetime', 'close')
26 |     daily_return = daily_close.pct_change(axis=1)
27 | 
28 |     FactorReturn_df = pd.DataFrame(None, index=date_list, columns=factor_list)
29 |     for i in range(len(date_list) - 1):
30 |         date0 = date_list[i]
31 |         date1 = date_list[i + 1]
32 |         factor_date0 = Factor_date_dict[date0]
33 |         return_date1 = daily_return[date1]
34 |         Model = model(jsonPath,paraDict = {}).fit()
35 |         FactorReturn_df.loc[date0] = Model.params
36 |     return FactorReturn_df.dropna()
37 |     
38 |     
39 |     
40 |     
41 |     


--------------------------------------------------------------------------------
/Signal/CrossSectionalModels/Base/ModelTest/ModelTest.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Dec 16 14:10:33 2020
 4 | 
 5 | @author: Mengjie Ye
 6 | """
 7 | import os
 8 | import sys
 9 | sys.path.append(os.getcwd())
10 | import abc
11 | # =============================================================================
12 | # try:
13 | #     from .ScoreMethod import scoreMethodDict
14 | # except:
15 | #     from ScoreMethod import scoreMethodDict
16 | # =============================================================================
17 | from .ScoreMethod import scoreMethodDict
18 | 
19 | class ModelTest():
20 |     
21 |     def __init__(self, model = None):
22 |         '''
23 |         model: sklearn like model
24 |         '''
25 |         self.model = model
26 |         # self.scoreMethodDict = scoreMethodDict
27 |     
28 |     def get_score(self, y_true, y_pred, scoreMethod = 'r2', scoreMethodDict = scoreMethodDict, **kwargs):
29 |         '''
30 |         get score of the prediction based on the scoreMethod
31 |         
32 |         ----
33 |             
34 |             y_true: array-like of shape (n_samples,) or 
35 |                     (n_samples, n_outputs)
36 |             y_pred: array-like of shape (n_samples,) or 
37 |                     (n_samples, n_outputs)
38 |             scoreMethod: metrics to score the model
39 |                         'r2', 'mse', 'mae'
40 |         '''
41 |         
42 |         scoreMethod = scoreMethodDict[scoreMethod]
43 |         return scoreMethod(y_true, y_pred)


--------------------------------------------------------------------------------
/meeting log.md:
--------------------------------------------------------------------------------
 1 | # Meeting Log
 2 | 
 3 | ## 2020-12-02
 4 | 
 5 | **讨论**：
 6 | 
 7 | 整体框架设计，各模块功能
 8 | 
 9 | **任务**：
10 | 
11 | 学习类的继承
12 | 
13 | ## 2020-12-09
14 | 
15 | **讨论**整体框架设计，各模块功能，分配任务，重点在于Signal模块的设计
16 | 
17 | **任务：**
18 | 
19 | 编写Signal模块中get_signal会用到的方法：
20 | 
21 | - 线性（叶梦婕）
22 | - 树状（国欣然）
23 | - 网络型（薛岚天）
24 | 
25 | 从以上三个方向编写CrossSectionalModels（AlphaSignalFromMachineLearning\BackTesting\Signal\CrossSectionalModels）以及FeatureSelectors（AlphaSignalFromMachineLearning\BackTesting\Signal\FeatureSelectors）
26 | 
27 | ## 2020-12-16
28 | 
29 | **讨论**:
30 | 
31 | - 上周任务完成情况
32 |   - 已编写线性（OLS、Ridge、Lasso），树状，KNN的CrossSectionalModel
33 |   - 线性模型已测试完成
34 |   - parameter的获取方式：jsonPath，paraDict，CrossValidation
35 | 
36 | - SignalBase的方法设计
37 | 
38 | **任务**：
39 | 
40 | - **code review**
41 | - 编写CrossSectionalModelSklearn与ModelTest两个类，使得构造CrossSectionalModel时直接继承这两个
42 |   - CrossSectionalModelSklearn：实现共同init的方式，fit的方式（CV或者直接fit）
43 |   - ModelTest：对模型进行测试的工具包，例如：计算score，画图等等
44 | - 整理文件夹，每个目录都设置README
45 | - SignalBase的方法编写
46 |   - generate_signals（国欣然）
47 |   - train_test_slice（叶梦婕）
48 |   - preprocessing（叶文轩）
49 |   - get_signal（国欣然）
50 |   - smoothing, logger（薛岚天）
51 | 
52 | ## 2020-12-23
53 | 
54 | **讨论**:
55 | 
56 | - 上周任务完成情况
57 |   - train_test_slice的具体参数设置，底层切片方法的需求
58 |   - preprocessing的具体处理，mask的实现与相应类MA设计，preprocessing所涉方法维度选择
59 |   - generate_signals基本框架思路
60 |   - smoothing方法的generalize,logger类的使用
61 |  
62 |  **任务**：
63 |  
64 |  - 各自完善代码
65 |  - 底层切片方法的完善，策略基础类的架构（胡逸凡）
66 |  - 寻找表现较好的因子，具体说明为什么能有较好表现及后续数据的可得
67 |     - 技术指标：动量及反转相关（国欣然）
68 |     - 基本面指标：财务数据相关（叶文轩）
69 |     - 投资者行为相关（叶梦婕）
70 |     - 高频因子低频化（薛岚天）
71 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # AlphaSignalFromMachineLearning
 2 | 
 3 | ## 简介
 4 | 在因子研究的过程中，有许多重复且既定的过程，我们将因子信号与策略回测分成两个不同的 模块，将常见的研究流程实现出来，并且设计可以替换模型的接口，让因子合成的测试可以快速被实现。
 5 | 在信号端，我们可以用不同的预测模型例如线性模型、树模型、或神经网络模型来替换核心的 cross sectional predict 模型，来达到用一样流程比较不同模型的测试方式。若要达到更高的自由度，使用者可以透过继承多态的方式，对每期的预测流程进行定制。若想要在流程上不使用既定的预测区间设定(例如在每天的收盘前 30 分钟 或是更复杂的预测时间段)则可以继承并重写整个类。这样的设计是为了解决我们在经验上发现我们想要简便的替换常见的实验对象，并且又有高自由度可以随时改变框架的需求。
 6 | 在策略端我们接收 signal 模块传出来的信号，并使用是当的下单策略，目前实现简单的 Longshort 并算 PnL 来查看信号的效果。
 7 | 
 8 | ## 模块设计
 9 | 
10 | 本次project主要包含以下模块：
11 | 
12 | - Signal模块：接收上一模块输入的factor，进行一系列操作后，产生Signal，进入Strategy模块
13 | - Strategy模块：接收上一模块产生的Signal后，进一步处理，得到股票配置的策略
14 | 
15 | ### 1.Signal模块
16 | 
17 | 本模块主要包括两大功能：generateSignals和smoothing
18 | 即产生 raw signal，signal 平滑处理
19 | 
20 | #### generateSignals
21 | 
22 | 是一个iteration，每个迭代过程会实现以下几个功能：
23 | 
24 | - slicefactor：对于Factor模块中输进来的factor进行切片处理
25 | - preprocessing：对于上一步的factor进行预处理
26 |   - 遮罩：剔除上市未满一年、ST、涨跌停的股票等
27 |   - 极值处理、中性化与标准化处理等
28 | 
29 | - getSignal：使用多种模型（线性、树等）对于处理后的factor进行分析，从而产生signal
30 | 
31 | 最终generateSignals会输出包含这段时间每天signal的一张表
32 | 
33 | #### smoothing
34 | 
35 | 对于generateSignals产出的raw signals进行平滑处理
36 | 
37 | 
38 | ### 2.Strategy模块
39 | 
40 | 接收Signal模块输出的信号，并利用不同的方法，产出股票配置的策略：
41 | - long-short
42 | - long only
43 | - 利用cvxopt进行各种组合优化
44 | 
45 | ## 文件夹说明
46 | ### \director
47 | 存放主要用来 call signal 模块与 strategy 模块的 director 类，用来描述整个回测的流程，可以分为单因子回测与多因子回测。
48 | ### \get data
49 | 存放用来将数据导入的功能函数 loadData，与其他用来从数据库提取数据的脚本。
50 | ### \signal
51 | 主要的 signal 模块
52 | ### \Tool 
53 | 存放通用类 Factor、GeneralData 与 globalVars 模块用于定义 global variables 的 trick
54 | ### \report 
55 | 回测结果
56 | 
57 | ## 使用说明
58 | 


--------------------------------------------------------------------------------
/Tool/FactorProfileBase.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Thu May 28 14:18:05 2020
 4 | 
 5 | @author: Evan
 6 | @reviewer: Robert
 7 | """
 8 | from abc import abstractmethod
 9 | import abc
10 | 
11 | class FactorProfileBase(object, metaclass=abc.ABCMeta):
12 |     def __init__(self):
13 |         """
14 |         Parameter
15 |         ----------------------------
16 |         functionName: str
17 |         datasetName_list: list[str]
18 |         parameters_dict: dict()
19 |         """
20 |         
21 |         self.factorName = ""
22 |         self.functionName = ""
23 |         self.reliedDatasetNames = []
24 |         self.parameters = {}
25 |         
26 |         
27 |         
28 |     @abstractmethod        
29 |     def get_relied_dataset(self):
30 |         pass
31 |         
32 |     def get_factor_kwargs(self, verbose = 0):
33 |         """
34 |         Parameter
35 |         -------------------------------
36 |         verbose: boolean, if verbose is True, return factorName, parameters,dataset; otherwise, return return factorName, parameters
37 |         """
38 |         out = dict()
39 |         out.update({'factorName':self.factorName})
40 |         out.update({'functionName':self.functionName})
41 |         out.update({'reliedDatasetNames':self.reliedDatasetNames})
42 |         out.update({'parameters':self.parameters})
43 |         
44 |         if verbose == 0:
45 |             return(out)
46 |         elif verbose == 1:
47 |             out.update({'dataset':self.dataset})
48 |             return(out)
49 |         else:
50 |             raise ValueError('verbose can only be 0 or 1.')
51 |         
52 | 


--------------------------------------------------------------------------------
/Signal/FeatureSelectors/test.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.datasets import load_boston
 3 | 
 4 | def create_dataset():
 5 |     boston = load_boston()
 6 |     target = np.array(boston.feature_names) == "DIS"
 7 |     X = boston.data[:, np.logical_not(target)]
 8 |     y = boston.data[:, target].squeeze()
 9 | 
10 |     return X, y
11 | 
12 | 
13 | if __name__ == "__main__":
14 | 
15 |     
16 |     from CrossSectionalFeatureSelectorLinear import CrossSectionalFeatureSelectionLasso
17 |     
18 | 
19 |     # fit and predict
20 |     paraDictLasso = {'fit_intercept':True,'alpha':0.3}
21 |     selector = CrossSectionalFeatureSelectionLasso(paraDict = paraDictLasso)
22 | 
23 |     print("+++++++  Before training +++++++")
24 |     print(selector.getSelector())
25 |     print(selector.getPara())
26 | 
27 |     X,y = create_dataset()
28 |     
29 |     print("+++++++ Training +++++++")
30 |     selector.fit(X,y)
31 | 
32 |     print("+++++++  After training +++++++")
33 |     print(selector.getSelector())
34 |     print(selector.getPara())
35 | 
36 |     print("+++++++ Predicting +++++++")
37 |     pred = selector.transform(X)
38 | 
39 |     # fit_transform directly
40 |     selector = CrossSectionalFeatureSelectionLasso(paraDict = paraDictLasso)
41 | 
42 |     print("+++++++  Before training +++++++")
43 |     print(selector.getSelector())
44 |     print(selector.getPara())
45 | 
46 |     X = create_dataset()
47 |     
48 |     print("+++++++ Fit_transform +++++++")
49 |     pred = selector.fit_transform(X,y)
50 | 
51 |     print("+++++++  After fit_transform +++++++")
52 |     print(selector.getSelector())
53 |     print(selector.getPara())
54 | 
55 | 
56 |     
57 |     
58 | 


--------------------------------------------------------------------------------
/GetData/manipulateToTable.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Dec  4 11:03:08 2020
 4 | 
 5 | @author: Evan Hu
 6 | """
 7 | 
 8 | import pandas as pd
 9 | import os
10 | 
11 | 
12 | SKIP_N_ROWS = 2
13 | 
14 | # C:\Users\eiahb\Documents\MyFiles\WorkThing\tf\02data
15 | DATA_PATH = '..\\..\\..\\02data'
16 | dataFileList = ['AShareEODPrices.pkl', 'Barra_CNE5_factordata.pkl']
17 | #%%
18 | OUTPUT_PATH = 'GetData\\tables\\materialData'
19 | 
20 | os.makedirs(OUTPUT_PATH, exist_ok=bool)
21 | 
22 | aDataFile = dataFileList[0]
23 | aFilePath = os.path.join(DATA_PATH, aDataFile)
24 | 
25 | df = pd.read_pickle(aFilePath)
26 | toPivotColumns = df.columns[2:]
27 | 
28 | for aPivotTarget in toPivotColumns:
29 |     pivotedTable = df.pivot(df.columns[1], df.columns[0], aPivotTarget)
30 |     pivotedTable = pivotedTable.iloc[SKIP_N_ROWS:, :]
31 |     pivotedTable.index = pd.DatetimeIndex(pivotedTable.index)
32 |     pivotedTable.to_csv(os.path.join(OUTPUT_PATH, '{}.csv'.format(aPivotTarget)))
33 | 
34 | 
35 | #%%
36 | OUTPUT_PATH = 'GetData\\tables\\barra'
37 | os.makedirs(OUTPUT_PATH, exist_ok=bool)
38 | aDataFile = dataFileList[1]
39 | aFilePath = os.path.join(DATA_PATH, aDataFile)
40 | df = pd.read_pickle(aFilePath)
41 | df = df.drop(columns = 'Unnamed: 0')
42 | toPivotColumns = df.columns[2:]
43 | aPivotTarget = toPivotColumns[0]
44 | SKIP_N_ROWS = 0
45 | for aPivotTarget in toPivotColumns:
46 |     pivotedTable = df.pivot(df.columns[0], df.columns[1], aPivotTarget)
47 |     pivotedTable = pivotedTable.iloc[SKIP_N_ROWS:, :]
48 |     pivotedTable.index = pd.DatetimeIndex(pivotedTable.index.astype(str))
49 |     pivotedTable.to_csv(os.path.join(OUTPUT_PATH, '{}.csv'.format(aPivotTarget)))
50 | 
51 | 


--------------------------------------------------------------------------------
/Signal/CrossSectionalModels/CrossSectionalModel/CrossSectionalModelKNN.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Dec 14 14:18:58 2020
 4 | 
 5 | @author: Lantian
 6 | """
 7 | 
 8 | import sys
 9 | sys.path.append("../../")
10 | from BackTesting.Signal.CrossSectionalModels.Base.CrossSectionalModelBase import CrossSectionalModelBase
11 | from sklearn.neighbors import KNeighborsClassifier 
12 | import json
13 | 
14 | class CrossSectionalModelKNN(CrossSectionalModelBase):
15 |     def __init__(self, jsonpath = None, para = {}):
16 |         #take both jsonpath and the input para into consideration
17 |         if jsonpath is not None:
18 |             with open (jsonpath, 'r') as f:
19 |                 para.update(json.load(f))
20 |             with open (jsonpath, 'w') as f:
21 |                 json.dump(para, f)
22 |         self.para = para
23 |         self.model = KNeighborsClassifier(**self.para)
24 |     
25 |     def fit(self,X_train,Y_train):
26 |         self.model.fit(X_train,Y_train)
27 |      
28 |     def predict(self, X):
29 |         return self.model.predict(X)
30 |     
31 |     def get_para(self):
32 |         return self.para
33 |     
34 |     def get_model(self):
35 |         return self.model
36 |     
37 |     def predict_score(self,x,y):
38 |         return self.model.score(x,y)
39 | 
40 | if __name__=='__main__':
41 |     from testCSModel import create_classification_dataset 
42 |     X_train, y_train, X_test, y_test = create_classification_dataset()
43 |     
44 |     param = {'weights': 'distance'}
45 |     data = {'n_neighbors': 4}
46 |     with open ('data.json','w') as f:
47 |         json.dump(data,f)
48 |     model = CrossSectionalModelKNN(jsonpath = 'data.json', para = param)
49 |     model.fit(X_train,y_train)
50 |     predy = model.predict(X_test)


--------------------------------------------------------------------------------
/GetData/getDataFromMySQLdatabase.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Nov 27 16:44:47 2020
 4 | 
 5 | @author: cy
 6 | """
 7 | 
 8 | import pyodbc
 9 | # import numpy as np
10 | # from pandas import DataFrame,Timestamp,Series
11 | import pandas as pd
12 | # import csv
13 | # from WindPy import w
14 | import time 
15 | #%% 数据库参数
16 | driver='MySQL ODBC 8.0 Unicode Driver'
17 | host = '192.168.41.56'
18 | port = '3306'
19 | user = 'inforesdep01'
20 | passwd = 'tfyfInfo@1602'
21 | db = 'wind'    
22 | #%% 需要的資料表
23 | require_data = {
24 |     'AShareEODPrices':[
25 |         'S_INFO_WINDCODE',
26 |         'TRADE_DT',
27 |         'S_DQ_OPEN',
28 |         'S_DQ_HIGH',
29 |         'S_DQ_LOW',
30 |         'S_DQ_CLOSE',
31 |         'S_DQ_CHANGE',
32 |         'S_DQ_PCTCHANGE',
33 |         'S_DQ_VOLUME',
34 |         'S_DQ_AMOUNT',
35 |         'S_DQ_ADJPRECLOSE',
36 |         'S_DQ_ADJOPEN',
37 |         'S_DQ_ADJHIGH',
38 |         'S_DQ_ADJLOW',
39 |         'S_DQ_ADJCLOSE',
40 |         'S_DQ_ADJFACTOR',
41 |         'S_DQ_AVGPRICE',
42 |         ]
43 |     
44 |     }
45 | 
46 | #%% 数据提取
47 | cnxn = pyodbc.connect('DRIVER='+driver+';SERVER='+host+';DATABASE='+db+';UID='+user+';PWD='+passwd)
48 | columns_list = require_data['AShareEODPrices']
49 | columns_string = ", a.".join(["{}"]*len(columns_list)).format(*columns_list)
50 | sql=('''
51 |       select a.{} 
52 |       from AShareEODPrices as a
53 |       '''.format(columns_string)
54 |       ) 
55 | 
56 | # aTable = 'AShareEODPrices'
57 | # aColumn = require_data[aTable][0]
58 | # sql=('''
59 | #       select a.S_INFO_WINDCODE, a.TRADE_DT, a.{} 
60 | #       from {} as a
61 | #       '''.format(aColumn, aTable)
62 | #       ) 
63 | #%%
64 | start_time = time.time()
65 | factor=pd.read_sql(sql, cnxn)
66 | print(time.time() - start_time)
67 | 
68 | 


--------------------------------------------------------------------------------
/Signal/FeatureSelectors/CrossSectionalFeatureSelectorLinear.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sun Dec 13 19:50:33 2020
 4 | 
 5 | @author: Mengjie Ye
 6 | """
 7 | 
 8 | from CrossSectionalFeatureSelectorBase import  CrossSectionalFeatureSelectionBase
 9 | from sklearn.linear_model import LinearRegression,Ridge,Lasso
10 | import pandas as pd
11 | import json
12 | 
13 | class CrossSectionalFeatureSelectionLasso(CrossSectionalFeatureSelectionBase):
14 |     
15 |     def __init__(self, jsonPath = None, paraDict = {}):
16 |         self.parameter = paraDict
17 |         if jsonPath is not None:
18 |             with open(jsonPath,'r') as f:
19 |                 self.parameter = json.load(f)
20 |         self.selector = Lasso(**self.parameter)
21 |         
22 |     def fit(self, X, y):
23 |         self.selector.fit(X, y)
24 |         
25 |     def transform(self, X):
26 |         try:
27 |             coef = self.selector.coef_
28 |             coefIdx = [i for i in range(len(coef)) if coef[i] != 0]
29 |             return X[:,coefIdx]
30 |         except :
31 |             raise Exception('please fit your selector first!')
32 |             
33 |     def fit_transform(self, X, y):
34 |         self.selector.fit(X, y)
35 |         coef = self.selector.coef_
36 |         coefIdx = [i for i in range(len(coef)) if coef[i] != 0]
37 |         return X[:,coefIdx]
38 |     
39 |     def getPara(self):
40 |         if self.parameter!={}:
41 |             return pd.DataFrame.from_dict(self.parameter, 
42 |                                           orient='index',
43 |                                           columns= ['ParaValue'])
44 |         else:
45 |             print('Hyper parameters are default')
46 |             
47 |     def getSelector(self):
48 |         return self.selector
49 |             
50 |             


--------------------------------------------------------------------------------
/Signal/FeatureSelectors/CrossSectionalFeatureSelectorBase.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Thu Dec 10 11:10:59 2020
 4 | 
 5 | @author: Evan Hu (Yi Fan Hu)
 6 | 
 7 | """
 8 | 
 9 | import abc
10 | 
11 | class CrossSectionalFeatureSelectionBase(object, metaclass=abc.ABCMeta):
12 |     @abc.abstractmethod
13 |     def __init__(self, **kwargs):
14 |         self.parameter = {}
15 |         self.selector = None
16 |         pass
17 |     
18 |     @abc.abstractmethod
19 |     def fit(self, X_train):
20 |         # fit the model with the input data
21 |         # self.model.fit(X,y)
22 |         pass
23 |     
24 |     @abc.abstractmethod
25 |     def transform(self, X):
26 |         # the one method that to be called to perform prediction
27 |         # return(self.model.predict(X))
28 |         pass
29 |     
30 |     @abc.abstractmethod
31 |     def fit_transform(self, X):
32 |         # the one method that to be called to perform prediction
33 |         # return(self.model.predict(X))
34 |         pass
35 |     
36 |     @abc.abstractmethod
37 |     def get_para(self):
38 |         # return the hyperparameter of the model
39 |         # maybe from another file json-like or another module
40 |         # for the cv cases
41 |         # do some how cv or things to decide the hyperparameter in this
42 |         
43 |         # if self.parameter == {}:
44 |         #     do something
45 |         # else:
46 |         #     return(self.parameter)
47 |         pass
48 |     
49 |     @abc.abstractmethod
50 |     def get_selector(self):
51 |         # return the hyperparameter of the model
52 |         # maybe from another file json-like or another module
53 |         # for the cv cases
54 |         # do some how cv or things to decide the hyperparameter in this
55 |         
56 |         # if self.parameter == {}:
57 |         #     do something
58 |         # else:
59 |         #     return(self.parameter)
60 |         pass


--------------------------------------------------------------------------------
/Signal/SignalBase.py:
--------------------------------------------------------------------------------
 1 | from abc import abstractmethod, ABCMeta, abstractstaticmethod
 2 | 
 3 | 
 4 | class SignalBase(object,metaclass=ABCMeta):
 5 |     @abstractmethod
 6 |     def __init__(self):
 7 |         pass
 8 | 
 9 |     @abstractmethod
10 |     def initialize(self):
11 |         # Do some prepare to generate signals
12 |         # load the factors 
13 |         # update the factors
14 |         # prepare some preprocess metetials 
15 |         pass
16 | 
17 |     @abstractmethod
18 |     def generate_signals(self):
19 |         # the main main func of this class
20 |         # iter through all time periods and get the signals
21 |         # for each iteration: call train_test_slice, preprocessing, get_signal
22 |         pass
23 | 
24 |     @abstractstaticmethod
25 |     def train_test_slice(factors, dependents, trainStart, trainEnd, testStart, testEnd):
26 |         # split all the factors and dependents to train part and test part according to input,
27 |         # if end part isn't passed in, slice one period as default, 
28 |         # if the test start isn't passed in,
29 |         # take the very next time period of trainEnd,
30 |         # the input of factors could be a list of factors or just one Factor
31 |         pass
32 | 
33 |     @abstractmethod
34 |     def preprocessing(self):
35 |         # apply preprocess in here including 
36 |         # clean up nans and 停牌 ST ect,
37 |         # deal with extreme points
38 |         # and other stuff
39 |         # use np.ma module technic here should be suitable 
40 |         # please make it modulized and easy to maintain (take cleanUpRules as inputs ect.)
41 |         pass
42 | 
43 |     @abstractmethod
44 |     def get_signal(self):
45 |         # define how we get signal for one interation
46 |         # the obviuos version will be use feature selection and models 
47 |         # to predict crossSectional expected returns of next period
48 |         pass
49 | 
50 |     @abstractmethod
51 |     def smoothing(self):
52 |         # smoothing methods defind at the end
53 |         # typicaly is the moving average of n days
54 |         # use partial function technic here will be suitable 
55 |         pass


--------------------------------------------------------------------------------
/Tool/logger.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sun Dec 20 20:00:54 2020
 4 | 
 5 | @author: Lantian
 6 | """
 7 | import logging
 8 | import datetime
 9 | import os
10 | 
11 | 
12 | class Logger(object):
13 |     '''
14 |     record the log when u want
15 |     u need to create a folder log under the folder tool
16 |     problem may be found using spyder, use cmd instead
17 |     '''
18 |     def __init__(self, loggerFolder ="log\\", exeFileName="", level=logging.DEBUG):
19 |         self.logger = logging.getLogger()
20 |         self.logger.setLevel(level)
21 |         fmt = '%(asctime)-15s %(filename)s[line:%(lineno)d] - %(levelname)s - %(name)s : %(message)s'
22 |         if len(self.logger.handlers) == 0:
23 |             formatter = logging.Formatter(fmt=fmt)
24 |             streamHandler  = logging.StreamHandler()
25 |             streamHandler.setFormatter(formatter)
26 |             streamHandler.setLevel(logging.INFO)
27 |             self.logger.addHandler(streamHandler)
28 |             
29 |             logRecordFile = os.path.join(loggerFolder, exeFileName+"_"+datetime.datetime.now().strftime("%Y-%m-%d.log"))
30 |             fileHandler=logging.FileHandler(logRecordFile, encoding='utf-8')
31 |             fileHandler.setFormatter(formatter)
32 |             fileHandler.setLevel(logging.DEBUG)
33 |             self.logger.addHandler(fileHandler)
34 | 
35 |         
36 |         
37 |     def debug(self,msg):
38 |         self.logger.debug(msg)
39 |     
40 |     def info(self, msg):
41 |         self.logger.info(msg)
42 | 
43 |     
44 |     def warning(self, msg):
45 |         self.logger.warning(msg)
46 |     
47 |     def error(self, msg):
48 |         self.logger.error(msg)
49 |         
50 |     def critical(self, msg):
51 |         self.logger.critical(msg)
52 |         
53 |     def log(self, level, msg):
54 |         self.logger.log(level, msg)
55 |         
56 |     def setLevel(self, level):
57 |         self.logger.setLevel(level)
58 |         
59 |     def disable(self):
60 |         logging.disable(50) 
61 | 
62 | if __name__=='__main__':
63 |     fileName = 'loggerTest'
64 |     logger = Logger(fileName)
65 |     logger.info('start running '+fileName)
66 |     logger.warning('something wrong with '+fileName)
67 |     logger.critical('we have to break '+fileName)
68 |     logging.shutdown()
69 |     
70 |     


--------------------------------------------------------------------------------
/Signal/CrossSectionalModels/CrossSectionalModel/CrossSectionalModelLinearSklearn.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Dec 11 12:16:04 2020
 4 | 
 5 | @author: Mengjie Ye
 6 | """
 7 | # import os
 8 | # import sys
 9 | from sklearn.linear_model import LinearRegression,Ridge,Lasso
10 | 
11 | # =============================================================================
12 | # abspath = os.path.abspath('.')
13 | # sys.path.append(abspath+'\..')
14 | # try:
15 | #     from .Base.CrossSectionalModelSklearn import CrossSectionalModelSklearn
16 | # except:
17 | #     from Base.CrossSectionalModelSklearn import CrossSectionalModelSklearn
18 | # =============================================================================
19 | 
20 | from BackTesting.Signal.CrossSectionalModels.Base.CrossSectionalModelSklearn import CrossSectionalModelSklearn
21 | 
22 | # import matplotlib.pyplot as plt
23 |     
24 | #%% OLS
25 | class CrossSectionalModelOLS(CrossSectionalModelSklearn):
26 |     
27 |     def __init__(self, jsonPath = None, paraDict = {}, json_first = True):
28 |         # super(CrossSectionalModelOLS,self).__init__(jsonPath = None, paraDict = {})
29 |         super().__init__(jsonPath = jsonPath, 
30 |                          paraDict = paraDict, 
31 |                          json_first = json_first)
32 |         self.model = LinearRegression(**self.parameter)
33 | 
34 | 
35 | #%% Ridge
36 | class CrossSectionalModelRidge(CrossSectionalModelSklearn):
37 |     
38 |     def __init__(self,jsonPath = None, paraDict = {}, paraGrid = None, json_first = True):
39 |         super().__init__(jsonPath = jsonPath, 
40 |                          paraDict = paraDict, 
41 |                          paraGrid = paraGrid,
42 |                          json_first = json_first)
43 |         self.model = Ridge(**self.parameter)
44 | 
45 | #%% Lasso  
46 | class CrossSectionalModelLasso(CrossSectionalModelSklearn):
47 |     
48 |     def __init__(self,jsonPath = None, paraDict = {}, paraGrid = None, json_first = True):
49 |         super().__init__(jsonPath = jsonPath, 
50 |                          paraDict = paraDict, 
51 |                          paraGrid = paraGrid,
52 |                          json_first = json_first)
53 |         self.model = Lasso(**self.parameter)
54 | 
55 |         
56 | #%%
57 | 
58 |    
59 |     
60 |     
61 |     
62 | 
63 | 
64 | 
65 |      


--------------------------------------------------------------------------------
/Tool/GeneralDataBase.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Spyder Editor
 4 | 
 5 | This is a temporary script file
 6 | """
 7 | import abc
 8 | # from abc import ABC
 9 | import numpy as np
10 | # import cupy as np
11 | import pandas as pd
12 | 
13 | 
14 | class GeneralDataBase(object, metaclass=abc.ABCMeta):
15 |     @abc.abstractmethod
16 |     def __init__(self):
17 |         # print('GeneralDataBase __init__')
18 |         self.name = ''
19 |         self.generalData = np.ndarray(0)
20 |         self.timestamp = pd.Index([])
21 |         self.columnNames = []
22 |         self.metadata = {}
23 |         
24 |     def __str__(self):
25 |         if self.generalData.shape[0]>=6:
26 |             outputString = "{} : datashape of {} \nhead:\n{}\ntail:\n{}".format(self.name,\
27 |                                                                   self.generalData.shape,\
28 |                                                                   self.generalData[:6, :],\
29 |                                                                   self.generalData[-6:, :])
30 |         else:
31 |             outputString = "{} : datashape of {}".format(self.name,\
32 |                                                          self.generalData.shape)
33 |         return(outputString)
34 |     
35 |     def __repr__(self):
36 |         if self.generalData.shape[0]>=6:
37 |             outputString = "{} : datashape of {} \nhead:\n{}\ntail:\n{}".format(self.name,\
38 |                                                                   self.generalData.shape,\
39 |                                                                   self.generalData[:6, :],\
40 |                                                                   self.generalData[-6:, :])
41 |         else:
42 |             outputString = "{} : datashape of {}".format(self.name,\
43 |                                                          self.generalData.shape)
44 |         return(outputString)
45 |     
46 |     @abc.abstractmethod
47 |     def get_data_tail(self, n = 10):
48 |         pass
49 |     
50 |     @abc.abstractmethod
51 |     def get_data_head(self, n = 10):
52 |         pass
53 |     
54 |     @abc.abstractmethod
55 |     def get_data(self, start = None, end = None, get_loc_method = None):
56 |         pass
57 |     
58 |     @abc.abstractmethod
59 |     def get_timestamp(self):
60 |         pass
61 |     
62 |     @abc.abstractmethod
63 |     def get_columnNames(self):
64 |         pass
65 |             
66 |     
67 |     
68 |     
69 |     


--------------------------------------------------------------------------------
/Signal/CrossSectionalModels/CrossSectionalModel/CrossSectionalModelTreeSklearn.py:
--------------------------------------------------------------------------------
 1 | from sklearn.tree import DecisionTreeRegressor
 2 | from xgboost import XGBRegressor
 3 | from BackTesting.Signal.CrossSectionalModels.Base.CrossSectionalModelBase import CrossSectionalModelBase
 4 | import json
 5 | import sys
 6 | 
 7 | sys.path.append("../../")
 8 | #import matplotlib.pyplot as plt
 9 | 
10 | 
11 | class CrossSectionalModelDecisionTree(CrossSectionalModelBase):
12 |     # there are two ways to get parameters
13 |     def __init__(self, jsonPath = None, paraDict = {}):
14 |         if jsonPath is not None:
15 |             with open(jsonPath, 'r') as f:
16 |                 self.parameter = json.load(f)
17 |         else:
18 |             self.parameter = paraDict
19 | 
20 |         self.model = DecisionTreeRegressor(**self.parameter)
21 | 
22 |     def fit(self, X_train, y_train):
23 |         self.model.fit(X_train, y_train)
24 | 
25 |     def predict(self, X):
26 |         return self.model.predict(X)
27 | 
28 |     def get_model(self):
29 |         return self.model
30 | 
31 |     def get_score(self,y_true,y_pre):
32 |         return self.model.score(y_true, y_pre)
33 | 
34 |     def get_para(self):
35 |         return self.parameter
36 | 
37 | 
38 | class CrossSectionalModelXGBoost(CrossSectionalModelBase):
39 |     # there are two ways to get parameters
40 |     def __init__(self, jsonPath=None, paraDict={}):
41 |         if jsonPath is not None:
42 |             with open(jsonPath, 'r') as f:
43 |                 self.parameter = json.load(f)
44 |         else:
45 |             self.parameter = paraDict
46 | 
47 |         self.model = XGBRegressor(**self.parameter)
48 | 
49 |     def fit(self, X_train, y_train):
50 |         self.model.fit(X_train, y_train)
51 | 
52 |     def predict(self, X):
53 |         return self.model.predict(X)
54 | 
55 |     def get_model(self):
56 |         return self.model
57 | 
58 |     def get_score(self,y_true,y_pre):
59 |         return self.model.score(y_true, y_pre)
60 | 
61 |     def get_para(self):
62 |         return self.parameter
63 | 
64 | 
65 | if __name__ == '__main__':
66 |     from testCSModel import create_regression_dataset
67 |     import matplotlib.pyplot as plt
68 |     X_train, y_train, X_test, y_test = create_regression_dataset()
69 | 
70 |     paraDicts = {'max_depth': 6}
71 | 
72 |     model = CrossSectionalModelDecisionTree(jsonPath=None, paraDict=paraDicts)
73 |     model.fit(X_train, y_train)
74 |     pred_y = model.predict(X_test)
75 | 
76 |     plt.scatter(y_test,pred_y)
77 |     plt.title('y_pred vs y_real')
78 |     plt.xlabel('y_real')
79 |     plt.ylabel('y_pred')
80 |     plt.show()
81 | 
82 | 
83 | 
84 | 


--------------------------------------------------------------------------------
/Signal/CrossSectionalModels/test/testCSFeatureSelector.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.datasets import load_boston
 3 | 
 4 | def create_dataset():
 5 |     iris = load_iris()
 6 |     X = iris.data
 7 | 
 8 |     return X
 9 | 
10 | 
11 | if __name__ == "__main__":
12 | 
13 |     from CrossSectionalFeatureSelectorBase import CrossSectionalFeatureSelectorBase
14 | 
15 |     # from *your python file* import *your selector*
16 |     class MySelector(CrossSectionalFeatureSelectorBase):
17 |         def __init__(self, **kwargs):
18 |             super(MyModel, self).__init__(**kwargs)
19 | 
20 |         def fit(self, X_train):
21 |         # fit the model with the input data
22 |         # self.model.fit(X,y)
23 |             pass
24 | 
25 |         def transform(self, X):
26 |             # the one method that to be called to perform prediction
27 |             # return(self.model.predict(X))
28 |             pass
29 |         
30 |         def fit_transform(self, X):
31 |             # the one method that to be called to perform prediction
32 |             # return(self.model.predict(X))
33 |             pass
34 |         
35 |         def getPara(self):
36 |             # return the hyperparameter of the model
37 |             # maybe from another file json-like or another module
38 |             # for the cv cases
39 |             # do some how cv or things to decide the hyperparameter in this
40 |             
41 |             # if self.parameter == {}:
42 |             #     do something
43 |             # else:
44 |             #     return(self.parameter)
45 |             pass
46 |         
47 |         def getSelector(self):
48 |             # return the hyperparameter of the model
49 |             # maybe from another file json-like or another module
50 |             # for the cv cases
51 |             # do some how cv or things to decide the hyperparameter in this
52 |             
53 |             # if self.parameter == {}:
54 |             #     do something
55 |             # else:
56 |             #     return(self.parameter)
57 |             pass
58 | 
59 | 
60 |     # fit and predict
61 |     selector = MySelector()
62 | 
63 |     print("+++++++  Before training +++++++")
64 |     print(selector.getSelector())
65 |     print(selector.getPara())
66 | 
67 |     X = create_dataset()
68 |     
69 |     print("+++++++ Training +++++++")
70 |     selector.fit(X)
71 | 
72 |     print("+++++++  After training +++++++")
73 |     print(selector.getSelector())
74 |     print(selector.getPara())
75 | 
76 |     print("+++++++ Predicting +++++++")
77 |     pred = selector.transform(X)
78 | 
79 |     # fit_transform directly
80 |     selector = MySelector()
81 | 
82 |     print("+++++++  Before training +++++++")
83 |     print(selector.getSelector())
84 |     print(selector.getPara())
85 | 
86 |     X = create_dataset()
87 |     
88 |     print("+++++++ Fit_transform +++++++")
89 |     pred = selector.fit_transform(X)
90 | 
91 |     print("+++++++  After fit_transform +++++++")
92 |     print(selector.getSelector())
93 |     print(selector.getPara())
94 | 
95 | 
96 |     
97 |     
98 | 


--------------------------------------------------------------------------------
/Signal/CrossSectionalModels/test/testCSModel.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.datasets import load_iris, load_boston
 3 | 
 4 | 
 5 | def create_classification_dataset():
 6 |     # features: 4 numeric, predictive attributes
 7 |     # target: 3 int classes (0, 1, 2)
 8 | 
 9 |     iris = load_iris()
10 |     X = iris.data
11 |     y = iris.target
12 | 
13 |     n_sample = len(X)
14 | 
15 |     X_train = X[:int(.7 * n_sample)]
16 |     y_train = y[:int(.7 * n_sample)]
17 |     X_test = X[int(.7 * n_sample):]
18 |     y_test = y[int(.7 * n_sample):]
19 | 
20 |     return X_train, y_train, X_test, y_test
21 |     
22 | 
23 | def create_regression_dataset():
24 |     # features: 13 numeric/categorical predictive attributes
25 |     # target: 3 int(0, 1, 2)
26 |     boston = load_boston()
27 |     target = np.array(boston.feature_names) == "DIS"
28 |     X = boston.data[:, np.logical_not(target)]
29 |     y = boston.data[:, target].squeeze()
30 | 
31 |     n_sample = len(X)
32 | 
33 |     X_train = X[:int(.7 * n_sample)]
34 |     y_train = y[:int(.7 * n_sample)]
35 |     X_test = X[int(.7 * n_sample):]
36 |     y_test = y[int(.7 * n_sample):]
37 | 
38 |     return X_train, y_train, X_test, y_test
39 | 
40 | if __name__ == "__main__":
41 | 
42 |     from CrossSectionalModelBase import CrossSectionalModelBase
43 | 
44 |     # from *your python file* import *your model*
45 |     class MyModel(CrossSectionalModelBase):
46 |         def __init__(self, **kwargs):
47 |             super(MyModel, self).__init__(**kwargs)
48 |     
49 |         def fit(self, X_train, y_train):
50 |             # fit the model with the input data
51 |             # self.model.fit(X,y)
52 |             pass
53 |         
54 |         def predict(self, X):
55 |             # the one method that to be called to perform prediction
56 |             # return(self.model.predict(X))
57 |             pass
58 |         
59 |         def getPara(self):
60 |             # return the hyperparameter of the model
61 |             # maybe from another file json-like or another module
62 |             # for the cv cases
63 |             # do some how cv or things to decide the hyperparameter in this
64 |             
65 |             # if self.parameter == {}:
66 |             #     do something
67 |             # else:
68 |             #     return(self.parameter)
69 |             pass
70 |         
71 |         def getModel(self):
72 |             # return the model 
73 |             pass
74 | 
75 | 
76 |     model = MyModel()
77 | 
78 |     print("+++++++  Before training +++++++")
79 |     print(model.getModel())
80 |     print(model.getPara())
81 | 
82 |     X_train, y_train, X_test, y_test = create_regression_dataset()  # or create_classification_dataset()
83 |     
84 |     print("+++++++ Training +++++++")
85 |     model.fit(X_train, y_train)
86 | 
87 |     print("+++++++  After training +++++++")
88 |     print(model.getModel())
89 |     print(model.getPara())
90 | 
91 |     print("+++++++ Predicting +++++++")
92 |     pred = model.predict(X_test)
93 | 
94 |     def mse(y, y_hat):
95 |         return np.mean((y-y_hat)**2)
96 |     print(mse(y_test, pred))


--------------------------------------------------------------------------------
/Tool/Factor.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Thu May 28 15:11:11 2020
  4 | 
  5 | @author: Evan
  6 | @reviewer: Robert
  7 | """
  8 | # for convenience to try with spyder
  9 | # use python -m Factor is the standard way to call modules main function
 10 | #%%
 11 | import pickle
 12 | try :
 13 |     from .FactorProfileBase import FactorProfileBase
 14 |     from .GeneralData import GeneralData
 15 |     # print("from .FactorProfileBase import FactorProfileBase")
 16 | except :
 17 |     from FactorProfileBase import FactorProfileBase
 18 |     from GeneralData import GeneralData
 19 |     # print("from FactorProfileBase import FactorProfileBase")
 20 | 
 21 | 
 22 | DEFAULT_PROTOCOL = 2
 23 | #%%
 24 | class Factor(FactorProfileBase, GeneralData):
 25 |     def __init__(self, name: str = None, generalData = None, timestamp = None, columnNames = None,\
 26 |                  functionName = None, reliedDatasetNames: dict = None, parameters_dict: dict = None, **kwargs):
 27 |             
 28 |         FactorProfileBase.__init__(self)
 29 |         GeneralData.__init__(self, name, generalData, timestamp, columnNames, **kwargs)
 30 |         
 31 | 
 32 |         self.functionName = functionName
 33 |         self.reliedDatasetNames = reliedDatasetNames
 34 |         self.parameters_dict = parameters_dict
 35 | 
 36 |         
 37 |         
 38 |     def get_relied_dataset(self):
 39 |         outputDataset = {}
 40 |         for k, v in self.reliedDatasetNames.items():
 41 |             for dataset in v:
 42 |                 try:
 43 |                     outputDataset.update({
 44 |                             dataset:getattr(globalVars, k)[dataset]
 45 |                         })
 46 |                 except AttributeError :
 47 |                     print("There is no dataset named {} in global".format(k))
 48 |         return(outputDataset)
 49 | 
 50 |     def save(self, f, pickle_module=pickle, pickle_protocol=DEFAULT_PROTOCOL):
 51 |         with open(f, 'wb') as factorfilehandle:
 52 |             pickle_module.dump(self, factorfilehandle)
 53 | 
 54 |     @staticmethod
 55 |     def load(f, pickle_module=pickle, pickle_protocol=DEFAULT_PROTOCOL):
 56 |         with open(f, 'rb') as factorfilehandle:
 57 |             unpickler = pickle.Unpickler(factorfilehandle)
 58 |             factor = unpickler.load()
 59 |         return(factor)
 60 | 
 61 |   
 62 | #%%
 63 | if __name__ == '__main__':
 64 | 
 65 |     
 66 |     import pandas as pd
 67 |     import os
 68 |     PROJECT_ROOT = 'C:\\Users\\eiahb\\Documents\\MyFiles\\WorkThing\\tf\\01task\\GeneticProgrammingProject\\AlphaSignalFromMachineLearning\\'
 69 | 
 70 |     os.chdir(PROJECT_ROOT)
 71 |     from Tool import globalVars
 72 |     from GetData import load_data, align_all_to
 73 |     from Tool import Logger
 74 |     # loggerFolder = PROJECT_ROOT+"Tool\\log\\"
 75 |     # logger = Logger(loggerFolder, 'log')
 76 |     globalVars.initialize()
 77 | 
 78 |     # read h5
 79 |     # 用例 1 
 80 |     load_data("materialData",
 81 |         os.path.join(os.path.join(PROJECT_ROOT,"data"), "h5")
 82 |     )
 83 | 
 84 |     
 85 |     factorName = "close"
 86 |     functionName = "test"
 87 | 
 88 |     reliedDatasetNames= {'materialData':['open', 'close']}
 89 |     parameters_dict = dict()
 90 |     
 91 |     klass = Factor(name = factorName,
 92 |                 generalData = globalVars.materialData['close'],
 93 |                 functionName = functionName,
 94 |                 reliedDatasetNames = reliedDatasetNames,
 95 |                 parameters_dict = parameters_dict
 96 |             )
 97 |     reliedDatasets = klass.get_relied_dataset()
 98 |     klass.save('try.pickle')
 99 |     # factor = Factor.load('try.pickle')
100 |     factor = Factor.load(os.path.join(os.path.join(PROJECT_ROOT,'data'), 'factors\\try.pickle'))
101 |         
102 | 
103 | # %%
104 | 
105 | 
106 | # %%
107 | 


--------------------------------------------------------------------------------
/Director/systhesisDirector.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Sat Jan 2 13:30:30 2021
  4 | 
  5 | @author: Ye Donggua
  6 | 
  7 | """
  8 | import pandas as pd
  9 | from sklearn.metrics import mean_squared_error
 10 | 
 11 | from Tool import globalVars
 12 | from Tool.Factor import Factor
 13 | from Tool.DataPreProcessing import DeExtremeMethod, ImputeMethod, StandardizeMethod
 14 | from GetData.loadData import load_material_data
 15 | from BackTesting.Signal.CrossSectionalModels.CrossSectionalModel import CrossSectionalModelXGBoost
 16 | 
 17 | 
 18 | class SysthesisDirector(object):
 19 |     def __init__(self, signalGeneratorClass, params, logger=None):
 20 |         self.signalGeneratorClass = signalGeneratorClass
 21 |         self.params = params
 22 |         self.logger = logger
 23 |         # factors used during back testing
 24 |         self.factorNameList = params["factorNameList"]
 25 |         # run的时候会有实例
 26 |         self.signalGenerator = None
 27 |             
 28 |     def run(self):
 29 |         # initialize the globalVars
 30 |         globalVars.initialize()
 31 |         self.logger.info('globalVars is initialized')
 32 | 
 33 |         # load material data
 34 |         loadedDataList = load_material_data()
 35 |         self.logger.info('material data {0} is loaded'.format(loadedDataList))
 36 | 
 37 |         # load factors
 38 |         # TODO: should load from some factor.json file latter rather than simply load from material data
 39 |         # self.factorNameList
 40 |         # # 给globalVars注册factors（dict）
 41 |         # # key：factor的名字，value：generalData
 42 | 
 43 |         if 'factors' not in globalVars.varList:
 44 |             globalVars.register('factors', {})
 45 | 
 46 |         # TODO: factorManager
 47 |         for factorName in self.factorNameList:
 48 | 
 49 |             globalVars.factors[factorName] = Factor(factorName, globalVars.materialData[factorName])
 50 |             print(factorName, 'is now in globalVars.factors')
 51 |             self.logger.info("factor {0} is loaded".format(factorName))
 52 |         self.logger.info("all factors are loaded")
 53 | 
 54 |         # calculate the signal
 55 |         # 设置计算factors的参数
 56 |         # 可直接传pipeline
 57 |         # maskList：mask的nameList
 58 |         self.logger.info("start to generate signalGenerator")
 59 |         self.signalGenerator = self.signalGeneratorClass(model=CrossSectionalModelXGBoost, logger=self.logger)
 60 |         signals = self.signalGenerator.generate_signals(**self.params)
 61 | 
 62 |         return signals
 63 | 
 64 | 
 65 | if __name__ == '__main__':
 66 |     import logging
 67 |     import numpy as np
 68 |     from Tool.logger import Logger
 69 |     from BackTesting.Signal.SignalSynthesis import SignalSynthesis
 70 | 
 71 |     np.warnings.filterwarnings('ignore')
 72 | 
 73 |     logger = Logger("SignalDirector")
 74 |     logger.setLevel(logging.INFO)
 75 |     params = {
 76 |         "startDate": pd.to_datetime('20200101'),
 77 |         "endDate": pd.to_datetime('20200301'),
 78 |         "panelSize": 3,
 79 |         "trainTestGap": 1,
 80 |         "maskList": None,
 81 |         "deExtremeMethod": DeExtremeMethod.MeanStd(),
 82 |         "imputeMethod": ImputeMethod.JustMask(),
 83 |         "standardizeMethod": StandardizeMethod.StandardScaler(),
 84 |         "pipeline": None,
 85 |         # "factorNameList": ['close', 'amount', 'free_circulating_market_cap'],
 86 |         "factorNameList": ['large_sell_rate', 'large_buy_rate', 'initiative_sell_rate', 'initiative_buy_rate'],
 87 |         # params for XGBoost
 88 |         "modelParams": {
 89 |             "jsonPath": None,
 90 |             "paraDict": {
 91 |                 "n_estimators": 50,
 92 |                 "random_state": 42,
 93 |                 "max_depth": 2}
 94 |         },
 95 |         # metric function for machine learning models
 96 |         "metric_func": mean_squared_error,
 97 |         # smoothing params
 98 |         "smoothing_params": None
 99 |     }
100 | 
101 |     director = Director(SignalSynthesis, params=params, logger=logger)
102 |     director.run()
103 | 


--------------------------------------------------------------------------------
/Signal/CrossSectionalModels/Base/CrossSectionalModelSklearn.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Thu Dec 17 00:19:56 2020
  4 | 
  5 | @author: Mengjie Ye
  6 | """
  7 | 
  8 | # import os
  9 | # import sys
 10 | # sys.path.append(os.getcwd())
 11 | from .CrossSectionalModelBase import CrossSectionalModelBase
 12 | from .ModelTest.ModelTest import ModelTest
 13 | # =============================================================================
 14 | # try:
 15 | #     from .CrossSectionalModelBase import CrossSectionalModelBase
 16 | #     from .ModelTest.ModelTest import ModelTest
 17 | # except:
 18 | #     from CrossSectionalModelBase import CrossSectionalModelBase
 19 | #     from ModelTest.ModelTest import ModelTest
 20 | # =============================================================================
 21 | import pandas as pd
 22 | import json
 23 | from sklearn.model_selection import GridSearchCV
 24 | 
 25 | # import matplotlib.pyplot as plt
 26 | 
 27 | class CrossSectionalModelSklearn(CrossSectionalModelBase, ModelTest):
 28 |     
 29 |     def __init__(self, jsonPath = None, 
 30 |                  paraDict = {}, paraGrid = None, 
 31 |                  json_first = True):
 32 |         # take both jsonPath and the input para into consideration
 33 |         # json_first = True, when merge the parameters and there is a conflict
 34 |         # we would consider the para in json file first
 35 |         self.parameter = paraDict
 36 |         if jsonPath is not None:
 37 |             if json_first:
 38 |                 with open(jsonPath, 'r') as f:
 39 |                     # !!! should be a copy of paraDict
 40 |                     # python: call by reference
 41 |                     temp = paraDict.copy()
 42 |                     temp.update(json.load(f))
 43 |                     # paraDict = temp
 44 |                     # use self.parameter directly rather than paraDict
 45 |                     # avoid id(paraDict) changes
 46 |                     self.parameter = temp
 47 |             else:
 48 |                 with open(jsonPath, 'r') as f:
 49 |                     temp = json.load(f)
 50 |                     temp.update(paraDict)
 51 |                     # paraDict = temp
 52 |                     self.parameter = temp
 53 |             # restore the new paraDict into that jsonPath
 54 |             with open(jsonPath, 'w') as f:
 55 |                 # json.dump(paraDict, f)
 56 |                 json.dump(self.parameter, f)
 57 |         # self.parameter = paraDict
 58 |         # define your model when inherit this class
 59 |         self.model = None
 60 |         # if we want to use Cross Validation to search for the best para
 61 |         # input the paraGrid for Grid Search
 62 |         self.paraGrid = paraGrid
 63 | 
 64 |     def fit(self, X_train, y_train, **kwargs):
 65 |         # use cv to get best model and para
 66 |         # or just fit the model
 67 |         # set kwargs for GridSearchCV()
 68 |         if self.paraGrid is not None:
 69 |             reg = GridSearchCV(
 70 |                 self.model, self.paraGrid, **kwargs
 71 |                 )
 72 |             reg.fit(X_train, y_train)
 73 |             self.parameter = reg.best_params_
 74 |             self.model = reg.best_estimator_
 75 |         else:
 76 |             self.model.fit(X_train, y_train)
 77 |       
 78 |     def predict(self, X):
 79 |         return self.model.predict(X)
 80 |     
 81 |     def get_para(self, verbal = False): 
 82 |         if self.parameter!={}:
 83 |             # verbal: if True: display the parameter as a dataframe
 84 |             # verbal: if False: return a dict
 85 |             if verbal is False:
 86 |                 return self.parameter
 87 |             else:
 88 |                 return pd.DataFrame.from_dict(self.parameter, 
 89 |                                           orient='index',
 90 |                                           columns= ['ParaValue'])
 91 |         else:
 92 |             print('Hyper parameters are default')
 93 |         
 94 |     def get_model(self):
 95 |         return self.model
 96 | 
 97 |     def get_coef(self):
 98 |         # get estimated coefficients for the linear regression problem
 99 |         return self.model.coef_
100 |     
101 |     def get_intercept(self):
102 |         # get estimated intercept for the linear regression problem
103 |         return self.model.intercept_
104 | 
105 | 
106 | 
107 | 
108 | 
109 | 


--------------------------------------------------------------------------------
/Director/singleFactorDirector.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Sat Jan 2 13:30:30 2021
  4 | 
  5 | @author: Ye Donggua
  6 | 
  7 | """
  8 | import pandas as pd
  9 | from sklearn.metrics import mean_squared_error
 10 | 
 11 | from Tool import globalVars
 12 | from Tool.GeneralData import GeneralData
 13 | from Tool.Factor import Factor
 14 | from Tool.DataPreProcessing import DeExtremeMethod, ImputeMethod, StandardizeMethod
 15 | from GetData.loadData import load_material_data
 16 | from BackTesting.Signal.CrossSectionalModels.CrossSectionalModel import CrossSectionalModelXGBoost
 17 | 
 18 | 
 19 | class SingelFactorDirector(object):
 20 |     def __init__(self,toTestFactor:GeneralData, signalGeneratorClass, logger=None):
 21 |         self.signalGeneratorClass = signalGeneratorClass
 22 |         self.logger = logger
 23 |         # factors used during back testing
 24 |         self.factorNameList = []
 25 |         # run的时候会有实例
 26 |         self.signalGenerator = None
 27 |         self.toTestFactor = toTestFactor
 28 |             
 29 |     def run(self):
 30 |         # initialize the globalVars
 31 |         globalVars.initialize()
 32 |         self.logger.info('globalVars is initialized')
 33 | 
 34 |         # load material data
 35 |         loadedDataList = load_material_data()
 36 |         self.logger.info('material data {0} is loaded'.format(loadedDataList))
 37 | 
 38 |         # load factors
 39 |         # TODO: should load from some factor.json file latter rather than simply load from material data
 40 |         # self.factorNameList
 41 | 
 42 |         # toLoadFactors = ['close',
 43 |         #                  'high',
 44 |         #                  'low',
 45 |         #                  'open',
 46 |         #                  'volume'
 47 |         #                  ]
 48 |         # # 给globalVars注册factors（dict）
 49 |         # # key：factor的名字，value：generalData
 50 |         
 51 | 
 52 |         if 'factors' not in globalVars.varList:
 53 |             globalVars.register('factors', {})
 54 |         globalVars.factors['{}_factor'.format(self.toTestFactor.name)] = Factor('{}_factor'.format(self.toTestFactor.name),self.toTestFactor)
 55 |         self.factorNameList.append('{}_factor'.format(self.toTestFactor.name))
 56 |         # TODO: factorManager
 57 |         # for factorName in toLoadFactors:
 58 | 
 59 |         #     globalVars.factors[factorName] = Factor(factorName, globalVars.materialData[factorName])
 60 |         #     print(factorName, 'is now in globalVars.factors')
 61 |         #     self.factorNameList.append(factorName)
 62 |         #     self.logger.info("factor {0} is loaded".format(factorName))
 63 |         # self.logger.info("all factors are loaded")
 64 | 
 65 |         # calculate the signal
 66 |         # 设置计算factors的参数
 67 |         # 可直接传pipeline
 68 |         # maskList：mask的nameList
 69 |         params = {
 70 |             "startDate": pd.to_datetime('2020-01-01'),
 71 |             "endDate": pd.to_datetime('2020-10-31'),
 72 |             "panelSize": 3,
 73 |             "trainTestGap": 1,
 74 |             "maskList": None,
 75 |             "deExtremeMethod": DeExtremeMethod.MeanStd(),
 76 |             "imputeMethod": ImputeMethod.JustMask(),
 77 |             "standardizeMethod": StandardizeMethod.StandardScaler(),
 78 |             "pipeline": None,
 79 |             "factorNameList": self.factorNameList,
 80 |             # params for XGBoost
 81 |             "modelParams": {
 82 |                 "jsonPath": None,
 83 |                 "paraDict": {
 84 |                     "n_estimators": 50,
 85 |                     "random_state": 42,
 86 |                     "max_depth": 2}
 87 |             },
 88 |             # metric function for machine learning models
 89 |             "metric_func": mean_squared_error,
 90 |             # smoothing params
 91 |             "smoothing_params": None
 92 |         }
 93 |         self.logger.info("start to generate signalGenerator")
 94 |         self.signalGenerator = self.signalGeneratorClass(model=CrossSectionalModelXGBoost, logger=self.logger)
 95 |         signals = self.signalGenerator.generate_signals(**params)
 96 | 
 97 |         return signals
 98 | 
 99 | 
100 | if __name__ == '__main__':
101 |     import logging
102 |     import numpy as np
103 |     from Tool.logger import Logger
104 |     from BackTesting.Signal.SignalSynthesis import SignalSynthesis
105 | 
106 |     np.warnings.filterwarnings('ignore')
107 | 
108 |     logger = Logger("SignalDirector")
109 |     logger.setLevel(logging.INFO)
110 | 
111 |     director = SignalDirector(toTestFactor = ????, SignalSynthesis, logger=logger)
112 |     director.run()
113 | 


--------------------------------------------------------------------------------
/Signal/CrossSectionalModels/test/testModelLinear.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import numpy as np
  3 | from sklearn.datasets import load_iris, load_boston
  4 | import matplotlib.pyplot as plt
  5 | 
  6 | 
  7 | def create_classification_dataset():
  8 |     # features: 4 numeric, predictive attributes
  9 |     # target: 3 int classes (0, 1, 2)
 10 | 
 11 |     iris = load_iris()
 12 |     X = iris.data
 13 |     y = iris.target
 14 | 
 15 |     n_sample = len(X)
 16 | 
 17 |     X_train = X[:int(.7 * n_sample)]
 18 |     y_train = y[:int(.7 * n_sample)]
 19 |     X_test = X[int(.7 * n_sample):]
 20 |     y_test = y[int(.7 * n_sample):]
 21 | 
 22 |     return X_train, y_train, X_test, y_test
 23 |     
 24 | 
 25 | def create_regression_dataset():
 26 |     # features: 13 numeric/categorical predictive attributes
 27 |     # target: 3 int(0, 1, 2)
 28 |     boston = load_boston()
 29 |     target = np.array(boston.feature_names) == "DIS"
 30 |     X = boston.data[:, np.logical_not(target)]
 31 |     y = boston.data[:, target].squeeze()
 32 | 
 33 |     n_sample = len(X)
 34 | 
 35 |     X_train = X[:int(.7 * n_sample)]
 36 |     y_train = y[:int(.7 * n_sample)]
 37 |     X_test = X[int(.7 * n_sample):]
 38 |     y_test = y[int(.7 * n_sample):]
 39 | 
 40 |     return X_train, y_train, X_test, y_test
 41 | 
 42 | if __name__ == "__main__":
 43 | # =============================================================================
 44 | #     import sys
 45 | #     import os
 46 | #     sys.path.append(os.path.abspath('.') + '\..')
 47 | #     from CrossSectionalModel.CrossSectionalModelLinearSklearn import CrossSectionalModelOLS, CrossSectionalModelRidge, CrossSectionalModelLasso
 48 | # =============================================================================
 49 |     from BackTesting.Signal.CrossSectionalModels.CrossSectionalModel.CrossSectionalModelLinearSklearn \
 50 |         import CrossSectionalModelOLS, CrossSectionalModelRidge, CrossSectionalModelLasso
 51 |     
 52 |     paraDictOLS = {'fit_intercept':True}
 53 |     paraDictRidge = {'fit_intercept':True,'alpha':0.3}
 54 |     paraDictLasso = {'fit_intercept':True,'alpha':1}
 55 |     # paraDictLasso2 = {'fit_intercept':True,'alpha':0.2}
 56 |     
 57 |     paraGridRidge = {'alpha':[x for x in np.arange(0.1,2,0.2)]}
 58 |     paraGridLasso = {'alpha':[x for x in np.arange(0.1,2,0.2)]}
 59 |     
 60 |     modelOLS = CrossSectionalModelOLS(paraDict = paraDictOLS)
 61 |     modelRidge = CrossSectionalModelRidge(paraDict = paraDictRidge)
 62 |     modelLasso = CrossSectionalModelLasso(paraDict = paraDictLasso)
 63 |     
 64 |     modelRidgeJson = CrossSectionalModelRidge(jsonPath = 'paraDictRidge.json')
 65 |     modelLassoJson = CrossSectionalModelLasso(jsonPath = 'paraDictLasso.json')
 66 |     # modelLassoJson2 = CrossSectionalModelLasso(jsonPath = 'paraDictLasso2.json',json_first = False)
 67 |     
 68 |     modelRidgeCV = CrossSectionalModelRidge(paraGrid = paraGridRidge)
 69 |     
 70 |     # ????用完modelLassoJson之后 paraDict变了。。。
 71 |     # paraDict.update()的问题？？？
 72 |     # 先用paraDict.update()的话，会把paraDict存在那个函数里面的感觉,id的问题
 73 |     # 直接用self.parameter
 74 |     
 75 |     # modelLassoJson = CrossSectionalModelLasso(jsonPath = 'paraDictLasso.json',
 76 |     #                                           paraDict = paraDictLasso2)
 77 |     modelLassoCV = CrossSectionalModelLasso(paraGrid = paraGridLasso)
 78 |     
 79 |     
 80 |     # 用上面这些做测试
 81 |     # modelLassoJson2 = CrossSectionalModelLasso(jsonPath = 'paraDictLasso2.json',json_first = False)
 82 |     # modelLassoCV2 = CrossSectionalModelLasso(paraGrid = paraGridLasso)
 83 |     model = modelLassoCV
 84 |     print("+++++++  Before training +++++++")
 85 |     print(model.get_model())
 86 |     print(model.get_para())
 87 | 
 88 |     X_train, y_train, X_test, y_test = create_regression_dataset()  # or create_classification_dataset()
 89 |     
 90 |     print("+++++++ Training +++++++")
 91 |     model.fit(X_train, y_train)
 92 | 
 93 |     print("+++++++  After training +++++++")
 94 |     print(model.get_model())
 95 |     print(model.get_para(verbal = True))
 96 | 
 97 |     print("+++++++ Predicting +++++++")
 98 |     pred = model.predict(X_test)
 99 | 
100 | # =============================================================================
101 | #     def mse(y, y_hat):
102 | #         return np.mean((y-y_hat)**2)
103 | #     print(mse(y_test, pred))
104 | # =============================================================================
105 |     print("+++++ Rregression Coefficient ++++++")
106 |     print(model.get_coef())
107 |     print(model.get_score(y_test, y_pred=pred, scoreMethod = 'r2'))
108 |     
109 |     plt.scatter(y_test,pred)
110 |     plt.title('y_pred vs y_real')
111 |     plt.xlabel('y_real')
112 |     plt.ylabel('y_pred')
113 |     plt.show()
114 | 
115 | 
116 | 
117 | 
118 | 
119 | 
120 | 
121 | 
122 | 
123 | 
124 | 
125 | 


--------------------------------------------------------------------------------
/Signal/SignalFunctionsDev/trainTestSlice.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Sun Dec 20 18:44:20 2020
  4 | 
  5 | @author: Mengjie Ye
  6 | """
  7 | 
  8 | import numpy as np
  9 | import pandas as pd
 10 | 
 11 | from datetime import datetime
 12 | def train_test_slice(factors, dependents=None, trainStart=None, trainEnd=None, testStart=None, testEnd=None):
 13 |     # split all the factors and toPredicts to train part and test part according to input,
 14 |     # if trainStart = trainEnd: the user doesn't use panel data
 15 |     # slice factors at that date
 16 |     # else we slice factors from trainStart to trainEnd (closed date interval)
 17 |     # dependents always sliced by trainEnd
 18 |     # if dependents is None, return {} (can be used when we slice maskDict)
 19 |     factorTrainDict, factorTestDict = {}, {}
 20 |     dependentTrainDict, dependentTestDict = {}, {}
 21 |     
 22 |     if trainStart == trainEnd:
 23 |         for factor in factors:
 24 |             factorTrainDict[factor.name] = factor.get_data(at = trainEnd)
 25 |             factorTestDict[factor.name] = factor.get_data(at = testEnd)
 26 |     else:
 27 |         for factor in factors:
 28 |             factorTrainDict[factor.name] = np.vstack((factor.get_data(trainStart, trainEnd),
 29 |                                                      factor.get_data(at = trainEnd)))
 30 |             factorTestDict[factor.name] = np.vstack((factor.get_data(testStart, testEnd),
 31 |                                                      factor.get_data(at = testEnd)))
 32 |     if dependents is not None:      
 33 |         for name, dependent in dependents.items():
 34 |             dependentTrainDict[name] = dependent.get_data(at = trainEnd)
 35 |             dependentTestDict[name] = dependent.get_data(at = testEnd)
 36 |     
 37 |         return factorTrainDict, factorTestDict, dependentTrainDict, dependentTestDict
 38 | 
 39 | 
 40 | 
 41 | #%% test
 42 | from Tool import globalVars
 43 | from GetData.loadData import load_material_data, simple_load_factor
 44 | globalVars.initialize()
 45 | loadedDataList = load_material_data()
 46 | 
 47 | #TODO:use logger latter 
 48 | shiftedReturn = globalVars.materialData['pctChange'].get_shifted(-1)
 49 | # TODO: take the -1 as a para: the period we want to shift
 50 | shiftedReturn.metadata.update({'shiftN':-1})
 51 | shiftedReturn.name = 'shiftedReturn'
 52 | dependents = {}
 53 | factorNameList = []
 54 | allTradeDatetime = shiftedReturn.timestamp
 55 | dependents.update({'shiftedReturn':shiftedReturn})
 56 | toLoadFactors = ['close',
 57 |                          'high',
 58 |                          'low',
 59 |                          'open'
 60 |                          ] 
 61 |         
 62 | for aFactor in toLoadFactors:
 63 |     simple_load_factor(aFactor)
 64 |     factorNameList.append(aFactor)
 65 | 
 66 | # shiftedReturn.metadata.update({'shiftN':-1})
 67 | # shiftedReturn.name = 'shiftedReturn'
 68 | # allTradeDatetime = shiftedReturn.timestamp
 69 | 
 70 | # dependents.update({'shiftedReturn':shiftedReturn})
 71 | 
 72 | 
 73 | # TODO: should load from some factor.json file latter
 74 | ############ this part realy socks 
 75 | ############ to be modified latter
 76 | ############ with nice designed globalVars
 77 | ############ the factor in globalVars.factors is a dict 
 78 | toLoadFactors = ['adj_close',
 79 |                  'adj_high',
 80 |                  'adj_low',
 81 |                  'adj_open'
 82 |                  ] 
 83 | factorNameList = []
 84 | for aFactor in toLoadFactors:
 85 |     simple_load_factor(aFactor)
 86 |     factorNameList.append(aFactor)
 87 | 
 88 | 
 89 | def get_last_trade_date(date, n=1):
 90 |     assert allTradeDatetime[allTradeDatetime<date][-n], 'index out of range'
 91 |     return allTradeDatetime[allTradeDatetime<date][-n]
 92 | 
 93 | def get_next_trade_date( date, n=1):
 94 |     assert allTradeDatetime[allTradeDatetime>date][n], 'index out of range'
 95 |     return allTradeDatetime[allTradeDatetime>date][n]
 96 | 
 97 | backTestDate = allTradeDatetime[10]
 98 | panelSize = 5
 99 | trainTestGap = 1
100 | testEnd = backTestDate
101 | testStart = get_last_trade_date(testEnd, panelSize - 1)
102 | trainEnd = get_last_trade_date(testEnd, trainTestGap)
103 | trainStart = get_last_trade_date(trainEnd, panelSize - 1)
104 | 
105 | factorTrainDict, factorTestDict, dependentTrainDict, dependentTestDict = train_test_slice(
106 |                 factors = globalVars.factors.values(), dependents = dependents,
107 |                 trainStart = trainStart, trainEnd = trainEnd, testStart = testStart, testEnd = testEnd
108 |             )
109 | #%%
110 | 
111 | #%%
112 | 
113 | #%%
114 | trainStart = '2020-11-30'
115 | trainEnd = '2020-12-02'
116 | # trainStart = pd.to_datetime(trainStart)
117 | # trainEnd = pd.to_datetime(trainEnd)
118 | f1 = globalVars.factors['adj_close']
119 | trainFactors = f1.get_data(trainStart,None)
120 | 
121 | # TODO GeneralData写一个get_data one day之类的东西
122 | 
123 | 
124 | 
125 | 


--------------------------------------------------------------------------------
/Signal/CrossSectionalModels/CrossSectionalModel/CrossSectionalModelLinearStat.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Fri Dec 11 12:16:04 2020
  4 | 
  5 | @author: Mengjie Ye
  6 | """
  7 | 
  8 | from BackTesting.Signal.CrossSectionalModels.Base.CrossSectionalModelBase import CrossSectionalModelBase
  9 | # from sklearn.linear_model import LinearRegression,Ridge,Lasso
 10 | 
 11 | import  statsmodels.api as sm
 12 | from statsmodels.api import OLS,WLS
 13 | import pandas as pd
 14 | import numpy as np
 15 | import json
 16 | from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
 17 | import abc
 18 | 
 19 | class CrossSectionalModelLinear(CrossSectionalModelBase):
 20 |     
 21 |     def __init__(self, jsonPath = None, paraDict = {}):
 22 |         self.parameter = paraDict
 23 |         if jsonPath is not None:
 24 |             with open(jsonPath,'r') as f:
 25 |                 self.parameter = json.loads(f)
 26 |         self.fit_intercept = self.parameter.get('fit_intercept',True)
 27 |         self.model = None
 28 |         
 29 |     def fit(self, X_train, y_train):
 30 |         if self.fit_intercept:
 31 |             X_train = sm.add_constant(X_train)
 32 |         self.model = OLS(y_train, X_train)
 33 |         self.res = self.model.fit()
 34 |         return self.res
 35 |         
 36 |         
 37 |     def predict(self, X):
 38 |         if self.fit_intercept:
 39 |             X = sm.add_constant(X)
 40 |         return self.res.predict(X)
 41 |     
 42 |     def get_para(self):
 43 |         if self.parameter!={}:
 44 |             return pd.DataFrame.from_dict(self.parameter, 
 45 |                                           orient='index',
 46 |                                           columns= ['ParaValue'])
 47 |         else:
 48 |             print('Hyper parameters are default')
 49 |         
 50 |         
 51 |     def get_model(self):
 52 |         try:
 53 |             return self.res
 54 |         except:
 55 |             print('fit your model first!')
 56 |             return None
 57 |         
 58 |     
 59 |     def get_score(self, y_real, **kwargs):
 60 |         '''
 61 |         get score of the prediction based on the scoreMethod
 62 |         
 63 |         ----
 64 |             
 65 |             y: y_real
 66 |             kwargs:
 67 |                 scoreMethod: str
 68 |                         'r2': r2_score
 69 |                         'mse': mean_squared_error
 70 |                         'mae': mean_absolute_error
 71 |                 X: ndarray, input X to get y_pred
 72 |                 y_pred: input y_pred directly
 73 |         '''
 74 |         if 'y_pred' in kwargs.keys():
 75 |             y_pred = kwargs['y_pred']
 76 |         elif 'X' in kwargs.keys():
 77 |             y_pred = self.res.predict(kwargs['X'])
 78 |             
 79 |         
 80 |         def r2(y_real, y_pred):
 81 |             return r2_score(y_real, y_pred)
 82 |         def mse(y_real, y_pred):
 83 |             return mean_squared_error(y_real, y_pred)
 84 |         def mae(y_real, y_pred):
 85 |             return mean_absolute_error(y_real, y_pred)
 86 |         
 87 |         methodDict = {'r2':r2, 'mse':mse, 'mae':mae}
 88 |         scoreMethod = kwargs.get('scoreMethod','r2')
 89 |         scoreMethod = methodDict[scoreMethod]
 90 |         return scoreMethod(y_real, y_pred)
 91 |     
 92 |     def get_coef(self):
 93 |         '''
 94 |         get estimated coefficients for the linear regression problem
 95 |         '''
 96 |         return self.res.params
 97 |     
 98 |     def get_model_summary(self):
 99 |         '''
100 |         get summary of the model
101 |         
102 |         return
103 |         ----
104 |         summary of model: coef, pvalue, t-statistics, R2, R2_adj...
105 |         '''
106 |         return self.res.summary()
107 |     
108 |    
109 | #%%
110 | class CrossSectionalModelOLS(CrossSectionalModelLinear):
111 |     pass
112 |     
113 | 
114 | #%%
115 | class CrossSectionalModelRidge(CrossSectionalModelLinear):
116 |     
117 |     def __init__(self, jsonPath = None, paraDict = {}):
118 |         self.parameter = paraDict
119 |         if jsonPath is not None:
120 |             with open(jsonPath,'r') as f:
121 |                 self.parameter = json.loads(f)
122 |         self.fit_intercept = self.parameter.get('fit_intercept',True)
123 |         self.model = None
124 |         
125 |     def fit(self, X, y, **kwargs):
126 |         if self.fit_intercept:
127 |             X = sm.add_constant(X)
128 |         try:
129 |             self.alpha = self.parameter['alpha']
130 |         except:
131 |             raise Exception('cannot find alpha! please set the penalty of Ridge')
132 |         else:
133 |             self.model = OLS(y, X)
134 |         self.res = self.model.fit_regularized(alpha = self.alpha, L1_wt = 0, **kwargs)
135 |         
136 | #%%        
137 | class CrossSectionalModelLasso(CrossSectionalModelBase):
138 |     
139 |     def fit(self, X, y, **kwargs):
140 |         if self.fit_intercept:
141 |             X = sm.add_constant(X)
142 |         try:
143 |             self.alpha = kwargs['alpha']
144 |         except:
145 |             raise Exception('cannot find alpha! please set the penalty of Lasso')
146 |         else:
147 |             self.model = OLS(y, X)
148 |         self.res = self.model.fit_regularized(alpha = self.alpha, L1_wt = 1, **kwargs)
149 | 
150 |     
151 |     
152 | 
153 | 
154 | 
155 |      


--------------------------------------------------------------------------------
/Tool/GeneralData.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Tue Dec  1 15:24:11 2020
  4 | 
  5 | @author: Evan
  6 | """
  7 | # for convenience to try with spyder
  8 | # use python -m Factor is the standard way to call modules main function
  9 | 
 10 | try:
 11 |     from Tool.GeneralDataBase import GeneralDataBase
 12 |     # print('from Tool.GeneralDataBase import GeneralDataBase')
 13 | except Exception:
 14 |     from GeneralDataBase import GeneralDataBase 
 15 |     # print('from GeneralDataBase import GeneralDataBase')
 16 | 
 17 |     
 18 | 
 19 | import copy
 20 | import numpy as np
 21 | import pandas as pd
 22 | 
 23 | class GeneralData(GeneralDataBase):
 24 |     def __init__(self, name = None, generalData = None, timestamp = None, columnNames = None, **kwargs):
 25 |         GeneralDataBase.__init__(self)
 26 |         
 27 |         # print('GeneralData __init__')
 28 |         
 29 |         self.name = name
 30 |         if generalData is None: 
 31 |             if 'filePath' in kwargs:
 32 |                 try:
 33 |                     filePath = kwargs['filePath']
 34 |                     generalData = pd.read_csv(filePath, index_col=0)
 35 |                     if '300186.SZ' in generalData.columns:
 36 |                         generalData.drop(columns='300186.SZ', inplace=True)  # Todo: 数据bug处理完后删掉
 37 |                 except FileNotFoundError as fnfe:
 38 |                     print(fnfe)
 39 |                     print("please check the file path is currect")
 40 |                     raise fnfe
 41 |                 except Exception as e:
 42 |                     print(e)
 43 |                     print('We have a filePath but we can not load the generalData to pandas df structure')
 44 |                     raise e
 45 | 
 46 |         if isinstance(generalData, pd.DataFrame):
 47 |             try:
 48 |                 self.columnNames = generalData.columns
 49 |                 if 'indexFormat' in kwargs:
 50 |                     indexFormat = kwargs['indexFormat']
 51 |                     generalData.index = pd.to_datetime(generalData.index, format = indexFormat)
 52 |                 self.timestamp = pd.DatetimeIndex(generalData.index)
 53 |                 self.timestamp = pd.DatetimeIndex(generalData.index.astype(str))
 54 |                 self.generalData = generalData.to_numpy()
 55 |             except Exception as e:
 56 |                 raise(e)
 57 |             
 58 |         elif isinstance(generalData, np.ndarray):
 59 |             assert timestamp is not None and columnNames is not None
 60 |             self.generalData = generalData
 61 |             
 62 |         else:
 63 |         # isinstance(generalData, GeneralData):
 64 |             assert timestamp is None and columnNames is None
 65 |             try:
 66 |                 self.generalData = generalData.generalData
 67 |                 self.columnNames = generalData.columnNames
 68 |                 self.timestamp = generalData.timestamp
 69 |                 if self.name == None:
 70 |                     self.name = generalData.name
 71 |             except:
 72 |                 raise TypeError('Must be np ndarray or pandas DataFrame or GeneralData like')
 73 |                     
 74 |         if timestamp is not None:
 75 |             assert len(timestamp) == self.generalData.shape[0], 'the timestammp should \
 76 |                 match the generalData size' 
 77 |             self.timestamp = timestamp
 78 |            
 79 |             
 80 |         if columnNames is not None:
 81 |             assert len(columnNames) == self.generalData.shape[1], 'the columnNames should \
 82 |                 match the generalData size'
 83 |             self.columnNames = columnNames
 84 | 
 85 |         self.metadata.update({k:v for k, v in kwargs.items()})
 86 |         
 87 |     def get_data_tail(self, n = 10):
 88 |         return(self.generalData[-n:, :])
 89 |     
 90 |     def get_data_head(self, n = 10):
 91 |         return(self.generalData[:n, :])
 92 |     
 93 |     def get_data(self, start = None, end = None, at = None, get_loc_method = 'ffill'):  
 94 |         if at is None:
 95 |             if start is None:
 96 |                 start = self.timestamp[0]
 97 |             if end is None:
 98 |                 end = self.timestamp[-1]
 99 |         else:
100 |             start = at
101 |             end = at
102 |             
103 |         if not isinstance(start, int):
104 |             try:
105 |                 start_loc = self.timestamp.get_loc(start, get_loc_method)
106 |             except KeyError as ke:
107 |                 print('''
108 |                       The start time is out of range or not in the index when you are not using default get_loc_method
109 |                       ''')
110 |                 raise(ke)
111 |             except Exception as e:
112 |                 raise e
113 |         else:
114 |             start_loc = start
115 |             
116 |         if not isinstance(end, int):
117 |             try:
118 |                 end_loc = self.timestamp.get_loc(end, get_loc_method)
119 |             except KeyError as ke:
120 |                 print('''
121 |                       The end time is out of range or not in the index when you are not using default get_loc_method.
122 |                       ''')
123 |                 raise(ke)
124 |         else:
125 |             end_loc = end
126 |         
127 |         assert (isinstance(start_loc, int)), "The input type of start and end should be int of loc \
128 |             or datetime or str that datetimeIndex accessible"
129 |         assert (isinstance(end_loc, int)), "The input type of start and end should be int of loc \
130 |             or datetime or str that datetimeIndex accessible"
131 |             
132 |         if start_loc == end_loc:
133 |             return(self.generalData[start_loc, :])
134 |         else:
135 |             return(self.generalData[start_loc:end_loc, :])
136 |     
137 |     def get_columnNames(self):
138 |         return(self.columnNames)
139 |     
140 |     def get_timestamp(self):
141 |         return(self.timestamp)
142 |     
143 |     def is_same_shape(self, anotherCls):
144 |         assert isinstance(anotherCls, GeneralData)
145 |         return(self.generalData.shape == anotherCls.generalData.shape)
146 |     
147 |     def get_shifted_generalData(self, shiftN):
148 |         toOutput = self.generalData.copy()
149 |         if shiftN >= 0:
150 |             toOutput[-shiftN:, :] = np.nan
151 |         else:
152 |             toOutput[:-shiftN, :] = np.nan
153 |         shifted = np.roll(toOutput, shiftN, axis = 0)
154 |         return(shifted)
155 |     
156 |     def get_shifted(self, shiftN):
157 |         toOutput = copy.copy(self)
158 |         toOutput.generalData = toOutput.get_shifted_generalData(shiftN)
159 |         return(toOutput)
160 |     
161 |     def to_DataFrame(self):
162 |         return(pd.DataFrame(self.generalData, index=self.timestamp, columns=self.columnNames))
163 |         
164 |     
165 |     def align_with(self, alignTo):
166 |         data_df = self.to_DataFrame()
167 |         reindexed = data_df.reindex(index=alignTo.timestamp, columns=alignTo.columnNames)
168 |         toReturn = GeneralData(self.name, generalData=reindexed)
169 |         return(toReturn)
170 | 
171 |     def __str__(self):
172 |         return(str(self.to_DataFrame()))
173 |     
174 |     def __repr__(self):
175 |         return(str(self.to_DataFrame()))
176 |         
177 | if __name__ ==  "__main__":
178 |     DATA_PATH = 'C:/Users/eiahb/Documents/MyFiles/WorkThing/tf/01task/GeneticProgrammingProject/AlphaSignalFromMachineLearning\\GetData/tables//materialData//S_DQ_ADJOPEN.csv'
179 |     klass = GeneralData(name = 'adj_open', filePath = DATA_PATH, indexFormat = "%Y%m%d")
180 |     # klass.get_data('2005', '2014-01-06')
181 |     isinstance(klass, GeneralData)
182 |     
183 |     
184 |     #%% how to get data of a single slice
185 |     klass.get_data(at = '2018-03-09')
186 |     klass.get_data(start = '2018-03-09', end = '2018-03-09')
187 | 
188 | 
189 | 
190 | 
191 | 
192 | 
193 | 
194 | 
195 | 
196 | 
197 | 
198 | 
199 | 
200 | 
201 | 
202 | 
203 | 
204 | 
205 | 
206 | 
207 | 
208 | 
209 | 
210 | 
211 | 
212 | 
213 | 
214 | 
215 | 
216 | 
217 |     
218 |     
219 |         
220 |         


--------------------------------------------------------------------------------
/GetData/backtestDataApi.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import datetime
  3 | 
  4 | from GetData.backtestDatabase import BacktestDatabase as DatabaseReader
  5 | 
  6 | 
  7 | class BacktestDataApi:
  8 |     @classmethod
  9 |     def get_universe(cls, bk, date, factor_list=None):
 10 |         """
 11 |         板块名称字符串，支持 “沪深300”、“中证500”
 12 |         :param bk: 版块名称，比如 沪深300
 13 |         :param date: 指定的日期， 比如 2018-11-30 目前只有每个月最后一个交易日的数据，其他时间返回的数据是None
 14 |         :param factor_list: 暂时不支持指定，默认返回全部的factor列表
 15 |         :return:
 16 |         """
 17 |         # 获取最近交易日的日期，自动过滤非交易日
 18 |         date = cls.get_nearest_trading_date(date)
 19 |         stock_info = DatabaseReader.get_stock_info(None, date, date, factor_list)
 20 |         # 根据 stock_code_list 获取指定日期的因子数据
 21 |         if bk in ["沪深300", "hs300", "300", "HS", "000300.SH"]:
 22 |             df = DatabaseReader.get_index_weight("000300.SH", start_date=date, end_date=date)
 23 |             selected_df = stock_info[stock_info['code'].isin(list(df['code']))]
 24 |         elif bk in ["中证500", "zz500", "500", "000905.SH"]:
 25 |             df = DatabaseReader.get_index_weight("000905.SH", start_date=date, end_date=date)
 26 |             selected_df = stock_info[stock_info['code'].isin(list(df['code']))]
 27 |         elif bk == '全A':
 28 |             selected_df = stock_info
 29 |         else:
 30 |             raise ValueError("输入的版块名称错误 ：{0}".format(bk))
 31 |         selected_df.set_index('code', inplace=True)
 32 |         return selected_df
 33 | 
 34 |     @classmethod
 35 |     def get_index_weight(cls, index_id, date, factor_list=None):
 36 |         """
 37 |         获取指数在某一天的权重信息
 38 |         :param date: 指定的日期，date 输入格式为 yyyy-mm-dd 的方式
 39 |         :param index_id: 对应的指数代码
 40 |         :param factor_list: 具体返回哪些因子列表 
 41 |         :return:
 42 |         """
 43 |         pd_data = DatabaseReader.get_index_weight(index_id, start_date=date, end_date=date).copy(deep=True)
 44 |         if pd_data is not None:
 45 |             pd_data.set_index("code", inplace=True)
 46 |             # 列名重命名 权重 isST 重命名为 ST 保持和股票因子 ST数据一致
 47 |             pd_data.rename(columns={"weight": "权重"}, inplace=True)
 48 | 
 49 |             weight_data = pd_data.loc[:, ["权重"]]
 50 |             # 然后获取因子数据
 51 |             stock_info = DatabaseReader.get_stock_info(weight_data.index.tolist(), date, date, factor_list)
 52 |             stock_info.set_index('code', inplace=True)
 53 |             pd_data = pd.concat([weight_data, stock_info], axis=1, sort=False)
 54 |             return pd_data
 55 |     
 56 |     @classmethod
 57 |     def get_index_weight_by_date_list(cls, index_id, date_list, factor_list=None):
 58 |         """
 59 |         获取指数在某个时间列表的的权重信息
 60 |         :param date_list: 指定日期组成的list,比如["2005-04-29", "2005-05-09"]
 61 |         :param index_id: 对应的指数代码
 62 |         :param factor_list: 具体返回哪些因子列表 
 63 |         :return:
 64 |         """
 65 |         result = dict()
 66 |         for date in date_list:
 67 |             pd_data = DatabaseReader.get_index_weight(index_id, start_date=date, end_date=date).copy(deep=True)
 68 |             if pd_data is not None:
 69 |                 pd_data.set_index("code", inplace=True)
 70 |                 pd_data.rename(columns={"weight": "权重"}, inplace=True)
 71 |                 weight_data = pd_data.loc[:, ["权重"]]
 72 |                 # 然后获取因子数据
 73 |                 stock_info = DatabaseReader.get_stock_info(weight_data.index.tolist(), date, date, factor_list)
 74 |                 stock_info.set_index('code', inplace=True)
 75 |                 pd_data = pd.concat([weight_data, stock_info], axis=1, sort=False)
 76 |                 result[pd.to_datetime(date).strftime("%Y-%m-%d")] = pd_data
 77 |             else:
 78 |                 result[pd.to_datetime(date).strftime("%Y-%m-%d")] = None
 79 | 
 80 |         return result
 81 | 
 82 |     @classmethod
 83 |     def get_period_return(cls, code_list, start_date=None, end_date=None):
 84 |         """
 85 |         返回某个股票，或者某列股票，在某个时间区间段内的收益， 注意是区间段的收益，不是时间收益率序列
 86 | 
 87 |         股票或指数的字符串代码，或者代码的list，指数行情目前只需要 000300.SH，000016.SH 和 399905.SZ，以及申万一级行业指数
 88 |         如果传入的是单个代码，返回一个浮点数。如果传入的是一个list，返回一个series，index是股票代码，值是收益率。
 89 |         收益率= enddate_close/startdate_close-1
 90 |         （采用后复权的价格）
 91 |         :param stock_code: 股票代码列表
 92 |         :param start_date: 开始日期
 93 |         :param end_date: 结束日期
 94 |         :return:
 95 |         """
 96 |         # 对于时间的预处理，获取最近的交易日期 
 97 |         # 获取所有可 交易日期序列 
 98 |         start_date = cls.get_nearest_trading_date(start_date)
 99 |         end_date = cls.get_nearest_trading_date(end_date)
100 |         if start_date > end_date:
101 |             raise ValueError("输入的开始日期{0} 大于 结束日期{1}".format(start_date, end_date))
102 |         # 开始日期
103 |         start_date_data = DatabaseReader.get_daily_quote(code_list, start_date, start_date).set_index('code')
104 |         # 结束日期
105 |         end_date_data = DatabaseReader.get_daily_quote(code_list, end_date, end_date).set_index('code')
106 |         # 两者拼接 
107 |         data = pd.concat([start_date_data['close'], end_date_data['close']],
108 |                          axis=1, sort=False)
109 |         data = data.loc[code_list]
110 |         data.columns = ["起始日", "结束日"]
111 |         ret = data.loc[:, "结束日"] / data.loc[:, "起始日"] - 1
112 |         ret.name = "收益率"
113 |         return ret
114 | 
115 |     @classmethod
116 |     def get_stock_return_timeseries(cls, stock_code, start_date, ndays=1):
117 |         """
118 |         基于某个日期，往前或者往后推，去计算某个时间段的收益率时间日期序列
119 |         :param stock_code: 股票或指数字符串代码，或者是代码的list
120 |         :param start_date: 指定某个日期
121 |         :param ndays: 交易日天数，最大支持220，最小支持1. 正数表示往未来看nday，负数表示往历史看nday
122 |         如果传入的stock_code是list，返回一个DataFrame，index是交易日期，columns是股票代码。
123 |         值是每个股票在历史/未来ndays天的收益率序列，行数等于ndays。如果股票当天停牌，则收益率值为nan。
124 |         在更新版的数据库中，会直接采用 涨跌幅2 这个字段，现在暂时使用其他方法来解决
125 |         :return:
126 |         """
127 |         # 获取开始时间段
128 |         start_date = cls.get_nearest_trading_date(start_date)
129 |         # 获取所有可 交易日期序列
130 |         trading_days = DatabaseReader.get_all_trade_days()
131 |         # 开始日期
132 |         # 寻找开始节点
133 |         temp = trading_days <= start_date
134 |         idx = temp.sum()
135 |         trading_days_list = trading_days.tolist()
136 |         # 从零开始，比如 idx是1，那么30个之后是30， 其实是 1+ndays-1 然后是从零开始，所以需要减去2
137 |         end_index = max(idx, idx + ndays-2)
138 |         end_date = trading_days_list[end_index]
139 |         end_date = pd.to_datetime(end_date)
140 |         # 从数据库获取对应的数据
141 |         pd_data = DatabaseReader.get_daily_quote(stock_code, start_date=start_date, end_date=end_date)
142 |         return pd_data
143 | 
144 |     @classmethod
145 |     def get_period_quote_timeseries(cls, code_list, start_date, end_date):
146 |         df = DatabaseReader.get_daily_quote(code_list, start_date, end_date)
147 |         ret_df = df.pivot(index='datetime', columns='code', values='pctChange')
148 |         return ret_df
149 | 
150 |     @classmethod
151 |     def get_nearest_trading_date(cls, input_date):
152 |         if input_date is None:
153 |             input_date = datetime.datetime.today()
154 |         trading_days = DatabaseReader.get_all_trade_days()
155 |         # 获取距离今日最近交易日的数据 ，如果今天不是交易日，那么看上一个交易日是否是交易日
156 |         input_date = pd.to_datetime(input_date)
157 |         while input_date not in trading_days:
158 |             input_date -= datetime.timedelta(days=1)
159 |         return input_date
160 | 
161 |     @classmethod
162 |     def get_all_trade_days(cls, start_date=None, end_date=None):
163 |         return DatabaseReader.get_all_trade_days(start_date, end_date)
164 | 
165 | 
166 | if __name__ == "__main__":
167 |     import time
168 |     from GetData.loadData import load_material_data
169 |     from Tool import globalVars
170 | 
171 |     globalVars.initialize()
172 |     load_material_data()
173 | 
174 |     s = time.time()
175 |     print(BacktestDataApi.get_universe('沪深300', '2020-05-20', ['close', ]))
176 |     print(time.time() - s)
177 | 
178 |     s = time.time()
179 |     print(BacktestDataApi.get_index_weight('000300.SH', '2020-05-20', ['close', ]))
180 |     print(time.time() - s)
181 | 
182 |     s = time.time()
183 |     print(BacktestDataApi.get_index_weight_by_date_list('000300.SH', ['2020-05-20', '2020-05-21'],
184 |                                                         ['close', 'circulating_market_cap']))
185 |     print(time.time() - s)
186 | 
187 |     s = time.time()
188 |     print(BacktestDataApi.get_period_return(['000001.SZ', '000002.SZ'], '2020-05-20', '2020-06-20'))
189 |     print(time.time() - s)
190 | 
191 |     s = time.time()
192 |     print(BacktestDataApi.get_stock_return_timeseries(['000001.SZ', '000002.SZ'], '2020-05-20', 30))
193 |     print(time.time() - s)
194 | 
195 |     s = time.time()
196 |     print(BacktestDataApi.get_nearest_trading_date('2020-06-20'))
197 |     print(time.time() - s)
198 | 
199 | 


--------------------------------------------------------------------------------
/Signal/SignalFunctionsDev/GenerateSignal.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Fri Dec 18 14:21:37 2020
  4 | 
  5 | @author: Evan Hu (Yi Fan Hu)
  6 | 
  7 | """
  8 | 
  9 | import numpy as np
 10 | import numpy.ma as ma
 11 | from sklearn.pipeline import Pipeline
 12 | from sklearn.base import TransformerMixin
 13 | from abc import abstractmethod, ABCMeta, abstractstaticmethod
 14 | 
 15 | from Tool import globalVars
 16 | from Tool.GeneralData import GeneralData
 17 | from Tool.DataPreProcessing import *
 18 | from GetData.loadData import load_material_data, simple_load_factor
 19 | from BackTesting.Signal.SignalBase import SignalBase
 20 | from CrossSectionalModels import CrossSectionalModel
 21 | # %%
 22 | 
 23 | class SignalFactorFromCSV(SignalBase, metaclass=ABCMeta):
 24 | 
 25 |     def __init__(self):
 26 |         self.rawSignals = GeneralData('rawSignals')
 27 |         self.factorNameList = []
 28 |         self.dependents = {}
 29 |         self.allTradeDatetime = []
 30 |         self.metadata = {}
 31 | 
 32 |     def initialize(self):
 33 |         globalVars.initialize()
 34 |         loadedDataList = load_material_data()
 35 | 
 36 |         # TODO:use logger latter
 37 |         print('We now have {} in our globalVar now'.format(loadedDataList))
 38 | 
 39 |         try:
 40 |             shiftedReturn = globalVars.materialData['pctChange'].get_shifted(-1)
 41 |         except AttributeError as ae:
 42 |             print(ae)
 43 |             raise AttributeError('There\'s no pctChange in globalVars')
 44 |         except Exception as e:
 45 |             print(e)
 46 |             raise
 47 | 
 48 |         shiftedReturn.metadata.update({'shiftN': -1})
 49 |         shiftedReturn.name = 'shiftedReturn'
 50 |         self.allTradeDatetime = shiftedReturn.timestamp
 51 |         self.dependents.update({'shiftedReturn': shiftedReturn})
 52 | 
 53 |         # TODO: should load from some factor.json file latter
 54 |         ############ this part realy socks
 55 |         ############ to be modified latter
 56 |         ############ with nice designed globalVars
 57 |         ############ the factor in globalVars.factors is a dict
 58 |         toLoadFactors = ['close',
 59 |                          'high',
 60 |                          'low',
 61 |                          'open'
 62 |                          ]
 63 | 
 64 |         for aFactor in toLoadFactors:
 65 |             simple_load_factor(aFactor)
 66 |             self.factorNameList.append(aFactor)
 67 | 
 68 |     @abstractmethod
 69 |     def generate_signals(self,*,model=None,panelsize=None,trainTestGap=1,
 70 |                          prestart=None, preend=None,coverlist=['ST','suspended'],
 71 |                          data_dict, control_dict,
 72 |                          imputeMethod=ImputeMethod.JustMask(),
 73 |                          standardizeMethod=StandardizeMethod.MinMaxScaler(feature_range=(0, 1)),
 74 |                          deExtremeMethod=DeExtremeMethod.Quantile(method='clip')):
 75 |         # the main main func of this class
 76 |         # iter through all time periods and get the signals
 77 |         # for each iteration: call train_test_slice, preprocessing, get_signal
 78 | 
 79 |         if prestart is None and preend is None:
 80 |             alldate = self.allTradeDatetime
 81 |         else:
 82 |             alldate = self.allTradeDatetime[prestart,preend+1]
 83 |         if panelsize is not None and trainTestGap is not None:
 84 |             for date in alldate:
 85 |                 testStart = getLastTradeDate
 86 | 
 87 | 
 88 | 
 89 | 
 90 | 
 91 | 
 92 | 
 93 | 
 94 |     @abstractmethod
 95 |     def train_test_slice(factors, dependents,
 96 |                          trainStart, trainEnd,
 97 |                          testStart, testEnd):
 98 | 
 99 |         factorTrainDict, factorTestDict = {}, {}
100 |         dependentTrainDict, dependentTestDict = {}, {}
101 | 
102 |         if trainStart is None:
103 |             for factor in factors:
104 |                 factorTrainDict[factor.name] = factor.get_data(at=trainEnd)
105 |                 factorTestDict[factor.name] = factor.get_data(at=testEnd)
106 |         else:
107 |             for factor in factors:
108 |                 factorTrainDict[factor.name] = np.vstack(factor.get_data(trainStart, trainEnd),
109 |                                                          factor.get_data(at=trainEnd))
110 |                 factorTestDict[factor.name] = np.vstack(factor.get_data(testStart, testEnd),
111 |                                                         factor.get_data(at=testEnd))
112 | 
113 |         for dependent in dependents:
114 |             dependentTrainDict[dependent.name] = dependent.get_data(at=trainEnd)
115 |             dependentTestDict[dependent.name] = dependent.get_data(at=testEnd)
116 |         # split all the factors and toPredicts to train part and test part according to input,
117 |         # if end part isn't passed in, slice one period as default,
118 |         # if the test start isn't passed in,
119 |         # take the very next time period of trainEnd,
120 |         # the input of factors could be a list of factors or just one Factor
121 |         return factorTrainDict, factorTestDict, dependentTrainDict, dependentTestDict
122 | 
123 |     @staticmethod
124 |     def preprocessing(dataDict, maskDict, *, deExtremeMethod=None, imputeMethod=None,
125 |                       standardizeMethod=None, pipeline=None):
126 |         # generating the mask
127 |         mask = None
128 |         for _, maskData in maskDict.items():
129 |             if mask is None:
130 |                 mask = np.zeros(maskData.shape)
131 |             mask = np.logical_or(mask, maskData)
132 | 
133 |         # generating the pipeline
134 |         if pipeline is not None:
135 |             assert (isinstance(pipeline, Pipeline))
136 |         else:
137 |             l = []
138 |             if deExtremeMethod is not None:
139 |                 assert (isinstance(deExtremeMethod, TransformerMixin))
140 |                 l.append(("de extreme", deExtremeMethod))
141 |             if imputeMethod is not None:
142 |                 assert (isinstance(imputeMethod, TransformerMixin))
143 |                 l.append(("impute", imputeMethod))
144 |             if standardizeMethod is not None:
145 |                 assert (isinstance(standardizeMethod, TransformerMixin))
146 |                 l.append(("standardize", standardizeMethod))
147 |             l.append(('passthrough', 'passthrough'))
148 |             pipeline = Pipeline(l)
149 | 
150 |         # processing the data
151 |         processedDataDict = dict()
152 |         for dataField, data in dataDict.items():
153 |             for _, maskData in maskDict.items():
154 |                 assert (data.shape == maskData.shape)
155 |             maskedData = ma.masked_array(data, mask=mask)
156 |             maskedData = pipeline.fit_transform(maskedData.T, None).T  # transforming horizontally(stocks-level)
157 | 
158 |             # check the masked proportion
159 |             # minNoMaskProportion = min(1 - np.mean(maskedData.mask, axis=0))
160 |             # if minNoMaskProportion < maskThreshold:
161 |             #     raise ValueError("The remained proportion of data {} is {:.2%} ，"
162 |             #                      "lower than the setting threshold {:.2%}"
163 |             #                      .format(dataField, minNoMaskProportion, maskThreshold))
164 |             processedDataDict[dataField] = maskedData
165 | 
166 |         return processedDataDict
167 | 
168 |     @abstractmethod
169 |     # define how we get signal for one interation
170 |     # the obviuos version will be use feature selection and models
171 |     # to predict crossSectional expected returns of next perio
172 |     def get_signal(self,X_train, y_train, X_test, y_test,model=None):
173 |         if model is None:
174 |             premodel = CrossSectionalModel.CrossSectionalModelDecisionTree(jsonPath=None, paraDict=paraDicts)
175 |         else:
176 |             premodel = CrossSectionalModel.model(jsonPath=None, paraDict=paraDicts)
177 | 
178 |         premodel.fit(X_train, y_train)
179 |         pred_y = model.predict(X_test)
180 | 
181 |         return pred_y
182 | 
183 | 
184 |     def smoothing(self, periods=10, factors=None):
185 |         # smoothing methods defind at the end
186 |         # typicaly is the moving average of n days
187 |         # use partial function technic here will be suitable
188 |         '''
189 |         now the self here is something like what we see in
190 |         the generalData.py, in there must be some differences,
191 |         cause i haven't understand the whole procedure...
192 |         it left to be improved later
193 |         '''
194 |         weights = np.ones(periods) / periods
195 |         for factor in factors:
196 |             if (self.columnNames.count(factor) == 0):
197 |                 print('non-exist factor ' + factor)
198 |                 continue
199 |             index = self.generalData.index(factor)
200 |             self.generalData[:, index] = np.convolve(self.generalData[:, index], weights)
201 | 


--------------------------------------------------------------------------------
/Tool/DataPreProcessing.py:
--------------------------------------------------------------------------------
  1 | import copy as cp
  2 | import numpy as np
  3 | import numpy.ma as ma
  4 | import sklearn
  5 | import sklearn.impute
  6 | import sklearn.preprocessing
  7 | from sklearn.base import TransformerMixin, BaseEstimator
  8 | 
  9 | from Tool import globalVars
 10 | 
 11 | 
 12 | __all__ = ['DeExtremeMethod', 'StandardizeMethod', 'ImputeMethod', 'TransformerBase']
 13 | 
 14 | 
 15 | class TransformerBase(TransformerMixin, BaseEstimator):
 16 |     def __init__(self):
 17 |         self.transformer = None
 18 | 
 19 |     def _mask_x(self, X):
 20 |         if isinstance(X, np.ma.masked_array):
 21 |             X_ = X.compressed()
 22 |         elif isinstance(X, np.ndarray):
 23 |             X_ = X
 24 |         else:
 25 |             raise ValueError("X should be np.array or np.ma.masked_array")
 26 |         return X_.reshape(-1, 1)
 27 | 
 28 |     def fit_transform(self, X, y=None):
 29 |         X_ = cp.deepcopy(X)
 30 |         for i in range(X_.shape[1]):
 31 |             X_t = X_[:, i]
 32 |             if len(X_t[~X_t.mask & np.isnan(X_t)]) <= 1:
 33 |                 # globalVars.logger.logger.warning("the number of remaining data after masking"
 34 |                 #                                  " is lower than 2")
 35 |                 continue
 36 |             X_t[~X_t.mask] = self.transformer.fit_transform(self._mask_x(X_t)).reshape(-1)
 37 |         return X_
 38 | 
 39 |     def fit(self, X, y=None, **fit_params):
 40 |         pass
 41 | 
 42 |     def transform(self, X):
 43 |         return self.fit_transform(X)
 44 | 
 45 | 
 46 | # %% impute method
 47 | class SimpleImputer(TransformerBase):
 48 |     def __init__(self, *, missing_values=np.nan, strategy="mean", fill_value=None, verbose=0, copy=True,
 49 |                  add_indicator=False):
 50 |         super(SimpleImputer, self).__init__()
 51 |         self.transformer = sklearn.impute.SimpleImputer(missing_values=missing_values,
 52 |                                                         strategy=strategy,
 53 |                                                         fill_value=fill_value,
 54 |                                                         verbose=verbose, copy=copy,
 55 |                                                         add_indicator=add_indicator)
 56 | 
 57 | 
 58 | class JustMask(TransformerMixin, BaseEstimator):
 59 |     def __init__(self):
 60 |         pass
 61 | 
 62 |     def fit_transform(self, X, y=None, **fit_params):
 63 |         X_ = cp.deepcopy(X)
 64 |         X_[np.isnan(X_)] = ma.masked
 65 |         return X_
 66 | 
 67 |     def fit(self, X, y=None, **fit_params):
 68 |         pass
 69 | 
 70 |     def transform(self, X):
 71 |         return self.fit_transform(X)
 72 | 
 73 | 
 74 | class ImputeMethod:
 75 |     SimpleImputer = SimpleImputer
 76 |     JustMask = JustMask
 77 | 
 78 | 
 79 | # %% de extreme method
 80 | class MedianStd(TransformerMixin, BaseEstimator):
 81 |     def __init__(self, *, multiple=5.2, method='clip'):
 82 |         assert method in ['clip', 'mask']
 83 | 
 84 |         self.multiple = multiple
 85 |         self.method = method
 86 | 
 87 |     def fit_transform(self, X, y=None, **fit_params):
 88 |         X_ = cp.deepcopy(X)
 89 | 
 90 |         median = np.median(X_, axis=0)
 91 |         distance_to_median = np.abs(X_ - median)
 92 |         median_of_distance = np.median(distance_to_median)
 93 | 
 94 |         upper_limit = median + self.multiple * median_of_distance  # upper bound
 95 |         lower_limit = median - self.multiple * median_of_distance  # lower bound
 96 | 
 97 |         u_outlier = X_[X_ > upper_limit]
 98 |         l_outliner = X_[X_ < lower_limit]
 99 |         if self.method == 'clip':
100 |             if len(u_outlier) > 0:
101 |                 u_outlier = upper_limit
102 |             if len(l_outliner) > 0:
103 |                 l_outliner = lower_limit
104 |         elif self.method == 'mask':
105 |             if len(u_outlier) > 0:
106 |                 u_outlier = ma.masked
107 |             if len(l_outliner) > 0:
108 |                 l_outliner = ma.masked
109 | 
110 |         return X_
111 | 
112 |     def fit(self, X, y=None, **fit_params):
113 |         pass
114 | 
115 |     def transform(self, X):
116 |         return self.fit_transform(X)
117 | 
118 | 
119 | class MeanStd(TransformerMixin, BaseEstimator):
120 |     def __init__(self, *, multiple=5.2, method='clip'):
121 |         assert method in ['clip', 'mask']
122 | 
123 |         self.multiple = multiple
124 |         self.method = method
125 | 
126 |     def fit_transform(self, X, y=None, **fit_params):
127 |         X_ = cp.deepcopy(X)
128 | 
129 |         mean = np.mean(X_, axis=0)
130 |         distance_to_mean = np.abs(X_ - mean)
131 |         median_of_distance = np.median(distance_to_mean)
132 | 
133 |         upper_limit = mean + self.multiple * median_of_distance  # upper bound
134 |         lower_limit = mean - self.multiple * median_of_distance  # lower bound
135 | 
136 |         u_outlier = X_[X_ > upper_limit]
137 |         l_outliner = X_[X_ < lower_limit]
138 |         if self.method == 'clip':
139 |             if len(u_outlier) > 0:
140 |                 u_outlier = upper_limit
141 |             if len(l_outliner) > 0:
142 |                 l_outliner = lower_limit
143 |         elif self.method == 'mask':
144 |             if len(u_outlier) > 0:
145 |                 u_outlier = ma.masked
146 |             if len(l_outliner) > 0:
147 |                 l_outliner = ma.masked
148 | 
149 |         return X_
150 | 
151 |     def fit(self, X, y=None, **fit_params):
152 |         pass
153 | 
154 |     def transform(self, X):
155 |         return self.fit_transform(X)
156 | 
157 | 
158 | class Quantile(TransformerMixin, BaseEstimator):
159 |     def __init__(self, *, multiple=5.2, method='clip'):
160 |         assert method in ['clip', 'mask']
161 | 
162 |         self.multiple = multiple
163 |         self.method = method
164 | 
165 |     def fit_transform(self, X, y=None, **fit_params):
166 |         X_ = cp.deepcopy(X)
167 | 
168 |         quantile = np.quantile(X_, [0.25, 0.5, 0.75], axis=0)
169 |         gap1 = quantile[2] - quantile[1]
170 |         gap2 = quantile[1] - quantile[0]
171 | 
172 |         upper_limit = quantile[2] + self.multiple * gap1  # upper bound
173 |         lower_limit = quantile[0] - self.multiple * gap2  # lower bound
174 | 
175 |         u_outlier = X_[X_ > upper_limit]
176 |         l_outliner = X_[X_ < lower_limit]
177 |         if self.method == 'clip':
178 |             if len(u_outlier) > 0:
179 |                 u_outlier = upper_limit
180 |             if len(l_outliner) > 0:
181 |                 l_outliner = lower_limit
182 |         elif self.method == 'mask':
183 |             if len(u_outlier) > 0:
184 |                 u_outlier = ma.masked
185 |             if len(l_outliner) > 0:
186 |                 l_outliner = ma.masked
187 | 
188 |         return X_
189 | 
190 |     def fit(self, X, y=None, **fit_params):
191 |         pass
192 | 
193 |     def transform(self, X):
194 |         return self.fit_transform(X)
195 | 
196 | 
197 | class DeExtremeMethod:
198 |     MedianStd = MedianStd
199 |     MeanStd = MeanStd
200 |     Quantile = Quantile
201 | 
202 | 
203 | # %% standardize method
204 | class MaxAbsScaler(TransformerBase):
205 |     def __init__(self, *, copy=True):
206 |         super(MaxAbsScaler, self).__init__()
207 |         self.transformer = sklearn.preprocessing.MaxAbsScaler(copy=copy)
208 | 
209 | 
210 | class MinMaxScaler(TransformerBase):
211 |     def __init__(self, feature_range=(0, 1), *, copy=True, clip=False):
212 |         super(MinMaxScaler, self).__init__()
213 |         if sklearn.__version__ >= '0.24':
214 |             self.transformer = sklearn.preprocessing.MinMaxScaler(feature_range=feature_range,
215 |                                                                   copy=copy, clip=clip)
216 |         else:
217 |             self.transformer = sklearn.preprocessing.MinMaxScaler(feature_range=feature_range,
218 |                                                                   copy=copy)
219 | 
220 | 
221 | class StandardScaler(TransformerBase):
222 |     def __init__(self, *, copy=True, with_mean=True, with_std=True):
223 |         super(StandardScaler, self).__init__()
224 |         self.transformer = sklearn.preprocessing.StandardScaler(copy=copy,
225 |                                                                 with_mean=with_mean,
226 |                                                                 with_std=with_std)
227 | 
228 | 
229 | class RobustScalar(TransformerBase):
230 |     def __init__(self, *, with_centering=True, with_scaling=True,
231 |                  quantile_range=(25.0, 75.0), copy=True, unit_variance=False):
232 |         super(RobustScalar, self).__init__()
233 |         self.transformer = sklearn.preprocessing.RobustScalar(with_centering=with_centering,
234 |                                                               with_scaling=with_scaling,
235 |                                                               quantile_range=quantile_range,
236 |                                                               copy=copy,
237 |                                                               unit_variance=unit_variance)
238 | 
239 | 
240 | class StandardizeMethod:
241 |     MaxAbsScaler = MaxAbsScaler
242 |     MinMaxScaler = MinMaxScaler
243 |     StandardScaler = StandardScaler
244 |     RobustScalar = RobustScalar
245 | 
246 | 


--------------------------------------------------------------------------------
/GetData/backtestDatabase.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import pandas as pd
  4 | import numpy as np
  5 | 
  6 | from functools import lru_cache
  7 | 
  8 | from Tool import globalVars
  9 | 
 10 | cur_path = os.path.abspath(os.path.dirname(__file__))
 11 | 
 12 | ALL_TRADING_DAYS_DATA_PATH = os.path.join(cur_path, 'tables/all_trade_days.npy')
 13 | INDEX_DATA_DIR = os.path.join(cur_path, 'tables/indexData')
 14 | INDEX_QUOTE_DATA_DIR = os.path.join(INDEX_DATA_DIR, 'indexQuote')
 15 | INDEX_WEIGHT_DATA_DIR = os.path.join(INDEX_DATA_DIR, 'indexWeight')
 16 | 
 17 | 
 18 | class BacktestDatabase:
 19 |     fields = ['industry_sw1_name', 'industry_zx1_name', 'name', 'listed_date', 'is_st',
 20 |               'is_exist', 'industry_zx1_name', 'is_trading', 'market_cap', 'circulating_market_cap',
 21 |               'free_circulating_market_cap', 'open', 'high', 'low', 'close', 'volume', 'amount']
 22 | 
 23 |     @classmethod
 24 |     @lru_cache(maxsize=1000)
 25 |     def get_all_trade_days(cls, start_date=None, end_date=None):
 26 |         """
 27 |         获取指定日期时间段的交易日期数据，如果没有输入，默认返回所有交易时间段的数据
 28 |         :param start_date: 开始日期，如果没有输入
 29 |         :param end_date:
 30 |         :return:
 31 |         """
 32 |         all_trade_dates = pd.DatetimeIndex(np.load(ALL_TRADING_DAYS_DATA_PATH, allow_pickle=True))
 33 | 
 34 |         start_date = pd.to_datetime(start_date) if start_date is not None else all_trade_dates[0]
 35 |         end_date = pd.to_datetime(end_date) if end_date is not None else all_trade_dates[-1]
 36 |         trading_days = all_trade_dates[(all_trade_dates >= start_date) & (all_trade_dates <= end_date)]
 37 |         if len(trading_days) > 0:
 38 |             return trading_days
 39 |         else:
 40 |             return None
 41 | 
 42 |     @classmethod
 43 |     def _parse_start_and_end_date(cls, start_date, end_date):
 44 |         start_date, end_date = pd.to_datetime(start_date), pd.to_datetime(end_date)
 45 |         trade_dates = cls.get_all_trade_days(start_date, end_date)
 46 |         return trade_dates[0], trade_dates[-1], trade_dates
 47 | 
 48 |     @classmethod
 49 |     def get_next_trade_date(cls, date, n=1):
 50 |         if n == 0:
 51 |             return date
 52 |         try:
 53 |             all_trade_days = cls.get_all_trade_days()
 54 |             return all_trade_days[all_trade_days > date][n-1]
 55 |         except IndexError:
 56 |             raise IndexError("The given {0} days after date {1} out of the upper bound {2}"
 57 |                              .format(n, date, all_trade_days[-1]))
 58 | 
 59 |     @classmethod
 60 |     # @lru_cache(maxsize=1000)
 61 |     def get_daily_factor(cls, code_list, factor_list, start_date, end_date):
 62 |         """
 63 |         从数据库中读取 某一段 时间切片上 特定股票的 日频因子数据
 64 |         :param start_date: 输入指定的开始日期
 65 |         :param end_date:  输入指定的结束日期
 66 |         :param factor_list: 需要请求的那些字段，默认返回所有字段，比如 factor_list = ["is_st", "market_cap"]
 67 |         :param code_list: 返回特定的几只股票的数据
 68 |         :return:
 69 |         """
 70 | 
 71 |         # 将输入的时间转换成 datetime 类型，这样就可以直接和数据库里面的日期 比较
 72 |         start_date, end_date, trade_dates = cls._parse_start_and_end_date(start_date, end_date)
 73 |         if isinstance(code_list, str):
 74 |             code_list = code_list.split(",")
 75 |         if factor_list is None:
 76 |             factor_list = cls.fields
 77 |         l = []
 78 |         for field in factor_list:
 79 |             general_data = globalVars.materialData[field]
 80 |             if start_date == end_date:
 81 |                 data = general_data.get_data(at=start_date).reshape(1, -1)
 82 |             else:
 83 |                 data = general_data.get_data(start=start_date, end=cls.get_next_trade_date(end_date))
 84 |             df = pd.DataFrame(data, columns=general_data.columnNames)
 85 |             df['datetime'] = trade_dates
 86 |             melted = df.melt(id_vars=['datetime'], value_vars=code_list, value_name=field, var_name='code')
 87 |             sr = melted[field]
 88 |             l.append(sr)
 89 |         db_data = pd.concat(l, axis=1)
 90 |         db_data['code'] = melted['code'].values
 91 |         db_data['datetime'] = melted['datetime']
 92 | 
 93 |         return db_data
 94 | 
 95 |     @classmethod
 96 |     def get_daily_quote(cls, code_list, start_date, end_date):
 97 |         """
 98 |         从数据库中读取 某一段 时间切片上 特定股票的 日频因子数据
 99 |         :param start_date: 输入指定的开始日期
100 |         :param end_date:  输入指定的结束日期
101 |         :param code_list: 返回特定的几只股票的数据
102 |         :return:
103 |         """
104 |         index_code_list = []
105 |         for code in ["000300.SH", "000905.SH"]:
106 |             if code in code_list:
107 |                 index_code_list.append(code)
108 |                 code_list.remove(code)
109 |         quote_list = ['open', 'high', 'low', 'close', 'volume', 'preclose', 'amount']
110 |         stock_df = cls.get_daily_factor(code_list, quote_list, start_date, end_date)
111 |         for index_code in index_code_list:
112 |             index_df = cls._load_index_quote(index_code)
113 |             index_df = index_df[(index_df['datetime'] >= start_date) & (index_df['datetime'] <= end_date)]
114 |             stock_df = pd.concat([stock_df, index_df[stock_df.columns]])
115 |         stock_df['pctChange'] = stock_df['close'] / stock_df['preclose'] - 1
116 |         return stock_df
117 | 
118 |     @classmethod
119 |     @lru_cache(maxsize=1000)
120 |     def _load_index_quote(cls, index_code):
121 |         return pd.read_pickle(os.path.join(INDEX_QUOTE_DATA_DIR, index_code))
122 | 
123 |     @classmethod
124 |     @lru_cache(maxsize=1000)
125 |     def _load_index_weight(cls, index_code):
126 |         return pd.read_pickle(os.path.join(INDEX_WEIGHT_DATA_DIR, index_code))
127 | 
128 |     @classmethod
129 |     @lru_cache(maxsize=1000)
130 |     def get_index_weight(cls, index_code, start_date, end_date):
131 |         """
132 |         通过 wss 时间切片的方法，从数据库中读取 某一天 时间切片上所有股票的 相关因子数据
133 |         :param start_date: 输入指定的开始日期
134 |         :param end_date:  输入指定的结束日期
135 |         :param index_code: 股票指数代码，目前只支持000300.XSHG 和 000905.XSHG
136 |         :return:
137 |         """
138 |         if index_code in ["沪深300", "hs300", "300", "HS", "000300.SH", "000300.XSHG"]:
139 |             index_code = "000300.SH"
140 | 
141 |         elif index_code in ["中证500", "zz500", "500", "000905.SH", "000905.XSHG"]:
142 |             index_code = "000905.SH"
143 |         assert index_code in ['000300.SH', '000905.SH']
144 | 
145 |         start_date, end_date, trade_dates = cls._parse_start_and_end_date(start_date, end_date)
146 |         df = cls._load_index_weight(index_code)
147 |         return df[(df['datetime'] >= start_date) & (df['datetime'] <= end_date)]
148 | 
149 |     @classmethod
150 |     def get_stock_info(cls, code_list, start_date, end_date, field_list=None):
151 |         """
152 |         通过 wss 时间切片的方法，从数据库中读取 某一天 时间切片上所有股票的 相关因子数据
153 |         :param start_date: 输入指定的开始日期
154 |         :param end_date:  输入指定的结束日期
155 |         :param code_list: 返回特定的几只股票的数据
156 |         :param field_list: 指定字段
157 |         :return:
158 |         """
159 |         fixed_list = ['is_trading', 'market_cap', 'circulating_market_cap', 'free_circulating_market_cap']
160 |         if field_list is not None:
161 |             for field in field_list:
162 |                 assert field in cls.fields
163 |             field_list = list(set(field_list) | set(fixed_list))
164 |         else:
165 |             field_list = fixed_list
166 |         df = cls.get_daily_factor(code_list, field_list, start_date, end_date)
167 | 
168 |         temp_fileds = "sec_name,ipo_date,delist_date,industry_sw,industry_citic".split(',')
169 |         rename_dict = {'sec_name': 'name',
170 |                        'ipo_date': 'ipo_date',
171 |                        'delist_date': 'delist_date',
172 |                        'industry_sw': 'industry_sw1_name',
173 |                        'industry_citic': 'industry_zx1_name'}
174 |         for field in temp_fileds:
175 |             d = np.load(f'{cur_path}/tables/tempData/{field}.npy', allow_pickle=True).item()
176 |             df[rename_dict[field]] = df['code'].map(d)
177 |         df['is_exist'] = (df['ipo_date'] <= df['datetime']) & (df['delist_date'] >= df['datetime'])
178 |         # Todo: 'is_st'  # 是否st
179 |         return df
180 | 
181 | 
182 | if __name__ == '__main__':
183 |     from GetData.loadData import load_material_data
184 | 
185 |     BacktestDatabase.get_all_trade_days()
186 |     globalVars.initialize()
187 |     load_material_data()
188 |     BacktestDatabase.get_daily_quote(['000300.SH'], pd.to_datetime("2020-01-01"),
189 |                                      pd.to_datetime("2020-02-28"))
190 |     BacktestDatabase.get_daily_factor(['000001.SZ', '000002.SZ'], ["close", "open"], pd.to_datetime("2020-01-01"),
191 |                                       pd.to_datetime("2020-02-28"))
192 |     BacktestDatabase.get_index_weight('沪深300', pd.to_datetime("2020-01-22"),
193 |                                       pd.to_datetime("2020-01-22"))
194 | 
195 |     # start_date = pd.to_datetime('2016-01-01')
196 |     # end_date = pd.to_datetime('2021-01-05')
197 |     # all_trade_days = BacktestDatabase.get_all_trade_days()
198 |     # to_get_date_list = all_trade_days[(all_trade_days >= start_date) & (all_trade_days <= end_date)]
199 |     # for index_code in ['000300.SH', '000905.SH']:
200 |     #     df = pd.read_pickle(os.path.join(INDEX_WEIGHT_DATA_DIR, index_code))
201 |     #     l = []
202 |     #     for date in to_get_date_list:
203 |     #         if date not in df['datetime']:
204 |     #             sl = df['datetime'][df['datetime'] <= date]
205 |     #             if len(sl) == 0:
206 |     #                 print(date)
207 |     #                 continue
208 |     #             shift_date = sl.iat[-1]
209 |     #             tmp_df = df[df['datetime'] == shift_date].copy(deep=True)
210 |     #             tmp_df['datetime'] = date
211 |     #             l.append(tmp_df)
212 |     #     l.append(df)
213 |     #     res_df = pd.concat(l)
214 |     #     res_df.sort_values('datetime').to_pickle(os.path.join(INDEX_WEIGHT_DATA_DIR, index_code))
215 | 
216 | 
217 | 
218 | 


--------------------------------------------------------------------------------
/GetData/loadData.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Fri Dec  4 14:43:53 2020
  4 | 
  5 | @author: Evan Hu (Yi Fan Hu)
  6 | 
  7 | """
  8 | #%%
  9 | import os
 10 | import pandas as pd
 11 | from copy import deepcopy
 12 | try:
 13 |     from Tool import globalVars
 14 |     from Tool import GeneralData
 15 |     from Tool.Factor import Factor
 16 | except :
 17 |     PROJECT_ROOT = 'C:\\Users\\eiahb\\Documents\\MyFiles\\WorkThing\\tf\\01task\\GeneticProgrammingProject\\AlphaSignalFromMachineLearning\\'
 18 |     os.chdir(PROJECT_ROOT)
 19 |     print("change wd to {}".format(PROJECT_ROOT))
 20 |     from Tool import globalVars
 21 |     from Tool import GeneralData
 22 |     from Tool.Factor import Factor
 23 | 
 24 | 
 25 | try:
 26 |     logger = globalVars.logger
 27 | except :
 28 |     import logging
 29 |     logger = logging.getLogger()
 30 | #%% DataFileDict
 31 | 
 32 | #####################################
 33 | #materialData index format 為 "%Y%m%d"
 34 | #####################################
 35 | materialDataFileDict = {
 36 |         # 'hx1': 'hx1.csv',
 37 |         # 'hx2': 'hx2.csv',
 38 |         # 'hx3': 'hx3.csv',
 39 |         # 'alpha3': 'alpha3.csv',
 40 |         # 'alpha13': 'alpha13.csv',
 41 |         # 'alpha14': 'alpha14.csv',
 42 |         # 'alpha15': 'alpha15.csv',
 43 |         # 'alpha16': 'alpha16.csv',
 44 |         # 'alpha17': 'alpha17.csv',
 45 |         'close': 'S_DQ_ADJCLOSE.csv',
 46 |         'high': 'S_DQ_ADJHIGH.csv',
 47 |         'low': 'S_DQ_ADJLOW.csv',
 48 |         'open': 'S_DQ_ADJOPEN.csv',
 49 |         # 'preclose': 'S_DQ_ADJPRECLOSE.csv',
 50 |         'amount': 'S_DQ_AMOUNT.csv',
 51 |         'volume': 'S_DQ_VOLUME.csv',
 52 |         'pctChange': 'S_DQ_PCTCHANGE.csv',
 53 | 
 54 |         'close_moneyflow_pct_value':'CLOSE_MONEYFLOW_PCT_VALUE.csv',
 55 |         'close_moneyflow_pct_volume': 'CLOSE_MONEYFLOW_PCT_VOLUME.csv',
 56 |         'close_net_inflow_rate_value': 'CLOSE_NET_INFLOW_RATE_VALUE.csv',
 57 |         'close_net_inflow_rate_volume': 'CLOSE_NET_INFLOW_RATE_VOLUME.csv',
 58 |         'moneyflow_pct_value': 'MONEYFLOW_PCT_VALUE.csv',
 59 |         'moneyflow_pct_volume':'MONEYFLOW_PCT_VOLUME.csv',
 60 |         'net_inflow_rate_value':'NET_INFLOW_RATE_VALUE.csv',
 61 |         'net_inflow_rate_volume':'NET_INFLOW_RATE_VOLUME.csv',
 62 |         'open_moneyflow_pct_value':'OPEN_MONEYFLOW_PCT_VALUE.csv',
 63 |         'open_moneyflow_pct_volume':'OPEN_MONEYFLOW_PCT_VOLUME.csv',
 64 |         'open_net_inflow_rate_value':'OPEN_NET_INFLOW_RATE_VALUE.csv',
 65 |         'open_net_inflow_rate_volume':'OPEN_NET_INFLOW_RATE_VOLUME.csv',
 66 |         's_mfd_inflow':'S_MFD_INFLOW.csv',
 67 |         's_mfd_inflowvolume':'S_MFD_INFLOWVOLUME.csv',
 68 |         's_mfd_inflow_closevolume':'S_MFD_INFLOW_CLOSEVOLUME.csv',
 69 |         's_mfd_inflow_openvolume':'S_MFD_INFLOW_OPENVOLUME.csv'
 70 | 
 71 | 
 72 |         # 'raw_close': 'S_DQ_ADJCLOSE.csv',
 73 |         # 'raw_high': 'S_DQ_ADJHIGH.csv',
 74 |         # 'raw_low': 'S_DQ_ADJLOW.csv',
 75 |         # 'raw_open': 'S_DQ_ADJOPEN.csv',
 76 |         # 'is_buy_limit': 'S_DQ_BUYLIMIT.csv',  # 是否涨停
 77 |         # 'is_sell_limit': 'S_DQ_SELLLIMIT.csv',  # 是否跌停
 78 |         # 'is_trading': 'S_DQ_TRADE.csv',
 79 |         # 'market_cap': 'S_VAL_MV.csv',  # 总市值
 80 |         # 'circulating_market_cap': 'S_DQ_MV.csv',  # 流通市值
 81 |         # 'free_circulating_market_cap': 'S_FREE_MV.csv',  # 自由流通市值
 82 |         # 'large_sell_rate': 'S_LI_LARGESELLRATE.csv',  # 大卖比率
 83 |         # 'large_buy_rate': 'S_LI_LARGEBUYRATE.csv',  # 大买比率
 84 |         # 'initiative_sell_rate': 'S_LI_INITIATIVESELLRATE.csv',  # 主卖比率
 85 |         # 'initiative_buy_rate': 'S_LI_INITIATIVEBUYRATE.csv',  # 主买比率
 86 |         # 'ipo_date': ''  # 上市日期
 87 |         # 'is_exist': ''  # 是否存续中
 88 |         # 'is_st': ''  # 是否st
 89 |         # 'industry_zx1_name': ''  # 中信一级行业名称
 90 |         # 'industry_sw1_name': ''  # 申万一级行业名称
 91 |         # 'name': ''   # 股票简称
 92 | 
 93 |     }
 94 | 
 95 | 
 96 | #####################################
 97 | #barra index format 為 "%Y-%m-%d"
 98 | #####################################
 99 | barraFileDict = {
100 |         'beta':'beta.csv',
101 |         'blev':'BLEV.csv',
102 |         'bp':'BP.csv',
103 |         'cetop':'CETOP.csv',
104 |         # 'cmra':'CMRA.csv',
105 |         'dastd':'DASTD.csv',
106 |         # 'dtoa':'DTOA.csv',
107 |         # 'egrlf':'EGRLF.csv',
108 |         # 'egro':'EGRO.csv',
109 |         # 'egrsf':'EGRSF.csv',
110 |         # 'epfwd':'EPFWD.csv',
111 |         'etop':'ETOP.csv',
112 |         # 'hsigma':'HSIGMA.csv',
113 |         # 'mlev':'MLEV.csv',
114 |         'mom':'momentum.csv',
115 |         'nonlinear_size':'Non_linear_size.csv',
116 |         # 'report_period':'REPORT_PERIOD.csv',
117 |         # 'sgro':'SGRO.csv',
118 |         'size':'size.csv',
119 |         'stoa':'STOA.csv',
120 |         'stom':'STOM.csv',
121 |         'stoq':'STOQ.csv',
122 |         'beta': 'beta.csv',
123 |         'blev': 'BLEV.csv'
124 |     }
125 | 
126 | #%% load data functions
127 | def load_data_csv(dataFileDict, TABLE_PATH, dictName = None, **kwargs):
128 |     toReturnList = []
129 |     # add dictionary to globalVars
130 |     if dictName not in globalVars.varList:
131 |         logger.info("The dict {} was not in globalVars".format(dictName))
132 |         globalVars.register(dictName, {})
133 |     for k, v in dataFileDict.items():
134 |         if k not in globalVars.__getattribute__(dictName):
135 |             data = GeneralData(name = k, filePath = os.path.join(TABLE_PATH, v), **kwargs)
136 |             globalVars.__getattribute__(dictName)[k] = data
137 |             # print('==================================================================\n\
138 |             #       {} is now in globalVars.{}\n'.format(k, dictName), data)
139 |             logger.info('{} is now in globalVars.{}'.format(k, dictName))
140 |             toReturnList.append(k)
141 |         else:
142 |             # print('==================================================================\n\
143 |             #       {} is already in globalVars.{}\n'.format(k, dictName))
144 |             logger.info('{} is already in globalVars.{}'.format(k, dictName))
145 |     return(toReturnList)
146 | 
147 | def load_data(name, filedir, filetype = 'h5', dataFileDict = None,**kwargs): 
148 |     '''
149 |     關於 globalVars:
150 |     現在不支持直接在 globalVars 中建立非字典的變量(以前會看讀取的數據是不是字典，不是的話直接建立變量如 globalVars.變量)，
151 |     但是仍然支持在讀數據時在 globalVars 中建立字典，會提示 "The dict {} was not in globalVars".format(name) 
152 | 
153 |     可以用兩種不同的方式讀取數據，如果沒有指定的話就是使用 h5 檔案
154 |     h5:
155 |     指定字典名稱 name 與 檔案位置可以使用與檔案名稱不同的字典名
156 |     會先嘗試用 filedir 直接讀取檔案如: .\\AlphaSignalFromMachineLearning\\GetData\\h5\\materialData.h5
157 |     那麼就會直接讀該檔案，並用 name 在 globalVars 中建立字典
158 |     如果指定的 filedir 沒法讀取(他不是 h5 file 或是我們只指定了資料夾位置)，會嘗試讀取 filedir 資料夾底下的 name.h5 檔 (就是 "{}.h5".format(name))
159 |     並且使用 name 在 globalVars 中建立字典
160 |     csv:
161 |     會先嘗試讀取 filedir 資料夾底下的 csv 檔案，如果找不到 csv 如: .\\GeneticProgrammingProject\\AlphaSignalFromMachineLearning\\GetData\\tables
162 |     如果讀不到就會嘗試: os.path.join(filedir, name) 底下的資料夾如: .\\GeneticProgrammingProject\\AlphaSignalFromMachineLearning\\GetData\\tables\\barra
163 |     思路與 h5 是一樣的
164 |     注意使用 csv 模式時候需要傳入 dataFileDict 來指定 csv 的檔名，否則報錯
165 |     此外可以使用 indexFormat 來指定不同的 datetime 的 format (此目的為適應不同 csv 檔案標註日期的格式) default 為 "%Y-%m-%d"
166 | 
167 | 
168 |     '''   
169 |     if filetype == 'h5':
170 |         # 如果用的是 h5 file
171 |         if name not in globalVars.varList:
172 |             logger.info("The dict {} was not in globalVars".format(name))
173 |             globalVars.register(name, {})
174 |         try:
175 |             assert os.path.exists(filedir), "FileNotFound, please check the file path is currect"
176 |             hdf = pd.HDFStore(filedir)
177 |         except Exception as e:
178 |             print(e)
179 |             print("try with {}".format(os.path.join(filedir, '{}.h5'.format(name))))
180 |             assert os.path.exists(os.path.join(filedir, '{}.h5'.format(name))), "FileNotFound, please check the file path is currect"
181 |             hdf = pd.HDFStore(os.path.join(filedir, '{}.h5'.format(name)))
182 |         for rawk in hdf.keys():
183 |             k = rawk.split('/')[1]
184 |             v = GeneralData(k, hdf.get(k))
185 |             globalVars.__getattribute__(name)[k] = v
186 |         logger.info('{} is now in globalVars.{}'.format(list(hdf.keys()), name))
187 |         return(list(hdf.keys()))            
188 |     elif filetype == "csv":
189 |         # 如果用的是 csv file
190 |         indexFormat =  "%Y-%m-%d"
191 |         if dataFileDict == None:
192 |             print("dataFileDict is needed in csv mode")
193 |             raise Exception
194 |         if "indexFormat" in kwargs:
195 |             indexFormat = kwargs['indexFormat']
196 |         try:
197 |             toReturnList = load_data_csv(dataFileDict = dataFileDict, TABLE_PATH = filedir, dictName=name, indexFormat = indexFormat)
198 |         except FileNotFoundError as fnfe:
199 |             filedir = os.path.join(filedir, name)
200 |             print("try with {}".format(filedir))
201 |             toReturnList = load_data_csv(dataFileDict = dataFileDict, TABLE_PATH = filedir, dictName=name, indexFormat = indexFormat)
202 |         except Exception as e:
203 |             print(e)
204 |             raise e
205 |         return(toReturnList)
206 | 
207 | 
208 | 
209 | 
210 | 
211 | def simple_load_factor(factorName):
212 |     if 'factors' not in globalVars.varList:
213 |         globalVars.register('factors', {})
214 |     globalVars.factors['{}_factor'.format(factorName)] = Factor('{}_factor'.format(factorName), globalVars.materialData[factorName])
215 |     print(factorName, 'is now in globalVars.factors')
216 | 
217 |  
218 | 
219 | def align_data(data, alignTo):
220 |     data_df = pd.DataFrame(data.generalData, index=data.timestamp, columns=data.columnNames)
221 |     reindexed = data_df.reindex(index=alignTo.timestamp, columns=alignTo.columnNames)
222 |     toReturn = GeneralData(data.name, generalData=reindexed)
223 |     return(toReturn)
224 | 
225 | def align_all_to(dict_, alignTo):
226 |     dict__ = deepcopy(dict_)
227 |     for k, v in dict__.items():
228 |         dict__[k] = align_data(v, alignTo)
229 |     return(dict__)
230 | 
231 |         
232 |     
233 |     
234 |     
235 | #%% main
236 | if __name__ == '__main__':
237 |     PROJECT_ROOT = "c:\\Users\\eiahb\\Documents\\MyFiles\\WorkThing\\tf\\01task\\GeneticProgrammingProject\\AlphaSignalFromMachineLearning\\"
238 |     import os
239 |     os.chdir(PROJECT_ROOT)
240 |     from Tool import globalVars
241 | 
242 |     globalVars.initialize()
243 |     # read h5
244 |     # 用例 1 
245 |     # load_data("barra",
246 |     #     os.path.join(os.path.join(PROJECT_ROOT,"data"), "h5")
247 |     # )
248 | 
249 |     # 用例 2
250 |     load_data("materialData",
251 |         os.path.join(os.path.join(os.path.join(PROJECT_ROOT,"data"), "h5"), "materialData_newData.h5")
252 |     )
253 | 
254 |     # #read csv
255 |     # # 用例 3 
256 |     # load_data("barra",
257 |     #     os.path.join(os.path.join(os.path.join(PROJECT_ROOT,"data"), "tables"), "barra"),
258 |     #     filetype="csv",dataFileDict=barraFileDict
259 |     # )
260 | 
261 |     # 用例 4
262 |     # load_data("materialData",
263 |     #     os.path.join(os.path.join(PROJECT_ROOT,"data"), "tables"),
264 |     #     filetype="csv",dataFileDict=materialDataFileDict,indexFormat = "%Y%m%d"
265 |     # )
266 | 
267 | 
268 | 
269 | 
270 | 
271 | # %%
272 | # h5_path = "C:\\Users\\eiahb\\Documents\MyFiles\\WorkThing\\tf\\01task\\GeneticProgrammingProject\\AlphaSignalFromMachineLearning\\data\\h5"
273 | # hdf = pd.HDFStore(os.path.join(h5_path, '{}.h5'.format("materialData_newData")))
274 | # # %%
275 | # for k, v in globalVars.materialData.items():
276 | #     print(v)
277 | #     hdf.put(k, v.to_DataFrame())
278 | # hdf.close()
279 | 
280 | # %%
281 | # %%
282 | 


--------------------------------------------------------------------------------
/Signal/SignalSynthesis.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Sat Jan 2 13:30:30 2021
  4 | 
  5 | @author: Ye Donggua
  6 | 
  7 | """
  8 | 
  9 | import pandas as pd
 10 | import numpy as np
 11 | import numpy.ma as ma
 12 | from copy import copy
 13 | from sklearn.pipeline import Pipeline
 14 | from sklearn.base import TransformerMixin
 15 | from tqdm.notebook import tqdm
 16 | 
 17 | from Tool import globalVars
 18 | from Tool.GeneralData import GeneralData
 19 | from Tool.DataPreProcessing import *
 20 | from BackTesting.Signal.SignalBase import SignalBase
 21 | from GeneticPogramming.utils import get_strided
 22 | 
 23 | 
 24 | # %%
 25 | 
 26 | class SignalSynthesis(SignalBase):
 27 | 
 28 |     def __init__(self, model=None, logger=None):
 29 |         super().__init__()
 30 |         # ml model
 31 |         self.model = model
 32 |         # 同一个logger
 33 |         self.logger = logger
 34 |         # director传进来的
 35 |         self.factorNameList = []
 36 |         # smoothing之前的signal
 37 |         self.rawSignals = None
 38 |         # 因变量的dict
 39 |         self.dependents = {}
 40 |         self.metadata = {}
 41 |         # dateTimeIndex
 42 |         self.allTradeDatetime = None
 43 | 
 44 |         # 机器学习模型的训练结果
 45 |         self.train_loss_list = []
 46 |         self.test_loss_list = []
 47 |         self.features_importance = []
 48 | 
 49 |         self.initialize()
 50 | 
 51 |     def initialize(self):
 52 |         '''
 53 |         initialize self.dependents & allTradeDatetime
 54 | 
 55 |         Raises
 56 |         ------
 57 |         AttributeError
 58 |             'There\'s no pctChange in globalVars'
 59 | 
 60 |         Returns
 61 |         -------
 62 |         None.
 63 | 
 64 |         '''
 65 |         try:
 66 |             shiftedReturn = globalVars.materialData['pctChange'].get_shifted(-1)
 67 |         except AttributeError as ae:
 68 |             print(ae)
 69 |             raise AttributeError('There\'s no pctChange in globalVars')
 70 |         except Exception as e:
 71 |             print(e)
 72 |             raise
 73 | 
 74 |         shiftedReturn.metadata.update({'shiftN': -1})
 75 |         shiftedReturn.name = 'shiftedReturn'
 76 | 
 77 |         self.dependents.update({'shiftedReturn': shiftedReturn})
 78 |         self.allTradeDatetime = shiftedReturn.timestamp
 79 | 
 80 |     def get_last_trade_date(self, date, n=1):
 81 |         if n == 0:
 82 |             return date
 83 |         try:
 84 |             return self.allTradeDatetime[self.allTradeDatetime < date][-n]
 85 |         except IndexError:
 86 |             raise IndexError("The given {0} days before date {1} out of the lower bound {2}"
 87 |                              .format(n, date, self.allTradeDatetime[0]))
 88 | 
 89 |     def get_next_trade_date(self, date, n=1):
 90 |         if n == 0:
 91 |             return date
 92 |         try:
 93 |             return self.allTradeDatetime[self.allTradeDatetime > date][n-1]
 94 |         except IndexError:
 95 |             raise IndexError("The given {0} days after date {1} out of the upper bound {2}"
 96 |                              .format(n, date, self.allTradeDatetime[-1]))
 97 | 
 98 |     def generate_signals(self, startDate, endDate, panelSize=1, trainTestGap=1, maskList=None,
 99 |                          deExtremeMethod=None, imputeMethod=None,
100 |                          standardizeMethod=None, pipeline=None, factorNameList=None,
101 |                          modelParams=None, metric_func=None,
102 |                          smoothing_params=None):
103 |         '''
104 |         
105 | 
106 |         Parameters
107 |         ----------
108 |         startDate : TYPE
109 |             startDate of the back testing date interval (left-closed, right-closed).
110 |         endDate : TYPE
111 |             endDate of the back testing date interval (left-closed, right-closed).
112 |         panelSize : TYPE, optional
113 |             time length of the panel factor used. The default is 1: use one-period factor.
114 |         trainTestGap : TYPE, optional
115 |             gap between trainData and testData. 
116 |             The default is 1, use day-T to train model and feed day-T+1 testData to predict.
117 |         maskList : TYPE, optional
118 |             maskNameList. The default is None.
119 |         deExtremeMethod : TYPE, optional
120 |             method for deExtreme factors. The default is None.
121 |         imputeMethod : TYPE, optional
122 |             method for impute factors. The default is None.
123 |         standardizeMethod : TYPE, optional
124 |             method for standardize factors. The default is None.
125 |         pipeline : TYPE, optional
126 |             can input preprocessing method like deExtremeMethod...
127 |             can also input a pipeline directly. The default is None.
128 |         factorNameList : TYPE, optional
129 |             factors used to generateSignals. The default is None.
130 |         modelParams : TYPE, optional
131 |             paras of the model that generates signals. The default is None.
132 |         metric_func : TYPE, optional
133 |             DESCRIPTION. The default is None.
134 |         periods : TYPE, optional
135 |             smoothing params. The default is 10.
136 |         method : TYPE, optional
137 |             smoothing method. The default is 'linear'.
138 | 
139 |         Returns
140 |         -------
141 |         resultDict : TYPE
142 |             DESCRIPTION.
143 | 
144 |         '''
145 |         # set startDate & endDate is input is None
146 |         # [startDate,endDate] is the dates interval for backTesting, closed interval
147 |         self.factorNameList = factorNameList
148 | 
149 |         if maskList is None:
150 |             maskList = []
151 | 
152 |         
153 |         # assert whether panelSize is out of range
154 |         # default panelSize should be 1
155 |         toStart = len(self.allTradeDatetime[self.allTradeDatetime <= startDate]) - panelSize - trainTestGap + 1
156 |         assert toStart >= 0, 'panelSize out of range'
157 |         endDateShiftOneDay = self.get_next_trade_date(endDate, 1)
158 |         backTestDates = self.allTradeDatetime[(self.allTradeDatetime >= startDate) &
159 |                                               (self.allTradeDatetime <= endDateShiftOneDay)]
160 |         self.logger.info('start to generate signals from {} ot {}'.format(startDate, endDateShiftOneDay))
161 |         # mL存mask的generalData
162 |         mL = []
163 |         for mask in maskList:
164 |             mL.append(globalVars.factors[mask])
165 | 
166 |         factorL = []
167 |         for factor in factorNameList:
168 |             factorL.append(globalVars.factors[factor])
169 | 
170 |         signalList = []
171 |         for backTestDate in tqdm(backTestDates):
172 |             # if use default panelSize = 1, Start == End
173 |             # set dates for train_test_slice
174 |             testEnd = backTestDate
175 |             testStart = self.get_last_trade_date(testEnd, panelSize - 1)
176 |             trainEnd = self.get_last_trade_date(testEnd, trainTestGap)
177 |             trainStart = self.get_last_trade_date(trainEnd, panelSize - 1)
178 | 
179 |             # get the mask of train and test sets
180 |             if mL:
181 |                 maskTrainDict, maskTestDict, _, _ = self.train_test_slice(
182 |                     factors=mL, dependents=None,
183 |                     trainStart=trainStart, trainEnd=trainEnd, testStart=testStart, testEnd=testEnd
184 |                 )
185 |             else:
186 |                 maskTrainDict, maskTestDict = {}, {}
187 | 
188 |             # get factors and dependents for each backTestingDate
189 |             factorTrainDict, factorTestDict, dependentTrainDict, dependentTestDict = self.train_test_slice(
190 |                 factors=factorL, dependents=self.dependents,
191 |                 trainStart=trainStart, trainEnd=trainEnd, testStart=testStart, testEnd=testEnd
192 |             )
193 |             # preprocess factors
194 |             processedTrainDict = self.preprocessing(factorTrainDict, maskTrainDict, deExtremeMethod=deExtremeMethod,
195 |                                                     imputeMethod=imputeMethod, standardizeMethod=standardizeMethod,
196 |                                                     pipeline=pipeline)
197 | 
198 |             processedTestDict = self.preprocessing(factorTestDict, maskTestDict, deExtremeMethod=deExtremeMethod,
199 |                                                    imputeMethod=imputeMethod, standardizeMethod=standardizeMethod,
200 |                                                    pipeline=pipeline)
201 |             
202 |             # stack factorDict to 3D
203 |             trainStack = np.ma.stack([processedTrainDict[factor] for factor in factorNameList]).transpose(2, 1, 0)
204 |             testStack = np.ma.stack([processedTestDict[factor] for factor in factorNameList]).transpose(2, 1, 0)
205 |             # reshape to 2D: XTrain.shape = (nStocks, panelSize*nFields)
206 |             XTrain = trainStack.reshape(trainStack.shape[0], -1)
207 |             XTest = testStack.reshape(testStack.shape[0], -1)
208 | 
209 |             signalDict = {}
210 |             # there may be several dependents, for loop
211 |             for k in dependentTrainDict.keys():
212 |                 signal = np.zeros(dependentTestDict[k].shape) * np.nan
213 |                 # concantenate X and y
214 |                 dataTrain = np.ma.concatenate([dependentTrainDict[k].reshape(-1, 1), XTrain], axis=1)
215 |                 dataTest = np.ma.concatenate([dependentTestDict[k].reshape(-1, 1), XTest], axis=1)
216 | 
217 |                 # clean data
218 |                 # 只要dataTrain或mask里有nan就直接mask
219 |                 naOrMaskTrain = np.sum(np.logical_or(np.isnan(dataTrain).data, dataTrain.mask), axis=1)
220 |                 naOrMaskTest = np.sum(np.logical_or(np.isnan(dataTest).data, dataTest.mask), axis=1)
221 | 
222 |                 dataTrainCleaned = dataTrain[naOrMaskTrain == 0, :]
223 |                 dataTestCleaned = dataTest[naOrMaskTest == 0, :]
224 | 
225 |                 self.logger.debug('Actual available training data account for {:.2%} ({} / {})'
226 |                                   .format(len(dataTrainCleaned)/len(dataTrain), len(dataTrainCleaned), len(dataTrain)))
227 |                 self.logger.debug('Actual available testing data account for {:.2%} ({} / {})'
228 |                                   .format(len(dataTestCleaned) / len(dataTest), len(dataTestCleaned), len(dataTest)))
229 | 
230 |                 predictY = self.get_signal(X_train=dataTrainCleaned[:, 1:], y_train=dataTrainCleaned[:, 0],
231 |                                            X_test=dataTestCleaned[:, 1:], y_test=dataTestCleaned[:, 0],
232 |                                            model=self.model(**modelParams), metric_func=metric_func)
233 |                 signal[naOrMaskTest == 0] = predictY
234 |                 signalDict[k] = signal
235 |             signalList.append(signalDict)
236 |             self.logger.debug('{} finished'.format(backTestDate))
237 | 
238 |         resultDict = {}
239 |         rawSignals = {}
240 |         for dependent in self.dependents.keys():
241 |             signal = np.c_[[x[dependent] for x in signalList]]
242 |             signalGeneralData = GeneralData(name=dependent, generalData=signal,
243 |                                             timestamp=pd.DatetimeIndex(backTestDates),
244 |                                             columnNames=factorL[0].columnNames)
245 |             rawSignals[dependent] = signalGeneralData
246 |             if smoothing_params is not None:
247 |                 smoothedSignalGeneralData = self.smoothing(signalGeneralData,
248 |                                                            smoothing_params['periods'], smoothing_params['method'])
249 |                 resultDict[dependent] = smoothedSignalGeneralData
250 |             else:
251 |                 resultDict[dependent] = signalGeneralData
252 |         self.rawSignals = rawSignals
253 |         return resultDict
254 | 
255 |     @staticmethod
256 |     def train_test_slice(factors, dependents=None, trainStart=None, trainEnd=None, testStart=None, testEnd=None):
257 |         # split all the factors and toPredicts to train part and test part according to input,
258 |         # if trainStart = trainEnd: the user doesn't use panel data
259 |         # slice factors at that date
260 |         # else we slice factors from trainStart to trainEnd (closed date interval)
261 |         # dependents always sliced by trainEnd
262 |         # if dependents is None, return {} (can be used when we slice maskDict)
263 |         factorTrainDict, factorTestDict = {}, {}
264 |         dependentTrainDict, dependentTestDict = {}, {}
265 | 
266 |         if trainStart == trainEnd:
267 |             for factor in factors:
268 |                 factorTrainDict[factor.name] = factor.get_data(at=trainEnd).reshape(1, -1)
269 |                 factorTestDict[factor.name] = factor.get_data(at=testEnd).reshape(1, -1)
270 |         else:
271 |             for factor in factors:
272 |                 factorTrainDict[factor.name] = np.vstack((factor.get_data(trainStart, trainEnd),
273 |                                                           factor.get_data(at=trainEnd)))
274 |                 factorTestDict[factor.name] = np.vstack((factor.get_data(testStart, testEnd),
275 |                                                          factor.get_data(at=testEnd)))
276 |         if dependents is not None:
277 |             for name, dependent in dependents.items():
278 |                 dependentTrainDict[name] = dependent.get_data(at=trainEnd)
279 |                 dependentTestDict[name] = dependent.get_data(at=testEnd)
280 | 
281 |             return factorTrainDict, factorTestDict, dependentTrainDict, dependentTestDict
282 | 
283 |     @staticmethod
284 |     def preprocessing(dataDict, maskDict, *, deExtremeMethod=None, imputeMethod=None,
285 |                       standardizeMethod=None, pipeline=None):
286 |         # generating the mask
287 |         mask = None
288 |         for _, maskData in maskDict.items():
289 |             if mask is None:
290 |                 mask = np.zeros(maskData.shape)
291 |             mask = np.logical_or(mask, maskData)
292 | 
293 |         # generating the pipeline
294 |         if pipeline is not None:
295 |             assert (isinstance(pipeline, Pipeline))
296 |         else:
297 |             l = []
298 |             if deExtremeMethod is not None:
299 |                 assert (isinstance(deExtremeMethod, TransformerMixin))
300 |                 l.append(("de extreme", deExtremeMethod))
301 |             if imputeMethod is not None:
302 |                 assert (isinstance(imputeMethod, TransformerMixin))
303 |                 l.append(("impute", imputeMethod))
304 |             if standardizeMethod is not None:
305 |                 assert (isinstance(standardizeMethod, TransformerMixin))
306 |                 l.append(("standardize", standardizeMethod))
307 |             l.append(('passthrough', 'passthrough'))
308 |             pipeline = Pipeline(l)
309 | 
310 |         # processing the data
311 |         processedDataDict = dict()
312 |         for dataField, data in dataDict.items():
313 | 
314 |             for _, maskData in maskDict.items():
315 |                 assert (data.shape == maskData.shape)
316 |             if mask is None:
317 |                 maskedData = ma.masked_array(data, mask=np.zeros(data.shape))
318 |             else:
319 |                 maskedData = ma.masked_array(data, mask=mask)
320 | 
321 |             # transforming horizontally(stocks-level)
322 | 
323 |             maskedData = pipeline.fit_transform(maskedData.T, None).T
324 |             # check the masked proportion
325 |             # minNoMaskProportion = min(1 - np.mean(maskedData.mask, axis=0))
326 |             # if minNoMaskProportion < maskThreshold:
327 |             #     raise ValueError("The remained proportion of data {} is {:.2%} ，"
328 |             #                      "lower than the setting threshold {:.2%}"
329 |             #                      .format(dataField, minNoMaskProportion, maskThreshold))
330 |             processedDataDict[dataField] = maskedData
331 | 
332 |         return processedDataDict
333 | 
334 |     # define how we get signal for one interation
335 |     # the obviuos version will be use feature selection and models
336 |     # to predict crossSectional expected returns of next perio
337 |     def get_signal(self, X_train, y_train, X_test, y_test, model=None, metric_func=None):
338 | 
339 |         model.fit(X_train, y_train)
340 |         pred_y = model.predict(X_test)
341 | 
342 |         trainLoss = metric_func(model.predict(X_train), y_train)
343 |         testLoss = metric_func(pred_y, y_test)
344 | 
345 |         self.train_loss_list.append(trainLoss)
346 |         self.test_loss_list.append(testLoss)
347 |         self.features_importance.append(model.get_model().feature_importances_)
348 | 
349 |         # self.logger.info("Model {} training loss: {}, testing loss: {}".format(model.model, trainLoss, testLoss))
350 | 
351 |         return pred_y
352 | 
353 |     @staticmethod
354 |     def smoothing(data, periods=10, method='linear'):
355 |         # smoothing methods defind at the end
356 |         # typicaly is the moving average of n days
357 |         # use partial function technic here will be suitable
358 |         toOutputGeneral = copy(data)
359 |         if method == 'linear':
360 |             npdata = toOutputGeneral.generalData
361 |             strided = get_strided(npdata, periods)
362 |             toOutput = strided.mean(axis=1)  # Todo: 使得信号出现太多NaN， 可能需要调整
363 |             toOutputGeneral.generalData = toOutput
364 |         elif method == 'exp':
365 |             pass
366 |         else:
367 |             print('non-existing method when smoothing')
368 |         return (toOutputGeneral)
369 | 
370 | 
371 | # %%
372 | if __name__ == '__main__':
373 |     ss = SignalSynthesis()
374 | 


--------------------------------------------------------------------------------
/report/DataPreProcessing.py:
--------------------------------------------------------------------------------
  1 | __all__ = ['FactorDeExtremeMethod', 'FactorStandardizeMethod', 'GroupingMethod', 'WeightMethod',
  2 |            'FactorNeutralizeMethod']
  3 | import pandas as pd
  4 | import numpy as np
  5 | import statsmodels.api as sm
  6 | 
  7 | 
  8 | class FactorDeExtremeMethod(object):
  9 |     """去极值方法"""
 10 |     def __init__(self, **kwargs):
 11 |         pass
 12 | 
 13 |     @classmethod
 14 |     def Method_Median(cls, factor_series, multiple=5.2):
 15 |         """
 16 |         中位数去极值法，参考天软文档
 17 |         参数 multiple 是用于计算上下轨的倍数, 默认值为5.2
 18 |         factor_series中可能有空值nan，计算均值、中位数等统计量时会跳过空值
 19 |         """
 20 |         median = factor_series.dropna().median()
 21 |         distance_to_median = (factor_series - median).abs()  # 每个数据点与中位数的距离
 22 |         median_of_distance = distance_to_median.dropna().median()     # 中位数距离的中位数
 23 | 
 24 |         upper_limit = median + multiple * median_of_distance  # 上轨
 25 |         lower_limit = median - multiple * median_of_distance  # 下轨
 26 |         # 替换数据
 27 |         result = factor_series.copy()
 28 |         result[result > upper_limit] = upper_limit
 29 |         result[result < lower_limit] = lower_limit
 30 |         return result
 31 | 
 32 |     @classmethod
 33 |     def Method_Mean_Std(cls, factor_series, multiple=3):
 34 |         """
 35 |         n倍标准差法，参考天软文档
 36 |         参数 multiple 是用于计算上下轨的倍数, 默认值为3
 37 |         factor_series中可能有空值nan，计算均值、中位数等统计量时会跳过空值
 38 |         """
 39 |         mean = factor_series.dropna().mean()
 40 |         std = factor_series.dropna().std()
 41 | 
 42 |         upper_limit = mean + multiple * std  # 上轨
 43 |         lower_limit = mean - multiple * std  # 下轨
 44 |         # 替换数据
 45 |         result = factor_series.copy()
 46 |         result[result > upper_limit] = upper_limit
 47 |         result[result < lower_limit] = lower_limit
 48 |         return result
 49 | 
 50 |     @classmethod
 51 |     def Method_Quantile(cls, factor_series, multiple=1.5):
 52 |         """
 53 |         n倍标准差法，参考天软文档
 54 |         参数 multiple 是用于计算上下轨的倍数, 默认值为3
 55 |         factor_series中可能有空值nan，计算均值、中位数等统计量时会跳过空值
 56 |         """
 57 |         quantile = factor_series.dropna().quantile([0.25, 0.5, 0.75])
 58 |         gap1 = quantile[0.75] - quantile[0.5]
 59 |         gap2 = quantile[0.5] - quantile[0.25]
 60 | 
 61 |         upper_limit = quantile[0.75] + multiple * gap1  # 上轨
 62 |         lower_limit = quantile[0.25] - multiple * gap2  # 下轨
 63 |         # 替换数据
 64 |         result = factor_series.copy()
 65 |         result[result > upper_limit] = upper_limit
 66 |         result[result < lower_limit] = lower_limit
 67 |         return result
 68 | 
 69 | 
 70 | class FactorStandardizeMethod(object):
 71 |     def __init__(self):
 72 |         pass
 73 | 
 74 |     @classmethod
 75 |     def Method_Z_Score(cls, factor_series):
 76 |         # z-score标准化
 77 |         mean = factor_series.dropna().mean()
 78 |         std = factor_series.dropna().std()
 79 |         return (factor_series - mean) / std
 80 | 
 81 |     @classmethod
 82 |     def Method_0_1(cls, factor_series):
 83 |         # [0, 1]正规化
 84 |         return (factor_series - factor_series.min()) / (factor_series.max() - factor_series.min())
 85 | 
 86 |     @classmethod
 87 |     def Method_1_1(cls, factor_series):
 88 |         # [-1, 1]标准化
 89 |         return 2 * (factor_series - factor_series.min()) / (factor_series.max() - factor_series.min()) - 1
 90 | 
 91 |     @classmethod
 92 |     def Method_Percentile(cls, factor_series):
 93 |         # 百分比打分法
 94 |         ss = factor_series.rank()
 95 |         ss /= ss.max()
 96 |         return ss
 97 | 
 98 | 
 99 | class FactorNeutralizeMethod(object):
100 |     def __init__(self):
101 |         pass
102 | 
103 |     @classmethod
104 |     def Method_Residual(cls, factor_series, control_factor_df):
105 |         # 线性回归不能出现NAN，此处仅取出y和X都没NaN的行
106 |         dropna_df = pd.concat([factor_series, control_factor_df], axis=1).dropna()
107 |         # 把行业等标签类因子变为哑变量
108 |         dff = pd.get_dummies(dropna_df)
109 |         # 回归
110 |         result = sm.OLS(np.array(dropna_df.iloc[:, 0]), np.array(dff.iloc[:, 1:])).fit()
111 |         # 把回归的残差放回原有的日期集合里
112 |         result_sr = pd.Series(index=factor_series.index)
113 |         result_sr.loc[dropna_df.index] = result.resid
114 | 
115 |         return result_sr
116 | 
117 | 
118 | class GroupingMethod(object):
119 |     def __init__(self):
120 |         pass
121 | 
122 |     @classmethod
123 |     def Method_Blank_QCut(cls, df, group_num, group_list, weight_method, max_stock_num):
124 |         """
125 |         空白Qcut，即不做任何因子的控制，直接对universe的股票按score进行QCut。
126 |         :param df: universe df，index是股票wind代码，columns至少要有filter和score这两列。
127 |         :param group_num: 分组数
128 |         :param group_list: 分组列表 ['Qgroup_num', 'Qgroup_num-1', ... , 'Q2', 'Q1']
129 |         :param weight_method: 配权方式
130 |         :param max_stock_num: 细分小组内最大的选股数量
131 |         :return: result_dict: keys是group_list里的分组名称，values是Series，Series的index是股票代码，values是权重
132 |         """
133 |         filter_df = df[df['filter']]
134 |         filter_df['grouping'] = pd.qcut(filter_df['score'], group_num, group_list, duplicates='drop')
135 |         df.loc[filter_df.index, 'grouping'] = filter_df['grouping']
136 | 
137 |         result_dict = dict()
138 |         # 得出各组的权重
139 |         for group_label in group_list:
140 |             temp_df = filter_df[filter_df['grouping'] == group_label]
141 |             temp_df = temp_df.sort_values('score', ascending=False).head(max_stock_num)  # 限制股票数量
142 |             if weight_method == 'EW':
143 |                 temp_df['weight'] = 1 / len(temp_df)
144 |                 weight_series = temp_df['weight']
145 |             elif weight_method == 'LVW':
146 |                 weight_series = np.sqrt(temp_df['circulating_market_cap']) / \
147 |                                 np.sqrt(temp_df['circulating_market_cap']).sum()
148 |             elif weight_method == 'VW':
149 |                 weight_series = np.sqrt(temp_df['market_cap']) / \
150 |                                 np.sqrt(temp_df['market_cap']).sum()
151 |             result_dict[group_label] = weight_series
152 |         return result_dict
153 | 
154 |     @classmethod
155 |     def Method_Group_By_Benchmark(cls, df, benchmark_df, control_dict, group_num, group_list, factor_group_list,
156 |                                   weight_method, max_stock_num):
157 |         """
158 |         根据benchmark的因子分布来做中性配权。
159 |         :param df: universe df，index是股票wind代码，columns至少要有filter和score这两列和control_dict里提到的因子。
160 |         :param benchmark_df: benchmark股票池的因子df，与df的结构一致
161 |         :param control_dict: 需要控制的因子，若不为空必须是OrderDict，需要控制的因子名称为key，分组数量为value，
162 |                               非数字型因子，如行业分类，则value为空字符串 ： ""。将按顺序依次控制分组。
163 |                               例如：OrderedDict([('申万一级行业', ''), ('流通市值', 3)])
164 |         :param group_num:分组数
165 |         :param group_list:分组列表 ['Qgroup_num', 'Qgroup_num-1', ... , 'Q2', 'Q1']
166 |         :param factor_group_list:根据control_dict提到的因子进行分组的组名list
167 |         :param weight_method:配权方式
168 |         :param max_stock_num:细分小组内最大的选股数量
169 |         :return:result_dict: keys是group_list里的分组名称，values是Series，Series的index是股票代码，values是权重
170 |                  result_count_df:每个细分小组内的最大选股数量
171 |         """
172 |         # 只对在filter方法中过滤得出的股票进行打分和分组。
173 |         filter_df = df[df['filter']]
174 | 
175 |         is_numeric_dict = {}
176 |         count = 0  # 记录控制到第几层
177 |         # 先对基准篮子分组
178 |         for control_factor, control_group_num in control_dict.items():
179 |             # 判断是数字型因子还是标签型因子，比如PE-TTM就是数字型，行业就是标签型。
180 |             is_numeric = pd.api.types.is_numeric_dtype(benchmark_df[control_factor])
181 |             is_numeric_dict[control_factor] = is_numeric
182 |             # 第1层时，不需要先通过上一层控制后再分组。
183 |             if count == 0:
184 |                 if is_numeric:
185 |                     # 按指数成份股的因子划定分位区间
186 |                     benchmark_df['group_' + control_factor] = pd.qcut(benchmark_df[control_factor],
187 |                                                                       control_group_num, duplicates='drop',
188 |                                                                       precision=10)
189 |                 else:
190 |                     benchmark_df['group_' + control_factor] = benchmark_df[control_factor]
191 |             # 非第1层时，需要先从第1层到上一层，层层控制下来再分组。所以要先groupby(factor_group_list[:count])
192 |             else:
193 |                 if is_numeric:
194 |                     def q_cut_func(x): return pd.qcut(x, min(control_group_num, len(x)), duplicates='drop', precision=10)
195 |                     gb = benchmark_df.groupby(factor_group_list[:count])
196 |                     benchmark_df['group_' + control_factor] = gb[control_factor].transform(q_cut_func)
197 |             count += 1
198 | 
199 |         every_group_selected_stock_dict = dict()  # 记录各组选出的股票，及其权重。二重dict
200 |         for group_label in group_list:
201 |             every_group_selected_stock_dict[group_label] = dict()
202 | 
203 |         gb = benchmark_df.groupby(factor_group_list)
204 |         result_count_list = []
205 |         # 从股票池选出对应细分小组的股票
206 |         for index, temp_df in gb:
207 |             pos_df = filter_df.copy()
208 |             # 按照基准股票池的分组标准取出股票池对应的股票到pos_df
209 |             if isinstance(index, str) or isinstance(index, pd.Interval):
210 |                 index = [index]
211 |             for i, sub_group in enumerate(index):
212 |                 control_factor_list = list(control_dict.keys())
213 |                 # 数字型
214 |                 if isinstance(sub_group, pd.Interval):
215 |                     pos_df = pos_df[(sub_group.left < pos_df[control_factor_list[i]]) &
216 |                                     (pos_df[control_factor_list[i]] <= sub_group.right)]
217 |                 # 标签型
218 |                 else:
219 |                     pos_df = pos_df[pos_df[control_factor_list[i]] == sub_group]
220 |                 if len(pos_df) == 0:
221 |                     break
222 |             # length需要drop_duplicates，不然有些score重复值比较多的pos_df会不够分
223 |             length = len(pos_df['score'].drop_duplicates())
224 |             if 0 < length <= group_num:
225 |                 # 如果股票池在这个细分小组里面的股票数量不够分组数，那么全选。
226 |                 if weight_method == 'EW':
227 |                     pos_df['weight'] = temp_df['权重'].sum() / length
228 |                 elif weight_method == 'LVW':
229 |                     pos_df['weight'] = temp_df['权重'].sum() * np.sqrt(pos_df['circulating_market_cap']) / \
230 |                                        np.sqrt(pos_df['circulating_market_cap']).sum()
231 |                 elif weight_method == 'VW':
232 |                     pos_df['weight'] = temp_df['权重'].sum() * np.sqrt(pos_df['market_cap']) / \
233 |                                        np.sqrt(pos_df['market_cap']).sum()
234 | 
235 |                 for group_label in group_list:
236 |                     # 记录选中的股票和权重
237 |                     every_group_selected_stock_dict[group_label].update(pos_df['weight'].to_dict())
238 |                     # 记录选股数量
239 |                     result_count_list.append(list(index) + [group_label, len(pos_df)])
240 | 
241 |             elif group_num < length:
242 |                 # 如果股票池在这个细分小组里的股票数量大于分组数，那么可以按score去排序分层
243 |                 try:
244 |                     pos_df['grouping'] = pd.qcut(pos_df['score'], group_num, labels=group_list,
245 |                                                  duplicates='drop', precision=10)
246 |                 except:
247 |                     pos_df['grouping'] = pd.cut(pos_df['score'], group_num, labels=group_list,
248 |                                                 duplicates='drop', precision=10)
249 |                 ggb = pos_df.groupby('grouping')
250 |                 for group_label, temp_df1 in ggb:
251 |                     temp_df1 = temp_df1.sort_values('score', ascending=False).head(max_stock_num)
252 |                     if weight_method == 'EW':
253 |                         temp_df1['weight'] = temp_df['权重'].sum() / len(temp_df1)
254 |                         weight_series = temp_df1['weight']
255 |                     elif weight_method == 'LVW':
256 |                         weight_series = temp_df['权重'].sum() * np.sqrt(temp_df1['circulating_market_cap']) / \
257 |                                         np.sqrt(temp_df1['circulating_market_cap']).sum()
258 |                     elif weight_method == 'VW':
259 |                         weight_series = temp_df['权重'].sum() * np.sqrt(temp_df1['market_cap']) / \
260 |                                         np.sqrt(temp_df1['market_cap']).sum()
261 | 
262 |                     every_group_selected_stock_dict[group_label].update(weight_series.to_dict())
263 |                     result_count_list.append(list(index) + [group_label, len(temp_df1)])
264 | 
265 |             else:
266 |                 # 在股票备选池里找不到任何一个落在此分组区间的股票，直接把基准的股票拿来代替.
267 |                 for group_label in group_list:
268 |                     every_group_selected_stock_dict[group_label].update(temp_df['权重'].to_dict())
269 |                     result_count_list.append(list(index) + [group_label, len(temp_df)])
270 | 
271 |         result_count_df = pd.DataFrame(result_count_list, columns=factor_group_list + ['grouping', '数量'])
272 |         result_dict = dict()
273 |         for group_label in group_list:
274 |             weight_series = pd.Series(every_group_selected_stock_dict[group_label])
275 |             # weight_series /= weight_series.sum()   # 归一化
276 |             result_dict[group_label] = weight_series
277 |         return result_dict, result_count_df
278 | 
279 |     @classmethod
280 |     def Method_Group_By_Universe(cls, df, benchmark_df, control_dict, group_num, group_list, factor_group_list,
281 |                                  weight_method, max_stock_num):
282 |         """
283 |         根据universe的因子分布来分组。
284 |         :param df: universe df，index是股票wind代码，columns至少要有filter和score这两列和control_dict里提到的因子。
285 |         :param benchmark_df: benchmark股票池的因子df，与df的结构一致
286 |         :param control_dict: 需要控制的因子，若不为空必须是OrderDict，需要控制的因子名称为key，分组数量为value，
287 |                               非数字型因子，如行业分类，则value为空字符串 ： ""。将按顺序依次控制分组。
288 |                               例如：OrderedDict([('申万一级行业', ''), ('流通市值', 3)])
289 |         :param group_num:分组数
290 |         :param group_list:分组列表 ['Qgroup_num', 'Qgroup_num-1', ... , 'Q2', 'Q1']
291 |         :param factor_group_list:根据control_dict提到的因子进行分组的组名list
292 |         :param weight_method:配权方式
293 |         :param max_stock_num:细分小组内最大的选股数量
294 |         :return:result_dict: keys是group_list里的分组名称，values是Series，Series的index是股票代码，values是权重
295 |                  result_count_df:每个细分小组内的最大选股数量
296 |         """
297 |         # 只对在filter方法中过滤得出的股票进行打分和分组。
298 |         filter_df = df[df['filter']].copy()
299 |         # 1.分组
300 |         is_numeric_dict = {}
301 |         count = 0
302 |         for control_factor, control_group_num in control_dict.items():
303 |             # 判断是数字型因子还是标签型因子，比如PE-TTM就是数字型，行业就是标签型。
304 |             is_numeric = pd.api.types.is_numeric_dtype(filter_df[control_factor])
305 |             is_numeric_dict[control_factor] = is_numeric
306 |             if count == 0:
307 |                 if is_numeric:
308 |                     # 按指数成份股的因子划定分位区间
309 |                     filter_df['group_' + control_factor] = pd.qcut(filter_df[control_factor],
310 |                                                                    control_group_num, duplicates='drop')
311 |                 else:
312 |                     filter_df['group_' + control_factor] = filter_df[control_factor]
313 |             else:
314 |                 if is_numeric:
315 |                     def q_cut_func(x): return pd.qcut(x, min(control_group_num, len(x)), duplicates='drop')
316 |                     gb = filter_df.groupby(factor_group_list[:count])
317 |                     filter_df['group_' + control_factor] = gb[control_factor].transform(q_cut_func)
318 |             count += 1
319 | 
320 |         every_group_selected_stock_dict = dict()  # 各组选出的股票。key是分组Q1、Q2...value是list
321 |         for group_label in group_list:
322 |             every_group_selected_stock_dict[group_label] = []
323 |         gb = filter_df.groupby(factor_group_list)
324 |         result_count_list = []
325 |         # 2.从股票池选出对应细分小组的股票
326 |         for index, temp_df in gb:
327 |             pos_df = temp_df.copy()
328 |             length = len(pos_df)
329 |             if 0 < length < group_num:
330 |                 # 如果股票池在这个细分小组里面的股票数量不够分组数，那么全选。
331 |                 for group_label in group_list:
332 |                     every_group_selected_stock_dict[group_label] += list(pos_df.index)
333 |                     result_count_list.append(list(index) + [group_label, len(pos_df)])
334 |             elif group_num < length:
335 |                 # 如果股票池在这个细分小组里的股票数量大于分组数，那么可以按score去排序分层
336 |                 pos_df['grouping'] = pd.qcut(pos_df['score'], group_num, labels=group_list)
337 |                 ggb = pos_df.groupby('grouping')
338 |                 for group_label, temp_df1 in ggb:
339 |                     temp_df1 = temp_df1.sort_values('score', ascending=False).head(max_stock_num)
340 |                     every_group_selected_stock_dict[group_label] += list(temp_df1.index)
341 |                     result_count_list.append(list(index) + [group_label, len(temp_df1)])
342 |         result_count_df = pd.DataFrame(result_count_list, columns=factor_group_list + ['grouping', '数量'])
343 |         # 3.配权
344 |         result_dict = dict()
345 |         for group_label in group_list:
346 |             weight_series = WeightMethod.Method_Label_Neutral(
347 |                 filter_df.loc[every_group_selected_stock_dict[group_label], :], benchmark_df, method=weight_method)
348 |             result_dict[group_label] = weight_series
349 |         return result_dict, result_count_df
350 | 
351 | 
352 | class WeightMethod(object):
353 |     def __init__(self):
354 |         pass
355 | 
356 |     @classmethod
357 |     def Method_Label_Neutral(cls, universe_df, benchmark_df, label_name='industry_zx1_name', method='LVW'):
358 |         # 此函数用于生成在某标签上的权重与基准一致（中性）的股票篮子series。同标签的默认总市值加权。常用于行业中性.
359 |         result_series = pd.Series()
360 |         #
361 |         s = set(benchmark_df[label_name])
362 |         for label in s:
363 |             temp_df = universe_df[universe_df[label_name]==label]
364 |             if len(temp_df) == 0:
365 |                 # 如果这个行业基准有，而股票池没有，直接用基准的权重。
366 |                 result_series = pd.concat([result_series, benchmark_df[benchmark_df[label_name]==label]['权重']])
367 |             else:
368 |                 total_weight = benchmark_df[benchmark_df[label_name]==label]['权重'].sum()
369 |                 if method == 'EW':
370 |                     # 等权
371 |                     n = len(temp_df)
372 |                     for code, _ in temp_df.iterrows():
373 |                         result_series.loc[code] = total_weight / n
374 |                 elif method == 'VW':
375 |                     # 总市值加权
376 |                     total_market_cap = np.sqrt(temp_df['market_cap']).sum()
377 |                     for code, row in temp_df.iterrows():
378 |                         result_series.loc[code] = total_weight * np.sqrt(row['market_cap']) / total_market_cap
379 |                 elif method == 'LVW':
380 |                     # 流通市值加权
381 |                     total_market_cap = np.sqrt(temp_df['circulating_market_cap']).sum()
382 |                     for code, row in temp_df.iterrows():
383 |                         result_series.loc[code] = total_weight * np.sqrt(row['circulating_market_cap']) / total_market_cap
384 | 
385 |                 else:
386 |                     return None
387 |         return result_series / result_series.sum()
388 | 


--------------------------------------------------------------------------------
/report/FactorAnalyse.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | sys.path.append('..')
  3 | 
  4 | from copy import deepcopy
  5 | from collections import OrderedDict
  6 | from datetime import timedelta
  7 | import pandas as pd
  8 | import numpy as np
  9 | import scipy.stats as st
 10 | from scipy.optimize import curve_fit
 11 | from tqdm.notebook import tqdm
 12 | import statsmodels.api as sm
 13 | from GetData.backtestDataApi import BacktestDataApi as WXDBReader
 14 | from BackTesting.DataPreProcessing import GroupingMethod
 15 | 
 16 | 
 17 | class GroupingTestResultAnalyser:
 18 |     """
 19 |     这个类用于打包回测结果，以及进行一些回测结果的分析
 20 |     """
 21 |     benchmark_id = ''  # 基准指数的代码
 22 |     group_list = None  # 分组的序号，如Q1、Q2、Q3等。
 23 | 
 24 |     all_group_ret_df = None  # 记录每个分组的净值，index是datetime格式日期，columns是Q1、Q2....，values是净值
 25 |     ic_series = None  # 记录持有期IC的series
 26 |     turnover_rate_df = None  # 记录每个分组的换手率，index是datetime格式日期，columns是Q1、Q2...，values是换手率
 27 |     result_count_series_dict = None  # 记录每个持有期内的分组股数，key是refresh_date，values是series，详见notebook
 28 |     half_time_period_series = None  # 记录每次的持仓的半衰期，index是refresh_date，values是半衰期
 29 |     daily_ic_df = None  # 记录每次持仓IC随持有天数的变化，index是持有天数，columns是refresh_date，values是ic
 30 |     regression_result_df = None  # 记录每次持仓的回归结果，index是refresh_date， columns是回归结果。
 31 |     industry_alpha_df = None  # 记录每次持仓各行业的超额收益，index是refresh_date， columns行业
 32 | 
 33 |     ret_df = None  # 用于记录回测所得净值的收益率，columns有p、b、excess，分别代表持仓、基准、超额。
 34 |     nav_df = None  # 用于记录回测所得净值，columns有p、b、excess，分别代表持仓、基准、超额。
 35 | 
 36 |     # 年化收益率，每年按照365天计算, 参考的是万得计算标准
 37 |     DAILY_ANNUAL_FACTOR = 365
 38 |     # 年化 1.5%
 39 |     RISK_FREE_RATE = 3.0 / 100 / DAILY_ANNUAL_FACTOR
 40 | 
 41 |     def __init__(self, result_dict):
 42 |         for k, v in result_dict.items():
 43 |             self.__dict__[k] = v
 44 |         self.group_list.reverse()  # 原本的group_list是Qn...Q3、Q2、Q1这样的顺序，这里把他反过来，从1开始。
 45 |         self.ret_df = self.all_group_ret_df[[self.group_list[0], self.benchmark_id]]  # 取Q1作为'p'
 46 |         self.ret_df.columns = ('p', 'b')   # position 和 benchmark
 47 |         self.ret_df['excess'] = self.ret_df['p'] - self.ret_df['b']
 48 |         self.ret_df = self.ret_df.astype(float)  # 没有这一步的话后面计算beta、最大回撤等指标时会报错。
 49 | 
 50 |         self.nav_df = (self.ret_df + 1).cumprod()
 51 | 
 52 |     def get_annual_return_statistic(self):
 53 |         """
 54 |         生成回测净值的年度统计
 55 |         :return:年度统计表 dataframe
 56 |         """
 57 |         result_df = pd.DataFrame(columns=('累计收益(%)', '年化收益(%)', '最大回撤(%)',
 58 |                                           '年化超额收益(%)', '超额最大回撤(%)',
 59 |                                           '年化alpha(%)', 'Beta', '跟踪误差(%)', '信息比率',
 60 |                                           'Sharpe比率', '超额Sharpe比率', 'Calmar比率', '超额Calmar比率'))
 61 |         for year, annual_df in self.ret_df.groupby(self.ret_df.index.year):
 62 |             # 把净值按年切割，传入分析函数进行净值分析
 63 |             result_df.loc[year] = self._get_period_return_statistic(annual_df)
 64 |         # 最后算全区间的净值分析
 65 |         result_df.loc['全区间'] = self._get_period_return_statistic(self.ret_df)
 66 |         return result_df
 67 | 
 68 |     def _get_period_return_statistic(self, annual_df_):
 69 |         """
 70 |         :param annual_df_: 一段时间内的净值Dataframe，需要有'p'和'b'两列，分别是组合和基准的净值日收益率
 71 |         :return: d: 一个dict，包含了所有年度统计中需要的字段
 72 |         """
 73 |         annual_df = annual_df_.copy().astype(float)  # 没有astype(float)的话后面计算beta、最大回撤等指标时会报错。
 74 |         annual_df['exceed'] = annual_df['p'] - annual_df['b']
 75 |         ann_factor = self.DAILY_ANNUAL_FACTOR
 76 |         rf = self.RISK_FREE_RATE
 77 | 
 78 |         days_delta = annual_df.index[-1] - annual_df.index[0]
 79 |         num_days = days_delta.days  # 待分析的净值共持续多少自然日
 80 | 
 81 |         d = dict()
 82 |         d['累计收益(%)'] = (1 + annual_df['p']).cumprod()[-1] - 1
 83 |         d['年化收益(%)'] = (1. + d['累计收益(%)']) ** (ann_factor / num_days) - 1
 84 | 
 85 |         cov = annual_df[['p', 'b']].cov().values
 86 |         d['Beta'] = cov[0, 1] / cov[1, 1]
 87 | 
 88 |         adj_returns = annual_df['p'] - rf  # 减无风险收益率，下同
 89 |         adj_factor_returns = annual_df['b'] - rf
 90 |         adj_excess_returns = annual_df['exceed'] - rf
 91 |         alpha_series = adj_returns - (d['Beta'] * adj_factor_returns)
 92 |         # alpha_series 可能存在缺失值
 93 |         alpha_series = alpha_series.dropna()
 94 |         # 计算累计alpha收益率
 95 |         cum_returns_final = (alpha_series + 1).cumprod(axis=0)[-1] - 1
 96 |         d['年化alpha(%)'] = (1. + cum_returns_final) ** (ann_factor / num_days) - 1
 97 | 
 98 |         # 计算累计alpha收益率
 99 |         cum_returns_final = (annual_df['exceed'] + 1).cumprod()[-1] - 1
100 |         d['年化超额收益(%)'] = (1. + cum_returns_final) ** (ann_factor / num_days) - 1
101 | 
102 |         nav_data = (1 + annual_df['p']).cumprod()
103 |         max_return = np.fmax.accumulate(nav_data)
104 |         try:
105 |             d['最大回撤(%)'] = (np.nanmin((nav_data - max_return) / max_return)).min()
106 |         except:
107 |             d['最大回撤(%)'] = np.nan
108 | 
109 |         excess_nav_data = (1 + annual_df['exceed']).cumprod()
110 |         max_return = np.fmax.accumulate(excess_nav_data)
111 |         try:
112 |             d['超额最大回撤(%)'] = (np.nanmin((excess_nav_data - max_return) / max_return)).min()
113 |         except:
114 |             d['超额最大回撤(%)'] = np.nan
115 | 
116 |         try:
117 |             # ddof=1，使得 算std时的分母为n-1。
118 |             d['Sharpe比率'] = np.mean(adj_returns) / np.std(annual_df['p'], ddof=1) * \
119 |                             np.sqrt(ann_factor)
120 |         except:
121 |             d['Sharpe比率'] = np.nan
122 | 
123 |         try:
124 |             d['超额Sharpe比率'] = np.mean(adj_excess_returns) / np.std(annual_df['p'], ddof=1) * \
125 |                             np.sqrt(ann_factor)
126 |         except:
127 |             d['超额Sharpe比率'] = np.nan
128 | 
129 |         try:
130 |             d['Calmar比率'] = - d['年化收益(%)'] / d['最大回撤(%)']
131 |         except:
132 |             d['Calmar比率'] = np.nan
133 | 
134 |         try:
135 |             d['超额Calmar比率'] = - d['年化超额收益(%)'] / d['超额最大回撤(%)']
136 |         except:
137 |             d['超额Calmar比率'] = np.nan
138 | 
139 |         d['跟踪误差(%)'] = np.nanstd(annual_df['exceed'], ddof=1)
140 | 
141 |         try:
142 |             d['信息比率'] = np.nanmean(annual_df['exceed']) / np.nanstd(annual_df['exceed'], ddof=1)
143 |         except:
144 |             d['信息比率'] = np.nan
145 | 
146 |         return d
147 | 
148 | 
149 | class FactorAnalyserBase(object):
150 |     start_date = None  # 回测开始日期
151 |     end_date = None  # 回测结束日期
152 |     benchmark_id = None  # 基准ID
153 |     universe = None  # 选股股票池，可选'全A'、'沪深300'、'中证500'
154 |     cost_rate = None  # 回测时单边交易费率
155 |     change_date_method = None  # 换仓日期模式，月初换、月末换、自定义。
156 |     customized_universe = None  # 自定义股票池，尚未支持，占坑。
157 |     all_trade_days = None  # 中国A股交易日序列
158 |     recalculate_date_list = None  # 重新计算下期持仓的日期序列
159 |     refresh_date_list = None  # 重新换仓的日期序列， 应与recalculate_date_list等长且比其滞后至少1日。
160 |     raw_universe_df_dict = None  # 用于储存在每个recalculate_date里universe的股票因子数据，不对这里的数据清洗、加工
161 |     processed_universe_df_dict = None  # 用于储存在每个recalculate_date里universe的股票因子数据，所有的数据清洗、加工在此
162 |     benchmark_weight_df_dict = None  # 用于储存在每个recalculate_date里的基准指数的股票因子数据及权重
163 | 
164 |     def __init__(self, director, start_date, end_date, benchmark_id, universe, props):
165 |         """
166 |         :param director: 计算信号的director
167 |         :param start_date:回测开始日期
168 |         :param end_date:回测结束日期
169 |         :param benchmark_id:基准ID
170 |         :param universe:选股股票池
171 |         :param props:其余参数，目前可设置：单边交易费率、换仓日期模式、自定义换仓日期、自定义篮子。
172 |         """
173 |         self.start_date = start_date
174 |         self.end_date = end_date
175 |         self.benchmark_id = benchmark_id
176 |         self.universe = universe  # 如果要自定义universe，传None
177 | 
178 |         self.cost_rate = props.get('单边交易费率', 0.0015)  # 单边
179 |         self.change_date_method = props.get('换仓日期模式', '月初换')  # 可选“月初换”， “月底换”，“自定义”
180 |         # 如果change_date_method取了自定义，那么要把具体的日期序列传递给change_date_list
181 |         self.change_date_list = props.get('自定义换仓日期', None)  # 数据类型是list
182 |         self.customized_universe = props.get('自定义篮子', None)  # pd.DataFrame, columns=('wind_code','date')
183 | 
184 |         self.all_trade_days = pd.DatetimeIndex(WXDBReader.get_all_trade_days())
185 | 
186 |         self.recalculate_date_list = None
187 |         self.refresh_date_list = None
188 |         self.raw_universe_df_dict = dict()
189 |         self.processed_universe_df_dict = dict()
190 |         self.benchmark_weight_df_dict = dict()
191 | 
192 |         self.director = director
193 |         general_data = self.director.run()["shiftedReturn"]
194 |         self.signals = general_data.to_DataFrame()
195 | 
196 |     def _get_date_list(self):
197 |         """重新算持仓日期和调仓日期"""
198 |         all_trade_days = self.all_trade_days
199 |         if self.change_date_method == '月初换':
200 |             # 根据上个月月底因子数据计算篮子，月初第一天下午收盘换仓
201 |             init_date = all_trade_days[all_trade_days <= self.start_date][-1]
202 |             recalculate_date_list = [init_date]  # 重新计算篮子的日期
203 |             refresh_date_list = [all_trade_days[all_trade_days > init_date][0]]  # 换仓的日期
204 |             trade_days = all_trade_days[(all_trade_days > self.start_date) & (all_trade_days <= self.end_date)]
205 |             for i, date in enumerate(trade_days):
206 |                 # -2，因为i得出recalculate_date，而refresh_date比它长一天，另外refresh_date如果在-1则毫无意义。
207 |                 # 因为这意味着在回测的最后一天收盘时才买入新篮子。
208 |                 if i >= len(trade_days) - 2:
209 |                     continue
210 |                 else:
211 |                     if trade_days[i].month != trade_days[i + 1].month:
212 |                         recalculate_date_list.append(date)
213 |                         refresh_date_list.append(trade_days[i + 1])
214 |         elif self.change_date_method == '月末换':
215 |             # 根据上个月倒数第二个交易日的因子数据计算篮子，倒数第一个交易日下午收盘换仓
216 |             init_date = all_trade_days[all_trade_days <= self.start_date][-1]
217 |             recalculate_date_list = [init_date]  # 重新计算篮子的日期
218 |             refresh_date_list = [all_trade_days[all_trade_days > init_date][0]]  # 换仓的日期
219 |             trade_days = all_trade_days[(all_trade_days > self.start_date) & (all_trade_days <= self.end_date)]
220 |             for i, date in enumerate(trade_days):
221 |                 if i >= len(trade_days) - 2:
222 |                     continue
223 |                 else:
224 |                     if trade_days[i].month != trade_days[i + 1].month:
225 |                         if i != 0:
226 |                             recalculate_date_list.append(trade_days[i-1])
227 |                             refresh_date_list.append(trade_days[i])
228 |         elif self.change_date_method == '自定义':
229 |             refresh_date_list = self.change_date_list
230 |             # recalculate_date选在每个refresh_day的前一天
231 |             recalculate_date_list = [all_trade_days[all_trade_days<x][-1] for x in refresh_date_list]
232 |         elif self.change_date_method == '每日换':
233 |             in_period_days = all_trade_days[(all_trade_days >= self.start_date) & (all_trade_days <= self.end_date)]
234 |             refresh_date_list = list(in_period_days[1:])
235 |             recalculate_date_list = list(in_period_days[:-1])
236 |         else:
237 |             self.log('参数设定错误，不存在这种换仓日期模式')
238 |             return
239 |         self.recalculate_date_list, self.refresh_date_list = recalculate_date_list, refresh_date_list
240 | 
241 |     def prepare_data(self):
242 |         self._get_date_list()
243 | 
244 |         for date in self.recalculate_date_list:
245 |             # 没有指定自定义的篮子
246 |             if self.customized_universe is None:
247 | 
248 |                 # 按日期记录因子数据到dict里
249 |                 self.raw_universe_df_dict[date] = WXDBReader.get_universe(self.universe, date)
250 | 
251 |                 # 按日期记录指数权重数据到dict里
252 |                 self.benchmark_weight_df_dict[date] = WXDBReader.get_index_weight(self.benchmark_id, date)
253 | 
254 |             # 指定了自定义篮子
255 |             else:
256 |                 pass
257 |         # 复制一份出来，后面对这个进行数据清洗。raw_universe_df_dict里的df不去进行任何去极值、标准化、中性化等操作，这些
258 |         # 操作全部都在processed_universe_df_dict里的df上操作。
259 |         # 1.为的是回测完能够对选出来的组合观察因子原始值的分布，比如市值的分布、市盈率的分布等操作。
260 |         # 2.如果选择分组配权方式里的group_by_benchmark模式，只能用原始值去进行选股。不然，假如把universe的因子处理了，
261 |         # 再把benchmark的股票因子处理了，两者就无法拿到一起比较了。
262 |         self.processed_universe_df_dict = deepcopy(self.raw_universe_df_dict)
263 | 
264 |     def filter(self):
265 |         """
266 |         初步过滤一些股票，如去除PE<0 的股票等，需要用户自己定义
267 |         """
268 |         pass
269 | 
270 |     def rate_stock(self):
271 |         """
272 |         因子清洗、合成逻辑,， 需要自己定义。关键是给self.processed_universe_df_dict 的每个df加一列 'score'，分数越高越好
273 |         :return:
274 |         """
275 |         pass
276 | 
277 |     def grouping_test(self, group_num, control_dict, group_by_benchmark=False, weight_method='LVW', max_stock_num=30):
278 |         """
279 |         分组收益分析，基于Score分组，由大到小分别是Q1、Q2...Q group_num，快速回测
280 |         :param group_num: 分组数量
281 |         :param control_dict: 需要控制的因子，若不为空必须是OrderDict，需要控制的因子名称为key，分组数量为value，
282 |                               非数字型因子，如行业分类，则value为空字符串 ： ""。将按顺序依次控制分组。
283 |                               例如：OrderedDict([('industry_zx1_name', ''), ('circulating_market_cap', 3)])
284 |         :param group_by_benchmark: 是否按基准的因子去划分
285 |         :param weight_method:配权方法，目前只支持EW、LVW和VW，分别是等权、流通市值平方根加权，总市值平方根加权
286 |         :param max_stock_num 每个分组网格内最大的持股数
287 |         :return: result_analyser:一个GroupingTestResultAnalyser类的对象。
288 |                                   包含8个结果。分组回测的净值、分组回测每个持有周期的IC、分组回测每次的换手率、
289 |                                   控制变量下每组的股票数量。。。。等。详见GroupingTestResultAnalyser类的属性定义。
290 |         """
291 |         if weight_method not in ['EW', 'LVW', 'VW']:
292 |             self.log('错误的权重方法:{0}，权重方法必须为EW、LVW或者VW'.format(weight_method))
293 |             return
294 |         result_count_series_dict = dict()
295 |         # 分组
296 |         group_list = ['Q' + str(x + 1) for x in range(group_num)]
297 |         group_list.reverse()
298 |         # 如果不需要控制变量
299 |         if not control_dict:
300 |             for date, df in tqdm(self.processed_universe_df_dict.items()):
301 |                 weight_sr_dict = GroupingMethod.Method_Blank_QCut(df, group_num, group_list,
302 |                                                                   weight_method, max_stock_num)
303 |                 for group_label, weight_series in weight_sr_dict.items():
304 |                     # 把权重添加到raw和processed的df里，列名为group_label：Q1、Q2.....
305 |                     self.processed_universe_df_dict[date].loc[weight_series.index, group_label] = weight_series
306 |                     self.raw_universe_df_dict[date].loc[weight_series.index, group_label] = weight_series
307 |                 result_count_series_dict[date] = df.reset_index().groupby('grouping')['code'].count().sort_index()
308 | 
309 |         # 控制变量
310 |         else:
311 |             # 根据control_dict提到的因子进行分组的组名list
312 |             factor_group_list = ['group_' + x for x in control_dict.keys()]
313 |             # 基于benchmark去确定分组区间
314 |             if group_by_benchmark:
315 |                 # 这种控制变量方法下只能通过原始值去分组，而不能用处理过的因子数据。
316 |                 for date, df in tqdm(self.raw_universe_df_dict.items()):
317 |                     benchmark_df = self.benchmark_weight_df_dict.get(date, None)
318 |                     weight_sr_dict, result_count_df = GroupingMethod.Method_Group_By_Benchmark(
319 |                         df, benchmark_df, control_dict, group_num, group_list, factor_group_list,
320 |                         weight_method, max_stock_num)
321 |                     for group_label, weight_series in weight_sr_dict.items():
322 |                         # 把权重添加到raw和processed的df里，列名为group_label：Q1、Q2.....
323 |                         self.raw_universe_df_dict[date].loc[weight_series.index, group_label] = weight_series
324 |                         self.processed_universe_df_dict[date].loc[weight_series.index, group_label] = weight_series
325 |                     result_count_series_dict[date] = result_count_df.groupby(factor_group_list + ['grouping']).sum()
326 | 
327 |             # 基于股票池确定因子分组区间
328 |             else:
329 |                 for date, df in tqdm(self.processed_universe_df_dict.items()):
330 |                     benchmark_df = self.benchmark_weight_df_dict.get(date, None)
331 |                     weight_sr_dict, result_count_df = GroupingMethod.Method_Group_By_Universe(
332 |                         df, benchmark_df, control_dict, group_num, group_list, factor_group_list,
333 |                         weight_method, max_stock_num)
334 |                     for group_label, weight_series in weight_sr_dict.items():
335 |                         # 把权重添加到raw和processed的df里，列名为group_label：Q1、Q2.....
336 |                         self.raw_universe_df_dict[date].loc[weight_series.index, group_label] = weight_series
337 |                         self.processed_universe_df_dict[date].loc[weight_series.index, group_label] = weight_series
338 |                     result_count_series_dict[date] = result_count_df.groupby(factor_group_list + ['grouping']).sum()
339 | 
340 |         # 通过以上步骤，得到了记录了每个分组的选股数量的result_count_series_dict。
341 |         # 且processed_universe_df_dict和raw_universe_df_dict多了Q1、Q2..列，每列是每组的权重
342 |         # 以下开始回测
343 | 
344 |         # 用来记录回测中各分组、以及benchmark的日收益率。从第一个建仓日开始，直到回测的结束日期。
345 |         all_group_ret_df = pd.DataFrame(columns=group_list+[self.benchmark_id],
346 |                                         index=self.all_trade_days[(self.all_trade_days >= self.refresh_date_list[0]) &
347 |                                                                   (self.all_trade_days <= self.end_date)])
348 |         # 下面这几个都是用来储存回测结果的空变量。具体意义可以看GroupingTestResultAnalyser的属性定义。
349 |         ic_series = pd.Series(index=self.refresh_date_list)
350 |         daily_ic_df = pd.DataFrame(index=self.refresh_date_list, columns=range(1, 21))
351 |         half_time_period_series = pd.Series(index=self.refresh_date_list)
352 |         last_period_weight_dict = dict()
353 |         turnover_rate_df = pd.DataFrame(columns=group_list, index=self.refresh_date_list)
354 |         regression_result_df = pd.DataFrame(columns=('score_系数', 'score_t值', 'score_p值',
355 |                                                      'R-squared', 'R-squared_Adj', 'F值'),
356 |                                             index=self.refresh_date_list)
357 |         industry_alpha_list = []
358 |         for i, recalculate_date in tqdm(enumerate(self.recalculate_date_list)):
359 |             refresh_date = self.refresh_date_list[i]
360 |             next_refresh_date = self.refresh_date_list[i+1] if i < len(self.refresh_date_list)-1 else self.end_date
361 |             if refresh_date == next_refresh_date:
362 |                 break
363 |             all_pos_df = self.processed_universe_df_dict[recalculate_date]
364 |             benchmark_df = self.benchmark_weight_df_dict[recalculate_date]
365 |             # 一次性取出来所有组里选到的股票和基准股票的收益率,假设在refresh_date的收盘时换仓,所以取ret数据时start_date要+1日
366 |             # 此外为了计算因子半衰期，start_date~end_date要取满40天。
367 |             all_stock_list = list(set(all_pos_df[group_list].dropna(how='all').index) | set(benchmark_df.index)) + \
368 |                                [self.benchmark_id]
369 |             trade_day_after_39 = self.all_trade_days[min(self.all_trade_days.get_loc(refresh_date)+39,
370 |                                                          len(self.all_trade_days))]
371 |             all_stock_ret_df = WXDBReader.get_period_quote_timeseries(all_stock_list, refresh_date+timedelta(days=1),
372 |                                                                       max(next_refresh_date,
373 |                                                                           trade_day_after_39))
374 |             # 基准指数的各股票本期收益率序列
375 |             benchmark_ret_df = all_stock_ret_df[benchmark_df.index]
376 |             benchmark_ret_df = benchmark_ret_df[(benchmark_ret_df.index > refresh_date) &
377 |                                                 (benchmark_ret_df.index <= next_refresh_date)]
378 |             if len(benchmark_ret_df) == 0:
379 |                 continue
380 |             for group_label in group_list:
381 |                 # 算组合收益率时间序列
382 |                 pos_df = all_pos_df[all_pos_df[group_label] > 0]   # 取权重大于0的出来。
383 |                 # 有些退市股票可能被选中，而这些股票取不到行情数据，这里先通过交集过滤得出stock_list。今后因子数据库补全
384 |                 # '是否退市'这个字段后可以在self.filter里通过它过滤掉，到时候删掉下面这两行代码。并解除往下第三行的注释。
385 |                 stock_list = list(set(pos_df.index) & set(all_stock_ret_df.columns))
386 |                 ret_df = all_stock_ret_df[stock_list]
387 | 
388 |                 # ret_df = all_stock_ret_df[pos_df.index]
389 |                 # 前面的收益率是取了至少40天的，这里我们要开始计算净值，所以只取两个持有期中间的日期。
390 |                 ret_df = ret_df[(ret_df.index > refresh_date) & (ret_df.index <= next_refresh_date)]
391 |                 pos_ret_series = (pos_df[group_label] * ret_df).sum(axis=1)  # 组合的每日收益率
392 | 
393 |                 # 计算组合换手率
394 |                 if i > 0:
395 |                     # 把上期和今期的组合权重合并到一张df里，并补0到nan处，计算换手率
396 |                     aligned_df = pd.concat([last_period_weight_dict[group_label], pos_df[group_label]],
397 |                                            axis=1, sort=True)
398 |                     aligned_df.fillna(0, inplace=True)
399 |                     delta_weight = aligned_df.iloc[:, 1] - aligned_df.iloc[:, 0]
400 |                     # 换手率由两部分加总而得，
401 |                     # 左边delta_weight.abs().sum() / 2 是在换仓前后总仓位不变的情况下算的换手率，除以2的原因是
402 |                     # 保证这部分不超出100%，比如原组合持有20%的格力，新组合换为20%的美的，这里带来的换手率算为20而不是40
403 |                     # 右边abs(delta_weight.sum()) 是总仓位的变化的绝对值。
404 |                     turnover_rate = delta_weight.abs().sum() / 2 + abs(delta_weight.sum())
405 |                     turnover_rate_df.loc[refresh_date, group_label] = turnover_rate
406 |                 # 如果i == 0，意味着没有上一期，换手率为nan。不需要往换手率df里填充任何数据。
407 |                 else:
408 |                     delta_weight = pos_df[group_label]
409 |                 # 计算佣金，在换仓次日扣去。在整个回测的最后一天没有对平仓操作扣除佣金
410 |                 commission = delta_weight.abs().sum() * self.cost_rate
411 |                 pos_ret_series.iat[0] -= commission
412 | 
413 |                 # 计算Q1组合各行业的超额收益率
414 |                 if group_label == 'Q1':
415 |                     pos_df = all_pos_df[all_pos_df['Q1'] > 0].copy()
416 | 
417 |                     # 使得每个行业的总权重都为1，即假设每个行业各自独立满仓。
418 |                     def func(series):
419 |                         return series / series.sum()
420 | 
421 |                     pos_df['adj_weight'] = pos_df.groupby('industry_zx1_name')['Q1'].apply(func)
422 |                     benchmark_df['adj_weight'] = benchmark_df.groupby('industry_zx1_name')['权重'].apply(func)
423 |                     # 每只股票权重乘以日涨跌幅，并按持有期累计，得出持有期累计盈利率。
424 |                     pos_df['pnl_rate'] = ((1+pos_df['adj_weight'] * ret_df).cumprod()-1).iloc[-1, :]
425 |                     benchmark_df['pnl_rate'] = ((1+benchmark_df['adj_weight'] * benchmark_ret_df).cumprod()-1).iloc[-1, :]
426 |                     # 按行业加总，放在一个dataframe里对齐
427 |                     aligned_df = pd.concat([pos_df.groupby('industry_zx1_name')['pnl_rate'].sum(),
428 |                                             benchmark_df.groupby('industry_zx1_name')['pnl_rate'].sum()],
429 |                                            axis=1, sort=True)
430 |                     aligned_df['超额收益率'] = aligned_df.iloc[:, 0] - aligned_df.iloc[:, 1]
431 |                     industry_alpha_list.append(aligned_df['超额收益率'])
432 | 
433 |                 all_group_ret_df.loc[pos_ret_series.index, group_label] = pos_ret_series
434 |                 last_period_weight_dict[group_label] = pos_df[group_label]
435 |             # 记录基准指数的收益率序列
436 |             all_group_ret_df.loc[ret_df.index, self.benchmark_id] = all_stock_ret_df[self.benchmark_id].loc[ret_df.index]
437 | 
438 |             # 算持有周期的IC和回归分析，不分组
439 |             period_ret_series = ((1+all_stock_ret_df).cumprod()-1).iloc[-1, :]  # 所有股票区间收益率Series
440 |             period_ret_series.rename('next_period_ret', inplace=True)
441 |             aligned_df = pd.concat([all_pos_df[['circulating_market_cap', 'industry_zx1_name', 'score']], period_ret_series], axis=1, sort=True).dropna()
442 |             # IC
443 |             ic_series.loc[refresh_date] = st.spearmanr(aligned_df['score'], aligned_df['next_period_ret']).correlation
444 |             # 回归
445 |             dummied_X_df = pd.get_dummies(aligned_df[['score', 'industry_zx1_name']])  # 把申万一级行业展开为哑变量
446 |             dummied_X_df['const'] = 1.  # 回归的常数项
447 |             y = np.array(aligned_df['next_period_ret'])
448 |             X = np.array(np.array(dummied_X_df))
449 |             if len(X) > 0:
450 |                 try:
451 |                     regression_result = sm.OLS(y, X).fit()
452 |                     regression_result_df.loc[refresh_date, :] = [regression_result.params[0], regression_result.tvalues[0],
453 |                                                                  regression_result.pvalues[0], regression_result.rsquared,
454 |                                                                  regression_result.rsquared_adj, regression_result.fvalue]
455 |                 except:
456 |                     regression_result_df.loc[refresh_date, :] = np.nan
457 |             else:
458 |                 regression_result_df.loc[refresh_date, :] = np.nan
459 | 
460 |                 # IC半衰期，不分组
461 |             # for j in range(len(all_stock_ret_df)):
462 |             #     aligned_df = pd.concat([all_pos_df['score'], all_stock_ret_df.iloc[j, :]], axis=1, sort=True).dropna()
463 |             #     daily_ic_df.loc[refresh_date, j+1] = st.spearmanr(aligned_df.iloc[:, 0], aligned_df.iloc[:, 1]).correlation
464 |             # half_time_period_series.loc[refresh_date] = FactorAnalyserBase.cal_half_time_period_fun(daily_ic_df.loc[refresh_date])
465 |             # print(recalculate_date.strftime("%Y-%m-%d") + "Done!")
466 | 
467 |         all_group_ret_df.iloc[0, :] = 0.
468 |         industry_alpha_df = pd.concat(industry_alpha_list, axis=1, sort=False)
469 |         if len(industry_alpha_list) == len(self.recalculate_date_list):
470 |             industry_alpha_df.columns = self.recalculate_date_list
471 |         else:
472 |             industry_alpha_df.columns = self.recalculate_date_list[:-1]
473 |         result_dict = {'all_group_ret_df': all_group_ret_df, 'ic_series': ic_series,
474 |                        'turnover_rate_df': turnover_rate_df, 'result_count_series_dict': result_count_series_dict,
475 |                        'half_time_period_series': half_time_period_series, 'daily_ic_df': daily_ic_df,
476 |                        'regression_result_df': regression_result_df, 'group_list': group_list,
477 |                        'benchmark_id': self.benchmark_id, 'industry_alpha_df': industry_alpha_df}
478 |         result_analyser = GroupingTestResultAnalyser(result_dict)
479 |         return result_analyser
480 | 
481 |     @classmethod
482 |     def cal_half_time_period_fun(cls, daily_ic_series):
483 |         """
484 |         计算半衰期，具体公式见notebook。
485 |         :param daily_ic_series: pd.Series，index是持有天数，默认是1~60，values是IC。
486 |         :return: half_time_period: float，拟合得出的半衰期
487 |         """
488 |         def func(x, a, b):
489 |             return a * np.exp(-b * x)
490 |         _daily_ic_series = daily_ic_series.dropna()
491 |         X = np.linspace(1, len(_daily_ic_series), len(_daily_ic_series))
492 |         y = np.array(_daily_ic_series, dtype=np.float32)
493 |         # Fit for the parameters a, b, c of the function `func`
494 |         popt1, pcov1 = curve_fit(func, X, y)
495 |         half_time_period = np.log(2) / popt1[1]
496 |         return half_time_period
497 | 
498 |     def set_filter(self, filter_series, date):
499 |         self.raw_universe_df_dict[date]['filter'] = filter_series
500 |         self.processed_universe_df_dict[date]['filter'] = filter_series
501 | 
502 |     def set_score(self, score_series, date):
503 |         self.raw_universe_df_dict[date]['score'] = score_series
504 |         self.processed_universe_df_dict[date]['score'] = score_series
505 | 
506 |     def log(self, context):
507 |         print(context)
508 | 
509 | 
510 | # class RegressionMultipleFactorAnalyser(FactorAnalyserBase):
511 | #     pass
512 | 
513 | 
514 | if __name__ == '__main__':
515 |     import datetime
516 |     import logging
517 |     from sklearn.metrics import mean_squared_error
518 |     from Tool.logger import Logger
519 |     from Tool.DataPreProcessing import DeExtremeMethod, ImputeMethod, StandardizeMethod
520 |     from BackTesting.Signal.SignalSynthesis import SignalSynthesis
521 |     from BackTesting.systhesisDirector import SysthesisDirector as Director
522 | 
523 |     np.warnings.filterwarnings('ignore')
524 | 
525 |     logger = Logger("SignalDirector")
526 |     logger.setLevel(logging.INFO)
527 | 
528 |     params = {
529 |         "startDate": pd.to_datetime('20170301'),
530 |         "endDate": pd.to_datetime('20201231'),
531 |         "panelSize": 5,
532 |         "trainTestGap": 1,
533 |         "maskList": None,
534 |         "deExtremeMethod": DeExtremeMethod.MeanStd(),
535 |         "imputeMethod": ImputeMethod.JustMask(),
536 |         "standardizeMethod": StandardizeMethod.StandardScaler(),
537 |         "pipeline": None,
538 |         "factorNameList": ['close', 'amount', 'open', 'high', 'low'],
539 |         # params for XGBoost
540 |         "modelParams": {
541 |             "jsonPath": None,
542 |             "paraDict": {
543 |                 "n_estimators": 50,
544 |                 "random_state": 42,
545 |                 "max_depth": 2}
546 |         },
547 |         # metric function for machine learning models
548 |         "metric_func": mean_squared_error,
549 |         # smoothing params
550 |         "smoothing_params": None
551 |     }
552 | 
553 |     director = Director(SignalSynthesis, params=params, logger=logger)
554 | 
555 |     class SampleStrategy(FactorAnalyserBase):
556 |         def __init__(self, start_date, end_date, benchmark_id, universe, props):
557 |             super(SampleStrategy, self).__init__(director, start_date, end_date, benchmark_id,
558 |                                                  universe, props)
559 | 
560 |         def filter(self):
561 |             for date, df in self.raw_universe_df_dict.items():
562 |                 df['上市天数'] = (date-df['ipo_date']).dt.days + 1  # 自然日
563 |                 self.set_filter(df['is_trading'].astype(bool) & (df['上市天数'] > 180) & df['is_exist'], date)
564 | 
565 |         def rate_stock(self):
566 |             """
567 |             选股逻辑，去极值、中性化、标准化等。需要用户自己定义
568 |             """
569 |             for date, df in self.processed_universe_df_dict.items():
570 |                 score = self.signals.loc[date, df.index.to_list()]
571 |                 self.set_score(score, date)
572 | 
573 | 
574 |     fab = SampleStrategy(params['startDate'], params['endDate'], '000905.SH', '全A',
575 |                          {"换仓日期模式": "每日换", '单边交易费率': 0.})
576 |     fab.prepare_data()
577 |     fab.filter()
578 |     fab.rate_stock()
579 | 
580 |     start_ = datetime.datetime.now()
581 |     # result = fab.grouping_test(5, OrderedDict([('industry_zx1_name', ''), ('circulating_market_cap', 5)]),
582 |     #                            group_by_benchmark=True, weight_method='LVW')
583 |     result = fab.grouping_test(5, OrderedDict([('circulating_market_cap', 5)]),
584 |                                group_by_benchmark=True, weight_method='VW')
585 |     print(datetime.datetime.now() - start_)
586 |     # print(result.__dict__)
587 |     result.get_annual_return_statistic()
588 | 
589 | 


--------------------------------------------------------------------------------