├── Data ├── GetData.py └── __init__.py ├── DataBase ├── __init__.py ├── database.py ├── database_sql.py └── initialize.py ├── Engine └── LogEngine.py ├── EvaluationIndicitor └── Indicator.py ├── FactorAnalysis └── FactorAnalysis.py ├── FactorCalculation ├── FactorBase.py ├── FundamentalGrowthFactor.py ├── FundamentalOperateFactor.py ├── FundamentalProfitFactor.py ├── FundamentalQualityFactor.py ├── FundamentalSolvencyFactor.py ├── FundamentalValueFactor.py ├── HighFrequencyDistributionFactor.py ├── HighFrequencyFundFlowFactor.py ├── HighFrequencyHighFreqFactor.py ├── HighFrequencyVolPriceFactor.py ├── TechnicalHighFreqFactor.py ├── TechnicalLiquidityFactor.py ├── TechnicalMiningFactor.py ├── TechnicalMomentumFactor.py ├── TechnicalSizeFactor.py └── __init__.py ├── FactorProcess └── FactorProcess.py ├── Forecast ├── ReturnForecast.py ├── RiskForecast.py ├── __init__.py └── test.py ├── LabelPool └── Labelpool.py ├── Object.py ├── Optimization.py ├── Run ├── FactorCalculate.py ├── FactorCollinearity.py ├── MultiFactorStrategy.py ├── MultiFactorTest.py ├── SingleFactorTest.py └── __init__.py ├── StockPool └── StockPool.py ├── __init__.py ├── constant.py ├── factor_direction.json └── factor_name.json /Data/GetData.py: -------------------------------------------------------------------------------- 1 | import cx_Oracle 2 | from typing import AnyStr 3 | import pandas as pd 4 | import os 5 | from constant import ( 6 | KeyName as KN, 7 | PriceVolumeName as PVN, 8 | FilePathName as FPN, 9 | SpecialName as SN, 10 | ExchangeName as EN) 11 | 12 | path = 'A:\\数据\\' 13 | 14 | file_dict = { 15 | "Industry_Index": '行业指数标识.csv', 16 | "Industry_Size": '规模指数标识.csv', 17 | "Daily_stock_market": 'AStockData_new.csv', 18 | "Stock_connect_index": "陆股通指数数据.csv", 19 | "N_Capital_position": "北上资金沪股通持仓明细.csv" 20 | } 21 | 22 | 23 | class SQL(object): 24 | con_wind = cx_Oracle.connect('ZD_RESEARCH_WIND/zdresearchwind1234$@47.112.235.108:1521/zd_research') 25 | 26 | FS = {"BST": "ASHAREBALANCESHEET", 27 | "IST": "ASHAREINCOME", 28 | "CFT": "ASHARECASHFLOW"} 29 | 30 | def __init__(self): 31 | pass 32 | 33 | def finance_SQL(self, 34 | sql_keys, 35 | date_sta: int, 36 | date_end: int, 37 | table_type: str, 38 | ): 39 | """ 40 | 41 | :param sql_keys: {财务报表表名缩写:{财务内容关键字:缩写,}},财务报表表名缩写:资产负债表[BST], 利润表[IST],现金流量表[CFT] 42 | :param date_sta: 数据起始时间 43 | :param date_end: 数据截止时间 44 | :param table_type: 报表类型:合并报表,母公司报表/调成,更改前 45 | :return: 返回Oracle SQL语句 46 | """ 47 | sql_dict = {} 48 | sql_key_dict = {} 49 | 50 | for key_, values_ in sql_keys.items(): 51 | 52 | if values_ == {}: 53 | continue 54 | 55 | sheet_name = self.FS[key_] 56 | keys_str = "" 57 | keys_str2 = "" 58 | for K_, ab in values_.items(): 59 | keys_str += ", {} {}".format(K_, ab) 60 | keys_str2 += ", {}.{}".format(key_, ab) 61 | sql_key_dict[key_] = keys_str2 62 | 63 | sql_dict[key_] = self._sql_finance(keys_str, sheet_name, table_type, date_sta, date_end) 64 | 65 | if len(sql_dict) == 1: 66 | key_list = list(sql_dict.keys()) 67 | sql = "SELECT * FROM ({})".format(sql_dict[key_list[0]]) 68 | 69 | elif len(sql_dict) == 2: 70 | key_list = list(sql_dict.keys()) 71 | sql = "SELECT {key_0}.* {key_value_1} " \ 72 | "FROM ({sql0}) {key_0} " \ 73 | "LEFT JOIN ({sql1}) {key_1} " \ 74 | "ON {key_0}.\"{code}\" = {key_1}.\"{code}\" " \ 75 | "AND {key_0}.\"{date1}\" = {key_1}.\"{date1}\" " \ 76 | "AND {key_0}.\"{date2}\" = {key_1}.\"{date2}\" " \ 77 | "AND {key_0}.\"type\" = {key_1}.\"type\" ".format(code=KN.STOCK_ID.value, 78 | date1=SN.ANN_DATE.value, 79 | date2=SN.REPORT_DATE.value, 80 | key_0=key_list[0], 81 | key_1=key_list[1], 82 | key_value_1=sql_key_dict[key_list[1]], 83 | sql0=sql_dict[key_list[0]], 84 | sql1=sql_dict[key_list[1]]) 85 | 86 | elif len(sql_dict) == 3: 87 | key_list = list(sql_dict.keys()) 88 | sql = "SELECT {key_0}.* {key_value_1} {key_value_2} " \ 89 | "FROM ({sql0}) {key_0} " \ 90 | "LEFT JOIN ({sql1}) {key_1} " \ 91 | "ON {key_0}.\"{code}\" = {key_1}.\"{code}\" " \ 92 | "AND {key_0}.\"{date1}\" = {key_1}.\"{date1}\" " \ 93 | "AND {key_0}.\"{date2}\" = {key_1}.\"{date2}\" " \ 94 | "AND {key_0}.\"type\" = {key_1}.\"type\" " \ 95 | "LEFT JOIN ({sql2}) {key_2} " \ 96 | "ON {key_1}.\"code\" = {key_2}.\"{code}\" " \ 97 | "AND {key_1}.\"{date1}\" = {key_2}.\"{date1}\" " \ 98 | "AND {key_1}.\"{date2}\" = {key_2}.\"{date2}\" " \ 99 | "AND {key_1}.\"type\" = {key_2}.\"type\" ".format(code=KN.STOCK_ID.value, 100 | date1=SN.ANN_DATE.value, 101 | date2=SN.REPORT_DATE.value, 102 | key_0=key_list[0], 103 | key_1=key_list[1], 104 | key_2=key_list[2], 105 | key_value_1=sql_key_dict[key_list[1]], 106 | key_value_2=sql_key_dict[key_list[2]], 107 | sql0=sql_dict[key_list[0]], 108 | sql1=sql_dict[key_list[1]], 109 | sql2=sql_dict[key_list[2]]) 110 | 111 | else: 112 | print("SQL ERROR!") 113 | sql = "" 114 | return sql 115 | 116 | def stock_index_SQL(self, 117 | bm_index: str, 118 | date_sta: str = '20130101', 119 | date_end: str = '20200401'): 120 | sql = "SELECT " \ 121 | "to_char(to_date(TRADE_DT ,'yyyy-MM-dd'), 'yyyy-MM-dd') \"{date}\" , " \ 122 | "S_DQ_CLOSE \"{close}\", S_DQ_OPEN \"{open}\", S_DQ_HIGH \"{high}\", S_DQ_LOW \"{low}\" " \ 123 | "FROM AINDEXEODPRICES " \ 124 | "WHERE S_INFO_WINDCODE = \'{bm_code}\' " \ 125 | "AND TRADE_DT BETWEEN {sta} AND {end} " \ 126 | "ORDER BY TRADE_DT ".format(date=KN.TRADE_DATE.value, 127 | close=PVN.CLOSE.value, 128 | open=PVN.OPEN.value, 129 | high=PVN.HIGH.value, 130 | low=PVN.LOW.value, 131 | bm_code=bm_index, 132 | sta=date_sta, 133 | end=date_end) 134 | return sql 135 | 136 | def _sql_finance(self, 137 | keys: str, 138 | f_table: str, 139 | table_type: str, 140 | date_sta: int, 141 | date_end: int): 142 | """ 143 | :param keys: ,关键字 缩写, 关键字 缩写 144 | :param f_table: 财务报表名称 145 | :param table_type: 报表类型 146 | :param date_sta:数据起始时间 147 | :param date_end:数据截止时间 148 | :return: 返回Oracle sql语句 149 | """ 150 | sql_BS = "SELECT " \ 151 | "S_INFO_WINDCODE \"{code}\", " \ 152 | "to_char(to_date(ANN_DT ,'yyyy-MM-dd'), 'yyyy-MM-dd') \"{date1}\", " \ 153 | "to_char(to_date(REPORT_PERIOD ,'yyyy-MM-dd'), 'yyyy-MM-dd') \"{date2}\", " \ 154 | "STATEMENT_TYPE \"type\" {keys} " \ 155 | "FROM {f_table} " \ 156 | "WHERE STATEMENT_TYPE = {table_type} " \ 157 | "AND REPORT_PERIOD BETWEEN {sta} AND {end} " \ 158 | "AND regexp_like(S_INFO_WINDCODE, '^[0-9]') " \ 159 | "ORDER BY \"{date1}\" ".format(code=KN.STOCK_ID.value, 160 | date1=SN.ANN_DATE.value, 161 | date2=SN.REPORT_DATE.value, 162 | keys=keys, 163 | f_table=f_table, 164 | table_type=table_type, 165 | sta=date_sta, 166 | end=date_end) 167 | return sql_BS 168 | 169 | # 交易日 170 | def trade_date_SQL(self, 171 | date_sta: str = '20130101', 172 | date_end: str = '20200401', 173 | exchange: str = EN.SSE.value): 174 | 175 | sql_date = "SELECT to_char(to_date(TRADE_DAYS ,'yyyy-MM-dd'), 'yyyy-MM-dd') \"{date}\" " \ 176 | "FROM ASHARECALENDAR " \ 177 | "WHERE S_INFO_EXCHMARKET = \'{exchange}\' " \ 178 | "AND TRADE_DAYS BETWEEN {sta} AND {end} " \ 179 | "ORDER BY TRADE_DAYS ".format(date=KN.TRADE_DATE.value, 180 | exchange=exchange, 181 | sta=date_sta, 182 | end=date_end) 183 | return sql_date 184 | 185 | def trade_date_csv(self, file_path: str = FPN.Trade_Date.value, file_name: str = 'TradeDate.csv'): 186 | trade_date = pd.read_csv(os.path.join(file_path, file_name)) 187 | return trade_date 188 | 189 | def list_date_csv(self, file_path: str = FPN.List_Date.value, file_name: str = 'ListDate.csv'): 190 | list_date = pd.read_csv(os.path.join(file_path, file_name)) 191 | return list_date 192 | 193 | # 个股上市日期 194 | def list_date_SQL(self): 195 | sql_list_date = "SELECT S_INFO_WINDCODE \"{code}\", " \ 196 | "to_char(to_date(S_INFO_LISTDATE ,'yyyy-MM-dd'), 'yyyy-MM-dd') \"{list_date}\" " \ 197 | "FROM ASHAREDESCRIPTION " \ 198 | "WHERE regexp_like(S_INFO_WINDCODE, '^[0-9]')".format(code=KN.STOCK_ID.value, 199 | list_date=KN.LIST_DATE.value) 200 | return sql_list_date 201 | 202 | def query(self, sql): 203 | """ 204 | Oracle SQL 查询 205 | :param sql: 206 | :return: 207 | """ 208 | sql_df = pd.read_sql(sql, self.con_wind) 209 | return sql_df 210 | -------------------------------------------------------------------------------- /Data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaotfeng/SecuritySelect/6c4653b6bdd6b46118a5fd332d741b542daf9c31/Data/__init__.py -------------------------------------------------------------------------------- /DataBase/__init__.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | # @Time: 2020/9/11 10:50 3 | # @Author: FC 4 | # @Email: 18817289038@163.com 5 | 6 | import os 7 | from typing import TYPE_CHECKING 8 | 9 | settings = {"driver": "mysql", 10 | "database": "StockData", 11 | "user": 'root', 12 | "password": 'fengchen', 13 | "host": "", 14 | "port": ''} 15 | 16 | if TYPE_CHECKING: 17 | from .database import BaseDatabaseManager 18 | 19 | if "VNPY_TESTING" not in os.environ: 20 | from .initialize import init 21 | 22 | database_manager: "BaseDatabaseManager" = init(settings=settings) 23 | -------------------------------------------------------------------------------- /DataBase/database.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | # @Time: 2020/9/11 10:15 3 | # @Author: FC 4 | # @Email: 18817289038@163.com 5 | 6 | from abc import ABC, abstractmethod 7 | from enum import Enum 8 | from typing import TYPE_CHECKING, Iterable 9 | 10 | if TYPE_CHECKING: 11 | from Object import GroupData, FactorData, FactorRetData 12 | 13 | 14 | class Driver(Enum): 15 | SQLITE = "sqlite" 16 | MYSQL = "mysql" 17 | POSTGRESQL = "postgresql" 18 | MONGODB = "mongodb" 19 | 20 | 21 | class BaseDatabaseManager(ABC): 22 | 23 | @abstractmethod 24 | def query_factor_data( 25 | self, 26 | factor_name: str, 27 | db_name: str, 28 | **kwargs): 29 | pass 30 | 31 | @abstractmethod 32 | def query_factor_ret_data( 33 | self, 34 | factor_name: tuple, 35 | sta_date: str, 36 | end_date: str, 37 | ret_type: str, 38 | hp: int): 39 | pass 40 | 41 | @abstractmethod 42 | def save_group_data( 43 | self, 44 | datas: Iterable["GroupData"] 45 | ): 46 | pass 47 | 48 | @abstractmethod 49 | def save_fact_ret_data( 50 | self, 51 | datas: Iterable["FactorRetData"] 52 | ): 53 | pass 54 | 55 | @abstractmethod 56 | def save_factor_data( 57 | self, 58 | datas: Iterable["FactorData"], 59 | db_name: str 60 | ): 61 | pass 62 | 63 | @abstractmethod 64 | def check_group_data(self, factor_name: str): 65 | pass 66 | 67 | @abstractmethod 68 | def check_fact_ret_data(self, factor_name: str): 69 | pass 70 | 71 | @abstractmethod 72 | def check_factor_data(self, factor_name: str, db_name: str): 73 | pass 74 | 75 | @abstractmethod 76 | def clean(self, factor_name: str): 77 | pass 78 | 79 | # @abstractmethod 80 | # def save_factor_return_res( 81 | # self, 82 | # datas: Iterable["retData"] 83 | # ): 84 | # pass 85 | -------------------------------------------------------------------------------- /DataBase/database_sql.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | # @Time: 2020/9/11 10:14 3 | # @Author: FC 4 | # @Email: 18817289038@163.com 5 | 6 | from datetime import datetime 7 | import pymysql 8 | import pandas as pd 9 | import time 10 | from typing import List, Type, Iterable, Dict 11 | from Object import GroupData, FactorData, FactorRetData 12 | from peewee import ( 13 | AutoField, 14 | CharField, 15 | IntegerField, 16 | Database, 17 | DateTimeField, 18 | FloatField, 19 | Model, 20 | MySQLDatabase, 21 | PostgresqlDatabase, 22 | chunked, 23 | ) 24 | 25 | from DataBase.database import BaseDatabaseManager, Driver 26 | from constant import ( 27 | KeyName as KN, 28 | PriceVolumeName as PVN, 29 | SpecialName as SN, 30 | ) 31 | 32 | model_mapping = {"Group": GroupData} 33 | 34 | 35 | def init(driver: Driver, settings: dict): 36 | init_funcs = { 37 | Driver.MYSQL: init_mysql, 38 | Driver.POSTGRESQL: init_postgresql, 39 | } 40 | assert driver in init_funcs 41 | 42 | db = init_funcs[driver](settings) 43 | DB_dict = init_models(db, driver) 44 | return SqlManager(DB_dict) 45 | 46 | 47 | def init_mysql(settings: dict): 48 | keys = {"database", "user", "password", "host", "port"} 49 | settings = {k: v for k, v in settings.items() if k in keys} 50 | global MySQL_con 51 | MySQL_con = pymysql.connect(**settings) 52 | db = MySQLDatabase(**settings) 53 | return db 54 | 55 | 56 | def init_postgresql(settings: dict): 57 | keys = {"database", "user", "password", "host", "port"} 58 | settings = {k: v for k, v in settings.items() if k in keys} 59 | db = PostgresqlDatabase(**settings) 60 | return db 61 | 62 | 63 | class ModelBase(Model): 64 | 65 | def to_dict(self): 66 | return self.__data__ 67 | 68 | 69 | def init_models(db: Database, driver: Driver): 70 | class DBFactorRetData(ModelBase): 71 | """ 72 | Candlestick bar data for database storage. 73 | Index is defined unique with datetime, interval, symbol 74 | """ 75 | 76 | id = AutoField() 77 | 78 | date: datetime = DateTimeField() 79 | 80 | factor_return: float = FloatField() 81 | holding_period: int = IntegerField() 82 | factor_T: float = FloatField(null=True) 83 | factor_name: str = CharField() 84 | factor_name_chinese: str = CharField() 85 | ret_type: str = CharField() 86 | 87 | datetime_update: datetime = DateTimeField() 88 | 89 | class Meta: 90 | database = db 91 | indexes = ((("date", "factor_name", "ret_type", "holding_period"), True),) 92 | 93 | @staticmethod 94 | def from_ret(ret: FactorRetData): 95 | """ 96 | Generate DbBarData object from BarData. 97 | """ 98 | db_bar = DBFactorRetData() 99 | 100 | db_bar.date = ret.date 101 | 102 | db_bar.factor_return = ret.factor_return 103 | db_bar.factor_T = ret.factor_T 104 | db_bar.holding_period = ret.holding_period 105 | db_bar.factor_name = ret.factor_name 106 | db_bar.factor_name_chinese = ret.factor_name_chinese 107 | db_bar.ret_type = ret.ret_type 108 | db_bar.datetime_update = datetime.now() 109 | 110 | return db_bar 111 | 112 | def to_bar(self): 113 | """ 114 | Generate GroupData object from DbGroupData. 115 | """ 116 | Ret = FactorRetData() 117 | return Ret 118 | 119 | @staticmethod 120 | def save_all(objs: List["DBFactorRetData"]): 121 | """ 122 | save a list of objects, update if exists. 123 | """ 124 | dicts = [i.to_dict() for i in objs] 125 | with db.atomic(): 126 | if driver is Driver.POSTGRESQL: 127 | for bar in dicts: 128 | DBFactorRetData.insert(bar).on_conflict( 129 | update=bar, 130 | conflict_target=( 131 | DBFactorRetData.stock_id, 132 | DBFactorRetData.date, 133 | ), 134 | ).execute() 135 | else: 136 | for c in chunked(dicts, 1000): 137 | DBFactorRetData.insert_many(c).on_conflict_replace().execute() 138 | 139 | def query_data(self, 140 | factor_names: tuple, 141 | ret_type: str = 'Pearson', 142 | hp: int = 1, 143 | sta_date: str = '2013-01-01', 144 | end_date: str = '2020-04-01'): 145 | 146 | factor_sql = f"SELECT DATE_FORMAT(`date`,'%Y-%m-%d') as `date`, factor_return, factor_name " \ 147 | f"FROM dbfactorretdata " \ 148 | f"WHERE factor_name IN {factor_names} " \ 149 | f"AND ret_type = '{ret_type}' " \ 150 | f"AND holding_period = '{hp}' " \ 151 | f"AND `date` BETWEEN str_to_date('{sta_date}', '%Y-%m-%d') " \ 152 | f"AND str_to_date('{end_date}', '%Y-%m-%d') " 153 | res = pd.read_sql(factor_sql, con=MySQL_con) 154 | return None if res.empty else res 155 | 156 | class DbFactorGroupData(ModelBase): 157 | """ 158 | Candlestick bar data for database storage. 159 | Index is defined unique with datetime, interval, symbol 160 | """ 161 | 162 | id = AutoField() 163 | stock_id: str = CharField() 164 | date: datetime = DateTimeField() 165 | 166 | industry: str = CharField() 167 | group: int = IntegerField() 168 | 169 | stock_return: float = FloatField() 170 | factor_name: str = CharField() 171 | holding_period: int = IntegerField() 172 | factor_name_chinese: str = CharField() 173 | factor_value: float = FloatField(null=True) 174 | factor_type: str = CharField() 175 | 176 | datetime_update: datetime = DateTimeField() 177 | 178 | class Meta: 179 | database = db 180 | indexes = ((("stock_id", "date", "factor_name", "holding_period"), True),) 181 | 182 | @staticmethod 183 | def from_group(group: GroupData): 184 | """ 185 | Generate DbBarData object from BarData. 186 | """ 187 | db_bar = DbFactorGroupData() 188 | 189 | db_bar.stock_id = group.stock_id 190 | db_bar.date = group.date 191 | 192 | db_bar.industry = group.industry 193 | db_bar.group = group.group 194 | 195 | db_bar.stock_return = group.stock_return 196 | db_bar.holding_period = group.holding_period 197 | db_bar.factor_name = group.factor_name 198 | db_bar.factor_value = group.factor_value 199 | db_bar.factor_name_chinese = group.factor_name_chinese 200 | db_bar.factor_type = group.factor_type 201 | 202 | db_bar.datetime_update = datetime.now() 203 | 204 | return db_bar 205 | 206 | def to_bar(self): 207 | """ 208 | Generate GroupData object from DbGroupData. 209 | """ 210 | group = GroupData() 211 | return group 212 | 213 | @staticmethod 214 | def save_all(objs: List["DbFactorGroupData"]): 215 | """ 216 | save a list of objects, update if exists. 217 | """ 218 | dicts = [i.to_dict() for i in objs] 219 | with db.atomic(): 220 | if driver is Driver.POSTGRESQL: 221 | for bar in dicts: 222 | DbFactorGroupData.insert(bar).on_conflict( 223 | update=bar, 224 | conflict_target=( 225 | DbFactorGroupData.stock_id, 226 | DbFactorGroupData.date, 227 | ), 228 | ).execute() 229 | else: 230 | for c in chunked(dicts, 5000): 231 | DbFactorGroupData.insert_many(c).on_conflict_replace().execute() 232 | 233 | class DbFactFinData(ModelBase): 234 | """ 235 | Candlestick bar data for database storage. 236 | Index is defined unique with datetime, interval, symbol 237 | """ 238 | 239 | id = AutoField() 240 | stock_id: str = CharField(max_length=10) 241 | date: datetime = DateTimeField() 242 | date_report: datetime = DateTimeField() 243 | 244 | factor_category: str = CharField(max_length=50) 245 | factor_name: str = CharField(max_length=50) 246 | factor_name_chinese: str = CharField() 247 | factor_value: float = FloatField(null=True) 248 | factor_type: str = CharField(max_length=20) 249 | 250 | datetime_update: datetime = DateTimeField() 251 | 252 | class Meta: 253 | database = db 254 | indexes = ((("stock_id", "date", "factor_category", "factor_name", "factor_type"), True),) 255 | 256 | @staticmethod 257 | def from_factor(factor: FactorData, DataClass: type) -> "ModelBase": 258 | """ 259 | Generate DbFactorData object from FactorData. 260 | """ 261 | 262 | db_bar = DataClass() 263 | 264 | db_bar.stock_id = factor.stock_id 265 | db_bar.date = factor.date # 公布期 266 | db_bar.date_report = factor.date_report # 报告期 267 | 268 | db_bar.factor_category = factor.factor_category 269 | db_bar.factor_name = factor.factor_name 270 | db_bar.factor_name_chinese = factor.factor_name_chinese 271 | db_bar.factor_value = factor.factor_value 272 | db_bar.factor_type = factor.factor_type 273 | 274 | db_bar.datetime_update = datetime.now() 275 | 276 | return db_bar 277 | 278 | def to_bar(self): 279 | """ 280 | Generate GroupData object from DbGroupData. 281 | """ 282 | factor = FactorData() 283 | return factor 284 | 285 | @staticmethod 286 | def save_all(objs: List[ModelBase], DataClass: ModelBase): 287 | """ 288 | save a list of objects, update if exists. 289 | """ 290 | dicts = map(lambda x: x.to_dict(), objs) 291 | with db.atomic(): 292 | if driver is Driver.POSTGRESQL: 293 | for bar in dicts: 294 | DataClass.insert(bar).on_conflict( 295 | update=bar, 296 | conflict_target=( 297 | DataClass.stock_id, 298 | DataClass.date, 299 | ), 300 | ).execute() 301 | else: 302 | i = 1 303 | num = 5000 304 | for c in chunked(dicts, num): 305 | sta = time.time() 306 | print(f"Insert data to database {DataClass.__name__}: {i}-{i + num - 1}") 307 | DataClass.insert_many(c).on_conflict_replace().execute() 308 | print(time.time() - sta) 309 | i += num 310 | 311 | def query_data(self, factor_name: str): 312 | factor_sql = f"SELECT DATE_FORMAT(`date`,'%Y-%m-%d') as `date`, stock_id, factor_value as {factor_name} " \ 313 | f"FROM dbfactfindata " \ 314 | f"WHERE factor_name = '{factor_name}' " # TODO 名称 315 | res = pd.read_sql(factor_sql, con=MySQL_con) 316 | return None if res.empty else res 317 | 318 | class DBFactMTMData(DbFactFinData): 319 | pass 320 | 321 | class DBFactGenProData(DbFactFinData): 322 | pass 323 | 324 | if not db.autoconnect: 325 | db.connect() 326 | 327 | db.create_tables([DBFactorRetData]) 328 | db.create_tables([DbFactorGroupData]) 329 | db.create_tables([DbFactFinData]) 330 | db.create_tables([DBFactMTMData]) 331 | db.create_tables([DBFactGenProData]) 332 | 333 | mapping = {"Ret": DBFactorRetData, 334 | "Group": DbFactorGroupData, 335 | "Fin": DbFactFinData, 336 | "MTM": DBFactMTMData, 337 | "GenPro": DBFactGenProData} 338 | 339 | return mapping 340 | 341 | 342 | class SqlManager(BaseDatabaseManager): 343 | DB_name = ["Ret", "Group", "Fin", "MTM", "GenPro", "IC", "Return"] 344 | 345 | def __init__(self, class_dict: Dict[str, Type[Model]]): 346 | for key_, value_ in class_dict.items(): 347 | setattr(self, key_, value_) 348 | 349 | def query_factor_data( 350 | self, 351 | factor_name: str, 352 | db_name: str, 353 | **kwargs) -> [pd.DataFrame, None]: 354 | model = getattr(self, db_name) 355 | return model.query_data(model, factor_name, **kwargs) 356 | 357 | def query_factor_ret_data(self, 358 | factor_name: tuple, 359 | ret_type: str, 360 | hp: int, 361 | sta_date: str, 362 | end_date: str): 363 | model = getattr(self, 'Ret') 364 | return model.query_data(model, factor_name, ret_type, hp, sta_date, end_date) 365 | 366 | def save_factor_data(self, datas: Iterable[FactorData], db_name: str): 367 | 368 | model = getattr(self, db_name) 369 | ds = map(lambda x: model.from_factor(x, model), datas) 370 | model.save_all(ds, model) 371 | 372 | def save_group_data(self, datas: Iterable[GroupData]): 373 | model = getattr(self, "Group") 374 | ds = map(lambda x: model.from_group(x), datas) 375 | model.save_all(ds) 376 | 377 | def save_fact_ret_data(self, datas: Iterable[FactorRetData]): 378 | model = getattr(self, "Ret") 379 | ds = map(lambda x: model.from_ret(x), datas) 380 | model.save_all(ds) 381 | 382 | # check whether the field exists 383 | def check_group_data(self, factor_name: str): 384 | model = getattr(self, "Group") 385 | data_object = model.select().where(model.factor_name == factor_name) 386 | return False if data_object.__len__() == 0 else True 387 | 388 | def check_factor_data(self, factor_name: str, db_name: str): 389 | model = getattr(self, db_name) 390 | data_object = model.select().where(model.factor_name == factor_name) 391 | return False if data_object.__len__() == 0 else True 392 | 393 | def check_fact_ret_data(self, factor_name: str): 394 | model = getattr(self, "Ret") 395 | data_object = model.select().where(model.factor_name == factor_name) 396 | return False if data_object.__len__() == 0 else True 397 | 398 | # Clear existing fields 399 | def clean(self, factor_name: str): 400 | for name_ in self.DB_name: 401 | model = getattr(self, name_) 402 | model.delete().where(model.factor_name == factor_name).execute() 403 | -------------------------------------------------------------------------------- /DataBase/initialize.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | # @Time: 2020/9/11 10:52 3 | # @Author: FC 4 | # @Email: 18817289038@163.com 5 | 6 | from .database import BaseDatabaseManager, Driver 7 | 8 | 9 | def init(settings: dict) -> BaseDatabaseManager: 10 | driver = Driver(settings["driver"]) 11 | return init_sql(driver=driver, settings=settings) 12 | 13 | 14 | def init_sql(driver: Driver, settings: dict): 15 | from .database_sql import init 16 | keys = {'database', "host", "port", "user", "password"} 17 | settings = {k: v for k, v in settings.items() if k in keys} 18 | _database_manager = init(driver, settings) 19 | return _database_manager 20 | -------------------------------------------------------------------------------- /Engine/LogEngine.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | # @Time: 2020/9/2 15:13 3 | # @Author: FC 4 | # @Email: 18817289038@163.com 5 | 6 | import logging 7 | 8 | 9 | class LogEngine(object): 10 | 11 | def __init__(self): 12 | self.level = logging.DEBUG 13 | 14 | self.formatter = logging.Formatter("{asctime} {levelname}: {message}") 15 | pass 16 | -------------------------------------------------------------------------------- /EvaluationIndicitor/Indicator.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | # @Time: 2020/9/4 8:42 3 | # @Author: FC 4 | # @Email: 18817289038@163.com 5 | 6 | import pandas as pd 7 | import scipy as stats 8 | import numpy as np 9 | import datetime as dt 10 | from sklearn import linear_model 11 | import warnings 12 | 13 | warnings.filterwarnings('ignore') 14 | 15 | 16 | class Indicator(object): 17 | 18 | """ 19 | 时序上收益类指标的计算用到的是对数收益率,该收益率在截面上不具备可加性 20 | """ 21 | cycle = {"D": 365, 22 | "W": 52, 23 | "M": 12, 24 | "Y": 1} 25 | 26 | # 累计收益率 27 | def accumulative_return(self, nav: pd.Series): 28 | ret = np.log(nav[-1] / nav[0]) 29 | return ret 30 | 31 | # 年化累计收益率 32 | def return_a(self, nav: pd.Series, freq: str = 'D'): 33 | 34 | sta, end = nav.index[0], nav.index[-1] 35 | 36 | period = (end - sta).days 37 | 38 | if period == 0: 39 | return 0 40 | else: 41 | ret_a = np.exp(self.accumulative_return(nav)) ** (self.cycle[freq] / period) 42 | return ret_a 43 | 44 | def odds(self, nav: pd.Series, bm: pd.Series) -> float: 45 | 46 | return sum(nav > bm) / len(nav) 47 | 48 | def std_a(self, nav: pd.Series, freq: str = 'D') -> float: 49 | std_a = np.std(nav, ddof=1) * (self.cycle[freq] ** .5) 50 | return std_a 51 | 52 | def max_retreat(self, nav: pd.Series): 53 | # 回撤结束时间点 54 | i = (nav.cummax() - nav).idxmax() 55 | # 回撤开始的时间点 56 | j = (nav[:i]).idxmax() 57 | x = (float(nav[i]) / nav[j]) - 1 58 | return x 59 | 60 | 61 | def shape_a(self, nav: pd.Series, freq: str = "D") -> float: 62 | shape_a = (self.return_a(nav, freq=freq) - 0.03) / self.std_a(nav, freq="D") 63 | return shape_a 64 | -------------------------------------------------------------------------------- /FactorCalculation/FactorBase.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | # @Time: 2020/9/9 10:49 3 | # @Author: FC 4 | # @Email: 18817289038@163.com 5 | 6 | import pandas as pd 7 | import numpy as np 8 | import os 9 | import copy 10 | from typing import Callable, Dict, Any 11 | 12 | from Data.GetData import SQL 13 | from constant import ( 14 | KeyName as KN, 15 | SpecialName as SN, 16 | FilePathName as FPN, 17 | ExchangeName as EN 18 | ) 19 | 20 | 21 | class FactorBase(object): 22 | 23 | def __init__(self): 24 | self.Q = SQL() 25 | self.list_date = SQL().list_date_csv() 26 | 27 | # 财务数据转换,需要考虑未来数据 28 | def _switch_freq(self, 29 | data_: pd.DataFrame, 30 | name: str, 31 | limit: int = 120, 32 | date_sta: str = '20130101', 33 | date_end: str = '20200401', 34 | exchange: str = EN.SSE.value) -> pd.Series: 35 | """ 36 | 37 | :param data_: 38 | :param name: 需要转换的财务指标 39 | :param limit: 最大填充时期,默认二个季度 40 | :param date_sta: 41 | :param date_end: 42 | :param exchange: 43 | :return: 44 | """ 45 | 46 | def _reindex(data: pd.DataFrame, name_: str): 47 | """填充有风险哦""" 48 | # data_re = data.reindex(trade_date[KN.TRADE_DATE.value]) 49 | data_re = pd.merge(data, trade_date, on=KN.TRADE_DATE.value, how='outer') 50 | data_re.loc[:, data_re.columns != name_] = data_re.loc[:, data_re.columns != name_].fillna(method='ffill') 51 | 52 | return data_re 53 | 54 | sql_trade_date = self.Q.trade_date_SQL(date_sta=date_sta, 55 | date_end=date_end, 56 | exchange=exchange) 57 | trade_date = self.Q.query(sql_trade_date) 58 | 59 | # 保留最新数据 60 | data_sub = data_.groupby(KN.STOCK_ID.value, 61 | group_keys=False).apply( 62 | lambda x: x.sort_values( 63 | by=[KN.TRADE_DATE.value, SN.REPORT_DATE.value]).drop_duplicates(subset=[KN.TRADE_DATE.value], 64 | keep='last')) 65 | data_sub.reset_index(inplace=True) 66 | 67 | # 交易日填充 68 | data_trade_date = data_sub.groupby(KN.STOCK_ID.value, group_keys=False).apply(_reindex, name) 69 | res = data_trade_date.set_index([KN.TRADE_DATE.value, KN.STOCK_ID.value]).sort_index() 70 | 71 | # 历史数据有限填充因子值 72 | res[name] = res[name].groupby(KN.STOCK_ID.value, group_keys=False).apply(lambda x: x.ffill(limit=limit)) 73 | 74 | res.dropna(subset=[name], inplace=True) 75 | if 'index' in res.columns: 76 | res.drop(columns='index', inplace=True) 77 | return res 78 | 79 | # 读取因子计算所需常用数据 80 | def _csv_data(self, 81 | data_name: list, 82 | file_path: str = FPN.factor_inputData.value, 83 | file_name: str = "FactorPool1", 84 | date: str = KN.TRADE_DATE.value, 85 | stock_id: str = KN.STOCK_ID.value): 86 | res = pd.read_csv(os.path.join(file_path, file_name + '.csv'), 87 | usecols=[date, stock_id] + data_name) 88 | return res 89 | 90 | # 读取指数数据 91 | def csv_index(self, 92 | data_name: list, 93 | file_path: str = FPN.factor_inputData.value, 94 | file_name: str = 'IndexInfo', 95 | index_name: str = '', 96 | date: str = KN.TRADE_DATE.value,): 97 | index_data = pd.read_csv(os.path.join(file_path, file_name + '.csv'), 98 | usecols=[date, 'index_name'] + data_name) 99 | res = index_data[index_data['index_name'] == index_name] 100 | return res 101 | 102 | # 读取分钟数据(数据不在一个文件夹中),返回回调函数结果 103 | def csv_HFD_data(self, 104 | data_name: list, 105 | func: Callable = None, 106 | fun_kwargs: dict = {}, 107 | file_path: str = FPN.HFD_Stock_M.value, 108 | sub_file: str = '') -> Dict[str, Any]: 109 | if sub_file == '': 110 | Path = file_path 111 | elif sub_file == '1minute': 112 | Path = FPN.HFD_Stock_M.value 113 | else: 114 | Path = os.path.join(file_path, sub_file) 115 | data_dict = {} 116 | file_names = os.listdir(Path) 117 | 118 | i = 1 119 | for file_name in file_names: 120 | i += 1 121 | if file_name[-3:] == 'csv': 122 | try: 123 | data_df = pd.read_csv(os.path.join(Path, file_name), usecols=['code', 'time'] + data_name) 124 | except Exception as e: 125 | continue 126 | data_df['date'] = file_name[:-4] 127 | data_df.rename(columns={'code': 'stock_id'}, inplace=True) 128 | res = func(data_df, **fun_kwargs) 129 | data_dict[file_name[:-4]] = res 130 | # if i == 3: 131 | # break 132 | 133 | return data_dict 134 | 135 | def _switch_ttm(self, data_: pd.DataFrame, name: str): 136 | """ 137 | 计算TTM,groupby后要排序 138 | """ 139 | 140 | def _pros_ttm(data_sub: pd.DataFrame, name_: str): 141 | data_sub[name_ + '_TTM'] = data_sub[name_].diff(1) 142 | res_ = data_sub[data_sub['M'] == '03'][name_].append(data_sub[data_sub['M'] != '03'][name_ + '_TTM']) 143 | res_ = res_.droplevel(level=KN.STOCK_ID.value).sort_index().rolling(4).sum() 144 | return res_ 145 | 146 | data_copy = copy.deepcopy(data_) 147 | data_copy['M'] = data_copy[SN.REPORT_DATE.value].apply(lambda x: x[5:7]) 148 | data_copy.set_index([SN.REPORT_DATE.value, KN.STOCK_ID.value], inplace=True) 149 | data_copy.sort_index(inplace=True) 150 | 151 | res = data_copy[[name, 'M']].groupby(KN.STOCK_ID.value).apply(_pros_ttm, name) 152 | 153 | res.index = res.index.swaplevel(0, 1) 154 | res.name = name 155 | return res 156 | 157 | 158 | if __name__ == '__main__': 159 | A = FactorBase() 160 | # A.csv_HFD_data() 161 | -------------------------------------------------------------------------------- /FactorCalculation/FundamentalOperateFactor.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | # @Time: 2020/9/14 13:46 3 | # @Author: FC 4 | # @Email: 18817289038@163.com 5 | 6 | import pandas as pd 7 | import numpy as np 8 | from pyfinance.ols import PandasRollingOLS 9 | import sys 10 | 11 | from FactorCalculation.FactorBase import FactorBase 12 | from Object import FactorInfo 13 | from constant import ( 14 | KeyName as KN, 15 | SpecialName as SN, 16 | FinancialBalanceSheetName as FBSN, 17 | FinancialIncomeSheetName as FISN, 18 | FinancialCashFlowSheetName as FCFSN 19 | ) 20 | 21 | 22 | # 营运能力因子 23 | class FundamentalOperateFactor(FactorBase): 24 | """408001000: 合并报表; 408006000:母公司报表 """ 25 | 26 | def __init__(self): 27 | super(FundamentalOperateFactor, self).__init__() 28 | 29 | @classmethod 30 | def Operate007(cls, 31 | data: pd.DataFrame, 32 | operator_income: str = FISN.Op_Income.value, 33 | operator_cost: str = FISN.Op_Cost.value, 34 | quarter: int = 8, 35 | switch: bool = False): 36 | """ 37 | 营业能力改善因子(RROC_N) 38 | """ 39 | func_name = sys._getframe().f_code.co_name 40 | data.set_index([SN.REPORT_DATE.value, KN.STOCK_ID.value], inplace=True) 41 | data.sort_index(inplace=True) 42 | 43 | # 标准化 44 | reg_input = data[[operator_income, 45 | operator_cost]].groupby(KN.STOCK_ID.value).apply(lambda x: (x - x.mean()) / x.std()) 46 | 47 | # 回归取残差 48 | data[func_name] = reg_input.groupby(KN.STOCK_ID.value, 49 | group_keys=False).apply( 50 | lambda x: cls._reg_rolling(x, operator_cost, operator_income, 51 | has_cons=True, 52 | win=quarter)) 53 | 54 | if switch: 55 | data_fact = cls()._switch_freq(data_=data, name=func_name, limit=120) 56 | else: 57 | data_fact = None 58 | 59 | data = data.reset_index() 60 | 61 | F = FactorInfo() 62 | F.data_raw = data[[SN.ANN_DATE.value, KN.STOCK_ID.value, SN.REPORT_DATE.value, func_name]] 63 | F.data = data_fact 64 | F.factor_type = data['type'][0] 65 | F.factor_category = cls().__class__.__name__ 66 | F.factor_name = func_name 67 | 68 | return F 69 | 70 | @classmethod 71 | def Operate009(cls, 72 | data: pd.DataFrame, 73 | fixed_asset: str = FBSN.Fixed_Asset.value, 74 | operator_total_cost: str = FISN.Op_Total_Cost.value, 75 | quarter: int = 8, 76 | switch: bool = False): 77 | """ 78 | 产能利用率因子(OCFA) 79 | """ 80 | 81 | func_name = sys._getframe().f_code.co_name 82 | data.set_index([SN.REPORT_DATE.value, KN.STOCK_ID.value], inplace=True) 83 | data.sort_index(inplace=True) 84 | 85 | # 回归取残差 86 | data[func_name] = data[[fixed_asset, operator_total_cost]].groupby(KN.STOCK_ID.value, 87 | group_keys=False).apply( 88 | lambda x: cls._reg_rolling(x, 89 | x_name=fixed_asset, 90 | y_name=operator_total_cost, 91 | has_cons=True, 92 | win=quarter)) 93 | 94 | if switch: 95 | data_fact = cls()._switch_freq(data_=data, name=func_name, limit=120) 96 | else: 97 | data_fact = None 98 | 99 | data = data.reset_index() 100 | 101 | F = FactorInfo() 102 | F.data_raw = data[[SN.ANN_DATE.value, KN.STOCK_ID.value, SN.REPORT_DATE.value, func_name]] 103 | F.data = data_fact 104 | F.factor_type = data['type'][0] 105 | F.factor_category = cls().__class__.__name__ 106 | F.factor_name = func_name 107 | 108 | return F 109 | 110 | @classmethod 111 | def Operate006(cls, 112 | data: pd.DataFrame, 113 | operator_income: str = FISN.Op_Income.value, 114 | total_asset: str = FBSN.Total_Asset.value, 115 | switch: bool = False): 116 | 117 | """ 118 | 总资产周转率(TA_Turn_TTM) = 营业收入 / 平均资产总额 119 | :return: 120 | """ 121 | func_name = sys._getframe().f_code.co_name 122 | data.set_index([SN.REPORT_DATE.value, KN.STOCK_ID.value], inplace=True) 123 | data.sort_index(inplace=True) 124 | 125 | data[total_asset] = data[total_asset].groupby(KN.STOCK_ID.value, 126 | group_keys=False).rolling(2, min_periods=1).mean() 127 | data[func_name] = data[operator_income] / data[total_asset] 128 | 129 | if switch: 130 | data_fact = cls()._switch_freq(data_=data, name=func_name, limit=120) 131 | else: 132 | data_fact = None 133 | 134 | data = data.reset_index() 135 | 136 | F = FactorInfo() 137 | F.data_raw = data[[SN.ANN_DATE.value, KN.STOCK_ID.value, SN.REPORT_DATE.value, func_name]] 138 | F.data = data_fact 139 | F.factor_type = data['type'][0] 140 | F.factor_category = cls().__class__.__name__ 141 | F.factor_name = func_name 142 | 143 | return F 144 | 145 | @classmethod 146 | def Operate010(cls, 147 | data: pd.DataFrame, 148 | operator_income: str = FISN.Op_Income.value, 149 | total_asset: str = FBSN.Total_Asset.value, 150 | switch: bool = False): 151 | 152 | """ 153 | 总资产周转率(同比)(TA_Turn_ttm_T) = 本期营业收入 / 本期平均资产总额 - 上期营业收入 / 上期平均资产总额 154 | :return: 155 | """ 156 | func_name = sys._getframe().f_code.co_name 157 | data.set_index([SN.REPORT_DATE.value, KN.STOCK_ID.value], inplace=True) 158 | data.sort_index(inplace=True) 159 | 160 | data[total_asset] = data[total_asset].groupby(KN.STOCK_ID.value, 161 | group_keys=False).rolling(2, min_periods=1).mean() 162 | data["TA_turn_ttm"] = data[operator_income] / data[total_asset] 163 | data[func_name] = data["TA_turn_ttm"].groupby(KN.STOCK_ID.value).diff(1) 164 | 165 | if switch: 166 | data_fact = cls()._switch_freq(data_=data, name=func_name, limit=120) 167 | else: 168 | data_fact = None 169 | 170 | data = data.reset_index() 171 | 172 | F = FactorInfo() 173 | F.data_raw = data[[SN.ANN_DATE.value, KN.STOCK_ID.value, SN.REPORT_DATE.value, func_name]] 174 | F.data = data_fact 175 | F.factor_type = data['type'][0] 176 | F.factor_category = cls().__class__.__name__ 177 | F.factor_name = func_name 178 | 179 | return F 180 | 181 | #################################################################################################################### 182 | @classmethod 183 | def Operate007_data_raw(cls, 184 | sta: int = 20130101, 185 | end: int = 20200401, 186 | f_type: str = '408001000'): 187 | sql_keys = {"IST": {"OPER_REV": f"\"{FISN.Op_Income.value}\"", 188 | "LESS_OPER_COST": f"\"{FISN.Op_Cost.value}\""} 189 | } 190 | 191 | sql_ = cls().Q.finance_SQL(sql_keys, sta, end, f_type) 192 | financial_data = cls().Q.query(sql_) 193 | 194 | # 过滤未上市公司 195 | data_ = pd.merge(financial_data, cls().list_date, on=[KN.STOCK_ID.value], how='left') 196 | financial_data = data_[data_[KN.TRADE_DATE.value] >= data_[KN.LIST_DATE.value]] 197 | 198 | return financial_data 199 | 200 | @classmethod 201 | def Operate009_data_raw(cls, 202 | sta: int = 20130101, 203 | end: int = 20200401, 204 | f_type: str = '408001000'): 205 | sql_keys = {"IST": {"TOT_OPER_COST": f"\"{FISN.Op_Total_Cost.value}\""}, 206 | "BST": {"FIX_ASSETS": f"\"{FBSN.Fixed_Asset.value}\""} 207 | } 208 | 209 | sql_ = cls().Q.finance_SQL(sql_keys, sta, end, f_type) 210 | financial_data = cls().Q.query(sql_) 211 | 212 | # 过滤未上市公司 213 | data_ = pd.merge(financial_data, cls().list_date, on=[KN.STOCK_ID.value], how='left') 214 | financial_data = data_[data_[KN.TRADE_DATE.value] >= data_[KN.LIST_DATE.value]] 215 | 216 | return financial_data 217 | 218 | @classmethod 219 | def Operate006_data_raw(cls, 220 | sta: int = 20130101, 221 | end: int = 20200401, 222 | f_type: str = '408001000'): 223 | sql_keys = {"IST": {"OPER_PROFIT": f"\"{FISN.Op_Income.value}\""}, 224 | "BST": {"TOT_ASSETS": f"\"{FBSN.Total_Asset.value}\""} 225 | } 226 | 227 | sql_ = cls().Q.finance_SQL(sql_keys, sta, end, f_type) 228 | financial_data = cls().Q.query(sql_) 229 | 230 | # 过滤未上市公司 231 | data_ = pd.merge(financial_data, cls().list_date, on=[KN.STOCK_ID.value], how='left') 232 | financial_data = data_[data_[KN.TRADE_DATE.value] >= data_[KN.LIST_DATE.value]] 233 | 234 | # TTM 235 | operator_income = cls()._switch_ttm(financial_data, FISN.Op_Income.value) 236 | total_asset = cls()._switch_ttm(financial_data, FBSN.Total_Asset.value) 237 | financial_data.set_index([SN.REPORT_DATE.value, KN.STOCK_ID.value], inplace=True) 238 | financial_data[FISN.Op_Income.value] = operator_income 239 | financial_data[FBSN.Total_Asset.value] = total_asset 240 | 241 | financial_data.reset_index(inplace=True) 242 | return financial_data 243 | 244 | @classmethod 245 | def Operate010_data_raw(cls, 246 | sta: int = 20130101, 247 | end: int = 20200401, 248 | f_type: str = '408001000'): 249 | 250 | return cls.Operate006_data_raw(sta=sta, end=end, f_type=f_type) 251 | 252 | @staticmethod 253 | def _reg_rolling(reg_: pd.DataFrame, x_name: str, y_name: str, win: int, has_cons: bool = False): 254 | if len(reg_) <= win: 255 | res = pd.Series(index=reg_.index) 256 | else: 257 | try: 258 | X = reg_[x_name] 259 | Y = reg_[y_name] 260 | reg_object = PandasRollingOLS(x=X, y=Y, has_const=False, use_const=has_cons, window=win) 261 | res = pd.Series(reg_object._resids[:, -1], index=reg_.index[win - 1:]) 262 | except Exception as e: 263 | print(e) 264 | res = pd.Series(index=reg_.index) 265 | return res 266 | -------------------------------------------------------------------------------- /FactorCalculation/FundamentalProfitFactor.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | # @Time: 2020/9/23 15:48 3 | # @Author: FC 4 | # @Email: 18817289038@163.com 5 | 6 | import pandas as pd 7 | import numpy as np 8 | import sys 9 | 10 | from FactorCalculation.FactorBase import FactorBase 11 | from Object import FactorInfo 12 | from constant import ( 13 | KeyName as KN, 14 | SpecialName as SN, 15 | FinancialBalanceSheetName as FBSN, 16 | FinancialIncomeSheetName as FISN, 17 | FinancialCashFlowSheetName as FCFSN 18 | ) 19 | 20 | 21 | # 盈利能力因子 22 | class FundamentalProfitFactor(FactorBase): 23 | """408001000: 合并报表; 408006000:母公司报表 """ 24 | 25 | @classmethod # TODO 26 | def Profit013(cls, 27 | data: pd.DataFrame, 28 | net_profit_in: str = FISN.Net_Pro_In.value, 29 | total_asset: str = FBSN.Total_Asset.value, 30 | switch: bool = False): 31 | """ 32 | 总资产净利率(TTM)(ROA_TTM) 33 | """ 34 | func_name = sys._getframe().f_code.co_name 35 | data.set_index([SN.REPORT_DATE.value, KN.STOCK_ID.value], inplace=True) 36 | data.sort_index(inplace=True) 37 | 38 | data[func_name] = data[net_profit_in] / data[total_asset] 39 | data[func_name][np.isinf(data[func_name])] = np.nan 40 | 41 | if switch: 42 | data_fact = cls()._switch_freq(data_=data, name=func_name) 43 | else: 44 | data_fact = None 45 | 46 | data.reset_index(inplace=True) 47 | 48 | F = FactorInfo() 49 | F.data_raw = data[[SN.ANN_DATE.value, KN.STOCK_ID.value, SN.REPORT_DATE.value, func_name]] 50 | F.data = data_fact 51 | F.factor_type = data['type'][0] 52 | F.factor_category = cls().__class__.__name__ 53 | F.factor_name = func_name 54 | 55 | return F 56 | 57 | @classmethod 58 | def Profit032(cls, 59 | data: pd.DataFrame, 60 | net_profit_in: str = FISN.Net_Pro_In.value, 61 | total_asset: str = FBSN.Total_Asset.value, 62 | switch: bool = False): 63 | """ 64 | 总资产净利率(TTM,同比)(ROA_ttm_T) 65 | :param data: 66 | :param net_profit_in: 67 | :param total_asset: 68 | :param switch: 69 | :return: 70 | """ 71 | func_name = sys._getframe().f_code.co_name 72 | data.set_index([SN.REPORT_DATE.value, KN.STOCK_ID.value], inplace=True) 73 | data.sort_index(inplace=True) 74 | 75 | data['ROA_Q'] = data.groupby(KN.STOCK_ID.value, 76 | group_keys=False).apply( 77 | lambda x: x[net_profit_in].diff(1) / x[total_asset].shift(1)) 78 | data['ROA_Q'][np.isinf(data['ROA_Q'])] = np.nan 79 | data[func_name] = data['ROA_Q'].groupby(KN.STOCK_ID.value).diff(1) 80 | 81 | if switch: 82 | data_fact = cls()._switch_freq(data_=data, name=func_name) 83 | else: 84 | data_fact = None 85 | 86 | data.reset_index(inplace=True) 87 | 88 | F = FactorInfo() 89 | F.data_raw = data[[SN.ANN_DATE.value, KN.STOCK_ID.value, SN.REPORT_DATE.value, func_name]] 90 | F.data = data_fact 91 | F.factor_type = data['type'][0] 92 | F.factor_category = cls().__class__.__name__ 93 | F.factor_name = func_name 94 | 95 | return F 96 | 97 | @classmethod 98 | def Profit025(cls, 99 | data: pd.DataFrame, 100 | Surplus_Reserves: str = FBSN.Surplus_Reserves.value, 101 | Undistributed_Profit: str = FBSN.Undistributed_Profit.value, 102 | net_profit_in: str = FISN.Net_Pro_In.value, 103 | switch: bool = False): 104 | """ 105 | 股利支付率_TTM(DPR_TTM) = 每股股利/每股净利润 = (期末留存收益 - 期初留存收益) / 净利润 106 | 留存收益 = 盈余公积 + 未分配利润 107 | :param data: 108 | :param Surplus_Reserves: 109 | :param Undistributed_Profit: 110 | :param net_profit_in: 111 | :param switch: 112 | :return: 113 | """ 114 | 115 | func_name = sys._getframe().f_code.co_name 116 | data.set_index([SN.REPORT_DATE.value, KN.STOCK_ID.value], inplace=True) 117 | data.sort_index(inplace=True) 118 | 119 | data["RE"] = data[Surplus_Reserves] + data[Undistributed_Profit] 120 | data[func_name] = data['RE'] / data[net_profit_in] 121 | data = data.reset_index() 122 | 123 | if switch: 124 | data_fact = cls()._switch_freq(data_=data, name=func_name) 125 | else: 126 | data_fact = None 127 | 128 | data = data.reset_index() 129 | 130 | F = FactorInfo() 131 | F.data_raw = data[[SN.ANN_DATE.value, KN.STOCK_ID.value, SN.REPORT_DATE.value, func_name]] 132 | F.data = data_fact 133 | F.factor_type = data['type'][0] 134 | F.factor_category = cls().__class__.__name__ 135 | F.factor_name = func_name 136 | 137 | return F 138 | 139 | @classmethod 140 | def Profit026(cls, 141 | data: pd.DataFrame, 142 | net_profit_in: str = FISN.Net_Pro_In.value, 143 | operator_income: str = FISN.Op_Income.value, 144 | switch: bool = False): 145 | 146 | """ 147 | 当期净利润率(NP) 148 | """ 149 | func_name = sys._getframe().f_code.co_name 150 | data.set_index([SN.REPORT_DATE.value, KN.STOCK_ID.value], inplace=True) 151 | data.sort_index(inplace=True) 152 | 153 | data[func_name] = data[net_profit_in] / data[operator_income] 154 | data[np.isinf(data[func_name])] = 0 155 | 156 | if switch: 157 | data_fact = cls()._switch_freq(data_=data, name=func_name) 158 | else: 159 | data_fact = None 160 | 161 | data = data.reset_index() 162 | 163 | F = FactorInfo() 164 | F.data_raw = data[[SN.ANN_DATE.value, KN.STOCK_ID.value, SN.REPORT_DATE.value, func_name]] 165 | F.data = data_fact 166 | F.factor_type = data['type'][0] 167 | F.factor_category = cls().__class__.__name__ 168 | F.factor_name = func_name 169 | 170 | return F 171 | 172 | @classmethod 173 | def Profit027(cls, 174 | data: pd.DataFrame, 175 | net_profit_in: str = FISN.Net_Pro_In.value, 176 | operator_income: str = FISN.Op_Income.value, 177 | switch: bool = False): 178 | """ 179 | 净利润率(TTM)(NP_TTM) = 净利润 / 主营业务收入 180 | :param data: 181 | :param net_profit_in: 182 | :param operator_income: 183 | :param switch: 184 | :return: 185 | """ 186 | 187 | func_name = sys._getframe().f_code.co_name 188 | data.set_index([SN.REPORT_DATE.value, KN.STOCK_ID.value], inplace=True) 189 | data.sort_index(inplace=True) 190 | 191 | data[func_name] = data[net_profit_in] / data[operator_income] 192 | data[np.isinf(data[func_name])] = 0 193 | 194 | if switch: 195 | data_fact = cls()._switch_freq(data_=data, name=func_name) 196 | else: 197 | data_fact = None 198 | 199 | data = data.reset_index() 200 | 201 | F = FactorInfo() 202 | F.data_raw = data[[SN.ANN_DATE.value, KN.STOCK_ID.value, SN.REPORT_DATE.value, func_name]] 203 | F.data = data_fact 204 | F.factor_type = data['type'][0] 205 | F.factor_category = cls().__class__.__name__ 206 | F.factor_name = func_name 207 | 208 | return F 209 | 210 | @classmethod 211 | def Profit028(cls, 212 | data: pd.DataFrame, 213 | total_operator_income: str = FISN.Total_Op_Income.value, 214 | operator_profit: str = FISN.Op_Pro.value, 215 | switch: bool = False): 216 | """ 217 | 营业利润率(OPM) = 营业利润 / 总营业收入 218 | :param data: 219 | :param total_operator_income: 220 | :param operator_profit: 221 | :param switch: 222 | :return: 223 | """ 224 | func_name = sys._getframe().f_code.co_name 225 | data.set_index([SN.REPORT_DATE.value, KN.STOCK_ID.value], inplace=True) 226 | data.sort_index(inplace=True) 227 | 228 | data[func_name] = data[operator_profit] / data[total_operator_income] 229 | data[np.isinf(data[func_name])] = 0 230 | 231 | if switch: 232 | data_fact = cls()._switch_freq(data_=data, name=func_name) 233 | else: 234 | data_fact = None 235 | 236 | data = data.reset_index() 237 | 238 | F = FactorInfo() 239 | F.data_raw = data[[SN.ANN_DATE.value, KN.STOCK_ID.value, SN.REPORT_DATE.value, func_name]] 240 | F.data = data_fact 241 | F.factor_type = data['type'][0] 242 | F.factor_category = cls().__class__.__name__ 243 | F.factor_name = func_name 244 | 245 | return F 246 | 247 | @classmethod 248 | def Profit029(cls, 249 | data: pd.DataFrame, 250 | total_operator_income: str = FISN.Total_Op_Income.value, 251 | operator_profit: str = FISN.Op_Pro.value, 252 | switch: bool = False): 253 | """ 254 | 营业利润率(TTM)(OPM_TTM) = 营业利润 / 总营业收入 255 | :param data: 256 | :param total_operator_income: 257 | :param operator_profit: 258 | :param switch: 259 | :return: 260 | """ 261 | 262 | func_name = sys._getframe().f_code.co_name 263 | data.set_index([SN.REPORT_DATE.value, KN.STOCK_ID.value], inplace=True) 264 | data.sort_index(inplace=True) 265 | 266 | data[func_name] = data[operator_profit] / data[total_operator_income] 267 | data[np.isinf(data[func_name])] = 0 268 | 269 | if switch: 270 | data_fact = cls()._switch_freq(data_=data, name=func_name) 271 | else: 272 | data_fact = None 273 | 274 | data = data.reset_index() 275 | 276 | F = FactorInfo() 277 | F.data_raw = data[[SN.ANN_DATE.value, KN.STOCK_ID.value, SN.REPORT_DATE.value, func_name]] 278 | F.data = data_fact 279 | F.factor_type = data['type'][0] 280 | F.factor_category = cls().__class__.__name__ 281 | F.factor_name = func_name 282 | 283 | return F 284 | 285 | @classmethod 286 | def Profit031(cls, 287 | data: pd.DataFrame, 288 | net_profit_in: str = FISN.Net_Pro_In.value, 289 | operator_income: str = FISN.Op_Income.value, 290 | switch: bool = False): 291 | """ 292 | 净利润率(同比)(NPM_T) = 本期净利润 / 本期主营业务收入 - 上期净利润 / 上期主营业务收入 293 | :param data: 294 | :param net_profit_in: 295 | :param operator_income: 296 | :param switch: 297 | :return: 298 | """ 299 | 300 | func_name = sys._getframe().f_code.co_name 301 | data.set_index([SN.REPORT_DATE.value, KN.STOCK_ID.value], inplace=True) 302 | data.sort_index(inplace=True) 303 | 304 | data['NP'] = data[net_profit_in] / data[operator_income] 305 | data[np.isinf(data['NP'])] = 0 306 | data[func_name] = data['NP'].groupby(KN.STOCK_ID.value).diff(1) 307 | 308 | if switch: 309 | data_fact = cls()._switch_freq(data_=data, name=func_name) 310 | else: 311 | data_fact = None 312 | 313 | data = data.reset_index() 314 | 315 | F = FactorInfo() 316 | F.data_raw = data[[SN.ANN_DATE.value, KN.STOCK_ID.value, SN.REPORT_DATE.value, func_name]] 317 | F.data = data_fact 318 | F.factor_type = data['type'][0] 319 | F.factor_category = cls().__class__.__name__ 320 | F.factor_name = func_name 321 | 322 | return F 323 | 324 | #################################################################################################################### 325 | @classmethod 326 | def Profit013_data_raw(cls, 327 | sta: int = 20130101, 328 | end: int = 20200401, 329 | f_type: str = '408001000'): 330 | """ 331 | 332 | :param end: 333 | :param sta: 334 | :param f_type: 408001000 or 408006000 335 | :return: 336 | """ 337 | 338 | sql_keys = {"BST": {"TOT_ASSETS": f"\"{FBSN.Total_Asset.value}\""}, 339 | "IST": {"NET_PROFIT_INCL_MIN_INT_INC": f"\"{FISN.Net_Pro_In.value}\""} 340 | } 341 | sql_ = cls().Q.finance_SQL(sql_keys, sta, end, f_type) 342 | financial_data = cls().Q.query(sql_) 343 | # 过滤未上市公司 344 | data_ = pd.merge(financial_data, cls().list_date, on=[KN.STOCK_ID.value], how='left') 345 | financial_data = data_[data_[KN.TRADE_DATE.value] >= data_[KN.LIST_DATE.value]] 346 | 347 | # TTM 348 | financial_clean = cls()._switch_ttm(financial_data, FISN.Net_Pro_In.value) 349 | 350 | financial_data.set_index([SN.REPORT_DATE.value, KN.STOCK_ID.value], inplace=True) 351 | financial_data[FISN.Net_Pro_In.value] = financial_clean 352 | 353 | financial_data.reset_index(inplace=True) 354 | return financial_data 355 | 356 | @classmethod 357 | def Profit032_data_raw(cls, 358 | sta: int = 20130101, 359 | end: int = 20200401, 360 | f_type: str = '408001000'): 361 | return cls.Profit013_data_raw(sta=sta, end=end, f_type=f_type) 362 | 363 | @classmethod 364 | def Profit025_data_raw(cls, 365 | sta: int = 20130101, 366 | end: int = 20200401, 367 | f_type: str = '408001000'): 368 | 369 | sql_keys = {"IST": {"NET_PROFIT_INCL_MIN_INT_INC": f"\"{FISN.Net_Pro_In.value}\""}, 370 | "BST": {"SURPLUS_RSRV": f"\"{FBSN.Surplus_Reserves.value}\"", 371 | "UNDISTRIBUTED_PROFIT": f"\"{FBSN.Undistributed_Profit.value}\""} 372 | } 373 | 374 | sql_ = cls().Q.finance_SQL(sql_keys, sta, end, f_type) 375 | financial_data = cls().Q.query(sql_) 376 | 377 | # 过滤未上市公司 378 | data_ = pd.merge(financial_data, cls().list_date, on=[KN.STOCK_ID.value], how='left') 379 | 380 | financial_data = data_[data_[KN.TRADE_DATE.value] >= data_[KN.LIST_DATE.value]] 381 | # TTM 382 | net_profit_in = cls()._switch_ttm(financial_data, FISN.Net_Pro_In.value) 383 | surplus_reserves = cls()._switch_ttm(financial_data, FBSN.Surplus_Reserves.value) 384 | undistributed_profit = cls()._switch_ttm(financial_data, FBSN.Undistributed_Profit.value) 385 | 386 | financial_data.set_index([SN.REPORT_DATE.value, KN.STOCK_ID.value], inplace=True) 387 | 388 | financial_data[FISN.Net_Pro_In.value] = net_profit_in 389 | financial_data[FBSN.Surplus_Reserves.value] = surplus_reserves 390 | financial_data[FBSN.Undistributed_Profit.value] = undistributed_profit 391 | 392 | financial_data.reset_index(inplace=True) 393 | 394 | return financial_data 395 | 396 | @classmethod 397 | def Profit027_data_raw(cls, 398 | sta: int = 20130101, 399 | end: int = 20200401, 400 | f_type: str = '408001000'): 401 | 402 | sql_keys = {"IST": {"NET_PROFIT_INCL_MIN_INT_INC": f"\"{FISN.Net_Pro_In.value}\"", 403 | "OPER_REV": f"\"{FISN.Op_Income.value}\""}, 404 | } 405 | 406 | sql_ = cls().Q.finance_SQL(sql_keys, sta, end, f_type) 407 | financial_data = cls().Q.query(sql_) 408 | 409 | # 过滤未上市公司 410 | data_ = pd.merge(financial_data, cls().list_date, on=[KN.STOCK_ID.value], how='left') 411 | 412 | financial_data = data_[data_[KN.TRADE_DATE.value] >= data_[KN.LIST_DATE.value]] 413 | 414 | net_profit_in = cls()._switch_ttm(financial_data, FISN.Net_Pro_In.value) 415 | op_income = cls()._switch_ttm(financial_data, FISN.Op_Income.value) 416 | 417 | financial_data.set_index([SN.REPORT_DATE.value, KN.STOCK_ID.value], inplace=True) 418 | financial_data[FISN.Net_Pro_In.value] = net_profit_in 419 | financial_data[FISN.Op_Income.value] = op_income 420 | 421 | financial_data.reset_index(inplace=True) 422 | 423 | return financial_data 424 | 425 | @classmethod 426 | def Profit026_data_raw(cls, 427 | sta: int = 20130101, 428 | end: int = 20200401, 429 | f_type: str = '408001000'): 430 | 431 | sql_keys = {"IST": {"NET_PROFIT_INCL_MIN_INT_INC": f"\"{FISN.Net_Pro_In.value}\"", 432 | "OPER_REV": f"\"{FISN.Op_Income.value}\""}, 433 | } 434 | 435 | sql_ = cls().Q.finance_SQL(sql_keys, sta, end, f_type) 436 | financial_data = cls().Q.query(sql_) 437 | 438 | # 过滤未上市公司 439 | data_ = pd.merge(financial_data, cls().list_date, on=[KN.STOCK_ID.value], how='left') 440 | 441 | financial_data = data_[data_[KN.TRADE_DATE.value] >= data_[KN.LIST_DATE.value]] 442 | 443 | return financial_data 444 | 445 | @classmethod 446 | def Profit028_data_raw(cls, 447 | sta: int = 20130101, 448 | end: int = 20200401, 449 | f_type: str = '408001000'): 450 | 451 | sql_keys = {"IST": {"TOT_OPER_REV": f"\"{FISN.Total_Op_Income.value}\"", 452 | "OPER_PROFIT": f"\"{FISN.Op_Pro.value}\""}, 453 | } 454 | 455 | sql_ = cls().Q.finance_SQL(sql_keys, sta, end, f_type) 456 | financial_data = cls().Q.query(sql_) 457 | 458 | # 过滤未上市公司 459 | data_ = pd.merge(financial_data, cls().list_date, on=[KN.STOCK_ID.value], how='left') 460 | 461 | financial_data = data_[data_[KN.TRADE_DATE.value] >= data_[KN.LIST_DATE.value]] 462 | 463 | return financial_data 464 | 465 | @classmethod 466 | def Profit029_data_raw(cls, 467 | sta: int = 20130101, 468 | end: int = 20200401, 469 | f_type: str = '408001000'): 470 | 471 | sql_keys = {"IST": {"TOT_OPER_REV": f"\"{FISN.Total_Op_Income.value}\"", 472 | "OPER_PROFIT": f"\"{FISN.Op_Pro.value}\""}, 473 | } 474 | 475 | sql_ = cls().Q.finance_SQL(sql_keys, sta, end, f_type) 476 | financial_data = cls().Q.query(sql_) 477 | 478 | # 过滤未上市公司 479 | data_ = pd.merge(financial_data, cls().list_date, on=[KN.STOCK_ID.value], how='left') 480 | financial_data = data_[data_[KN.TRADE_DATE.value] >= data_[KN.LIST_DATE.value]] 481 | 482 | total_op_income = cls()._switch_ttm(financial_data, FISN.Total_Op_Income.value) 483 | operator_profit = cls()._switch_ttm(financial_data, FISN.Op_Pro.value) 484 | 485 | financial_data.set_index([SN.REPORT_DATE.value, KN.STOCK_ID.value], inplace=True) 486 | financial_data[FISN.Total_Op_Income.value] = total_op_income 487 | financial_data[FISN.Op_Pro.value] = operator_profit 488 | 489 | financial_data.reset_index(inplace=True) 490 | 491 | return financial_data 492 | 493 | @classmethod 494 | def Profit031_data_raw(cls, 495 | sta: int = 20130101, 496 | end: int = 20200401, 497 | f_type: str = '408001000'): 498 | 499 | return cls.Profit027_data_raw(sta=sta, end=end, f_type=f_type) 500 | -------------------------------------------------------------------------------- /FactorCalculation/FundamentalQualityFactor.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | # @Time: 2020/8/26 20:07 3 | # @Author: FC 4 | # @Email: 18817289038@163.com 5 | 6 | import pandas as pd 7 | import numpy as np 8 | import sys 9 | 10 | from FactorCalculation.FactorBase import FactorBase 11 | from Object import FactorInfo 12 | from constant import ( 13 | KeyName as KN, 14 | SpecialName as SN, 15 | FinancialBalanceSheetName as FBSN, 16 | FinancialIncomeSheetName as FISN, 17 | FinancialCashFlowSheetName as FCFSN 18 | ) 19 | 20 | 21 | # 收益质量因子 22 | class FundamentalQualityFactor(FactorBase): # TODO 修改 23 | """408001000: 合并报表; 408006000:母公司报表 """ 24 | 25 | @classmethod 26 | def Quality010(cls, 27 | data: pd.DataFrame, 28 | cash_sales: str = FCFSN.Cash_From_Sales.value, 29 | operator_income: str = FISN.Op_Income.value, 30 | switch: bool = False): 31 | """ 32 | 收现比(CSR) = 销售商品提供劳务收到的现金 / 营业收入 33 | :param data: 34 | :param cash_sales: 35 | :param operator_income: 36 | :param switch: 37 | :return: 38 | """ 39 | func_name = sys._getframe().f_code.co_name 40 | data.set_index([SN.REPORT_DATE.value, KN.STOCK_ID.value], inplace=True) 41 | data.sort_index(inplace=True) 42 | 43 | data[func_name] = data[cash_sales] / data[operator_income] 44 | data[func_name][np.isinf(data[func_name])] = np.nan 45 | 46 | if switch: 47 | data_fact = cls()._switch_freq(data_=data, name=func_name, limit=120) 48 | else: 49 | data_fact = None 50 | 51 | data = data.reset_index() 52 | 53 | F = FactorInfo() 54 | F.data_raw = data[[SN.ANN_DATE.value, KN.STOCK_ID.value, SN.REPORT_DATE.value, func_name]] 55 | F.data = data_fact 56 | F.factor_type = data['type'][0] 57 | F.factor_category = cls().__class__.__name__ 58 | F.factor_name = func_name 59 | 60 | return F 61 | 62 | @classmethod 63 | def Quality011(cls, 64 | data: pd.DataFrame, 65 | cash_sales: str = FCFSN.Cash_From_Sales.value, 66 | operator_income: str = FISN.Op_Income.value, 67 | switch: bool = False): 68 | """ 69 | 收现比变动(CSRD) 70 | """ 71 | func_name = sys._getframe().f_code.co_name 72 | data.set_index([SN.REPORT_DATE.value, KN.STOCK_ID.value], inplace=True) 73 | data.sort_index(inplace=True) 74 | 75 | CSR = data[cash_sales] / data[operator_income] 76 | CSR[np.isinf(CSR)] = np.nan 77 | data[func_name] = CSR.diff(1) 78 | 79 | if switch: 80 | data_fact = cls()._switch_freq(data_=data, name=func_name, limit=120) 81 | else: 82 | data_fact = None 83 | 84 | data = data.reset_index() 85 | 86 | F = FactorInfo() 87 | F.data_raw = data[[SN.ANN_DATE.value, KN.STOCK_ID.value, SN.REPORT_DATE.value, func_name]] 88 | F.data = data_fact 89 | F.factor_type = data['type'][0] 90 | F.factor_category = cls().__class__.__name__ 91 | F.factor_name = func_name 92 | 93 | return F 94 | 95 | @classmethod 96 | def Quality008(cls, 97 | data: pd.DataFrame, 98 | op_net_cash_flow: str = FCFSN.Op_Net_CF.value, 99 | operator_profit: str = FISN.Op_Pro.value, 100 | switch: bool = False): 101 | """ 102 | 应计利润占比(APR) = 应计利润 / 营业利润 103 | 应计利润 = 营业利润 - 经营性现金流量净额 104 | :param data: 105 | :param op_net_cash_flow: 106 | :param operator_profit: 107 | :param switch: 108 | :return: 109 | """ 110 | func_name = sys._getframe().f_code.co_name 111 | data.set_index([SN.REPORT_DATE.value, KN.STOCK_ID.value], inplace=True) 112 | data.sort_index(inplace=True) 113 | 114 | # 缺失科目填补为0 115 | data[op_net_cash_flow].fillna(0, inplace=True) 116 | data[func_name] = (data[operator_profit] - data[op_net_cash_flow]) / data[operator_profit] 117 | data[func_name][np.isinf(data[func_name])] = np.nan 118 | 119 | if switch: 120 | data_fact = cls()._switch_freq(data_=data, name=func_name, limit=120) 121 | else: 122 | data_fact = None 123 | 124 | data = data.reset_index() 125 | 126 | F = FactorInfo() 127 | F.data_raw = data[[SN.ANN_DATE.value, KN.STOCK_ID.value, SN.REPORT_DATE.value, func_name]] 128 | F.data = data_fact 129 | F.factor_type = data['type'][0] 130 | F.factor_category = cls().__class__.__name__ 131 | F.factor_name = func_name 132 | 133 | return F 134 | 135 | @classmethod 136 | def Quality009(cls, 137 | data: pd.DataFrame, 138 | op_net_cash_flow: str = FCFSN.Op_Net_CF.value, 139 | operator_profit: str = FISN.Op_Pro.value, 140 | switch: bool = False): 141 | """ 142 | 应计利润占比变动(APRD) 143 | """ 144 | func_name = sys._getframe().f_code.co_name 145 | data.set_index([SN.REPORT_DATE.value, KN.STOCK_ID.value], inplace=True) 146 | data.sort_index(inplace=True) 147 | 148 | # 缺失科目填补为0 149 | data[op_net_cash_flow].fillna(0, inplace=True) 150 | data["APR"] = (data[operator_profit] - data[op_net_cash_flow]) / data[operator_profit] 151 | data["APR"][np.isinf(data["APR"])] = np.nan 152 | data[func_name] = data["APR"].groupby(KN.STOCK_ID.value).diff(1) 153 | 154 | if switch: 155 | data_fact = cls()._switch_freq(data_=data, name=func_name, limit=120) 156 | else: 157 | data_fact = None 158 | 159 | data = data.reset_index() 160 | 161 | F = FactorInfo() 162 | F.data_raw = data[[SN.ANN_DATE.value, KN.STOCK_ID.value, SN.REPORT_DATE.value, func_name]] 163 | F.data = data_fact 164 | F.factor_type = data['type'][0] 165 | F.factor_category = cls().__class__.__name__ 166 | F.factor_name = func_name 167 | 168 | return F 169 | 170 | #################################################################################################################### 171 | @classmethod 172 | def Quality010_data_raw(cls, 173 | sta: int = 20130101, 174 | end: int = 20200401, 175 | f_type: str = '408001000'): 176 | sql_keys = {"IST": {"OPER_PROFIT": f"\"{FISN.Op_Income.value}\""}, 177 | "CFT": {"CASH_RECP_SG_AND_RS": f"\"{FCFSN.Cash_From_Sales.value}\""} 178 | } 179 | 180 | sql_ = cls().Q.finance_SQL(sql_keys, sta, end, f_type) 181 | financial_data = cls().Q.query(sql_) 182 | 183 | # 过滤未上市公司 184 | data_ = pd.merge(financial_data, cls().list_date, on=[KN.STOCK_ID.value], how='left') 185 | financial_data = data_[data_[KN.TRADE_DATE.value] >= data_[KN.LIST_DATE.value]] 186 | 187 | # TTM 188 | operator_income = cls()._switch_ttm(financial_data, FISN.Op_Income.value) 189 | cash_sales = cls()._switch_ttm(financial_data, FCFSN.Cash_From_Sales.value) 190 | financial_data.set_index([SN.REPORT_DATE.value, KN.STOCK_ID.value], inplace=True) 191 | financial_data[FISN.Op_Income.value] = operator_income 192 | financial_data[FCFSN.Cash_From_Sales.value] = cash_sales 193 | 194 | financial_data.reset_index(inplace=True) 195 | return financial_data 196 | 197 | @classmethod 198 | def Quality011_data_raw(cls, 199 | sta: int = 20130101, 200 | end: int = 20200401, 201 | f_type: str = '408001000'): 202 | return cls.Quality010_data_raw(sta, end, f_type) 203 | 204 | @classmethod 205 | def Quality008_data_raw(cls, 206 | sta: int = 20130101, 207 | end: int = 20200401, 208 | f_type: str = '408001000'): 209 | 210 | sql_keys = {"IST": {"OPER_REV": f"\"{FISN.Op_Pro.value}\""}, 211 | "CFT": {"NET_CASH_FLOWS_OPER_ACT": f"\"{FCFSN.Op_Net_CF.value}\""}, 212 | } 213 | 214 | sql_ = cls().Q.finance_SQL(sql_keys, sta, end, f_type) 215 | financial_data = cls().Q.query(sql_) 216 | 217 | # 过滤未上市公司 218 | data_ = pd.merge(financial_data, cls().list_date, on=[KN.STOCK_ID.value], how='left') 219 | financial_data = data_[data_[KN.TRADE_DATE.value] >= data_[KN.LIST_DATE.value]] 220 | 221 | # TTM 222 | operator_profit = cls()._switch_ttm(financial_data, FISN.Op_Pro.value) 223 | cash_operator = cls()._switch_ttm(financial_data, FCFSN.Op_Net_CF.value) 224 | 225 | financial_data.set_index([SN.REPORT_DATE.value, KN.STOCK_ID.value], inplace=True) 226 | financial_data[FISN.Op_Pro.value] = operator_profit 227 | financial_data[FCFSN.Op_Net_CF.value] = cash_operator 228 | 229 | financial_data.reset_index(inplace=True) 230 | return financial_data 231 | 232 | @classmethod 233 | def Quality009_data_raw(cls, 234 | sta: int = 20130101, 235 | end: int = 20200401, 236 | f_type: str = '408001000'): 237 | return cls.Quality008_data_raw(sta, end, f_type) 238 | 239 | 240 | if __name__ == '__main__': 241 | pass 242 | -------------------------------------------------------------------------------- /FactorCalculation/HighFrequencyHighFreqFactor.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import os 3 | import time 4 | import scipy.stats as st 5 | import datetime as dt 6 | import numpy as np 7 | import sys 8 | from pyfinance.ols import PandasRollingOLS 9 | 10 | from FactorCalculation.FactorBase import FactorBase 11 | from Object import FactorInfo 12 | from constant import ( 13 | KeyName as KN, 14 | PriceVolumeName as PVN, 15 | SpecialName as SN, 16 | FilePathName as FPN 17 | ) 18 | 19 | """ 20 | 高频数据 21 | 高频数据需要合成中间过程 22 | 然后利用中间过程计算因子 23 | 数据命名以合成的数据名为主,没有统一定义 24 | 25 | 1分钟频率收益率计算采用收盘价比上开盘价(分钟数据存在缺失,采用开盘或者收盘直接计算容易发生跳空现象) 26 | 2h 数据存在异常,在原数据中进行剔除 27 | 若不做特殊说明, 分钟级别数据运算会包含集合竞价信息 28 | """ 29 | 30 | 31 | class HighFrequencyHighFreqFactor(FactorBase): 32 | """ 33 | 高频因子 34 | """ 35 | 36 | def __init__(self): 37 | super(HighFrequencyHighFreqFactor, self).__init__() 38 | 39 | 40 | 41 | 42 | if __name__ == '__main__': 43 | pass 44 | -------------------------------------------------------------------------------- /FactorCalculation/HighFrequencyVolPriceFactor.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import os 3 | import time 4 | import scipy.stats as st 5 | import datetime as dt 6 | import numpy as np 7 | import sys 8 | from pyfinance.ols import PandasRollingOLS 9 | 10 | from FactorCalculation.FactorBase import FactorBase 11 | from Object import FactorInfo 12 | from constant import ( 13 | KeyName as KN, 14 | PriceVolumeName as PVN, 15 | SpecialName as SN, 16 | FilePathName as FPN 17 | ) 18 | 19 | """ 20 | 高频数据 21 | 高频数据需要合成中间过程 22 | 然后利用中间过程计算因子 23 | 数据命名以合成的数据名为主,没有统一定义 24 | 25 | 1分钟频率收益率计算采用收盘价比上开盘价(分钟数据存在缺失,采用开盘或者收盘直接计算容易发生跳空现象) 26 | 2h 数据存在异常,在原数据中进行剔除 27 | 若不做特殊说明, 分钟级别数据运算会包含集合竞价信息 28 | """ 29 | 30 | 31 | class HighFrequencyVolPriceFactor(FactorBase): 32 | """ 33 | 高频因子 34 | """ 35 | 36 | def __init__(self): 37 | super(HighFrequencyVolPriceFactor, self).__init__() 38 | 39 | @classmethod 40 | def VolPrice008(cls, 41 | data: pd.DataFrame, 42 | **kwargs): 43 | """大单驱动涨幅(MOM_bigOrder)""" 44 | 45 | F = FactorInfo() 46 | F.data = data 47 | F.factor_type = 'HFD' 48 | F.factor_category = cls().__class__.__name__ 49 | F.factor_name = data.name 50 | 51 | return F 52 | 53 | @classmethod 54 | def VolPrice009(cls, 55 | data: pd.DataFrame, 56 | **kwargs): 57 | """改进反转(Rev_improve)""" 58 | 59 | F = FactorInfo() 60 | F.data = data 61 | F.factor_type = 'HFD' 62 | F.factor_category = cls().__class__.__name__ 63 | F.factor_name = data.name 64 | 65 | return F 66 | 67 | @classmethod 68 | def VolPrice011(cls, 69 | data: pd.DataFrame, 70 | **kwargs): 71 | """聪明钱因子(SmartQ)""" 72 | 73 | F = FactorInfo() 74 | F.data = data 75 | F.factor_type = 'HFD' 76 | F.factor_category = cls().__class__.__name__ 77 | F.factor_name = data.name 78 | 79 | return F 80 | 81 | @classmethod 82 | def VolPrice012(cls, 83 | data: pd.DataFrame, 84 | **kwargs): 85 | """高频反转因子(HFD_Rev)""" 86 | 87 | F = FactorInfo() 88 | F.data = data 89 | F.factor_type = 'HFD' 90 | F.factor_category = cls().__class__.__name__ 91 | F.factor_name = data.name 92 | 93 | return F 94 | 95 | @classmethod 96 | def VolPrice013(cls, 97 | data: pd.DataFrame, 98 | **kwargs): 99 | """轨迹非流动因子(Illiq_Track)""" 100 | 101 | F = FactorInfo() 102 | F.data = data 103 | F.factor_type = 'HFD' 104 | F.factor_category = cls().__class__.__name__ 105 | F.factor_name = data.name 106 | 107 | return F 108 | 109 | @classmethod 110 | def VolPrice014(cls, 111 | data: pd.DataFrame, 112 | **kwargs): 113 | """ 114 | 加权收盘价比(Close_Weight) 115 | 默认一分钟频率 116 | """ 117 | 118 | F = FactorInfo() 119 | F.data = data 120 | F.factor_type = 'HFD' 121 | F.factor_category = cls().__class__.__name__ 122 | F.factor_name = data.name 123 | 124 | return F 125 | 126 | @classmethod 127 | def VolPrice015(cls, 128 | data: pd.DataFrame, 129 | **kwargs): 130 | """结构化反转因子(Rev_struct)""" 131 | 132 | F = FactorInfo() 133 | F.data = data 134 | F.factor_type = 'HFD' 135 | F.factor_category = cls().__class__.__name__ 136 | F.factor_name = data.name 137 | 138 | return F 139 | 140 | @classmethod 141 | def VolPrice016(cls, 142 | data: pd.DataFrame, 143 | **kwargs): 144 | """聪明钱因子改进(SmartQ_ln)""" 145 | 146 | F = FactorInfo() 147 | F.data = data 148 | F.factor_type = 'HFD' 149 | F.factor_category = cls().__class__.__name__ 150 | F.factor_name = data.name 151 | 152 | return F 153 | 154 | @classmethod 155 | def VolPrice017(cls, 156 | data: pd.DataFrame, 157 | n: int = 20, 158 | **kwargs): 159 | """PMA 特殊""" 160 | factor_name = sys._getframe().f_code.co_name 161 | data.set_index([KN.TRADE_DATE.value, KN.STOCK_ID.value], inplace=True) 162 | data.sort_index(inplace=True) 163 | 164 | # Calculate AM and PM returns on stocks 165 | data['am_ret_stock'] = data['2hPrice'] / data['open'] - 1 166 | data['pm_ret_stock'] = data['4hPrice'] / data['2hPrice'] - 1 167 | 168 | # filter momentum 169 | data[factor_name] = data.groupby(KN.TRADE_DATE.value, 170 | group_keys=False).apply(lambda x: cls._reg(x, 'am_ret_stock', 'pm_ret_stock')) 171 | # data['mean'] = data['res'].groupby(KN.STOCK_ID.value, 172 | # group_keys=False).rolling(n, min_periods=1).apply(np.nanmean) 173 | # data['std'] = data['res'].groupby(KN.STOCK_ID.value, 174 | # group_keys=False).rolling(n, min_periods=1).apply(np.nanstd) 175 | # data[factor_name] = data['mean'] / data['std'] 176 | # data[factor_name][np.isinf(data[factor_name])] = 0 177 | data[factor_name] = data['pm_ret_stock'] 178 | 179 | F = FactorInfo() 180 | F.data = data[factor_name] 181 | F.factor_type = 'HFD' 182 | F.factor_category = cls().__class__.__name__ 183 | F.factor_name = factor_name 184 | 185 | return F 186 | 187 | @classmethod 188 | def VolPrice018(cls, 189 | data: pd.DataFrame, 190 | **kwargs): 191 | """ 192 | 订单失衡(HFD_VOI) 193 | 日频转月频需要采用衰减加权的方式 194 | """ 195 | 196 | F = FactorInfo() 197 | F.data = data 198 | F.factor_type = 'HFD' 199 | F.factor_category = cls().__class__.__name__ 200 | F.factor_name = data.name 201 | 202 | return F 203 | 204 | @classmethod 205 | def VolPrice019(cls, 206 | data: pd.DataFrame, 207 | **kwargs): 208 | """ 209 | 订单失衡率(HFD_OLR) 210 | 日频转月频需要采用衰减加权的方式 211 | """ 212 | 213 | F = FactorInfo() 214 | F.data = data 215 | F.factor_type = 'HFD' 216 | F.factor_category = cls().__class__.__name__ 217 | F.factor_name = data.name 218 | 219 | return F 220 | 221 | @classmethod 222 | def VolPrice020(cls, 223 | data: pd.DataFrame, 224 | **kwargs): 225 | """ 226 | 市价偏离率(HFD_MPB) 227 | 日频转月频需要采用衰减加权的方式 228 | 剔除开盘集合竞价 229 | 集合竞价会存在0盘口,用前值填充 230 | """ 231 | 232 | F = FactorInfo() 233 | F.data = data 234 | F.factor_type = 'HFD' 235 | F.factor_category = cls().__class__.__name__ 236 | F.factor_name = data.name 237 | 238 | return F 239 | 240 | #################################################################################################################### 241 | @classmethod 242 | def VolPrice008_data_raw(cls, 243 | n: int = 20, 244 | q: float = 0.2, 245 | **kwargs): 246 | """大单驱动涨幅(MOM_bigOrder)""" 247 | factor_name = sys._getframe().f_code.co_name[: -9] + f"_{q}q_{n}days" 248 | 249 | def func(data: pd.DataFrame): 250 | data["ret"] = data.groupby(KN.STOCK_ID.value, group_keys=False)[PVN.CLOSE.value].pct_change() 251 | data['amt_per_min'] = data[PVN.AMOUNT.value] / data['tradenum'] 252 | r = data.groupby(KN.STOCK_ID.value).apply( 253 | lambda x: (x[x['amt_per_min'] >= x['amt_per_min'].quantile(1 - q)]['ret'] + 1).prod(min_count=1)) 254 | return r 255 | 256 | Q = cls().csv_HFD_data(data_name=[PVN.CLOSE.value, PVN.OPEN.value, PVN.AMOUNT.value, 'tradenum'], 257 | func=func, 258 | file_path=FPN.HFD_Stock_M.value) 259 | res_raw = pd.concat(Q) 260 | res = res_raw.groupby(KN.STOCK_ID.value, group_keys=False).rolling(n, min_periods=min(n, 2)).mean() 261 | res.index.names = [KN.TRADE_DATE.value, KN.STOCK_ID.value] 262 | res.name = factor_name 263 | 264 | return res 265 | 266 | @classmethod 267 | def VolPrice009_data_raw(cls, 268 | n: int = 20, 269 | **kwargs): 270 | """改进反转(Rev_improve)""" 271 | factor_name = sys._getframe().f_code.co_name[: -9] + f"_{n}days" 272 | 273 | def func(data: pd.DataFrame): 274 | data.dropna(inplace=True) 275 | data_sub = data[data['time'] >= '10:00:00'] 276 | data_sub["ret"] = data_sub.groupby(KN.STOCK_ID.value, group_keys=False)[PVN.CLOSE.value].pct_change() 277 | data_sub['ret'] += 1 278 | r = data_sub.groupby(KN.STOCK_ID.value, group_keys=False)["ret"].prod(min_count=2) 279 | return r - 1 280 | 281 | Q = cls().csv_HFD_data(data_name=[PVN.CLOSE.value], 282 | func=func, 283 | file_path=FPN.HFD_Stock_M.value) 284 | res_raw = pd.concat(Q) 285 | res = res_raw.groupby(KN.STOCK_ID.value, group_keys=False).rolling(n, min_periods=min(n, 2)).mean() 286 | res.index.names = [KN.TRADE_DATE.value, KN.STOCK_ID.value] 287 | res.name = factor_name 288 | 289 | return res 290 | 291 | @classmethod # TODO 滚动10个交易日 292 | def VolPrice011_data_raw(cls, 293 | **kwargs): 294 | """聪明钱因子(SmartQ)""" 295 | factor_name = sys._getframe().f_code.co_name[: -9] 296 | 297 | def func(data: pd.DataFrame): 298 | r = data.groupby([KN.STOCK_ID.value]).apply(cls.func_M_sqrt) 299 | return r 300 | 301 | Q = cls().csv_HFD_data(data_name=[PVN.CLOSE.value, PVN.VOLUME.value], func=func) 302 | res = pd.concat(Q) 303 | res.index.names = [KN.TRADE_DATE.value, KN.STOCK_ID.value] 304 | res.name = factor_name 305 | return res 306 | 307 | @classmethod # TODO 308 | def VolPrice012_data_raw(cls, 309 | minute: int = 5, 310 | n: int = 21, 311 | **kwargs): 312 | """高频反转因子(HFD_Rev)""" 313 | factor_name = sys._getframe().f_code.co_name[: -9] + f"_{minute}min_{n}days" 314 | 315 | def func(data: pd.DataFrame): 316 | r1 = data.groupby(KN.STOCK_ID.value).apply( 317 | lambda x: (np.log(x[PVN.CLOSE.value] / x[PVN.CLOSE.value].shift(1)) * x[PVN.VOLUME.value] / x[ 318 | PVN.VOLUME.value].sum()).sum()) 319 | r2 = data.groupby(KN.STOCK_ID.value)[PVN.VOLUME.value].sum() 320 | r = pd.concat([r1, r2], axis=1) 321 | r.columns = ['rev_d', 'volume_d'] 322 | return r 323 | 324 | Q = cls().csv_HFD_data(data_name=[PVN.CLOSE.value, PVN.VOLUME.value], 325 | func=func, 326 | file_path=FPN.HFD.value, 327 | sub_file=f"{minute}minute") 328 | res = pd.concat(Q) 329 | res['rev_vol'] = res['rev_d'] * res['volume_d'] 330 | res_sub = res[['rev_vol', 'volume_d']].groupby(KN.STOCK_ID.value, 331 | group_keys=False).rolling(n, min_periods=min(n, 2)).sum() 332 | res[factor_name] = res_sub['rev_vol'] / res_sub['volume_d'] 333 | 334 | res.index.names = [KN.TRADE_DATE.value, KN.STOCK_ID.value] 335 | res.name = factor_name 336 | return res 337 | 338 | @classmethod 339 | def VolPrice013_data_raw(cls, 340 | minute: int = 5, 341 | n: int = 21, 342 | **kwargs): 343 | """轨迹非流动因子(Illiq_Track)""" 344 | factor_name = sys._getframe().f_code.co_name[: -9] + f"_{minute}min_{n}days" 345 | 346 | def func(data: pd.DataFrame): 347 | r1 = data.groupby([KN.STOCK_ID.value]).apply( 348 | lambda x: np.log(1 + abs(np.log(x[PVN.CLOSE.value] / x[PVN.CLOSE.value].shift(1)))).sum()) 349 | r2 = data.groupby([KN.STOCK_ID.value])[PVN.AMOUNT.value].sum() 350 | r = pd.concat([r1, r2], axis=1) 351 | r.columns = ['ret_d', 'volume_d'] 352 | return r 353 | 354 | Q = cls().csv_HFD_data(data_name=[PVN.CLOSE.value, PVN.AMOUNT.value], 355 | func=func, 356 | file_path=FPN.HFD.value, 357 | sub_file=f"{minute}minute") 358 | res = pd.concat(Q) 359 | res = res.groupby(KN.STOCK_ID.value, group_keys=False).rolling(n, min_periods=min(n, 2)).sum() 360 | res[factor_name] = res['ret_d'] / res['volume_d'] 361 | res.index.names = [KN.TRADE_DATE.value, KN.STOCK_ID.value] 362 | return res[factor_name] 363 | 364 | @classmethod 365 | def VolPrice014_data_raw(cls, 366 | minute: int = 5, 367 | n: int = 21, 368 | **kwargs): 369 | """加权收盘价比(Close_Weight)""" 370 | factor_name = sys._getframe().f_code.co_name[: -9] + f"_{minute}min_{n}days" 371 | 372 | def func(data: pd.DataFrame): 373 | r = data.groupby(KN.STOCK_ID.value, 374 | group_keys=False).apply( 375 | lambda x: (x[PVN.CLOSE.value] * x[PVN.AMOUNT.value]).sum() / ( 376 | (x[PVN.CLOSE.value]).sum() * (x[PVN.AMOUNT.value]).sum()) * len(x)) 377 | return r 378 | 379 | Q = cls().csv_HFD_data(data_name=[PVN.CLOSE.value, PVN.AMOUNT.value], func=func) 380 | res = pd.concat(Q) 381 | res = res.groupby(KN.STOCK_ID.value, group_keys=False).rolling(n, min_periods=min(n, 2)).sum() 382 | res.index.names = [KN.TRADE_DATE.value, KN.STOCK_ID.value] 383 | res.name = factor_name 384 | 385 | return res 386 | 387 | @classmethod # TODO 可能需要滚动 388 | def VolPrice015_data_raw(cls, 389 | minute: int = 5, 390 | ratio: float = 0.1, 391 | **kwargs): 392 | """结构化反转因子(Rev_struct)""" 393 | factor_name = sys._getframe().f_code.co_name[: -9] + f"_{minute}min_{ratio}R" 394 | 395 | def func(data: pd.DataFrame): 396 | data['ret'] = data.groupby(KN.STOCK_ID.value, group_keys=False)[PVN.CLOSE.value].pct_change() 397 | data.dropna(inplace=True) 398 | 399 | rev_struct = data.groupby(KN.STOCK_ID.value).apply(cls.func_Structured_reversal, ratio) 400 | return rev_struct 401 | 402 | Q = cls().csv_HFD_data(data_name=[PVN.CLOSE.value, PVN.VOLUME.value], 403 | func=func, 404 | file_path=FPN.HFD.value, 405 | sub_file=f"{minute}minute") 406 | res = pd.concat(Q) 407 | res.index.names = [KN.TRADE_DATE.value, KN.STOCK_ID.value] 408 | res.name = factor_name 409 | return res 410 | 411 | @classmethod # TODO 滚动10个交易日 412 | def VolPrice016_data_raw(cls, 413 | **kwargs): 414 | """聪明钱因子改进(SmartQ_ln)""" 415 | factor_name = sys._getframe().f_code.co_name[: -9] 416 | 417 | def func(data: pd.DataFrame): 418 | r = data.groupby(KN.STOCK_ID.value).apply(cls.func_M_ln) 419 | return r 420 | 421 | Q = cls().csv_HFD_data(data_name=[PVN.CLOSE.value, PVN.VOLUME.value], func=func) 422 | res = pd.concat(Q) 423 | res.index.names = [KN.TRADE_DATE.value, KN.STOCK_ID.value] 424 | res.name = factor_name 425 | return res 426 | 427 | @classmethod 428 | def VolPrice017_data_raw(cls, 429 | **kwargs): 430 | """PMA 特殊""" 431 | data = cls()._csv_data(data_name=[PVN.OPEN.value, '2hPrice', '4hPrice'], 432 | file_path=FPN.HFD_Stock_Depth.value, 433 | file_name='VwapFactor', 434 | stock_id='code') 435 | data.rename(columns={'code': 'stock_id'}, inplace=True) 436 | 437 | # 2h 数据存在异常 438 | data_s = data[~((data['2hPrice'] == 0) | (np.isnan(data['2hPrice'])))] 439 | 440 | return data_s 441 | 442 | @classmethod 443 | def VolPrice018_data_raw(cls, 444 | depth: int = 5, 445 | n: int = 20, 446 | **kwargs): 447 | """ 448 | 订单失衡(HFD_VOI) 449 | 日频转月频需要采用衰减加权的方式 450 | 剔除开盘集合竞价 451 | """ 452 | factor_name = sys._getframe().f_code.co_name[: -9] + f"_{depth}depth_{n}days" 453 | 454 | bidvolume = [f'bidvolume{i}' for i in range(1, depth + 1)] 455 | askvolume = [f'askvolume{i}' for i in range(1, depth + 1)] 456 | 457 | def func(data: pd.DataFrame): 458 | data_sub = data[data['time'] >= '09:30:00'] 459 | data_sub['bid_Vol_weight'] = data_sub[bidvolume] @ cls.weight_attenuation(depth) 460 | data_sub['ask_Vol_weight'] = data_sub[askvolume] @ cls.weight_attenuation(depth) 461 | data_sub[['diff_bidprice1', 'diff_askprice1', 462 | 'diff_bid_Vol', 'diff_ask_Vol']] = data_sub.groupby(KN.STOCK_ID.value, 463 | group_keys=False).apply( 464 | lambda x: x[['bidprice1', 'askprice1', 'bid_Vol_weight', 'ask_Vol_weight']].diff(1)) 465 | data_sub.dropna(inplace=True) 466 | 467 | data_sub[['bid_judge', 'ask_judge']] = np.sign(data_sub[['diff_bidprice1', 'diff_askprice1']]) 468 | 469 | bid_equal = data_sub[data_sub['bid_judge'] == 0]['diff_bid_Vol'] 470 | bid_small = pd.Series(data=0, index=data_sub[data_sub['bid_judge'] < 0]['diff_bid_Vol'].index, 471 | name='diff_bid_Vol') 472 | bid_more = data_sub[data_sub['bid_judge'] > 0]['bid_Vol_weight'] 473 | 474 | ask_equal = data_sub[data_sub['ask_judge'] == 0]['diff_ask_Vol'] 475 | ask_small = pd.Series(data=0, index=data_sub[data_sub['ask_judge'] > 0]['diff_ask_Vol'].index, 476 | name='diff_ask_Vol') 477 | ask_more = data_sub[data_sub['ask_judge'] < 0]['ask_Vol_weight'] 478 | data_sub['delta_V_bid'] = pd.concat([bid_equal, bid_small, bid_more]) 479 | data_sub['delta_V_ask'] = pd.concat([ask_equal, ask_small, ask_more]) 480 | data_sub['VOI'] = data_sub['delta_V_bid'] - data_sub['delta_V_ask'] 481 | 482 | # 截面标准化 483 | data_sub['VOI_stand'] = data_sub.groupby('time', 484 | group_keys=False).apply( 485 | lambda x: (x['VOI'] - x['VOI'].mean()) / x['VOI'].std()) 486 | 487 | data_sub['VOI_stand'][np.isinf(data_sub['VOI_stand'])] = 0 488 | 489 | # 转日频 490 | r = data_sub.groupby(KN.STOCK_ID.value)['VOI_stand'].mean() 491 | 492 | return r 493 | 494 | Q = cls().csv_HFD_data(data_name=['bidprice1', 'askprice1'] + bidvolume + askvolume, 495 | func=func, 496 | file_path=FPN.HFD_Stock_Depth_1min.value) 497 | res = pd.concat(Q) 498 | # 滚动 499 | res = res.groupby(KN.STOCK_ID.value, 500 | group_keys=False).rolling(n, min_periods=min(n, 2)).apply( 501 | lambda x: x @ cls.weight_attenuation(len(x))) 502 | res.index.names = [KN.TRADE_DATE.value, KN.STOCK_ID.value] 503 | res.name = factor_name 504 | return res 505 | 506 | @classmethod 507 | def VolPrice019_data_raw(cls, 508 | depth: int = 5, 509 | n: int = 20, 510 | **kwargs): 511 | """ 512 | 订单失衡率(HFD_OLR) 513 | 日频转月频需要采用衰减加权的方式 514 | 剔除开盘集合竞价 515 | """ 516 | factor_name = sys._getframe().f_code.co_name[: -9] + f"_{depth}depth_{n}days" 517 | 518 | bidvolume = [f'bidvolume{i}' for i in range(1, depth + 1)] 519 | askvolume = [f'askvolume{i}' for i in range(1, depth + 1)] 520 | 521 | def func(data: pd.DataFrame): 522 | data_sub = data[data['time'] >= '09:30:00'] 523 | data_sub['bid_Vol_weight'] = data_sub[bidvolume] @ cls.weight_attenuation(depth) 524 | data_sub['ask_Vol_weight'] = data_sub[askvolume] @ cls.weight_attenuation(depth) 525 | 526 | data_sub['OIR'] = (data_sub['bid_Vol_weight'] - data_sub['ask_Vol_weight']) / ( 527 | data_sub['bid_Vol_weight'] + data_sub['ask_Vol_weight']) 528 | 529 | # 截面标准化 530 | data_sub['OIR_stand'] = data_sub.groupby('time', 531 | group_keys=False).apply( 532 | lambda x: (x['OIR'] - x['OIR'].mean()) / x['OIR'].std()) 533 | 534 | data_sub['OIR_stand'][np.isinf(data_sub['OIR_stand'])] = 0 535 | 536 | # 转日频 537 | r = data_sub.groupby(KN.STOCK_ID.value)['OIR_stand'].mean() 538 | return r 539 | 540 | Q = cls().csv_HFD_data(data_name=bidvolume + askvolume, 541 | func=func, 542 | file_path=FPN.HFD_Stock_Depth_1min.value) 543 | res = pd.concat(Q) 544 | # 滚动 545 | res = res.groupby(KN.STOCK_ID.value, 546 | group_keys=False).rolling(n, min_periods=min(n, 2)).apply( 547 | lambda x: x @ cls.weight_attenuation(len(x))) 548 | 549 | res.index.names = [KN.TRADE_DATE.value, KN.STOCK_ID.value] 550 | res.name = factor_name 551 | return res 552 | 553 | @classmethod 554 | def VolPrice020_data_raw(cls, 555 | n: int = 20, 556 | **kwargs): 557 | """ 558 | 市价偏离率(HFD_MPB) 559 | 日频转月频需要采用衰减加权的方式 560 | 剔除开盘集合竞价 561 | 集合竞价会存在0盘口,用前值填充 562 | """ 563 | factor_name = sys._getframe().f_code.co_name[: -9] + f"_{n}days" 564 | 565 | def func(data: pd.DataFrame): 566 | data_sub = data[data['time'] >= '09:30:00'] 567 | 568 | data_sub['TP'] = data_sub[PVN.AMOUNT.value] / data_sub[PVN.VOLUME.value] 569 | data_sub['TP'] = data_sub.groupby(KN.STOCK_ID.value, group_keys=False)['TP'].ffill() 570 | 571 | data_sub['MP'] = (data_sub['bidprice1'] + data_sub['askprice1']) / 2 572 | data_sub['MP'][data_sub['MP'] == 0] = np.nan 573 | data_sub['MP'] = data_sub.groupby(KN.STOCK_ID.value, group_keys=False)['MP'].ffill() 574 | 575 | data_sub['delta_MP'] = data_sub[[KN.STOCK_ID.value, 'MP']].groupby(KN.STOCK_ID.value, 576 | group_keys=False).rolling(2).mean() 577 | 578 | data_sub['MPB'] = data_sub['TP'] - data_sub['delta_MP'] 579 | 580 | # 截面标准化 581 | data_sub['MPB_stand'] = data_sub.groupby('time', 582 | group_keys=False).apply( 583 | lambda x: (x['MPB'] - x['MPB'].mean()) / x['MPB'].std()) 584 | 585 | data_sub['MPB_stand'][np.isinf(data_sub['MPB_stand'])] = 0 586 | 587 | # 转日频 588 | r = data_sub.groupby(KN.STOCK_ID.value)['MPB_stand'].mean() 589 | return r 590 | 591 | Q = cls().csv_HFD_data( 592 | data_name=[PVN.CLOSE.value, PVN.VOLUME.value, PVN.AMOUNT.value, 'bidprice1', 'askprice1'], 593 | func=func, 594 | file_path=FPN.HFD_Stock_Depth_1min.value) 595 | res = pd.concat(Q) 596 | # 滚动 597 | res = res.groupby(KN.STOCK_ID.value, 598 | group_keys=False).rolling(n, min_periods=min(n, 2)).apply( 599 | lambda x: x @ cls.weight_attenuation(len(x))) 600 | 601 | res.index.names = [KN.TRADE_DATE.value, KN.STOCK_ID.value] 602 | res.name = factor_name 603 | return res 604 | 605 | @staticmethod 606 | def func_Structured_reversal(data: pd.DataFrame, 607 | ratio: float): 608 | print(data['stock_id'].iloc[0]) 609 | data_copy = data.copy(deep=True) 610 | data_copy.sort_values(PVN.VOLUME.value, ascending=True, inplace=True) 611 | data_copy['cum_volume'] = data_copy[PVN.VOLUME.value].cumsum() / data_copy[PVN.VOLUME.value].sum() 612 | # momentum 613 | data_copy_mom = data_copy[data_copy['cum_volume'] <= ratio] 614 | rev_mom = (data_copy_mom['ret'] * (1 / data_copy_mom[PVN.VOLUME.value])).sum() / ( 615 | 1 / data_copy_mom[PVN.VOLUME.value]).sum() 616 | # Reverse 617 | data_copy_rev = data_copy[data_copy['cum_volume'] > ratio] 618 | rev_rev = (data_copy_rev['ret'] * (data_copy_rev[PVN.VOLUME.value])).sum() / ( 619 | data_copy_rev[PVN.VOLUME.value]).sum() 620 | rev_struct = rev_rev - rev_mom 621 | if np.isnan(rev_struct): 622 | print("Nan error!") 623 | return rev_struct 624 | 625 | @staticmethod 626 | def func_M_ln(data: pd.DataFrame): 627 | data_copy = data.copy(deep=True) 628 | data_copy['S'] = abs(data_copy[PVN.CLOSE.value].pct_change()) / np.log(data_copy['volume']) 629 | VWAP = (data_copy[PVN.CLOSE.value] * data_copy[PVN.VOLUME.value] / (data_copy[PVN.VOLUME.value]).sum()).sum() 630 | data_copy.sort_values('S', ascending=False, inplace=True) 631 | data_copy['cum_volume_R'] = data_copy[PVN.VOLUME.value].cumsum() / (data_copy[PVN.VOLUME.value]).sum() 632 | data_copy_ = data_copy[data_copy['cum_volume_R'] <= 0.2] 633 | res = (data_copy_[PVN.CLOSE.value] * data_copy_[PVN.VOLUME.value] / ( 634 | data_copy_[PVN.VOLUME.value]).sum()).sum() / VWAP 635 | 636 | return res 637 | 638 | @staticmethod 639 | def func_M_sqrt(data: pd.DataFrame): 640 | data_copy = data.copy(deep=True) 641 | # 可能存在分钟线丢失 642 | data_copy['S'] = abs(data_copy[PVN.CLOSE.value].pct_change()) / np.sqrt(data_copy[PVN.VOLUME.value]) 643 | VWAP = (data_copy[PVN.CLOSE.value] * data_copy[PVN.VOLUME.value] / (data_copy[PVN.VOLUME.value]).sum()).sum() 644 | data_copy.sort_values('S', ascending=False, inplace=True) 645 | data_copy['cum_volume_R'] = data_copy[PVN.VOLUME.value].cumsum() / (data_copy[PVN.VOLUME.value]).sum() 646 | data_copy_ = data_copy[data_copy['cum_volume_R'] <= 0.2] 647 | res = (data_copy_[PVN.CLOSE.value] * data_copy_[PVN.VOLUME.value] / ( 648 | data_copy_[PVN.VOLUME.value]).sum()).sum() / VWAP 649 | 650 | return res 651 | 652 | @staticmethod 653 | def _reg(d: pd.DataFrame, 654 | x_name: str, 655 | y_name: str) -> pd.Series: 656 | """!!!不排序回归结果会不一样!!!""" 657 | d_sub_ = d.dropna(how='any').sort_index() 658 | 659 | if d_sub_.shape[0] < d_sub_.shape[1]: 660 | Residual = pd.Series(data=np.nan, index=d.index) 661 | else: 662 | X, Y = d_sub_[x_name].to_frame(), d_sub_[y_name] 663 | reg = np.linalg.lstsq(X, Y) 664 | Residual = Y - (reg[0] * X).sum(axis=1) 665 | return Residual 666 | 667 | @staticmethod 668 | def weight_attenuation(n: int = 5): 669 | W_sum = sum(i for i in range(1, n + 1)) 670 | W = [i / W_sum for i in range(1, n + 1)] 671 | return W 672 | 673 | 674 | if __name__ == '__main__': 675 | pass 676 | -------------------------------------------------------------------------------- /FactorCalculation/TechnicalLiquidityFactor.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import sys 4 | 5 | 6 | class LiquidationFactor(object): 7 | """ 8 | 流动性因子 9 | """ 10 | 11 | def turnover(self, 12 | data: pd.DataFrame, 13 | amount_name: str = 'amount', 14 | mv_name: str = 'mv', 15 | n: int = 1): 16 | """ 17 | N日换手率 18 | :return: 19 | """ 20 | data['amount_{}'.format(n)] = data[amount_name].rolling(n).sum() 21 | data['mv_{}'.format(n)] = data[mv_name].rolling(n).mean() 22 | data['turnover_{}'.format(n)] = data['amount_{}'.format(n)] / data['mv_{}'.format(n)] 23 | 24 | result = self.data_filter1(data[['code', 'turnover_{}'.format(n)]], rolling=n, factor_name='turnover_{}'.format(n)) 25 | return result 26 | 27 | 28 | if __name__ == '__main__': 29 | # df_stock = pd.read_csv("D:\\Quant\\SecuritySelect\\Data\\AStockData.csv") 30 | # 31 | # # Data cleaning:Restoration stock price [open, high, low, close] 32 | # price_columns = ['open', 'close', 'high', 'low'] 33 | # df_stock.set_index('date', inplace=True) 34 | # df_stock[price_columns] = df_stock[price_columns].multiply(df_stock['adjfactor'], axis=0) 35 | # A = MomentFactor() 36 | # A.momentum_in_day(df_stock) 37 | pass 38 | -------------------------------------------------------------------------------- /FactorCalculation/TechnicalSizeFactor.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import sys 4 | 5 | from FactorCalculation.FactorBase import FactorBase 6 | from Object import FactorInfo 7 | 8 | from constant import ( 9 | KeyName as KN, 10 | SpecialName as SN, 11 | PriceVolumeName as PVN 12 | ) 13 | 14 | 15 | class TechnicalSizeFactor(FactorBase): 16 | 17 | @classmethod 18 | def Size001(cls, 19 | data: pd.DataFrame, 20 | liq_mv: str = PVN.LIQ_MV.value): 21 | """ 22 | 流动市值 23 | :return: 24 | """ 25 | func_name = sys._getframe().f_code.co_name 26 | data.set_index([SN.REPORT_DATE.value, KN.STOCK_ID.value], inplace=True) 27 | data.sort_index(inplace=True) 28 | 29 | data[func_name] = data[liq_mv] 30 | result = data[['code', func_name]] 31 | return result 32 | 33 | def total_market_value(self, 34 | data: pd.DataFrame, 35 | market_name: str = 'mv'): 36 | """ 37 | 总市值对数 38 | :return: 39 | """ 40 | factor_name = sys._getframe().f_code.co_name 41 | data[factor_name] = np.log(data[market_name]) 42 | result = data[['code', factor_name]] 43 | return result 44 | 45 | @classmethod 46 | def Size001_data_raw(cls, 47 | sta: int = 20130101, 48 | end: int = 20200401): 49 | price_data = cls()._csv_data(data_name=[PVN.LIQ_MV.value]) 50 | return price_data 51 | 52 | 53 | if __name__ == '__main__': 54 | # df_stock = pd.read_csv("D:\\Quant\\SecuritySelect\\Data\\AStockData.csv") 55 | # 56 | # # Data cleaning:Restoration stock price [open, high, low, close] 57 | # price_columns = ['open', 'close', 'high', 'low'] 58 | # df_stock[price_columns] = df_stock[price_columns].multiply(df_stock['adjfactor'], axis=0) 59 | # A = MomentFactor(df_stock[price_columns]) 60 | 61 | pass 62 | -------------------------------------------------------------------------------- /FactorCalculation/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import inspect 3 | import importlib 4 | 5 | 6 | class FactorPool(object): 7 | def __init__(self): 8 | self.factor, self.method = self.load_factor_function() 9 | 10 | def load_factor_function(self, ): 11 | """ 12 | Load strategy class from source code. 13 | """ 14 | factor_folder = os.path.dirname(os.path.abspath(__file__)) 15 | Factor_class = self.load_factor_class_from_folder(factor_folder) 16 | Factor_function = self.load_factor_function_from_class(Factor_class) 17 | # self.load_strategy_class_from_folder(path2, "strategies") 18 | return Factor_function 19 | 20 | # 导入因子类 21 | def load_factor_class_from_folder(self, path: str): 22 | """ 23 | Load strategy class from certain folder. 24 | """ 25 | for dirpath, dirnames, filenames in os.walk(path): 26 | Factor_class = {} 27 | for filename in filenames: 28 | # 剔除自己本身 29 | if filename.startswith('__'): 30 | continue 31 | class_name = filename[:-3] 32 | module = importlib.import_module("FactorCalculation." + class_name) 33 | for class_name in dir(module): 34 | value = getattr(module, class_name) 35 | if isinstance(value, type): 36 | Factor_class[value.__name__] = value 37 | return Factor_class 38 | 39 | # 导入因子属性 40 | def load_factor_function_from_class(self, Factor_class: dict): 41 | """ 42 | Load strategy class from module file. 43 | """ 44 | Factor_function, Method_function = {}, {} 45 | for factor_class in Factor_class.values(): 46 | for func_name in dir(factor_class): 47 | if func_name.startswith('__'): 48 | continue 49 | method_ = getattr(factor_class, func_name) 50 | if inspect.ismethod(method_): 51 | Factor_function[func_name] = method_ 52 | elif inspect.isfunction(method_): 53 | Method_function[func_name] = method_ 54 | return Factor_function, Method_function 55 | 56 | 57 | if __name__ == '__main__': 58 | A = FactorPool() 59 | -------------------------------------------------------------------------------- /FactorProcess/FactorProcess.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import os 4 | import json 5 | import statsmodels.api as sm 6 | import copy 7 | import time 8 | import datetime as dt 9 | from sklearn.decomposition import PCA 10 | from sklearn import linear_model 11 | 12 | from Optimization import MaxOptModel 13 | from constant import ( 14 | KeyName as KN, 15 | PriceVolumeName as PVN, 16 | FilePathName as FPN, 17 | SpecialName as SN 18 | ) 19 | 20 | 21 | # 因子预处理 22 | class FactorProcess(object): 23 | """ 24 | 去极值,标准化,中性化,分组 25 | """ 26 | data_name = {'mv': 'MV.csv', 27 | 'industry': 'IndustryLabel.csv'} 28 | 29 | def __init__(self): 30 | self.fact_name = '' 31 | self.raw_data = {} 32 | 33 | # *去极值* 34 | def remove_outliers(self, 35 | data: pd.Series, 36 | method='before_after_3%') -> pd.Series: 37 | 38 | self.fact_name = data.name 39 | 40 | method_dict = { 41 | "before_after_3%": self.before_after_n, 42 | "before_after_3sigma": self.before_after_3sigma, 43 | "mad": self.mad 44 | } 45 | if method is None: 46 | return data 47 | else: 48 | res = data.groupby(KN.TRADE_DATE.value).apply(method_dict[method]) 49 | return res 50 | 51 | # *中性化* 52 | def neutralization(self, 53 | data: pd.Series, 54 | method: str = 'industry+mv') -> pd.Series: 55 | """ 56 | 若同时纳入行业因子和市值因子需要加上截距项,若仅纳入行业因子则回归方程不带截距项! 57 | :param data: 因子数据 58 | :param method: 中心化方法 59 | :return: 剔除行业因素和市值因素后的因子 60 | """ 61 | 62 | self.fact_name = data.name 63 | 64 | # regression 65 | def _reg(data_: pd.DataFrame) -> pd.Series: 66 | """!!!不排序回归结果会不一样!!!""" 67 | data_sub_ = data_.dropna(how='any').sort_index() 68 | 69 | if data_sub_.shape[0] < data_sub_.shape[1]: 70 | fact_neu = pd.Series(data=np.nan, index=data_.index) 71 | else: 72 | X = pd.get_dummies(data_sub_.loc[:, data_sub_.columns != self.fact_name], 73 | columns=[SN.INDUSTRY_FLAG.value]) 74 | Y = data_sub_[self.fact_name] 75 | reg = np.linalg.lstsq(X, Y) 76 | fact_neu = Y - (reg[0] * X).sum(axis=1) 77 | fact_neu.name = self.fact_name 78 | return fact_neu 79 | 80 | # read mv and industry data 81 | if 'mv' in method: 82 | if self.raw_data.get('mv', None) is None: 83 | mv_data = pd.read_csv(os.path.join(FPN.label_pool_path.value, self.data_name['mv']), 84 | index_col=['date', 'stock_id'], 85 | usecols=['date', 'stock_id', 'liq_mv']) 86 | self.raw_data['mv'] = mv_data 87 | else: 88 | mv_data = self.raw_data['mv'].copy(deep=True) 89 | 90 | else: 91 | mv_data = pd.DataFrame() 92 | 93 | if 'industry' in method: 94 | if self.raw_data.get('industry', None) is None: 95 | industry_data = pd.read_csv(os.path.join(FPN.label_pool_path.value, self.data_name['industry']), 96 | index_col=['date', 'stock_id']) 97 | self.raw_data['industry'] = industry_data 98 | else: 99 | industry_data = self.raw_data['industry'].copy(deep=True) 100 | else: 101 | industry_data = pd.DataFrame() 102 | 103 | # merge data 104 | neu_factor = pd.concat([data, mv_data, industry_data], axis=1, join='inner') 105 | 106 | # neutralization 107 | 108 | res = neu_factor.groupby(KN.TRADE_DATE.value, group_keys=False).apply(_reg) 109 | return res 110 | 111 | # *标准化* 112 | def standardization(self, 113 | data: pd.Series, 114 | method='z_score') -> pd.Series: 115 | 116 | method_dict = {"range01": self.range01, 117 | "z_score": self.z_score, 118 | "mv": self.market_value_weighting 119 | } 120 | self.fact_name = data.name 121 | 122 | if method is None: 123 | return data 124 | elif method == 'mv': 125 | if self.raw_data.get('mv', None) is None: 126 | mv_data = pd.read_csv(os.path.join(FPN.label_pool_path.value, self.data_name['mv']), 127 | index_col=['date', 'stock_id'], 128 | usecols=['date', 'stock_id', 'liq_mv']) 129 | self.raw_data['mv'] = mv_data 130 | else: 131 | mv_data = self.raw_data['mv'] 132 | 133 | stand_data = pd.concat([data, mv_data], axis=1, join='inner') 134 | else: 135 | stand_data = data 136 | 137 | res = stand_data.groupby(KN.TRADE_DATE.value, group_keys=False).apply(method_dict[method]) 138 | return res 139 | 140 | # # *正交化* 141 | # @staticmethod 142 | # def orthogonal(factor_df, method='schimidt'): 143 | # # 固定顺序的施密特正交化 144 | # def schimidt(): 145 | # 146 | # col_name = factor_df.columns 147 | # factors1 = factor_df.values 148 | # 149 | # R = np.zeros((factors1.shape[1], factors1.shape[1])) 150 | # Q = np.zeros(factors1.shape) 151 | # for k in range(0, factors1.shape[1]): 152 | # R[k, k] = np.sqrt(np.dot(factors1[:, k], factors1[:, k])) 153 | # Q[:, k] = factors1[:, k] / R[k, k] 154 | # for j in range(k + 1, factors1.shape[1]): 155 | # R[k, j] = np.dot(Q[:, k], factors1[:, j]) 156 | # factors1[:, j] = factors1[:, j] - R[k, j] * Q[:, k] 157 | # 158 | # Q = pd.DataFrame(Q, columns=col_name, index=factor_df.index) 159 | # return Q 160 | # 161 | # # 规范正交 162 | # def canonial(): 163 | # factors1 = factor_df.values 164 | # col_name = factor_df.columns 165 | # D, U = np.linalg.eig(np.dot(factors1.T, factors1)) 166 | # S = np.dot(U, np.diag(D ** (-0.5))) 167 | # 168 | # Fhat = np.dot(factors1, S) 169 | # Fhat = pd.DataFrame(Fhat, columns=col_name, index=factor_df.index) 170 | # 171 | # return Fhat 172 | # 173 | # # 对称正交 174 | # def symmetry(): 175 | # col_name = factor_df.columns 176 | # factors1 = factor_df.values 177 | # D, U = np.linalg.eig(np.dot(factors1.T, factors1)) 178 | # S = np.dot(U, np.diag(D ** (-0.5))) 179 | # 180 | # Fhat = np.dot(factors1, S) 181 | # Fhat = np.dot(Fhat, U.T) 182 | # Fhat = pd.DataFrame(Fhat, columns=col_name, index=factor_df.index) 183 | # 184 | # return Fhat 185 | # 186 | # method_dict = { 187 | # "schimidt": schimidt(), 188 | # "canonial": canonial(), 189 | # "symmetry": symmetry() 190 | # } 191 | # 192 | # return method_dict[method] 193 | # 194 | 195 | # 因子预处理 196 | def main(self, 197 | factor: pd.Series, 198 | outliers: str, 199 | neutralization: str, 200 | standardization: str) -> pd.Series: 201 | 202 | df_factor = copy.deepcopy(factor) 203 | 204 | if outliers != '': 205 | print(f"{dt.datetime.now().strftime('%X')}: processing outlier") 206 | df_factor = self.remove_outliers(df_factor, outliers) 207 | if neutralization != '': 208 | print(f"{dt.datetime.now().strftime('%X')}: neutralization") 209 | df_factor = self.neutralization(df_factor, neutralization) 210 | if standardization != '': 211 | print(f"{dt.datetime.now().strftime('%X')}: standardization") 212 | df_factor = self.standardization(df_factor, standardization) 213 | 214 | # TODO 因子填充??? 215 | return df_factor 216 | 217 | """去极值""" 218 | 219 | # 前后3% 220 | @staticmethod 221 | def before_after_n(data: pd.Series, n: int = 3): 222 | length = len(data) 223 | sort_values = data.sort_values() 224 | threshold_top = sort_values.iloc[int(length * n / 100)] 225 | threshold_down = sort_values.iloc[-(int(length * n / 100) + 1)] 226 | data[data <= threshold_top] = threshold_top 227 | data[data >= threshold_down] = threshold_down 228 | return data 229 | 230 | # 3倍标准差外 231 | @staticmethod 232 | def before_after_3sigma(data: pd.Series) -> pd.Series: 233 | miu = data.mean() 234 | sigma = data.std() 235 | threshold_down = miu - 3 * sigma 236 | threshold_up = miu + 3 * sigma 237 | data[data.ge(threshold_up)] = threshold_up 238 | data[data.le(threshold_down)] = threshold_down 239 | return data 240 | 241 | # 绝对中位偏差法 242 | @staticmethod 243 | def mad(data): 244 | median = data.median() 245 | MAD = (data - median).abs().median() 246 | threshold_up = median + 3 * 1.483 * MAD 247 | threshold_down = median - 3 * 1.483 * MAD 248 | data[data >= threshold_up] = threshold_up 249 | data[data <= threshold_down] = threshold_down 250 | return data 251 | 252 | """标准化""" 253 | 254 | # 标准分数法 255 | @staticmethod 256 | def z_score(data: pd.Series): 257 | """ 258 | :param data: 259 | :return: 260 | """ 261 | miu = data.mean() 262 | sigma = data.std() 263 | stand = (data - miu) / sigma 264 | return stand 265 | 266 | @staticmethod 267 | def range01(data: pd.Series): 268 | result = (data - data.min()) / (data.max() - data.min()) 269 | return result 270 | 271 | # 市值加权标准化 272 | def market_value_weighting(self, data) -> pd.Series: 273 | data_sub = data.dropna(how='any') 274 | 275 | if data_sub.empty: 276 | stand = pd.Series(data=np.nan, index=data.index) 277 | else: 278 | 279 | factor = data_sub[self.fact_name] 280 | mv = data_sub[PVN.LIQ_MV.value] 281 | 282 | sum_mv = sum(mv) 283 | miu = sum(data_sub.prod(axis=1, skipna=False)) / sum_mv 284 | 285 | sigma = factor.std() 286 | stand = (factor - miu) / sigma 287 | stand.name = self.fact_name 288 | return stand 289 | 290 | # 分组 291 | @staticmethod 292 | def grouping(data: pd.DataFrame, n): 293 | """ 294 | 1.假设样本量为M,将因子分成N组,前N-1组有效样本量为int(M/N),最后一组有效样本量为M-(N-1)*int(M/*N); 295 | 2.无效样本不参与计算; 296 | 3.相同排序定义为同一组; 297 | 4.相同排序后下一元素连续不跳级 298 | 5.升序排列 299 | :param data: 300 | :param n:分组个数 301 | :return: 302 | """ 303 | rank_data = data.rank(axis=1, ascending=True, method='dense') 304 | effect_data = rank_data.max(axis=1) 305 | amount_each_group = effect_data // n 306 | data_group = rank_data.floordiv(amount_each_group, axis=0) + np.sign(rank_data.mod(amount_each_group, axis=0)) 307 | data_group[data_group > n] = n 308 | return data_group 309 | 310 | 311 | # 基本面因子处理 312 | class SpecialProcess(object): 313 | 314 | def switch_freq(self, data: pd.DataFrame, switch: str): 315 | pass 316 | 317 | pass 318 | 319 | 320 | # 因子共线性检验和处理方法 321 | class Multicollinearity(object): 322 | 323 | class OPT(MaxOptModel): 324 | 325 | """ 326 | 默认: 327 | 1.目标函数为最大化收益比上波动 328 | 2.权重介于0到1之间 329 | 3.权重和为1 330 | 4.最大迭代次数为300 331 | 5.容忍度为1e-7 332 | """ 333 | 334 | def __init__(self, data: pd.DataFrame, n: int): 335 | super().__init__(data, n) 336 | 337 | # 目标函数 338 | def object_func(self, w): 339 | """ 340 | 目标函数默认为夏普比最大化模型,通过前面加上负号转化为最小化模型 341 | :param w: 342 | :return: 343 | """ 344 | mean = np.array(self.data.mean()) 345 | cov = np.array(self.data.cov()) # 协方差 346 | func = - np.dot(w, mean) / np.sqrt(np.dot(w, np.dot(w, cov))) 347 | return func 348 | 349 | # 约束条件 350 | def _constraint1(self, w, **kwargs): 351 | return sum(w) - 1 352 | 353 | # 约束条件函数集 354 | def _constraints(self, **kwargs): 355 | limit = {'type': 'eq', 'fun': self._constraint1} 356 | return limit 357 | 358 | def __init__(self): 359 | self.fp = FactorProcess() 360 | 361 | # 相关性检验 362 | def correlation(self, data: pd.DataFrame) -> dict: 363 | """ 364 | 每期有效数据过少不计算相关性 365 | :param data: 366 | :return: 367 | """ 368 | 369 | df_cor = data.groupby(KN.TRADE_DATE.value).apply(lambda x: x.dropna().corr()) 370 | 371 | cor_GroupBy = df_cor.groupby(pd.Grouper(level=-1)) 372 | cor_dict = {"cor": df_cor, 373 | "mean": cor_GroupBy.mean(), 374 | "median": cor_GroupBy.median(), 375 | "std": cor_GroupBy.std(), 376 | "ttest": cor_GroupBy.apply(lambda x: (abs(x) - x.mean()) / x.std() * pow(len(x) - 1, 0.5)), 377 | } 378 | 379 | return cor_dict 380 | 381 | # 因子复合 382 | def composite(self, 383 | factor: pd.DataFrame, 384 | method: str = 'equal', 385 | **kwargs) -> pd.DataFrame: 386 | """ 387 | 部分权重会用到未来数据,所以需要对权重进行平移与相应的因子值进行匹配 388 | :param factor: 389 | :param method: 390 | :param kwargs: 391 | :return: 392 | """ 393 | 394 | method_dict = {"Equal": self.equal_weight, 395 | "Ret": self.return_weight, 396 | "MAX_IC": self.MAX_IC_IR} 397 | 398 | res = method_dict[method](factor, **kwargs) 399 | return res 400 | 401 | """因子合成""" 402 | 403 | # 等权法 404 | @staticmethod 405 | def equal_weight(fact: pd.DataFrame, 406 | **kwargs): 407 | fact_comp = fact.groupby(KN.TRADE_DATE.value, group_keys=False).apply(lambda x: x.mean(axis=1)) 408 | return fact_comp 409 | 410 | # TODO Test 411 | def return_weight(self, 412 | fact: pd.DataFrame, 413 | fact_ret: pd.DataFrame = None, 414 | hp: int = 1, 415 | rp: int = 20, 416 | algorithm='mean') -> [pd.Series, None]: 417 | """ 418 | 由于该地方的权重(Pearson相关性和Spearman相关性)权重都是作为标签参与了运算, 419 | 因此相对于截面当期该数据为未来数据,需要进行平移后与相应的因子进行匹配才能作为当期截面因子的历史权重, 420 | 系统默认计算收益率采用open价格,所以,若调仓周期为N天,则需要平移 N + 1 + 1天。 421 | :param fact: 标准化后的因子 422 | :param fact_ret: 因子收益率 423 | :param rp: 权重滚动计算周期 424 | :param hp: 标的持有周期(调仓周期) 425 | :param algorithm: 权重计算方法 426 | :return: 427 | """ 428 | 429 | fact_weight = abs(self._weight(fact_ret, rp, algorithm)) 430 | 431 | # 权重归一化 432 | fact_weight_std = fact_weight.div(fact_weight.sum(axis=1), axis=0) 433 | # 权重与因子值匹配 434 | fact_weight_std = fact_weight_std.shift(hp + 1) # TODO 不同的价格平移周期不一样 435 | # 复合因子 436 | fact_comp = fact.mul(fact_weight_std).sum(axis=1) 437 | 438 | return fact_comp 439 | 440 | # def IC_weight(self, 441 | # fact: pd.DataFrame, 442 | # fact_IC: pd.DataFrame, 443 | # rp: int = 20, 444 | # hp: int = 1, 445 | # algorithm='mean'): 446 | # 447 | # return self.return_weight(fact, fact_IC, rp, hp, algorithm) 448 | 449 | def MAX_IC_IR(self, 450 | fact: pd.DataFrame, 451 | fact_ret: pd.DataFrame = None, 452 | hp: int = 1, 453 | rp: int = 20, 454 | way='IC_IR', 455 | comp_name: str = 'comp_factor'): 456 | 457 | # 对收益率进行调整 458 | ret_real = fact_ret.shift(hp).dropna() 459 | 460 | w_list = [] 461 | for i in range(rp, ret_real.shape[0] + 1): 462 | df_ = ret_real.iloc[i - rp: i, :] 463 | opt = self.OPT(df_) 464 | 465 | if way == 'IC': 466 | opt.data_cov = np.array(fact.loc[df_.index].cov()) 467 | 468 | res_ = opt.solve() 469 | weight_ = res_.x 470 | w_s = pd.Series(weight_, index=df_.columns, name=df_.index[-1]) 471 | w_list.append(w_s) 472 | 473 | w_df = pd.DataFrame(w_list) 474 | # W = w_df.shift(hp) 475 | fact_comp = fact.mul(w_df).sum(axis=1) 476 | fact_comp.name = fact_comp 477 | return fact_comp 478 | 479 | def PCA(self, 480 | fact: pd.DataFrame, 481 | rp: int = 20): 482 | 483 | w_list = [] 484 | for i in range(rp, fact.shape[0] + 1): 485 | df_ = fact.iloc[i - rp: i, :] 486 | 487 | pca = PCA(n_components=1) 488 | pca.fit(np.array(df_)) 489 | weight = pca.components_[0] 490 | w_s = pd.Series(data=weight, index=df_.columns, name=df_.index[-1]) 491 | w_list.append(w_s) 492 | w_df = pd.DataFrame(w_list) 493 | 494 | fact_comp = fact.mul(w_df).sum(axis=1) 495 | fact_comp.name = fact_comp 496 | 497 | return fact_comp 498 | 499 | # *正交化* 500 | @staticmethod 501 | def orthogonal(factor_df, method='schimidt'): 502 | # 固定顺序的施密特正交化 503 | def schimidt(): 504 | 505 | col_name = factor_df.columns 506 | factors1 = factor_df.values 507 | 508 | R = np.zeros((factors1.shape[1], factors1.shape[1])) 509 | Q = np.zeros(factors1.shape) 510 | for k in range(0, factors1.shape[1]): 511 | R[k, k] = np.sqrt(np.dot(factors1[:, k], factors1[:, k])) 512 | Q[:, k] = factors1[:, k] / R[k, k] 513 | for j in range(k + 1, factors1.shape[1]): 514 | R[k, j] = np.dot(Q[:, k], factors1[:, j]) 515 | factors1[:, j] = factors1[:, j] - R[k, j] * Q[:, k] 516 | 517 | Q = pd.DataFrame(Q, columns=col_name, index=factor_df.index) 518 | return Q 519 | 520 | # 规范正交 521 | def canonial(): 522 | factors1 = factor_df.values 523 | col_name = factor_df.columns 524 | D, U = np.linalg.eig(np.dot(factors1.T, factors1)) 525 | S = np.dot(U, np.diag(D ** (-0.5))) 526 | 527 | Fhat = np.dot(factors1, S) 528 | Fhat = pd.DataFrame(Fhat, columns=col_name, index=factor_df.index) 529 | 530 | return Fhat 531 | 532 | # 对称正交 533 | def symmetry(): 534 | col_name = factor_df.columns 535 | factors1 = factor_df.values 536 | D, U = np.linalg.eig(np.dot(factors1.T, factors1)) 537 | S = np.dot(U, np.diag(D ** (-0.5))) 538 | 539 | Fhat = np.dot(factors1, S) 540 | Fhat = np.dot(Fhat, U.T) 541 | Fhat = pd.DataFrame(Fhat, columns=col_name, index=factor_df.index) 542 | 543 | return Fhat 544 | 545 | method_dict = { 546 | "schimidt": schimidt(), 547 | "canonial": canonial(), 548 | "symmetry": symmetry() 549 | } 550 | 551 | return method_dict[method] 552 | 553 | def _weight(self, 554 | data: pd.DataFrame = None, 555 | rp: int = 60, 556 | algorithm: str = 'mean') -> [pd.DataFrame, None]: 557 | 558 | if algorithm == 'mean': 559 | data_weight = data.rolling(rp).mean() 560 | elif algorithm == 'Half_time': 561 | weight_list = self._Half_time(rp) 562 | data_weight = data.rolling(rp).apply(lambda x: np.dot(x, weight_list)) 563 | else: 564 | data_weight = None 565 | 566 | return data_weight 567 | 568 | # 半衰权重 569 | @staticmethod 570 | def _Half_time(period: int, decay: int = 2) -> list: 571 | 572 | weight_list = [pow(2, (i - period - 1) / decay) for i in range(1, period + 1)] 573 | 574 | weight_1 = [i / sum(weight_list) for i in weight_list] 575 | 576 | return weight_1 577 | 578 | 579 | if __name__ == '__main__': 580 | A = Multicollinearity() 581 | data_array = np.random.rand(1000).reshape(200, 5) 582 | IC = pd.DataFrame(data_array) 583 | A.PCA(IC) 584 | # A.neutralization('s', method='industry+mv') 585 | # df_stock = pd.read_csv("D:\\Quant\\SecuritySelect\\Data\\AStockData.csv") 586 | # 587 | # # Data cleaning:Restoration stock price [open, high, low, close] 588 | # price_columns = ['open', 'close', 'high', 'low'] 589 | # df_stock[price_columns] = df_stock[price_columns].multiply(df_stock['adjfactor'], axis=0) 590 | # 591 | # A = FactorProcessing() 592 | # A.remove_outliers(df_stock['close']) 593 | pass 594 | -------------------------------------------------------------------------------- /Forecast/ReturnForecast.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | # @Time: 2020/9/4 14:11 3 | # @Author: FC 4 | # @Email: 18817289038@163.com 5 | 6 | import pandas as pd 7 | import numpy as np 8 | import statsmodels.api as sm 9 | import time 10 | from statsmodels.tsa.arima_model import ARMA 11 | 12 | from FactorProcess.FactorProcess import FactorProcess 13 | from constant import ( 14 | KeyName as KN, 15 | SpecialName as SN, 16 | PriceVolumeName as PVN, 17 | timer 18 | ) 19 | 20 | 21 | class ReturnModel(object): 22 | def __init__(self): 23 | pass 24 | 25 | # 等权 26 | def equal_weight(self, 27 | data: pd.DataFrame, 28 | rolling: int = 20, 29 | **kwargs): 30 | """ 31 | 因子收益预测--等权法:过去一段时间收益的等权平均作为下一期因子收益的预测 32 | :param data: 因子收益序列 33 | :param rolling: 滚动周期 34 | :return: 35 | """ 36 | fore_ret = data.rolling(rolling).mean().dropna() 37 | return fore_ret 38 | 39 | # 指数加权移动平均法 40 | def EWMA(self, 41 | data: pd.DataFrame, 42 | alpha: float = 0.5, 43 | **kwargs): 44 | """ 45 | pd.ewm中com与alpha的关系为 1 / alpha - 1 = com 46 | pd.ewm中adjust参数需要设置为False 47 | :param data: 48 | :param alpha: 当期权重,前一期权重为1-alpha 49 | :return: 50 | """ 51 | fore_ret = data.ewm(com=1 / alpha - 1, adjust=False).mean() 52 | return fore_ret 53 | 54 | # 时间序列模型 55 | def Time_series(self, 56 | data: pd.DataFrame, 57 | rolling: int = 20, 58 | AR_q: int = 1, 59 | MA_p: int = 1, 60 | **kwargs): 61 | fore_ret = data.rolling(rolling).apply(lambda x: self._ARMA(x, AR_q, MA_p)) 62 | return fore_ret 63 | 64 | # TODO 待研究 65 | def _ARMA(self, data: pd.Series, AR_q: int = 1, MA_p: int = 1): 66 | try: 67 | ar_ma = ARMA(data, order=(AR_q, MA_p)).fit(disp=0) 68 | except Exception as e: 69 | print(e) 70 | print("尝试采用其他滞后阶数") 71 | forecast = np.nan 72 | else: 73 | forecast = ar_ma.predict()[-1] 74 | 75 | return forecast 76 | 77 | def KML(self, data: pd.DataFrame): 78 | pass 79 | 80 | -------------------------------------------------------------------------------- /Forecast/RiskForecast.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | # @Time: 2020/10/19 20:04 3 | # @Author: FC 4 | # @Email: 18817289038@163.com 5 | 6 | import pandas as pd 7 | import numpy as np 8 | import statsmodels.api as sm 9 | from functools import reduce 10 | from constant import ( 11 | PriceVolumeName as PVN, 12 | ) 13 | 14 | 15 | class RiskModel(object): 16 | 17 | def __init__(self): 18 | pass 19 | 20 | # 因子协方差矩阵估计 21 | def forecast_cov_fact(self, 22 | fact_ret: pd.DataFrame, 23 | decay: int = 2, 24 | order: int = 2, 25 | annual: int = 1): 26 | """ 27 | 28 | :param fact_ret: 因子收益序列 29 | :param decay: 指数加权衰减系数 30 | :param order: 自相关之后阶数 31 | :param annual: "年化"参数 32 | :return: 33 | """ 34 | # 指数加权协方差矩阵 35 | F_Raw = self.exp_weight_cov(fact_ret, decay=decay) 36 | 37 | # Newey-West adjustment 38 | matrix_orders = np.zeros(shape=(fact_ret.shape[1], fact_ret.shape[1])) 39 | for order_ in range(1, order + 1): 40 | w = 1 - order_ / (order + 1) 41 | # 滞后order阶的自相关协方差矩阵 42 | matrix_order = self.auto_cor_cov(fact_ret, order=order, decay=decay) 43 | matrix_orders += w * (matrix_order + matrix_order.T) 44 | 45 | # Eigenvalue adjustment 46 | F_NW = annual * (F_Raw + matrix_orders) 47 | 48 | # 特征值调整 49 | F_Eigen = self.eigenvalue_adj(F_NW, period=120, M=100) 50 | 51 | # Volatility bias adjustment TODO 52 | # F = self.vol_bias_adj(F_Eigen) 53 | F = F_Eigen 54 | return F 55 | 56 | # 特异性收益协方差矩阵预测 57 | def forecast_cov_spec(self, 58 | spec_ret: pd.DataFrame, 59 | fact_exp: pd.DataFrame, 60 | liq_mv: pd.DataFrame, 61 | liq_mv_name: str = PVN.LIQ_MV.value, 62 | decay: int = 2, 63 | order: int = 5, 64 | annual: int = 1): 65 | """ 66 | 67 | :param spec_ret: 个股特异性收益 68 | :param fact_exp: 因子暴露 69 | :param liq_mv: 流通市值 70 | :param liq_mv_name: 流通市值名称 71 | :param decay: 指数加权衰减周期 72 | :param order: Newey-West调整最大滞后阶数 73 | :param annual: 调仓期:对协方差矩阵进行"年化"调整 74 | :return: 75 | """ 76 | # 删除无效资产 77 | eff_asset = spec_ret.iloc[-1, :].dropna().index 78 | spec_ret_eff = spec_ret[eff_asset] 79 | 80 | # Calculate the weighted covariance of the specific return index 81 | F_Raw = self.exp_weight_cov(spec_ret_eff, decay=decay) 82 | 83 | # Newey-West adjustment: 自由度设为n-1 84 | matrix_orders = np.zeros(shape=(spec_ret_eff.shape[1], spec_ret_eff.shape[1])) 85 | for order_ in range(1, order + 1): 86 | w = 1 - order_ / (order + 1) 87 | matrix_order = self.auto_cor_cov(spec_ret_eff, order=order_, decay=decay) 88 | matrix_orders += w * (matrix_order + matrix_order.T) 89 | 90 | # Eigenvalue adjustment 91 | F_NW = annual * (F_Raw + matrix_orders) 92 | 93 | # Structural adjustment 94 | F_STR = self.structural_adj(F_NW, spec_ret_eff, fact_exp, liq_mv.iloc[:, 0], liq_mv_name) 95 | 96 | # Bayesian compression adjustment 97 | F_SH = self.Bayesian_compression(F_STR, liq_mv.iloc[:, 0], liq_mv_name) 98 | 99 | # 波动率偏误调整 TODO 100 | 101 | # 非对角矩阵替换为0 102 | 103 | return F_SH 104 | 105 | # 指数加权协方差矩阵计算 106 | def exp_weight_cov(self, 107 | data: pd.DataFrame, 108 | decay: int = 2) -> pd.DataFrame: 109 | # Exponentially weighted index volatility: Half-Life attenuation 110 | 111 | w_list = self.Half_time(period=data.shape[0], decay=decay) 112 | w_list = sorted(w_list, reverse=False) # 升序排列 113 | 114 | cov_w = pd.DataFrame(np.cov(data.T, aweights=w_list), index=data.columns, columns=data.columns) 115 | 116 | return cov_w 117 | 118 | # 自相关协方差矩阵 119 | def auto_cor_cov(self, 120 | data: pd.DataFrame, 121 | order: int = 2, 122 | decay: int = 2) -> pd.DataFrame: 123 | """ 124 | 矩阵与矩阵相关性计算: 125 | A = np.array([[a11,a21],[a12,a22]]) 126 | B = np.array([[b11,b21],[b12,b22]]) 127 | 128 | matrix = [[cov([a11,a21], [a11,a21]), cov([a11,a21], [a12,a22]), cov([a11,a21], [b11,b21]), cov([a11,a21], [b12,b22])], 129 | [cov([a12,a22], [a11,a21]), cov([a12,a22], [a12,a22]), cov([a12,a22], [b11,b21]), cov([a12,a22], [b12,b22])], 130 | [cov([b11,b21], [a11,a21]), cov([b11,b21], [a12,a22]), cov([b11,b21], [b11,b21]), cov([b11,b21], [b12,b22])], 131 | [cov([b12,b22], [a11,a21]), cov([b12,b22], [a12,a22]), cov([b12,b22], [b11,b21]), cov([b12,b22], [b12,b22])]] 132 | 133 | 自相关协方差矩阵为: 134 | matrix_at_cor_cov = [[cov([a11,a21], [b11,b21]), cov([a11,a21], [b12,b22])], 135 | [cov([a12,a22], [b11,b21]), cov([a12,a22], [b12,b22])] 136 | 137 | 注: 138 | 输入pd.DataFrame格式的数据计算协方差会以行为单位向量进行计算 139 | 计算出来的协方差矩阵中右上角order*order矩阵才是自相关矩阵 140 | 协方差矩阵:横向为当期与各因子滞后阶数的协方差;纵向为滞后阶数与当期各因子的协方差 141 | :param data: 142 | :param order: 143 | :param decay: 144 | :return: 145 | """ 146 | 147 | # order matrix 148 | matrix_order = data.shift(order).dropna(axis=0, how='all') 149 | matrix = data.iloc[order:, :].copy(deep=True) 150 | 151 | w_list = self.Half_time(period=matrix.shape[0], decay=decay) 152 | w_list = sorted(w_list, reverse=False) # 升序排列 153 | 154 | covs = np.cov(matrix.T, matrix_order.T, aweights=w_list) # 需要再测试 155 | cov_order = pd.DataFrame(covs[: -matrix.shape[1], -matrix.shape[1]:], 156 | index=matrix.columns, 157 | columns=matrix.columns) 158 | 159 | return cov_order 160 | 161 | # 特征值调整 162 | def eigenvalue_adj(self, 163 | data: np.array, 164 | period: int = 120, 165 | M: int = 3000, 166 | alpha: float = 1.5): 167 | """ 168 | 169 | :param data:Newey-West调整后的协方差矩阵 170 | :param period: 蒙特卡洛模拟收益期数 171 | :param M: 蒙特卡洛模拟次数 172 | :param alpha: 173 | :return: 174 | """ 175 | 176 | # 矩阵奇异值分解 177 | e_vals, U0 = np.linalg.eig(data) 178 | 179 | # 对角矩阵 180 | D0 = np.diag(e_vals) 181 | 182 | # 蒙特卡洛模拟 183 | eigenvalue_bias = [] 184 | for i in range(M): 185 | S = np.random.randn(len(e_vals), period) # 模拟的特征组合收益率矩阵, 收益期数怎么定 TODO 186 | f = np.dot(U0, S) # 模拟的收益率矩阵 187 | F = np.cov(f) # 模拟的收益率协方差矩阵 188 | e_vas_S, U1 = np.linalg.eig(F) # 对模拟的协方差矩阵进行奇异值分解 189 | D1 = np.diag(e_vas_S) # 生成模拟协方差矩阵特征值的对角矩阵 190 | D1_real = np.dot(np.dot(U1.T, data), U1) 191 | 192 | D1_real = np.diag(np.diag(D1_real)) # 转化为对角矩阵 193 | 194 | lam = D1_real / D1 # 特征值偏误 195 | eigenvalue_bias.append(lam) 196 | 197 | gam_ = reduce(lambda x, y: x + y, eigenvalue_bias) 198 | gam = (np.sqrt(gam_ / M) - 1) * alpha + 1 199 | gam[np.isnan(gam)] = 0 200 | 201 | F_Eigen = pd.DataFrame(np.dot(np.dot(U0, np.dot(gam ** 2, D0)), np.linalg.inv(U0)), 202 | index=data.columns, 203 | columns=data.columns) 204 | 205 | return F_Eigen 206 | 207 | # 结构化调整 208 | def structural_adj(self, 209 | cov: pd.DataFrame, 210 | spec_ret: pd.DataFrame, 211 | fact_exp: pd.DataFrame, 212 | liq_mv: pd.DataFrame, 213 | liq_mv_name: PVN.LIQ_MV.value, 214 | time_window: int = 120): 215 | """ 216 | 217 | :param cov: 经Newey-West调整的个股特异收益矩阵 218 | :param spec_ret: 个股特异收益序列 219 | :param fact_exp: 因子暴露 220 | :param liq_mv: 流通市值 221 | :param liq_mv_name: 流通市值名称 222 | :param time_window: 个股特异收益的时间窗口(后面考虑改为特异收益序列的长度) 223 | :return: 224 | """ 225 | # 计算协调参数 226 | h_n = spec_ret.count() # 非空数量 227 | V_n = (h_n - 20 / 4) / 20 * 2 # 数据缺失程度(先用20测试) 228 | 229 | sigma_n = spec_ret.std().fillna(1) # 样本等权标准差(无法计算的标准差记为1) TODO 230 | 231 | sigma_n_steady = (spec_ret.quantile(.75) - spec_ret.quantile(0.25)) / 1.35 # 样本稳健估计标准差 232 | 233 | Z_n = abs((sigma_n - sigma_n_steady) / sigma_n_steady) # 数据肥尾程度 234 | 235 | # 将无限大值替换为0 236 | Z_n[np.isinf(Z_n)] = 0 237 | Z_n.fillna(0, inplace=True) 238 | 239 | left_, right_ = V_n.where(V_n > 0, 0), np.exp(1 - Z_n) 240 | 241 | left_, right_ = left_.where(left_ < 1, 1), right_.where(right_ < 1, 1) 242 | gam_n = left_ * right_ # 个股协调参数[0,1] 243 | 244 | reg_data = pd.concat([np.log(sigma_n), liq_mv, gam_n, fact_exp], axis=1) 245 | reg_data.columns = ['sigma', liq_mv_name, 'gam_n'] + fact_exp.columns.tolist() 246 | 247 | ref_data_com = reg_data[reg_data['gam_n'] == 1] 248 | 249 | # 加权(流通市值)最小二乘法用优质股票估计因子对特异波动的贡献值 250 | model = sm.WLS(ref_data_com['sigma'], ref_data_com[fact_exp.columns], weights=ref_data_com['gam_n']).fit() 251 | 252 | # 个股结构化特异波动预测值 253 | sigma_STR = pd.DataFrame(np.diag(np.exp(np.dot(fact_exp, model.params)) * 1.05), 254 | index=fact_exp.index, 255 | columns=fact_exp.index) 256 | 257 | # 对特异收益矩阵进行结构化调整 258 | F_STR = sigma_STR.mul((1 - gam_n), axis=0) + cov.mul(gam_n, axis=0) 259 | 260 | return F_STR 261 | 262 | # 贝叶斯压缩 263 | def Bayesian_compression(self, 264 | cov: pd.DataFrame, 265 | liq_mv: pd.DataFrame, 266 | liq_mv_name: PVN.LIQ_MV.value, 267 | group_num: int = 10, 268 | q: int = 1 269 | ): 270 | """ 271 | 𝜎_𝑛_𝑆𝐻 = 𝑣_𝑛*𝜎_𝑛 + (1 − 𝑣_𝑛)*𝜎_𝑛^ 272 | 273 | :param cov: 经结构化调整的特异收益矩阵 274 | :param liq_mv: 流通市值 275 | :param liq_mv_name: 流通市值名称 276 | :param group_num: 分组个数 277 | :param q: 压缩系数,该系数越大,先验风险矩阵所占权重越大 278 | :return: 279 | """ 280 | df_ = pd.DataFrame({"sigma_n": np.diag(cov), liq_mv_name: liq_mv}) 281 | # 按流通市值分组 282 | df_['Group'] = pd.cut(df_['sigma_n'], group_num, labels=[f'Group_{i}' for i in range(1, group_num + 1)]) 283 | 284 | # 各组特异风险市值加权均值 285 | df_['weight'] = df_.groupby('Group', group_keys=False).apply(lambda x: x[liq_mv_name] / x[liq_mv_name].sum()) 286 | sigma_n_weight = df_.groupby('Group').apply(lambda x: x['weight'] @ x['sigma_n']) 287 | sigma_n_weight.name = 'sigma_n_weight' 288 | 289 | df_N1 = pd.merge(df_, sigma_n_weight, left_on=['Group'], right_index=True, how='left') 290 | 291 | # 个股所属分组特异波动的标准差 292 | 293 | try: 294 | delta_n = df_N1.groupby('Group').apply( 295 | lambda x: np.nan if x.empty else pow(sum((x['sigma_n'] - x['sigma_n_weight']) ** 2) / x.shape[0], 0.5)) 296 | except Exception as e: 297 | delta_n = df_N1.groupby('Group').apply( 298 | lambda x: np.nan if x.empty else pow(sum((x['sigma_n'] - x['sigma_n_weight']) ** 2) / x.shape[0], 0.5)) 299 | print(e) 300 | 301 | delta_n.name = 'delta' 302 | 303 | df_N2 = pd.merge(df_N1, delta_n, left_on=['Group'], right_index=True, how='left') 304 | 305 | # 压缩系数 306 | df_N2['V_n'] = q * abs(df_N2['sigma_n'] - df_N2['sigma_n_weight']) / (df_N2['delta'] + q * abs(df_N2['sigma_n'] - df_N2['sigma_n_weight'])) 307 | 308 | # 调整后的特异波动 309 | sigma_SH = df_N2['V_n'] * df_N2['sigma_n_weight'] + (1 - df_N2['V_n']) * df_N2['sigma_n'] 310 | F_SH = pd.DataFrame(np.diag(np.array(sigma_SH)), index=sigma_SH.index, columns=sigma_SH.index) 311 | 312 | return F_SH 313 | 314 | # 半衰权重 315 | @staticmethod 316 | def Half_time(period: int, decay: int = 2) -> list: 317 | 318 | weight_list = [pow(2, (i - period - 1) / decay) for i in range(1, period + 1)] 319 | 320 | weight_1 = [i / sum(weight_list) for i in weight_list] 321 | 322 | return weight_1 323 | -------------------------------------------------------------------------------- /Forecast/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaotfeng/SecuritySelect/6c4653b6bdd6b46118a5fd332d741b542daf9c31/Forecast/__init__.py -------------------------------------------------------------------------------- /Forecast/test.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | # @Time: 2020/9/22 21:05 3 | # @Author: FC 4 | # @Email: 18817289038@163.com 5 | import pandas as pd 6 | import numpy as np 7 | from scipy.optimize import minimize 8 | 9 | 10 | # class MaxOptModel(object): 11 | # """ 12 | # 最优化求解: 13 | # Object Function: 组合超额收益的评估比率最大 14 | # Subject To1:资产权重和为1 15 | # Subject To2:组合收益大于阈值 16 | # """ 17 | # 18 | # def __init__(self, 19 | # data_mean: np.array, 20 | # data_cov: np.array, 21 | # n: int): 22 | # """ 23 | # """ 24 | # self.data_mean = data_mean 25 | # self.data_cov = data_cov 26 | # self.n = n 27 | # self.bonds = ((0., 1.),) * n 28 | # self.params = {} 29 | # 30 | # # 目标函数 31 | # def object_func(self, w): 32 | # mean = self.data_mean 33 | # cov = self.data_cov 34 | # func = np.dot(w, np.dot(w, cov)) 35 | # return func 36 | # 37 | # # 约束1 38 | # def _constraint1(self, w, **kwargs): 39 | # return sum(w) - 1 40 | # 41 | # def _constraints(self, **kwargs): 42 | # limit = {'type': 'eq', 'fun': self._constraint1} 43 | # return limit 44 | # 45 | # # 求解算法 46 | # def optimal_solution(self, 47 | # object_function, 48 | # bounds, 49 | # constraints, 50 | # ftol: float = 1e-7, 51 | # maxiter: int = 30): 52 | # # 初始权重 53 | # w0 = np.array([1 / self.n] * self.n) 54 | # 55 | # result = minimize(object_function, w0, 56 | # method='SLSQP', 57 | # bounds=bounds, 58 | # constraints=constraints, 59 | # options={'disp': False, 60 | # 'ftol': ftol, 61 | # 'maxiter': maxiter}) 62 | # return result 63 | # 64 | # # solve 65 | # def solve(self, ftol: float = 1e-7, maxiter: int = 30): 66 | # solution = self.optimal_solution(self.object_func, 67 | # self.bonds, 68 | # self._constraints(**self.params), 69 | # ftol=ftol, 70 | # maxiter=maxiter) 71 | # 72 | # if not solution.success: 73 | # print("Optimization of failure") 74 | # return solution 75 | 76 | 77 | 78 | if __name__ == '__main__': 79 | import pandas as pd 80 | 81 | data = pd.read_excel(r'C:\Users\User\Desktop\网格策略\test_reg.xlsx', 82 | sheet_name='X') 83 | y = pd.read_excel(r'C:\Users\User\Desktop\网格策略\test_reg.xlsx', 84 | sheet_name='Y') 85 | 86 | 87 | # reg_1 = np.linalg.lstsq(data.iloc, Y) 88 | 89 | data_ = pd.merge(data, y, on='stock_id') 90 | 91 | reg_1 = np.linalg.lstsq(data_.iloc[:, 1:-1].values, data_.iloc[:, -1].values) 92 | reg_2 = np.linalg.lstsq(data_.iloc[:, 1:-1].values, data_.iloc[:, -1].values) 93 | print(reg_1[0] == reg_2[0]) 94 | -------------------------------------------------------------------------------- /LabelPool/Labelpool.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import os 4 | import datetime as dt 5 | import time 6 | import sys 7 | from Data.GetData import SQL 8 | from constant import ( 9 | KeyName as KN, 10 | PriceVolumeName as PVN, 11 | FilePathName as FPN, 12 | SpecialName as SN 13 | ) 14 | 15 | 16 | class LabelPool(object): 17 | PATH = {"price": os.path.join(FPN.label_pool_path.value, 'StockPrice.csv'), 18 | "industry": os.path.join(FPN.label_pool_path.value, 'IndustryLabel.csv'), 19 | "composition": os.path.join(FPN.label_pool_path.value, 'IndexStockWeight.csv'), 20 | "index_weight": os.path.join(FPN.label_pool_path.value, 'IndexStockWeight.csv'), 21 | "mv": os.path.join(FPN.label_pool_path.value, 'MV.csv'), } 22 | 23 | def __init__(self): 24 | self.Q = SQL() 25 | 26 | def stock_return(self, 27 | stock_price: pd.DataFrame, 28 | return_type: str = PVN.OPEN.value, 29 | label: bool = True) -> pd.Series: 30 | """ 31 | 收益率作为预测标签需放置到前一天, 默认每个交易日至少存在一只股票价格,否则会出现收益率跳空计算现象 32 | :param stock_price: 股票价格表 33 | :param return_type: 计算收益率用到的股票价格 34 | :param label: 是否作为标签 35 | :return: 36 | """ 37 | stock_price.sort_index(inplace=True) 38 | if label: 39 | if return_type == PVN.OPEN.value: 40 | result = stock_price[return_type].groupby(as_index=True, 41 | level=KN.STOCK_ID.value).apply( 42 | lambda x: x.shift(-2) / x.shift(-1) - 1) 43 | else: 44 | result = stock_price[return_type].groupby(as_index=True, 45 | level=KN.STOCK_ID.value).apply(lambda x: x.shift(-1) / x - 1) 46 | else: 47 | if return_type == PVN.OPEN.value: 48 | result = stock_price[return_type].groupby(as_index=True, 49 | level=KN.STOCK_ID.value).apply(lambda x: x.shift(-1) / x - 1) 50 | else: 51 | result = stock_price[return_type].groupby(as_index=True, 52 | level=KN.STOCK_ID.value).apply(lambda x: x / x.shift(1) - 1) 53 | 54 | result = round(result, 6) 55 | result.name = KN.STOCK_RETURN.value + '_' + return_type 56 | return result 57 | 58 | def industry_weight(self, 59 | index_weight: pd.Series, 60 | industry_exposure: pd.Series, 61 | index_name: str = SN.CSI_500_INDUSTRY_WEIGHT.value) -> pd.Series: 62 | """ 63 | 生成行业权重 64 | 如果某个行业权重为零则舍弃掉 65 | """ 66 | data_ = pd.concat([index_weight[index_name], industry_exposure], axis=1, join='inner') 67 | # industry weight 68 | ind_weight = data_.groupby([KN.TRADE_DATE.value, SN.INDUSTRY_FLAG.value]).sum() 69 | index_ = industry_exposure.index.get_level_values(KN.TRADE_DATE.value).drop_duplicates() 70 | ind_weight_new = ind_weight.unstack().reindex(index_).fillna(method='ffill').stack(dropna=False) 71 | 72 | # fill weight and industry 73 | res_ = pd.merge(ind_weight_new.reset_index(), industry_exposure.reset_index(), 74 | on=[KN.TRADE_DATE.value, SN.INDUSTRY_FLAG.value], how='right') 75 | res_.set_index(['date', 'stock_id'], inplace=True) 76 | 77 | return res_[index_name] 78 | 79 | def industry_mv(self, 80 | index_weight: pd.Series, 81 | industry_exposure: pd.Series, 82 | mv: pd.Series, 83 | index_name: str = SN.CSI_300_INDUSTRY_WEIGHT.value, 84 | mv_name: str = PVN.LIQ_MV.value) -> pd.Series: 85 | 86 | weight_mv_name = index_name.replace('weight', 'mv') 87 | 88 | data_ = pd.concat([index_weight[index_name], mv[mv_name], industry_exposure], axis=1, join='inner') 89 | data_[weight_mv_name] = data_[mv_name] * data_[index_name] 90 | 91 | # industry weight 92 | ind_mv = data_[[weight_mv_name, 93 | SN.INDUSTRY_FLAG.value]].groupby([KN.TRADE_DATE.value, 94 | SN.INDUSTRY_FLAG.value]).sum() 95 | index_ = industry_exposure.index.get_level_values(KN.TRADE_DATE.value).drop_duplicates() 96 | ind_weight_new = ind_mv.unstack().reindex(index_).fillna(method='ffill').stack(dropna=False) 97 | 98 | # fill weight and industry 99 | res_ = pd.merge(ind_weight_new.reset_index(), industry_exposure.reset_index(), 100 | on=[KN.TRADE_DATE.value, SN.INDUSTRY_FLAG.value], how='right') 101 | res_.set_index(['date', 'stock_id'], inplace=True) 102 | # 去除无效市值 103 | res_ = res_[res_[weight_mv_name] != 0] 104 | 105 | return res_[weight_mv_name] 106 | 107 | def merge_labels(self, **kwargs) -> pd.DataFrame: 108 | """ 109 | :param kwargs: 股票标签数据 110 | :return: 111 | """ 112 | 113 | res = pd.concat(kwargs.values(), axis=1) 114 | 115 | return res 116 | 117 | def LabelPool1(self): 118 | 119 | result_path = os.path.join(FPN.label_pool_path.value, sys._getframe().f_code.co_name + '_result.csv') 120 | if os.path.exists(result_path): 121 | category_label = pd.read_csv(result_path, index_col=[KN.TRADE_DATE.value, KN.STOCK_ID.value]) 122 | else: 123 | # read data 124 | print(f"{dt.datetime.now().strftime('%X')}: Read the data of label") 125 | 126 | price_data = pd.read_csv(self.PATH["price"]) 127 | industry_data = pd.read_csv(self.PATH["industry"]) 128 | composition_data = pd.read_csv(self.PATH["composition"]) 129 | industry_weight_data = pd.read_csv(self.PATH["index_weight"]) 130 | stock_mv_data = pd.read_csv(self.PATH["mv"]) 131 | 132 | # set MultiIndex 133 | price_data.set_index([KN.TRADE_DATE.value, KN.STOCK_ID.value], inplace=True) 134 | industry_data.set_index([KN.TRADE_DATE.value, KN.STOCK_ID.value], inplace=True) 135 | composition_data.set_index([KN.TRADE_DATE.value, KN.STOCK_ID.value], inplace=True) 136 | industry_weight_data.set_index([KN.TRADE_DATE.value, KN.STOCK_ID.value], inplace=True) 137 | stock_mv_data.set_index([KN.TRADE_DATE.value, KN.STOCK_ID.value], inplace=True) 138 | 139 | # adj price 140 | price_data[[PVN.OPEN.value, 141 | PVN.CLOSE.value]] = price_data[[PVN.OPEN.value, 142 | PVN.CLOSE.value]].mul(price_data[PVN.ADJ_FACTOR.value], axis=0) 143 | 144 | # switch name 145 | composition_data.rename(columns={SN.CSI_50_INDUSTRY_WEIGHT.value: SN.CSI_50.value, 146 | SN.CSI_300_INDUSTRY_WEIGHT.value: SN.CSI_300.value, 147 | SN.CSI_500_INDUSTRY_WEIGHT.value: SN.CSI_500.value}, inplace=True) 148 | 149 | print(f"{dt.datetime.now().strftime('%X')}: calculate stock daily return label") 150 | stock_return_close = self.stock_return(price_data, return_type=PVN.CLOSE.value) 151 | stock_return_open = self.stock_return(price_data, return_type=PVN.OPEN.value) 152 | 153 | print(f"{dt.datetime.now().strftime('%X')}: Generate the {SN.CSI_500_INDUSTRY_WEIGHT.value}") 154 | industry_weight = self.industry_weight(industry_weight_data, industry_data, 155 | index_name=SN.CSI_500_INDUSTRY_WEIGHT.value) 156 | ############################################################################################################ 157 | # merge labels 158 | print(f"{dt.datetime.now().strftime('%X')}: Merge labels") 159 | category_label = self.merge_labels( 160 | data_ret_close=stock_return_close, 161 | data_ret_open=stock_return_open, 162 | composition=composition_data, 163 | industry_exposure=industry_data, 164 | index_weight=industry_weight, 165 | mv=stock_mv_data[PVN.LIQ_MV.value] 166 | ) 167 | 168 | # sort 169 | category_label.sort_index(inplace=True) 170 | 171 | category_label.to_csv(result_path) 172 | return category_label 173 | 174 | def BenchMark(self, 175 | bm_index: str = '000300.SH', 176 | sta: str = '20130101', 177 | end: str = '20200401', 178 | price: str = 'open'): 179 | """ 180 | 返回基准当天收益 181 | :param bm_index: 182 | :param sta: 183 | :param end: 184 | :param price: 185 | :return: 186 | """ 187 | sql_ = self.Q.stock_index_SQL(bm_index=bm_index, date_sta=sta, date_end=end) 188 | index_ = self.Q.query(sql_) 189 | index_.set_index(KN.TRADE_DATE.value, inplace=True) 190 | result = index_[price].shift(-1) / index_[price] - 1 191 | return result 192 | 193 | 194 | if __name__ == '__main__': 195 | # df_index = pd.read_csv(r"A:\数据\LabelPool\IndexStockWeight.csv") 196 | # df_industry = pd.read_csv(r"A:\数据\LabelPool\IndustryLabel.csv") 197 | # df_mv = pd.read_csv(r"A:\数据\LabelPool\MV.csv") 198 | # 199 | # df_industry.set_index([KN.TRADE_DATE.value, KN.STOCK_ID.value], inplace=True) 200 | # df_index.set_index([KN.TRADE_DATE.value, KN.STOCK_ID.value], inplace=True) 201 | # df_mv.set_index([KN.TRADE_DATE.value, KN.STOCK_ID.value], inplace=True) 202 | 203 | A = LabelPool() 204 | A.LabelPool1() 205 | pass 206 | -------------------------------------------------------------------------------- /Object.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | # @Time: 2020/9/11 10:17 3 | # @Author: FC 4 | # @Email: 18817289038@163.com 5 | 6 | from dataclasses import dataclass 7 | from datetime import datetime 8 | import pandas as pd 9 | import yagmail 10 | 11 | 12 | @dataclass 13 | class GroupData(object): 14 | """ 15 | Candlestick bar data of a certain trading period. 16 | """ 17 | 18 | stock_id: str = '' 19 | industry: str = '' 20 | date: datetime = None 21 | datetime_update: datetime = None 22 | group: int = None 23 | 24 | stock_return: float = None 25 | holding_period: int = None 26 | factor_name: str = None 27 | factor_name_chinese: str = None 28 | factor_value: float = None 29 | factor_type: str = None # 因子类型 30 | 31 | 32 | @dataclass 33 | class FactorRetData(object): 34 | """ 35 | Candlestick bar data of a certain trading period. 36 | """ 37 | 38 | date: datetime = None 39 | datetime_update: datetime = None 40 | 41 | factor_return: float = None 42 | holding_period: int = None 43 | factor_T: float = None 44 | factor_name: str = None 45 | factor_name_chinese: str = None 46 | ret_type: str = None # 因子收益类型 47 | 48 | 49 | @dataclass 50 | class FactorData(object): 51 | """ 52 | Candlestick bar data of a certain trading period. 53 | """ 54 | 55 | stock_id: str = '' 56 | date_report: datetime = None # 报告期 57 | date: datetime = None # 公布期(数据实际获得的日期) 58 | datetime_update: datetime = None 59 | 60 | factor_name: str = None 61 | factor_name_chinese: str = None 62 | factor_category: str = None 63 | factor_value: float = None 64 | factor_type: str = None # 因子类型 65 | 66 | 67 | # 因子数据的存储 68 | @dataclass 69 | class FactorInfo(object): 70 | """ 71 | 对于交易日产生的数据计算出来的因子报告期等于公布期, 72 | 对于采用财务数据计算出来的因子有公布期和报告期之分 73 | 公布期属于财务会计年度日期,存在未来数据 74 | 报告期数据数据实际公布日期,正常数据 75 | """ 76 | 77 | #:param data: 该数据用来进行后续的数据分析 78 | #:param data_raw: 该数据用来进行数据存储 79 | 80 | data_raw: pd.DataFrame = None # 因子[股票ID,公布期,因子值, 报告期] 81 | data: pd.Series = None # 因子[双索引[股票ID, 交易日]:因子值] 82 | 83 | factor_category: str = None 84 | factor_name: str = None 85 | factor_type: str = None # 因子类型 86 | 87 | 88 | # 发送邮件 89 | def send_email(email, theme, contents): 90 | """ 91 | 92 | :param email: 93 | {"person_name": {"user": "email_address", 94 | "password": "password", 95 | "host": "smtp.qq.com"}} 96 | :param theme: email theme 97 | :param contents: email contents 98 | :return: 99 | """ 100 | 101 | for person in email.keys(): 102 | user = email[person]['user'] 103 | password = email[person]['password'] 104 | host = email[person]['host'] 105 | try: 106 | yag = yagmail.SMTP(user=user, 107 | password=password, 108 | host=host) 109 | 110 | yag.send([user], theme, contents) 111 | except: 112 | # Alternate mailbox 113 | yag = yagmail.SMTP(user="18817289038@163.com", password="excejuxyyuthbiaa", 114 | host="smtp.qq.com") 115 | yag.send([user], theme, contents) 116 | # @dataclass 117 | # class FactorData(object): 118 | # """ 119 | # Candlestick bar data of a certain trading period. 120 | # 回归结果 121 | # """ 122 | # 123 | # stock_id: str = '' 124 | # industry: str = '' 125 | # date: datetime = None 126 | # datetime_update: datetime = None 127 | # group: int = None 128 | # 129 | # stock_return: float = None 130 | # factor_name: str = None 131 | # factor_category: str = None 132 | # factor_name_chinese: str = None 133 | # factor_value: float = None 134 | -------------------------------------------------------------------------------- /Optimization.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | # @Time: 2020/9/18 16:07 3 | # @Author: FC 4 | # @Email: 18817289038@163.com 5 | 6 | # 最优化求解:组合的绝对收益/收益波动最大 7 | import numpy as np 8 | import pandas as pd 9 | from scipy.optimize import minimize, linprog 10 | import numba as nb 11 | from abc import ABC, abstractmethod 12 | 13 | 14 | class MaxOptModel(ABC): 15 | """ 16 | 最小二乘法求解优化模型 17 | 当矩阵长度超过2000时迭代速度大幅下降,对于线性规划问题考虑采用单纯行法进行优化求解 18 | """ 19 | 20 | """ 21 | 默认: 22 | 1.目标函数为最大化收益比上波动 23 | 2.权重介于0到1之间 24 | 3.权重和为1 25 | 4.最大迭代次数为300 26 | 5.容忍度为1e-7 27 | """ 28 | 29 | def __init__(self, obj_type: str = 'MAX_RET'): 30 | self.data_mean = None # 收益率矩阵 31 | self.data_cov = None # 协方差矩阵 32 | 33 | self.n = None # 解个数 34 | self.maxiter = 300 # 最大迭代次数 35 | self.ftol = 1e-8 # 容忍度 36 | self.eps = 1e-8 # 学习效率 37 | self.obj_func = self.objectFunction(obj_type) 38 | self.bonds = None # 最优解约束边界 39 | self.limit = [] # 约束条件 40 | 41 | def objectFunction(self, obj_type: str = 'MAX_RET'): 42 | if obj_type == 'MAX_RET': 43 | return self.object_func1 44 | elif obj_type == 'MIN_RISK': 45 | return self.object_func2 46 | elif obj_type == 'MAX_RET/RISK': 47 | return self.object_func3 48 | pass 49 | 50 | # 目标函数 51 | def object_func1(self, w): 52 | """ 53 | 目标函数默认为夏普比最大化模型,通过前面加上负号转化为最小化模型 54 | :param w: 55 | :return: 56 | """ 57 | func = - np.dot(w, np.array(self.data_mean)) 58 | return func 59 | 60 | def object_func2(self, w): 61 | """ 62 | :param w: 63 | :return: 64 | """ 65 | func = np.dot(w, np.dot(w, np.array(self.data_cov))) 66 | return func 67 | 68 | def object_func3(self, w): 69 | """ 70 | :param w: 71 | :return: 72 | """ 73 | func = - np.dot(w, np.array(self.data_mean)) / np.sqrt(np.dot(w, np.dot(w, np.array(self.data_cov)))) 74 | return func 75 | 76 | # 约束条件 77 | def _constraint(self): 78 | self.limit.append({'type': 'eq', 'fun': lambda w: sum(w) - 1}) 79 | 80 | # solve 81 | def solve(self): 82 | # 初始权重 83 | w0 = np.array([1 / self.n] * self.n) 84 | 85 | result = minimize(fun=self.obj_func, 86 | x0=w0, 87 | method='SLSQP', 88 | bounds=self.bonds, 89 | constraints=self.limit, 90 | options={'disp': False, 91 | 'ftol': self.ftol, 92 | 'maxiter': self.maxiter, 93 | 'eps': self.eps}) 94 | 95 | if not result.success: 96 | print("Optimization of failure") 97 | return result 98 | 99 | 100 | class OptimizeSLSQP(object): 101 | """ 102 | 最小二乘法求解优化模型 103 | 当矩阵长度超过2000时迭代速度大幅下降,对于线性规划问题考虑采用单纯行法进行优化求解 104 | """ 105 | 106 | """ 107 | 默认: 108 | 1.目标函数为最大化收益比上波动 109 | 2.权重介于0到1之间 110 | 3.权重和为1 111 | 4.最大迭代次数为300 112 | 5.容忍度为1e-7 113 | """ 114 | 115 | def __init__(self, obj_type: str = 'MAX_RET'): 116 | self.data_mean = None # 收益率矩阵 117 | self.data_cov = None # 协方差矩阵 118 | 119 | self.n = None # 解个数 120 | self.maxiter = 300 # 最大迭代次数 121 | self.ftol = 1e-8 # 容忍度 122 | self.eps = 1e-8 # 学习效率 123 | self.obj_func = self.objectFunction(obj_type) 124 | self.bonds = None # 最优解约束边界 125 | self.limit = [] # 约束条件 126 | 127 | def objectFunction(self, obj_type: str = 'MAX_RET'): 128 | if obj_type == 'MAX_RET': 129 | return self.object_func1 130 | elif obj_type == 'MIN_RISK': 131 | return self.object_func2 132 | elif obj_type == 'MAX_RET/RISK': 133 | return self.object_func3 134 | pass 135 | 136 | # 目标函数 137 | def object_func1(self, w): 138 | """ 139 | 目标函数默认为夏普比最大化模型,通过前面加上负号转化为最小化模型 140 | :param w: 141 | :return: 142 | """ 143 | func = - np.dot(w, np.array(self.data_mean)) 144 | return func 145 | 146 | def object_func2(self, w): 147 | """ 148 | :param w: 149 | :return: 150 | """ 151 | func = np.dot(w, np.dot(w, np.array(self.data_cov))) 152 | return func 153 | 154 | def object_func3(self, w): 155 | """ 156 | :param w: 157 | :return: 158 | """ 159 | func = - np.dot(w, np.array(self.data_mean)) / np.sqrt(np.dot(w, np.dot(w, np.array(self.data_cov)))) 160 | return func 161 | 162 | # 约束条件 163 | def _constraint(self): 164 | self.limit.append({'type': 'eq', 'fun': lambda w: sum(w) - 1}) 165 | 166 | # solve 167 | def solve(self): 168 | # 初始权重 169 | w0 = np.array([1 / self.n] * self.n) 170 | 171 | result = minimize(fun=self.obj_func, 172 | x0=w0, 173 | method='SLSQP', 174 | bounds=self.bonds, 175 | constraints=self.limit, 176 | options={'disp': False, 177 | 'ftol': self.ftol, 178 | 'maxiter': self.maxiter, 179 | 'eps': self.eps}) 180 | 181 | if not result.success: 182 | print("Optimization of failure") 183 | return result 184 | 185 | 186 | class OptimizeLinear(object): 187 | """ 188 | Minimize:: 189 | 190 | c @ x 191 | Such That:: 192 | 193 | A_ub @ x <= b_ub 194 | A_eq @ x == b_eq 195 | lb <= x <= ub 196 | 197 | Example: 198 | self.obj = np.array([c1, c2, c3], ndmin=1) 199 | self.limit = [{'type': 'eq', 'coef': np.array([[b1, b2, b3]], ndmin=2)', 'const': np.array([b], ndimn=1)}, 200 | {'type': 'ineq', 'coef': np.array([[b1, b2, b3]], ndmin=2)', 'const': np.array([b], ndimn=1)}] 201 | self.bonds = ((0, 1), (None, 0), (1, None)) 202 | """ 203 | 204 | def __init__(self): 205 | self.obj: np.array = None # 目标方程 206 | self.bonds: tuple = () # 最优解约束边界 207 | self.limit: list = [] # 约束条件 208 | 209 | self.maxiter: int = 300 # 最大迭代次数 210 | 211 | def Const(self) -> tuple: 212 | """ 213 | :return: 等式约束系数矩阵,不等式约束系数矩阵,等式约束上限,不等式约束上限 214 | """ 215 | if not self.limit: 216 | print("Linear programming requires constraints!") 217 | return None, None, None, None 218 | 219 | # Constraint factor and constraint matrix 220 | M_eq_list, M_ineq_list, b_eq_list, b_ineq_list = [], [], [], [] 221 | for const_ in self.limit: 222 | if const_['type'] == 'eq': 223 | if const_['coef'].ndim != 2: 224 | print("The coefficient matrix dimension must be 2") 225 | return None, None, None, None 226 | M_eq_list.append(const_['coef']) 227 | if const_['const'].ndim != 1: 228 | print("Constraint matrix dimension must be 1") 229 | return None, None, None, None 230 | b_eq_list.append(const_['const']) 231 | 232 | elif const_['type'] == 'ineq': 233 | if const_['coef'].ndim != 2: 234 | print("The coefficient matrix dimension must be 2") 235 | return None, None, None, None 236 | M_ineq_list.append(const_['coef']) 237 | if const_['const'].ndim != 1: 238 | print("Constraint matrix dimension must be 1") 239 | return None, None, None, None 240 | b_ineq_list.append(const_['const']) 241 | 242 | else: 243 | print("Constraints type error!") 244 | return None, None, None, None 245 | 246 | M_eq, M_ineq = np.concatenate(M_eq_list), np.concatenate(M_ineq_list) 247 | b_eq, b_ineq = np.concatenate(b_eq_list), np.concatenate(b_ineq_list) 248 | return M_eq, M_ineq, b_eq, b_ineq 249 | 250 | def solve(self): 251 | if self.obj is not None: 252 | self.bonds = ((0, 1),) * len(self.obj) if self.obj == () else self.bonds 253 | M_eq, M_ineq, b_eq, b_ineq = self.Const() 254 | if M_eq is None: 255 | return 256 | else: 257 | # simple method 258 | # sta = time.time() 259 | solution = linprog(self.obj, 260 | M_ineq, b_ineq, M_eq, b_eq, 261 | bounds=self.bonds, 262 | options={"maxiter": self.maxiter, 263 | "disp": False}) 264 | if not solution.success: 265 | print("Optimization of failure") 266 | return solution 267 | else: 268 | print("Please input object function coefficient!") 269 | return None 270 | 271 | 272 | if __name__ == '__main__': 273 | data_ = np.random.rand(100).reshape(20, 5) 274 | IC = pd.DataFrame(data=data_) 275 | 276 | # A = Test(IC, 5) 277 | # A.params = {"threshold": 2} 278 | # res = A.solve() 279 | -------------------------------------------------------------------------------- /Run/FactorCalculate.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | # @Time: 2020/10/12 14:23 3 | # @Author: FC 4 | # @Email: 18817289038@163.com 5 | 6 | import os 7 | import pandas as pd 8 | import time 9 | from multiprocessing import Pool 10 | from FactorAnalysis.FactorAnalysis import * 11 | 12 | DATABASE_NAME = {"Group": "分组数据保存", 13 | "Fin": "基本面因子保存", 14 | "PV": "价量易因子保存", 15 | "GenPro": "遗传规划算法挖掘因子保存"} 16 | 17 | 18 | # 因子计算存储 19 | def cal_factor(params_dict: dict): 20 | A = FactorValidityCheck() 21 | 22 | factor_name = params_dict['factor_name'] 23 | factor_params = params_dict['factor_params'] 24 | 25 | A.load_factor(fact_name=factor_name, 26 | factor_params=factor_params, 27 | cal=params_dict['cal']) 28 | 29 | A.factor_to_csv() 30 | 31 | 32 | def main(): 33 | factors_name = { 34 | FCN.Val.value: ['EP_ttm', 'EP_LR', 'EP_cut_ttm', 'E2P_ttm', 'PEG_ttm', 'BP_LR', 'BP_ttm', 'SP_ttm', 35 | 'SP_LR', 'NCFP_ttm', 'OCFP_ttm', 'FCFP_LR', 'FCFP_ttm', 'DP_ttm'], 36 | FCN.Gro.value: ['BPS_G_LR', 'EPS_G_ttm', 'ROA_G_ttm', 'TA_G_LR', 'TA_G_ttm', 'LA_G_LR', 'LA_G_ttm', 37 | 'ILA_G_LR', 'ILA_G_ttm', 'TA_G_LR_std', 'TA_G_ttm_std', 'LA_G_LR_std', 'LA_G_ttm_std', 38 | 'ILA_G_LR_std', 'ILA_G_ttm_std', 'NP_Acc', 'NP_Stable', 'NP_SD', 'OP_Acc', 'OP_Stable', 39 | 'OP_SD', 'OR_Acc', 'OR_Stable', 'OR_SD'], 40 | FCN.Pro.value: ['ROA_ttm', 'DPR_ttm', 'NP', 'NP_ttm', 'OPM', 'OPM_ttm'], 41 | FCN.Sol.value: ['Int_to_Asset', 'ShortDebt1_CFPA', 'ShortDebt2_CFPA', 'ShortDebt3_CFPA', 42 | 'ShortDebt1_CFPA_qoq', 'ShortDebt2_CFPA_qoq', 'ShortDebt3_CFPA_qoq', 43 | 'ShortDebt1_CFPA_qoq_abs', 'ShortDebt2_CFPA_qoq_abs', 'ShortDebt3_CFPA_qoq_abs', 44 | 'ShortDebt1_CFPA_std', 'ShortDebt2_CFPA_std', 'ShortDebt3_CFPA_std', 45 | 'IT_qoq_Z', 'PTCF_qoq_Z', 'OT_qoq_Z', 'OT2NP_qoq_Z', 'PT2NA_Z'], 46 | 47 | FCN.Ope.value: ['RROC_N', 'OCFA', 'TA_Turn_ttm'], 48 | FCN.EQ.value: ['CSR', 'CSRD', 'APR', 'APRD']} 49 | 50 | for j, j_v in factors_name.items(): 51 | if j in [FCN.Val.value]: 52 | continue 53 | print(f"开始计算{j}因子") 54 | for v_ in j_v: 55 | # if v_ in ['EPS_G_ttm', 'ROA_G_ttm', 'TA_G_LR']: 56 | # continue 57 | factor_dict = {"factor_category": j, 58 | "factor_name": v_, 59 | "factor_params": {"switch": False}, 60 | 'factor': None, 61 | 'cal': True, 62 | 'save_type': 'raw' # 保存原始因子数据, switch:保留频率转换后的数据 63 | } 64 | 65 | print(f"\033[1;31m{dt.datetime.now().strftime('%X')}: {factor_dict['factor_name']}\033[0m") 66 | db = 'Fin' 67 | cal_factor(factor_dict, db) 68 | pass 69 | 70 | 71 | def main1(fact_dict): 72 | 73 | # print(f"\033[1;31m{dt.datetime.now().strftime('%X')}: " 74 | # f"{factor_dict['factor_name']}-{factor_dict['factor_params']['n']}\033[0m") 75 | print(f"\033[1;31m{dt.datetime.now().strftime('%X')}: {fact_dict['factor_name']}\033[0m") 76 | cal_factor(fact_dict) 77 | 78 | # factor_dict = {"factor_category": factor_category, 79 | # "factor_name": factor, 80 | # "factor_params": {}, 81 | # 'factor': None, 82 | # 'cal': True, 83 | # 'save_type': 'switch' # 保存原始因子数据, switch:保留频率转换后的数据 84 | # } 85 | # 86 | # print(f"\033[1;31m{dt.datetime.now().strftime('%X')}: {factor_dict['factor_name']}\033[0m") 87 | # db = 'Fin' 88 | # cal_factor(factor_dict, db) 89 | 90 | 91 | def cal(fact_list): 92 | for fac in fact_list: 93 | factor_dict = {"factor_name": fac, 94 | "factor_params": {}, 95 | 'factor': None, 96 | 'cal': True 97 | } 98 | main1(factor_dict) 99 | 100 | 101 | def cal_pa(fact_list, pa): 102 | for fac in fact_list: 103 | for p_ in pa: 104 | factor_dict = {"factor_name": fac, 105 | "factor_params": {'period': p_, 106 | "n": 1}, 107 | 'factor': None, 108 | 'cal': True 109 | } 110 | print(p_) 111 | main1(factor_dict) 112 | 113 | 114 | def main_M(): 115 | fac_dict = {"Q": ["HighFreq035", "HighFreq056", "HighFreq057", "HighFreq058", "HighFreq059", "HighFreq060", "HighFreq062", "HighFreq080"], 116 | "W": ["HighFreq038", "HighFreq039", "HighFreq040", "HighFreq041", "HighFreq042", "HighFreq043", ], 117 | "E": ["HighFreq044", "HighFreq045", "HighFreq046", "HighFreq047", "HighFreq076", "HighFreq077", "HighFreq078"], 118 | "R": ["HighFreq071", "HighFreq072", "HighFreq073", "HighFreq074", "HighFreq075"], 119 | "T": ["HighFreq036", "HighFreq037"]} 120 | pam = [5, 15, 30, 60] 121 | 122 | pool = Pool(processes=4) 123 | for key_, value_ in fac_dict.items(): 124 | pool.apply_async(cal, (value_, pam)) 125 | # pool.apply_async(cal_pa, (dd, pam)) 126 | pool.close() 127 | pool.join() 128 | 129 | 130 | if __name__ == '__main__': 131 | # for i in range(6, 28): 132 | # if i in [10, 11]: 133 | # continue 134 | # factor = 'Momentum{:0>3}'.format(i) 135 | # f_list = ['FundFlow020'] 136 | p = Pool(2) 137 | p.apply_async(cal_pa, (['FundFlow020'], ['all'])) 138 | p.apply_async(cal_pa, (['FundFlow020'], ['open'])) 139 | p.apply_async(cal_pa, (['FundFlow020'], ['between'])) 140 | p.apply_async(cal_pa, (['FundFlow020'], ['close'])) 141 | # p.apply_async(cal_pa, (['VolPrice020'], [5, 10])) 142 | # p.apply_async(cal_pa, (['FundFlow019'], ['close'])) 143 | # p.apply_async(cal_pa, (['FundFlow020'], ['all', 'open', 'between', 'close'])) 144 | # cal_pa(['FundFlow019'], ['close']) 145 | # cal_pa(['FundFlow020'], ['all', 'open', 'between', 'close']) 146 | p.close() 147 | p.join() 148 | # cal(['FundFlow047']) 149 | -------------------------------------------------------------------------------- /Run/FactorCollinearity.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | import time 4 | from multiprocessing import Pool 5 | from FactorAnalysis.FactorAnalysis import * 6 | 7 | DATABASE_NAME = {"Group": "分组数据保存", 8 | "Fin": "基本面因子保存", 9 | "PV": "价量易因子保存", 10 | "GenPro": "遗传规划算法挖掘因子保存"} 11 | 12 | factor_effect = {FCN.Val.value: {"BP_LR", "BP_ttm", "DP_ttm", "E2P_ttm", "EP_cut_ttm", 13 | "EP_LR", "EP_ttm", "SP_LR", "SP_ttm"}, 14 | 15 | FCN.Gro.value: {"BPS_G_LR", "EPS_G_ttm", "ROA_G_ttm", "MAR_G", 16 | "NP_Stable", "OP_Stable", "OR_Stable", 17 | "ILA_G_ttm_std", "TA_G_LR_std"}, 18 | 19 | FCN.Pro.value: {"NPM_T", "ROA_ttm"}, 20 | 21 | FCN.Ope.value: {"RROC_N", "TA_Turn_ttm_T"}, 22 | 23 | FCN.Sol.value: {"IT_qoq_Z", "OT2NP_qoq_Z", 24 | "ShortDebt2_CFPA_qoq_abs", "ShortDebt3_CFPA_qoq_abs", 25 | "ShortDebt3_CFPA_std"}, 26 | FCN.EQ.value: {}} 27 | 28 | factor_comp = {FCN.Val.value: [{"name": 'VF1', 29 | "factor_name": {'BP_LR', 'DP_ttm', 'EP_ttm'}}], 30 | FCN.Gro.value: [{"name": 'GF1', 31 | "factor_name": {'NP_Stable', 'OP_Stable', 'OR_Stable'}}, 32 | {"name": 'GF2', 33 | "factor_name": {'ILA_G_ttm_std', 'TA_G_LR_std'}}], 34 | FCN.Sol.value: [{"name": 'SF1', 35 | "factor_name": {'IT_qoq_Z', 'OT2NP_qoq_Z'}}, 36 | {"name": 'SF2', 37 | "factor_name": {'ShortDebt2_CFPA_qoq_abs', 'ShortDebt3_CFPA_qoq_abs'}}]} 38 | 39 | 40 | # 相关性检验 41 | def main(): 42 | # FPN.FactorSwitchFreqData.value 43 | A = FactorCollinearity() 44 | 45 | for factor_name_, factor_info in factor_effect.items(): 46 | if factor_name_ != FCN.Ope.value: 47 | continue 48 | # A.get_data(factor_name_, factor_info) # 49 | try: 50 | L = [] 51 | for i in ['MAR_G', "NPM_T", "SP_LR", "TA_Turn_ttm_T", "VF1_comp", "currentdebttodebt"]: 52 | m = pd.read_csv(f"A:\\SecuritySelectData\\FactorPool\\FactorEffective\\{i}.csv") 53 | m.set_index(['date', 'stock_id'], inplace=True) 54 | L.append(m[f"{i}"]) 55 | op = pd.concat(L, axis=1, join='inner') 56 | op.reset_index(inplace=True) 57 | A.get_data('', {}, op) 58 | A.correctionTest() 59 | except Exception as e: 60 | print(e) 61 | 62 | 63 | # 因子合成 64 | def main1(): 65 | Equal_dict = {} 66 | 67 | Ret_dict = {"fact_ret": None, 68 | "rp": 60, 69 | "hp": 6, 70 | "algorithm": "Half_time"} 71 | MAX_IC_dict = {"fact_ret": None, 72 | "rp": 60, 73 | "hp": 6, 74 | "way": "IC_IR"} 75 | factor_D = {"OCFA": '+', 76 | "RROC_N": '+', 77 | "TA_Turn_ttm": '+'} 78 | 79 | # FPN.FactorSwitchFreqData.value 80 | A = FactorCollinearity() 81 | 82 | for factor_name_, factor_info in factor_comp.items(): 83 | for factor_info_ in factor_info: 84 | 85 | comp_name = factor_info_['name'] + '_comp' 86 | A.get_data(factor_name_, factor_info_["factor_name"]) # 87 | try: 88 | comp_factor = A.factor_synthetic(method='Equal', 89 | factor_D=factor_D, 90 | stand_method='mv', 91 | ret_type='Pearson', 92 | **Equal_dict) 93 | except Exception as e: 94 | print(e) 95 | else: 96 | comp_factor.name = comp_name 97 | comp_factor.to_csv(os.path.join(FPN.factor_comp.value, comp_name + '.csv'), header=True) 98 | 99 | 100 | if __name__ == '__main__': 101 | 102 | main() 103 | -------------------------------------------------------------------------------- /Run/MultiFactorTest.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | # @Time: 2020/10/15 16:05 3 | # @Author: FC 4 | # @Email: 18817289038@163.com 5 | import os 6 | import pandas as pd 7 | import time 8 | import yagmail 9 | from multiprocessing import Pool 10 | from FactorAnalysis.FactorAnalysis import * 11 | 12 | DATABASE_NAME = {"Group": "分组数据保存", 13 | "Fin": "基本面因子保存", 14 | "PV": "价量易因子保存", 15 | "GenPro": "遗传规划算法挖掘因子保存"} 16 | 17 | email = {"FC": {"user": "18817289038@163.com", 18 | "password": "PFBMGFCIDJJGRRCK", 19 | "host": "smtp.163.com"}, 20 | } 21 | 22 | 23 | # 多因子测试 24 | def Multiple_factor_test(fact_dicts: dict, process: dict, hp): 25 | 26 | for fact, fact_p in fact_dicts.items(): 27 | try: 28 | fact_value = pd.read_csv(f"D:\\Data\\{fact_p['factor_category']}\\{fact}.csv") 29 | # fact_value = pd.read_csv(f'D:\\Quant\\SecuritySelect\\Data\\{fact}.csv') 30 | fact_p['factor_value'] = fact_value 31 | A = FactorValidityCheck() 32 | 33 | print(f"加载因子:{fact}") 34 | # load pool data 35 | A.load_pool_data(stock_pool_name="StockPool1", # StockPool1 36 | label_pool_name="LabelPool1") 37 | 38 | # load factor data 39 | A.load_factor(**fact_p) 40 | 41 | # integration data and process factor 42 | print(f"因子处理和数据整合") 43 | A.integration(**process) 44 | 45 | # Factor validity test 46 | print(f"开始测试因子:{fact}") 47 | A.effectiveness(ret_period=hp, save=True) 48 | 49 | # send_email(email, f'{fact}因子检验没有问题', f'{fact}因子检验没有问题') 50 | except Exception as e: 51 | send_email(email, f'{fact}因子检验存在问题', e.__str__()) 52 | print(">" * 40 + "time:{}".format(time.ctime()) + "<" * 40) 53 | # time.sleep(60) 54 | print('Over!') 55 | 56 | 57 | if __name__ == '__main__': 58 | 59 | factors_name = { 60 | FCN.Val.value: ['EP_ttm', 'EP_LR', 'EP_cut_ttm', 'E2P_ttm', 'PEG_ttm', 'BP_LR', 'BP_ttm', 'SP_ttm', 61 | 'SP_LR', 'NCFP_ttm', 'OCFP_ttm', 'FCFP_LR', 'FCFP_ttm', 'DP_ttm'], 62 | 63 | FCN.Gro.value: ['BPS_G_LR', 'EPS_G_ttm', 'ROA_G_ttm', 'TA_G_LR', 'TA_G_ttm', 'LA_G_LR', 'LA_G_ttm', 64 | 'ILA_G_LR', 'ILA_G_ttm', 'TA_G_LR_std', 'TA_G_ttm_std', 'LA_G_LR_std', 'LA_G_ttm_std', 65 | 'ILA_G_LR_std', 'ILA_G_ttm_std', 'NP_Acc', 'NP_Stable', 'NP_SD', 'OP_Acc', 'OP_Stable', 66 | 'OP_SD', 'OR_Acc', 'OR_Stable', 'OR_SD'], 67 | 68 | FCN.Pro.value: ['ROA_ttm', 'DPR_ttm', 'NP', 'NP_ttm', 'OPM', 'OPM_ttm'], 69 | 70 | FCN.Sol.value: ['Int_to_Asset', 'ShortDebt1_CFPA', 'ShortDebt2_CFPA', 'ShortDebt3_CFPA', 71 | 'ShortDebt1_CFPA_qoq', 'ShortDebt2_CFPA_qoq', 'ShortDebt3_CFPA_qoq', 72 | 'ShortDebt1_CFPA_qoq_abs', 'ShortDebt2_CFPA_qoq_abs', 'ShortDebt3_CFPA_qoq_abs', 73 | 'ShortDebt1_CFPA_std', 'ShortDebt2_CFPA_std', 'ShortDebt3_CFPA_std', 74 | 'IT_qoq_Z', 'PTCF_qoq_Z', 'OT_qoq_Z', 'OT2NP_qoq_Z', 'PT2NA_Z'], 75 | 76 | FCN.Ope.value: ['RROC_N', 'OCFA', 'TA_Turn_ttm'], 77 | 78 | FCN.EQ.value: ['CSR', 'CSRD', 'APR', 'APRD'] 79 | } 80 | # factors_ef = { 81 | # FCN.Val.value: ['EP_ttm', 'EP_LR', 'EP_cut_ttm', 'E2P_ttm', 'PEG_ttm', 'BP_LR', 'BP_ttm', 'SP_ttm', 82 | # 'SP_LR', 'NCFP_ttm', 'OCFP_ttm', 'FCFP_LR', 'FCFP_ttm', 'DP_ttm'], 83 | # 84 | # FCN.Gro.value: ['BPS_G_LR', 'EPS_G_ttm', 'ROA_G_ttm', 'TA_G_LR_std', 'TA_G_ttm_std', 'ILA_G_ttm_std', 85 | # 'NP_Stable', 'OP_Stable', 'OR_Stable'], 86 | # 87 | # FCN.Pro.value: ['ROA_ttm', 'DPR_ttm', 'NP', 'NP_ttm', 'OPM', 'OPM_ttm'], 88 | # 89 | # FCN.Sol.value: ['Int_to_Asset', 'ShortDebt1_CFPA', 'ShortDebt2_CFPA', 'ShortDebt3_CFPA', 90 | # 'ShortDebt1_CFPA_qoq', 'ShortDebt2_CFPA_qoq', 'ShortDebt3_CFPA_qoq', 91 | # 'ShortDebt1_CFPA_qoq_abs', 'ShortDebt2_CFPA_qoq_abs', 'ShortDebt3_CFPA_qoq_abs', 92 | # 'ShortDebt1_CFPA_std', 'ShortDebt2_CFPA_std', 'ShortDebt3_CFPA_std', 93 | # 'IT_qoq_Z', 'PTCF_qoq_Z', 'OT_qoq_Z', 'OT2NP_qoq_Z', 'PT2NA_Z'], 94 | # 95 | # FCN.Ope.value: ['RROC_N', 'OCFA', 'TA_Turn_ttm'], 96 | # 97 | # FCN.EQ.value: ['CSR', 'CSRD', 'APR', 'APRD'] 98 | # } 99 | s = 0 100 | while True: 101 | if True: 102 | # if dt.datetime.now() > dt.datetime(2020, 10, 22, 7, 30) and s == 0:# dt.datetime.now() > dt.datetime(2020, 10, 22, 7, 30) 103 | send_email(email, "开始进行因子有效性检验", f'{dt.datetime.now()}') 104 | for fact_c, fact_names in factors_name.items(): 105 | fact_dict = {} 106 | for fact_name in fact_names: 107 | if fact_name in ['EP_ttm']: 108 | continue 109 | # if fact_c in ['估值', '成长'] or fact_name in ['BPS_G_LR', 'EPS_G_ttm']: 110 | # continue 111 | factor_p = {"fact_name": fact_name, 112 | "factor_category": fact_c, 113 | "factor_params": {"switch": False}, 114 | 'db': 'Fin', 115 | 'factor_value': None, 116 | 'cal': False} 117 | 118 | factor_process = {"outliers": '', # mad 119 | "neu": '', # mv+industry 120 | "stand": '', # mv 121 | "switch_freq": False, 122 | "limit": 120} 123 | fact_dict[fact_name] = factor_p 124 | 125 | Multiple_factor_test(fact_dict, factor_process, hp=6) 126 | s = 1 127 | else: 128 | print('Cycle') 129 | time.sleep(60 * 10) 130 | -------------------------------------------------------------------------------- /Run/SingleFactorTest.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | # @Time: 2020/9/14 11:26 3 | # @Author: FC 4 | # @Email: 18817289038@163.com 5 | 6 | import os 7 | import pandas as pd 8 | import time 9 | import yagmail 10 | from multiprocessing import Pool 11 | from FactorAnalysis.FactorAnalysis import * 12 | 13 | DATABASE_NAME = {"Group": "分组数据保存", 14 | "Fin": "基本面因子保存", 15 | "PV": "价量易因子保存", 16 | "GenPro": "遗传规划算法挖掘因子保存"} 17 | 18 | 19 | # 单因子测试 20 | def Single_factor_test(params: dict, 21 | process: dict, 22 | hp: int = 1, 23 | save: bool = False): 24 | """ 25 | 26 | :param params:因子参数 27 | :param process: 因子处理参数 28 | :param hp: 持有周期 29 | :param save: 是否保存检验结果 30 | :return: 31 | """ 32 | A = FactorValidityCheck() 33 | 34 | # load pool data 35 | A.load_pool_data(stock_pool_name="StockPool1", # StockPool1 36 | label_pool_name="LabelPool1") 37 | 38 | # load factor data 39 | A.load_factor(**params) 40 | 41 | A.integration(**process) 42 | 43 | # Factor validity test 44 | A.effectiveness(hp=hp, 45 | save=save) 46 | print('Stop') 47 | 48 | 49 | def main1(factor_name, 50 | hp, 51 | save: bool = False): 52 | 53 | df = pd.read_csv(f"A:\\DataBase\\SecuritySelectData\\FactorPool\\FactorRawData\\TechnicalHighFrequencyFactor\\" 54 | f"{factor_name}.csv", header=None) 55 | df.columns = ['date', 'stock_id', factor_name] 56 | factor_p = {"fact_name": factor_name, 57 | "factor_params": {"switch": False}, 58 | 'db': 'HFD', 59 | 'factor_value': df, 60 | 'cal': False} 61 | factor_process = {"outliers": '', # mad 62 | "neu": '', # mv+industry 63 | "stand": '', # mv 64 | "switch_freq": False, 65 | "limit": 120} 66 | 67 | print(f"\033[1;31m{dt.datetime.now().strftime('%X')}: {factor_name}\033[0m") 68 | 69 | Single_factor_test(params=factor_p, 70 | process=factor_process, 71 | hp=hp, 72 | save=save) 73 | 74 | 75 | def main2(factor_name, hp, save: bool = False): 76 | fact_value = None 77 | 78 | factor_p = {"fact_name": factor_name, 79 | "factor_params": {"n": 21}, 80 | 'db': 'HFD', 81 | 'factor_value': fact_value, 82 | 'cal': True} 83 | 84 | factor_process = {"outliers": '', # mad 85 | "neu": '', # mv+industry 86 | "stand": '', # mv 87 | "switch_freq": False, 88 | "limit": 120} 89 | # factor_process1 = {"outliers": 'mad', # mad 90 | # "neu": 'mv+industry', # mv+industry 91 | # "stand": 'mv', # mv 92 | # "switch_freq": False, 93 | # "limit": 120} 94 | 95 | # print(f"\033[1;31m{dt.datetime.now().strftime('%X')}: " 96 | # f"{factor_name}-{factor_p['factor_params']['n']}-{hp}days\033[0m") 97 | print(f"\033[1;31m{dt.datetime.now().strftime('%X')}: {factor_name}-{hp}days\033[0m") 98 | Single_factor_test(params=factor_p, 99 | process=factor_process, 100 | hp=hp, 101 | save=save) 102 | 103 | # Single_factor_test(params=factor_p, 104 | # process=factor_process1, 105 | # hp=hp, 106 | # save=save) 107 | 108 | 109 | if __name__ == '__main__': 110 | 111 | # for i in range(6, 28): 112 | # if i in [10, 11]: 113 | # continue 114 | # factor = 'Momentum{:0>3}'.format(i) 115 | # main2(factor, 1) 116 | 117 | factor = 'HighFreq062' 118 | main1(factor, hp=5, save=False) 119 | 120 | -------------------------------------------------------------------------------- /Run/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaotfeng/SecuritySelect/6c4653b6bdd6b46118a5fd332d741b542daf9c31/Run/__init__.py -------------------------------------------------------------------------------- /StockPool/StockPool.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import datetime as dt 3 | import numpy as np 4 | import os 5 | import sys 6 | import time 7 | from functools import reduce 8 | 9 | from constant import ( 10 | KeyName as KN, 11 | FilePathName as FPN, 12 | PriceVolumeName as PVN, 13 | ) 14 | 15 | 16 | class StockPool(object): 17 | """ 18 | 股票池属性名称最好与文件名名称保持一致 19 | 股票池返回数据类型为pandas.Index 20 | """ 21 | 22 | def __init__(self): 23 | self.index_list = [] 24 | pass 25 | 26 | # ST股标识 27 | def ST(self, 28 | data: pd.DataFrame, 29 | st: str = 'isst') -> pd.Series(bool): 30 | """ 31 | 对ST,*ST股进行标记 32 | 合格股票标记为True 33 | """ 34 | data_sub = data.copy(deep=True) 35 | res = data_sub[st].groupby(KN.STOCK_ID.value, group_keys=False).shift(1).fillna(False) 36 | self.index_list.append(res[~res].index) 37 | return ~ res 38 | 39 | # 成立年限 40 | def established(self, 41 | data: pd.DataFrame, 42 | listdate: str = 'period2list', 43 | days: int = 90) -> pd.Series(bool): 44 | """ 45 | 以真实成立时间进行筛选 46 | 合格股票标记为True 47 | """ 48 | data_sub = data.copy(deep=True) 49 | # dt_now = dt.datetime.now() 50 | # data_sub[listdate].fillna(dt_now.date().__str__(), inplace=True) 51 | # list_days = data_sub[listdate].apply(lambda x: (dt_now - dt.datetime.strptime(x, "%Y-%m-%d")).days) 52 | J = data_sub[listdate] > days 53 | 54 | res = J.groupby(KN.STOCK_ID.value, group_keys=False).shift(1).fillna(True) 55 | self.index_list.append(res[res].index) 56 | return res 57 | 58 | # 交易规模 59 | def liquidity(self, 60 | data: pd.DataFrame, 61 | amount_name: str = PVN.AMOUNT.value, 62 | days: int = 5, 63 | proportion: float = 0.05) -> pd.Series(bool): 64 | """ 65 | 默认标记过去5个交易日日均成交额占比在后5%的股票 66 | 合格股票标记为True 67 | """ 68 | data_sub = data.copy(deep=True) 69 | amount_mean = data_sub[amount_name].groupby(KN.STOCK_ID.value, 70 | group_keys=False).rolling(days).mean() 71 | # 空值不参与计算 72 | J = amount_mean.groupby(KN.TRADE_DATE.value).apply(lambda x: x.gt(x.quantile(proportion))) 73 | 74 | res = J.groupby(KN.STOCK_ID.value, group_keys=False).shift(1).fillna(True) 75 | self.index_list.append(res[res].index) 76 | return res 77 | 78 | # 停牌 79 | def suspension(self, 80 | data: pd.DataFrame, 81 | amount_name: str = PVN.AMOUNT.value, 82 | days: int = 5, 83 | frequency: int = 3) -> pd.Series(bool): 84 | """ 85 | 1.当天停牌,下一天不交易 86 | 2.连续5天发生3天停牌不交易 87 | 以成交额为空作为停牌标识 88 | 如果当天下午发生停牌,该标识有误 89 | 前days天以第days天标识为主:若第days天为5,则前days天都为5 90 | 合格股票标记为True 91 | """ 92 | data_sub = data.copy(deep=True) 93 | trade_days = data_sub[amount_name].groupby(KN.STOCK_ID.value, 94 | group_keys=False).rolling(days, min_periods=days).count().bfill() 95 | J1 = trade_days > days - frequency 96 | J2 = data_sub[amount_name] != 0 97 | res = pd.DataFrame({"J1": J1, "J2": J2}) 98 | res = res.groupby(KN.STOCK_ID.value, group_keys=False).shift(1).fillna(True) 99 | 100 | self.index_list.append(res[res['J1'] & res['J2']].index) 101 | return res 102 | 103 | def price_limit(self, 104 | data: pd.DataFrame, 105 | up_down: PVN.Up_Down.value): 106 | """ 107 | 当天涨跌停,下一天停止交易 108 | 若标识为空,则默认为涨跌停股票(该类股票一般为退市股或ST股等异常股) 109 | 合格股票标记为True 110 | :param up_down: 111 | :param data: 112 | :return: 113 | """ 114 | data_sub = data.copy(deep=True) 115 | 116 | J = data_sub[up_down].fillna(1) == 0 117 | res = J.groupby(KN.STOCK_ID.value, group_keys=False).shift(1).fillna(True) 118 | self.index_list.append(res[res].index) 119 | return res 120 | 121 | def StockPool1(self) -> pd.Index: 122 | """ 123 | 1.剔除ST股:是ST为True 124 | 2.剔除成立年限小于6个月的股票:成立年限小于6个月为False 125 | 3.过去5天成交额占比排名最后5%:成交额占比在最后5%为False 126 | 4.过去5天停牌天数超过3天:停牌数超过阈值为False 127 | 128 | 注意:函数名需要与数据源文件名对应,保持一致防止出错,可自行修改 129 | :return: 130 | """ 131 | result_path = os.path.join(FPN.stock_pool_path.value, sys._getframe().f_code.co_name + '_result.csv') 132 | if os.path.exists(result_path): # os.path.exists(result_path) 133 | index_effect_stock = pd.read_csv(result_path, index_col=[KN.TRADE_DATE.value, KN.STOCK_ID.value]).index 134 | else: 135 | # get data file path 136 | data_address = os.path.join(FPN.stock_pool_path.value, sys._getframe().f_code.co_name + '.csv') 137 | 138 | # read data 139 | print(f"{dt.datetime.now().strftime('%X')}: Read the data of stock pool") 140 | data_input = pd.read_csv(data_address) 141 | data_input.set_index([KN.TRADE_DATE.value, KN.STOCK_ID.value], inplace=True) 142 | self.established(data=data_input, days=225) 143 | # get filter condition 144 | print(f"{dt.datetime.now().strftime('%X')}: Weed out ST stock") 145 | self.ST(data_input) 146 | 147 | print(f"{dt.datetime.now().strftime('%X')}: Weed out Price Up_Down limit stock") 148 | self.price_limit(data_input, PVN.Up_Down.value) 149 | 150 | print(f"{dt.datetime.now().strftime('%X')}: Weed out stock established in less than 3 months") 151 | self.established(data=data_input, days=225) 152 | 153 | print(f"{dt.datetime.now().strftime('%X')}: Weed out stock illiquidity") 154 | self.liquidity(data_input) 155 | 156 | print(f"{dt.datetime.now().strftime('%X')}: Weed out suspension stock") 157 | self.suspension(data_input) 158 | 159 | # Filter 160 | index_effect_stock = reduce(lambda x, y: x.intersection(y), self.index_list) 161 | 162 | # Sort 163 | index_effect_stock = index_effect_stock.sort_values() 164 | # to_csv 165 | index_effect_stock.to_frame().to_csv(result_path, index=False) 166 | return index_effect_stock 167 | 168 | 169 | if __name__ == '__main__': 170 | path = "A:\\数据\\StockPool" 171 | # stock_pool = pd.read_csv("A:\\数据\\StockPool.csv") 172 | 173 | # Data cleaning:Restoration stock price [open, high, low, close] 174 | # price_columns = ['open', 'close', 'high', 'low'] 175 | # stock_pool[price_columns] = stock_pool[price_columns].multiply(stock_pool['adjfactor'], axis=0) 176 | # df_stock.set_index('date', inplace=True) 177 | A = StockPool() 178 | A.StockPool1() 179 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | # @Time: 2020/9/11 10:50 3 | # @Author: FC 4 | # @Email: 18817289038@163.com 5 | -------------------------------------------------------------------------------- /constant.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | # @Time: 2020/9/1 16:56 3 | # @Author: FC 4 | # @Email: 18817289038@163.com 5 | 6 | from enum import Enum, unique 7 | import time 8 | import psutil 9 | import datetime as dt 10 | 11 | mem = psutil.virtual_memory() 12 | 13 | 14 | @unique 15 | class FilePathName(Enum): 16 | factor_info = 'Z:\\Database\\' # 因子信息路径 17 | stock_pool_path = 'A:\\DataBase\\SecuritySelectData\\StockPool' # 股票池数据 18 | label_pool_path = 'A:\\DataBase\\SecuritySelectData\\LabelPool' # 标签池数据 19 | process_path = 'A:\\DataBase\\SecuritySelectData\\Process' # 因子预处理所需数据 20 | 21 | factor_pool_path = 'A:\\DataBase\\SecuritySelectData\\FactorPool\\' # 因子池 22 | factor_inputData = 'A:\\DataBase\\SecuritySelectData\\FactorPool\\Factor_InputData\\' # 因子计算所需数据 23 | FactorSwitchFreqData = "A:\\DataBase\\SecuritySelectData\\FactorPool\\FactorSwitchFreqData\\" # 频率转换后的因子集 24 | FactorRawData = "A:\\DataBase\\SecuritySelectData\\FactorPool\\FactorRawData\\" # 原始因子集(未经任何处理) 25 | factor_test_res = "A:\\DataBase\\SecuritySelectData\\FactorPool\\FactorsTestResult\\" # 因子检验结果保存 26 | 27 | factor_ef = "A:\\DataBase\\SecuritySelectData\\FactorPool\\FactorEffective\\" # 筛选有效因子集 28 | factor_comp = "A:\\DataBase\\SecuritySelectData\\FactorPool\\FactorEffective\\FactorComp\\" # 复合因子数据集 29 | 30 | Trade_Date = 'A:\\DataBase\\TradeDate' # 交易日 31 | List_Date = 'A:\\DataBase\\ListDate' # 成立日 32 | 33 | HFD_Stock_M = 'Y:\\合成数据\\逐笔1min\\逐笔1min' # 高频分钟数据 34 | HFD_Stock_Depth = 'Y:\\合成数据\\十档Vwap' # 高频十档盘口数据 35 | HFD_Stock_Depth_1min = 'Y:\\合成数据\\十档1min\\因子数据' # 高频十档分钟数据 36 | HFD_Stock_CF = 'Y:\\合成数据\\逐笔资金流向' # 逐笔资金流向 37 | HFD = 'A:\\DataBase\\HFD' # 高频数据存储地址 38 | 39 | 40 | @unique 41 | class KeyName(Enum): 42 | STOCK_ID = 'stock_id' 43 | TRADE_DATE = 'date' 44 | LIST_DATE = 'list_date' 45 | STOCK_RETURN = 'return' 46 | 47 | 48 | @unique 49 | class SpecialName(Enum): 50 | GROUP = 'group' 51 | 52 | CSI_300 = 'HS300' 53 | CSI_50 = 'SZ50' 54 | CSI_500 = 'ZZ500' 55 | WI_A = 'Wind_A' 56 | 57 | INDUSTRY_FLAG = 'industry_flag' 58 | CSI_300_INDUSTRY_WEIGHT = 'csi_300_weight' 59 | CSI_500_INDUSTRY_WEIGHT = 'csi_500_weight' 60 | CSI_50_INDUSTRY_WEIGHT = 'csi_50_weight' 61 | 62 | CSI_300_INDUSTRY_MV = 'csi_300_mv' 63 | CSI_500_INDUSTRY_MV = 'csi_500_mv' 64 | CSI_50_INDUSTRY_MV = 'csi_50_mv' 65 | ANN_DATE = 'date' 66 | REPORT_DATE = 'report_date' 67 | 68 | 69 | @unique 70 | class PriceVolumeName(Enum): 71 | 72 | CLOSE = 'close' 73 | OPEN = 'open' 74 | HIGH = 'high' 75 | LOW = 'low' 76 | 77 | Up_Down = 'up_down_limit' 78 | 79 | AMOUNT = 'amount' 80 | VOLUME = 'volume' 81 | 82 | ADJ_FACTOR = 'adjfactor' 83 | 84 | LIQ_MV = 'liq_mv' 85 | TOTAL_MV = 'total_mv' 86 | 87 | 88 | @unique 89 | class ExchangeName(Enum): 90 | SSE = 'SSE' 91 | SZSE = 'SZSE' 92 | 93 | 94 | @unique 95 | class FinancialBalanceSheetName(Enum): 96 | Total_Asset = 'total_asset' # 总资产 97 | Liq_Asset = 'liq_asset' # 流动性资产 98 | ILiq_Asset = 'iliq_asset' # 非流动性资产 99 | Fixed_Asset = 'fixed_asset' # 固定资产 100 | 101 | Currency = 'money' # 货币资金 102 | Tradable_Asset = 'tradable_asset' # 可交易金融资产 103 | 104 | ST_Borrow = 'st_borrow' # 短期借款 105 | ST_Bond_Payable = 'st_Bond_P' # 短期应付债券 106 | ST_IL_LB_1Y = 'st_lb' # 一年内到期的非流动负债 107 | LT_Borrow = 'lt_borrow' # 长期借款 108 | 109 | Tax_Payable = 'tax_patable' # 应交税费 110 | 111 | Total_Lia = 'total_liability' # 总负债 112 | 113 | Actual_Capital = 'actual_capital' # 总股本 114 | Surplus_Reserves = 'surplus_reserves' # 盈余公积 115 | Undistributed_Profit = 'undistributed_profit' # 未分配利润 116 | 117 | Net_Asset_Ex = 'shareholder_equity_ex' # (不含少数股东权益)净资产 118 | Net_Asset_In = 'shareholder_equity_in' # (含少数股东权益)净资产 119 | 120 | 121 | @unique 122 | class FinancialIncomeSheetName(Enum): 123 | Net_Pro_In = 'net_profit_in' # 净利润(包含少数股东权益) 124 | Net_Pro_Ex = 'net_profit_ex' # 净利润(不包含少数股东权益) 125 | Net_Pro_Cut = 'net_profit_cut' # 净利润(扣除非经常性损益) 126 | 127 | Total_Op_Income = 'total_op_ic' # 营业总收入 128 | Op_Total_Cost = 'op_total_cost' # 营业总成本 129 | 130 | Op_Income = 'op_ic' # 营业收入 131 | Op_Pro = 'op_pro' # 营业利润 132 | Op_Cost = 'op_cost' # 营业成本 133 | 134 | Tax = 'tax' # 所得税 135 | Tax_Surcharges = 'tax_surcharges' # 税金及附加 136 | 137 | 138 | @unique 139 | class FinancialCashFlowSheetName(Enum): 140 | Net_CF = 'net_cash_flow' # 净现金流 141 | Op_Net_CF = 'op_net_cash_flow' # 经营性活动产生的现金流量净额 142 | All_Tax = 'tax_all' # 支付的各项税费 143 | 144 | Cash_From_Sales = 'cash_sales' # 销售商品、提供劳务收到的现金 145 | 146 | Free_Cash_Flow = 'FCFF' # 自由现金流 147 | 148 | 149 | @unique 150 | class FactorCategoryName(Enum): 151 | Val = 'ValuationFactor' 152 | Gro = 'GrowthFactors' 153 | Pro = 'ProfitFactor' 154 | Sol = 'SolvencyFactor' 155 | Ope = 'OperateFactor' 156 | EQ = 'QualityFactor' 157 | Size = 'SizeFactor' 158 | MTM = 'MomentumFactor' 159 | HFD = 'HighFrequencyFactor' 160 | 161 | 162 | @unique 163 | class StrategyName(Enum): 164 | pass 165 | 166 | 167 | def timer(func): 168 | def wrapper(*args, **kw): 169 | func_name = func.__name__ 170 | 171 | sta = time.time() 172 | # mem_start = mem.used 173 | print(f"\033[1;31m{dt.datetime.now().strftime('%X')}: Start run the method of \033[0m" 174 | f"\033[1;33m\'{func_name}\'\033[0m") 175 | 176 | func(*args, **kw) 177 | 178 | end = time.time() 179 | # mem_end = mem.used 180 | 181 | rang_time = round((end - sta) / 60, 4) 182 | # range_mem = round((mem_start - mem_end) / 1024 / 1024 / 1024, 4) 183 | 184 | print(f"\033[1;31m{dt.datetime.now().strftime('%X')}: It takes\033[0m " 185 | f"\033[1;33m{rang_time}Min\033[0m " 186 | f"\033[1;31mto run func\033[0m" 187 | f" \033[1;33m\'{func_name}\'\033[0m") 188 | 189 | return wrapper 190 | 191 | 192 | # def memory_cal(func): 193 | # mem = psutil.virtual_memory() 194 | # mem_start = mem.used / 1024 / 1024 / 1024 195 | # f = func() 196 | # mem_used = mem.used / 1024 / 1024 / 1024 - mem_start 197 | # return f 198 | 199 | 200 | if __name__ == '__main__': 201 | print('s') 202 | -------------------------------------------------------------------------------- /factor_direction.json: -------------------------------------------------------------------------------- 1 | { 2 | "Category1": "1.成长类因子", 3 | 4 | "BPS_G_LR": "+", 5 | "EPS_G_ttm": "+", 6 | "ROA_G_ttm": "+", 7 | "TA_G_LR": "", 8 | "TA_G_ttm": "", 9 | "LA_G_LR": "", 10 | "LA_G_ttm": "", 11 | "ILA_G_LR": "", 12 | "ILA_G_ttm": "", 13 | "TA_G_LR_std": "-", 14 | "TA_G_ttm_std": "-", 15 | "LA_G_LR_std": "", 16 | "LA_G_ttm_std": "", 17 | "ILA_G_LR_std": "", 18 | "ILA_G_ttm_std": "-", 19 | 20 | "NP_Acc": "", 21 | "NP_Stable": "+", 22 | "NP_SD": "", 23 | 24 | "OP_Acc": "", 25 | "OP_Stable": "+", 26 | "OP_SD": "", 27 | 28 | "OR_Acc": "", 29 | "OR_Stable": "+", 30 | "OR_SD": "", 31 | 32 | 33 | "Category2": "2.估值类因子", 34 | 35 | "E2P_ttm": "+", 36 | "EP_ttm": "+", 37 | "EP_LR": "+", 38 | "EP_cut_ttm": "+", 39 | "PEG_ttm": "", 40 | "BP_LR": "+", 41 | "BP_ttm": "+", 42 | "FCFP_LR": "", 43 | "FCFP_ttm": "", 44 | "NCFP_ttm": "", 45 | "OCFP_ttm": "", 46 | "DP_ttm": "+", 47 | "SP_ttm": "+", 48 | "SP_LR": "+", 49 | "EV2EBITDA_LR": "", 50 | 51 | "Category3":"3.营运能力类因子", 52 | 53 | "RROC_N": "+", 54 | "OCFA": "", 55 | "TA_Turn_ttm": "", 56 | 57 | "Category4": "4.偿债能力因子", 58 | 59 | "Int_to_Asset": "-", 60 | 61 | "ShortDebt1_CFPA": "", 62 | "ShortDebt1_CFPA_qoq": "", 63 | "ShortDebt1_CFPA_qoq_abs": "", 64 | "ShortDebt1_CFPA_std": "", 65 | 66 | "ShortDebt2_CFPA": "", 67 | "ShortDebt2_CFPA_qoq": "", 68 | "ShortDebt2_CFPA_qoq_abs": "+", 69 | "ShortDebt2_CFPA_std": "", 70 | 71 | "ShortDebt3_CFPA": "", 72 | "ShortDebt3_CFPA_qoq": "", 73 | "ShortDebt3_CFPA_qoq_abs": "+", 74 | "ShortDebt3_CFPA_std": "+", 75 | 76 | "IT_qoq_Z": "+", 77 | "PTCF_qoq_Z": "", 78 | "OT_qoq_Z": "", 79 | "OT2NP_qoq_Z": "-", 80 | "PT2NA_Z": "", 81 | 82 | "Category5": "5.收益质量因子", 83 | 84 | "CSR": "", 85 | "CSRD": "", 86 | "APR": "", 87 | "APRD": "", 88 | 89 | "Category6": "6.动量因子", 90 | 91 | "MTM_gen_1": "", 92 | "MTM_gen_2": "", 93 | "MTM_gen_3": "", 94 | "MTM_gen_20": "", 95 | "MTM_gen_60": "", 96 | 97 | "MTM_bt_day_1": "", 98 | "MTM_bt_day_2": "", 99 | "MTM_bt_day_3": "", 100 | "MTM_bt_day_20": "", 101 | "MTM_bt_day_60": "", 102 | 103 | "MTM_in_day_1": "", 104 | "MTM_in_day_2": "", 105 | "MTM_in_day_3": "", 106 | "MTM_in_day_20": "", 107 | "MTM_in_day_60": "", 108 | 109 | "MTM_N_P_1": "", 110 | "MTM_N_P_2": "", 111 | "MTM_N_P_3": "", 112 | "MTM_N_P_20": "", 113 | "MTM_N_P_60": "", 114 | 115 | 116 | "Category7": "7.盈利能力因子", 117 | 118 | "ROA_ttm": "+", 119 | "DPR_ttm": "", 120 | "NP": "", 121 | "NP_ttm": "", 122 | "OPM": "", 123 | "OPM_ttm": "" 124 | } -------------------------------------------------------------------------------- /factor_name.json: -------------------------------------------------------------------------------- 1 | { 2 | "Category1": "1.成长类因子", 3 | 4 | "BPS_G_LR": "每股净资产增长率(最新财报)", 5 | "EPS_G_ttm": "每股盈余增长率(TTM)", 6 | "ROA_G_ttm": "总资产增长率(TTM)", 7 | "TA_G_LR": "总资产增长率(最新财报)", 8 | "TA_G_ttm": "总资产增长率(TTM)", 9 | "LA_G_LR": "流动资产增长率(最新财报)", 10 | "LA_G_ttm": "流动资产增长率(TTM)", 11 | "ILA_G_LR": "非流动资产增长率(最新财报)", 12 | "ILA_G_ttm": "非流动资产增长率(TTM)", 13 | "TA_G_LR_std": "总资产增长率波动率(最新财报)", 14 | "TA_G_ttm_std": "总资产增长率波动率(TTM)", 15 | "LA_G_LR_std": "流动资产增长率波动率(最新财报)", 16 | "LA_G_ttm_std": "流动资产增长率波动率(TTM)", 17 | "ILA_G_LR_std": "非流动资产增长率波动率(最新财报)", 18 | "ILA_G_ttm_std": "非流动资产增长率波动率(TTM)", 19 | 20 | "NP_Acc": "净利润加速度", 21 | "NP_Stable": "净利润稳健增速", 22 | "NP_SD": "净利润稳健加速度", 23 | 24 | "OP_Acc": "营业利润加速度", 25 | "OP_Stable": "营业利润稳健增速", 26 | "OP_SD": "营业利润稳健加速度", 27 | 28 | "OR_Acc": "营业收入加速度", 29 | "OR_Stable": "营业收入稳健增速", 30 | "OR_SD": "营业收入稳健加速度", 31 | 32 | "MAR_G": "销售毛利率增长率", 33 | "NP_G": "净利润增长率", 34 | 35 | "Category2": "2.估值类因子", 36 | 37 | "E2P_ttm": "市盈率(不包含少数股东权益,TTM)", 38 | "EP_ttm": "市盈率(包含少数股东权益,TTM)", 39 | "EP_LR": "市盈率(包含少数股东权益,最新财报)", 40 | "EP_cut_ttm": "市盈率(扣除非经常性损益,TTM)", 41 | "PEG_ttm": "市盈率(考虑利润的同比增长,TTM)", 42 | "BP_LR": "市净率(最新财报)", 43 | "BP_ttm": "市净率(TTM)", 44 | "FCFP_LR": "市现率(企业自由现金流,最新财报)", 45 | "FCFP_ttm": "市现率(企业自由现金流,TTM)", 46 | "NCFP_ttm": "市现率(净现金流)", 47 | "OCFP_ttm": "市现率(经营现金流)", 48 | "DP_ttm": "股息率(近12个月现金红利和,TTM)", 49 | "SP_ttm": "市销率(TTM)", 50 | "SP_LR": "市销率(最新财报)", 51 | "EV2EBITDA_LR": "企业价值倍数(扣除现金,最新财报)", 52 | 53 | "Category3":"3.营运能力类因子", 54 | 55 | "RROC_N": "营业能力改善", 56 | "OCFA": "产能利用率", 57 | "TA_Turn_ttm": "总资产周转率(TTM)", 58 | "TA_Turn_ttm_T": "总资产周转率(TTM, 同比)", 59 | 60 | 61 | "Category4": "4.偿债能力因子", 62 | 63 | "Int_to_Asset": "有息负债率", 64 | 65 | "ShortDebt1_CFPA": "短期偿债能力1", 66 | "ShortDebt1_CFPA_qoq": "短期偿债能力1变化率", 67 | "ShortDebt1_CFPA_qoq_abs": "短期偿债能力1稳定性1", 68 | "ShortDebt1_CFPA_std": "短期偿债能力1稳定性2", 69 | 70 | "ShortDebt2_CFPA": "短期偿债能力2", 71 | "ShortDebt2_CFPA_qoq": "短期偿债能力2变化率", 72 | "ShortDebt2_CFPA_qoq_abs": "短期偿债能力2稳定性1", 73 | "ShortDebt2_CFPA_std": "短期偿债能力2稳定性2", 74 | 75 | "ShortDebt3_CFPA": "短期偿债能力3", 76 | "ShortDebt3_CFPA_qoq": "短期偿债能力3变化率", 77 | "ShortDebt3_CFPA_qoq_abs": "短期偿债能力3稳定性1", 78 | "ShortDebt3_CFPA_std": "短期偿债能力3稳定性2", 79 | 80 | "IT_qoq_Z": "所得税变化稳定性", 81 | "PTCF_qoq_Z": "各项税费变化稳定性", 82 | "OT_qoq_Z": "税金及附加变化稳定性", 83 | "OT2NP_qoq_Z": "税金及附加占净利润环比变化稳定性", 84 | "PT2NA_Z": "净应交税费率", 85 | 86 | "Category5": "5.收益质量因子", 87 | 88 | "CSR": "收现比", 89 | "CSRD": "收现比变动", 90 | "APR": "应计利润占比", 91 | "APD": "应计利润占比变动", 92 | 93 | 94 | "Category7": "6.盈利能力因子", 95 | 96 | "ROA_ttm": "总资产收益率(TTM)", 97 | "DPR_ttm": "股利支付率(TTM)", 98 | "NP": "当期净利润率", 99 | "NP_ttm": "净利润率(TTM)", 100 | "OPM": "当期营业利润率", 101 | "OPM_ttm": "营业利润率(TTM)", 102 | "NPM_T": "净利润率(同比)", 103 | 104 | "Category6": "7.动量因子", 105 | 106 | "MTM_gen_1": "动量因子(收盘价1)", 107 | "MTM_gen_2": "动量因子(收盘价2)", 108 | "MTM_gen_3": "动量因子(收盘价3)", 109 | "MTM_gen_20": "动量因子(收盘价20)", 110 | "MTM_gen_60": "动量因子(收盘价60)", 111 | 112 | "MTM_bt_day_1": "动量因子(日间1)", 113 | "MTM_bt_day_2": "动量因子(日间2)", 114 | "MTM_bt_day_3": "动量因子(日间3)", 115 | "MTM_bt_day_20": "动量因子(日间20)", 116 | "MTM_bt_day_60": "动量因子(日间60)", 117 | 118 | "MTM_in_day_1": "动量因子(日内1)", 119 | "MTM_in_day_2": "动量因子(日内2)", 120 | "MTM_in_day_3": "动量因子(日内3)", 121 | "MTM_in_day_20": "动量因子(日内20)", 122 | "MTM_in_day_60": "动量因子(日内60)", 123 | 124 | "MTM_N_P_1": "趋势动量因子(收盘价1)", 125 | "MTM_N_P_2": "趋势动量因子(收盘价2)", 126 | "MTM_N_P_3": "趋势动量因子(收盘价3)", 127 | "MTM_N_P_20": "趋势动量因子(收盘价20)", 128 | "MTM_N_P_60": "趋势动量因子(收盘价60)" 129 | } --------------------------------------------------------------------------------