├── readme_pic └── tree.png ├── __init__.py ├── utils.py ├── fitness.py ├── data_trans.py ├── README.md ├── .idea └── workspace.xml ├── functions.py ├── example.py ├── _program.py └── genetic.py /readme_pic/tree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ACEACEjasonhuang/gplearnplus/HEAD/readme_pic/tree.png -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | # @Project :gplearnplus 5 | # @File :__init__.py 6 | # @Date :2022/12/1 0001 13:36 7 | # @Author :Junzhe Huang 8 | # @Email :acejasonhuang@163.com 9 | # @Software :PyCharm 10 | ------------------------------------------------- 11 | """ 12 | __version__ = '1.5.9' 13 | 14 | __all__ = ['genetic', 'functions', 'fitness', 'example'] 15 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | # @Project :gplearnplus 5 | # @File :utils 6 | # @Date :2022/12/1 0001 13:38 7 | # @Author :Junzhe Huang 8 | # @Email :acejasonhuang@163.com 9 | # @Software :PyCharm 10 | ------------------------------------------------- 11 | """ 12 | 13 | import numbers 14 | 15 | import numpy as np 16 | from joblib import cpu_count 17 | 18 | 19 | # seed 转换为 np.random.RandomState对象 20 | def check_random_state(seed): 21 | """Turn seed into a np.random.RandomState instance 22 | 23 | Parameters 24 | ---------- 25 | seed : None | int | instance of RandomState 26 | If seed is None, return the RandomState singleton used by np.random. 27 | If seed is an int, return a new RandomState instance seeded with seed. 28 | If seed is already a RandomState instance, return it. 29 | Otherwise raise ValueError. 30 | 31 | """ 32 | if seed is None or seed is np.random: 33 | return np.random.mtrand._rand 34 | if isinstance(seed, (numbers.Integral, np.integer)): 35 | return np.random.RandomState(seed) 36 | if isinstance(seed, np.random.RandomState): 37 | return seed 38 | raise ValueError('%r cannot be used to seed a numpy.random.RandomState' 39 | ' instance' % seed) 40 | 41 | 42 | # n_jobs转换 -1为全部 43 | def _get_n_jobs(n_jobs): 44 | """Get number of jobs for the computation. 45 | 46 | This function reimplements the logic of joblib to determine the actual 47 | number of jobs depending on the cpu count. If -1 all CPUs are used. 48 | If 1 is given, no parallel computing code is used at all, which is useful 49 | for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. 50 | Thus for n_jobs = -2, all CPUs but one are used. 51 | 52 | Parameters 53 | ---------- 54 | n_jobs : int 55 | Number of jobs stated in joblib convention. 56 | 57 | Returns 58 | ------- 59 | n_jobs : int 60 | The actual number of jobs as positive integer. 61 | 62 | """ 63 | if n_jobs < 0: 64 | return max(cpu_count() + 1 + n_jobs, 1) 65 | elif n_jobs == 0: 66 | raise ValueError('Parameter n_jobs == 0 has no meaning.') 67 | else: 68 | return n_jobs 69 | 70 | 71 | # 将estimator 分配到每一个job上 72 | # 返回 进程数, 每个进程任务数,累计任务数 73 | def _partition_estimators(n_estimators, n_jobs): 74 | """Private function used to partition estimators between jobs.""" 75 | # Compute the number of jobs 76 | n_jobs = min(_get_n_jobs(n_jobs), n_estimators) 77 | 78 | # 给进程分配任务 79 | # Partition estimators between jobs 80 | n_estimators_per_job = (n_estimators // n_jobs) * np.ones(n_jobs, 81 | dtype=int) 82 | n_estimators_per_job[:n_estimators % n_jobs] += 1 83 | starts = np.cumsum(n_estimators_per_job) 84 | 85 | return n_jobs, n_estimators_per_job.tolist(), [0] + starts.tolist() 86 | -------------------------------------------------------------------------------- /fitness.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | # @Project :gplearnplus 5 | # @File :fitness 6 | # @Date :2022/12/5 0005 7:25 7 | # @Author :Junzhe Huang 8 | # @Email :acejasonhuang@163.com 9 | # @Software :PyCharm 10 | ------------------------------------------------- 11 | """ 12 | 13 | import numbers 14 | 15 | import numpy as np 16 | from joblib import wrap_non_picklable_objects 17 | from scipy.stats import rankdata 18 | 19 | __all__ = ['make_fitness'] 20 | 21 | 22 | class _Fitness(object): 23 | 24 | """A metric to measure the fitness of a program. 25 | 26 | This object is able to be called with NumPy vectorized arguments and return 27 | a resulting floating point score quantifying the quality of the program's 28 | representation of the true relationship. 29 | 30 | Parameters 31 | ---------- 32 | function : callable 33 | A function with signature function(y, y_pred, sample_weight) that 34 | returns a floating point number. Where `y` is the input target y 35 | vector, `y_pred` is the predicted values from the genetic program, and 36 | sample_weight is the sample_weight vector. 37 | 38 | greater_is_better : bool 39 | Whether a higher value from `function` indicates a better fit. In 40 | general this would be False for metrics indicating the magnitude of 41 | the error, and True for metrics indicating the quality of fit. 42 | 43 | """ 44 | 45 | def __init__(self, function, greater_is_better): 46 | self.function = function 47 | self.greater_is_better = greater_is_better 48 | self.sign = 1 if greater_is_better else -1 49 | 50 | def __call__(self, *args): 51 | return self.function(*args) 52 | 53 | 54 | def make_fitness(*, function, greater_is_better, wrap=True): 55 | """Make a fitness measure, a metric scoring the quality of a program's fit. 56 | 57 | This factory function creates a fitness measure object which measures the 58 | quality of a program's fit and thus its likelihood to undergo genetic 59 | operations into the next generation. The resulting object is able to be 60 | called with NumPy vectorized arguments and return a resulting floating 61 | point score quantifying the quality of the program's representation of the 62 | true relationship. 63 | 64 | Parameters 65 | ---------- 66 | function : callable 67 | A function with signature function(y, y_pred, sample_weight) that 68 | returns a floating point number. Where `y` is the input target y 69 | vector, `y_pred` is the predicted values from the genetic program, and 70 | sample_weight is the sample_weight vector. 71 | 72 | greater_is_better : bool 73 | Whether a higher value from `function` indicates a better fit. In 74 | general this would be False for metrics indicating the magnitude of 75 | the error, and True for metrics indicating the quality of fit. 76 | 77 | wrap : bool, optional (default=True) 78 | When running in parallel, pickling of custom metrics is not supported 79 | by Python's default pickler. This option will wrap the function using 80 | cloudpickle allowing you to pickle your solution, but the evolution may 81 | run slightly more slowly. If you are running single-threaded in an 82 | interactive Python session or have no need to save the model, set to 83 | `False` for faster runs. 84 | 85 | """ 86 | if not isinstance(greater_is_better, bool): 87 | raise ValueError('greater_is_better must be bool, got %s' 88 | % type(greater_is_better)) 89 | if not isinstance(wrap, bool): 90 | raise ValueError('wrap must be an bool, got %s' % type(wrap)) 91 | if function.__code__.co_argcount != 3: 92 | raise ValueError('function requires 3 arguments (y, y_pred, w),' 93 | ' got %d.' % function.__code__.co_argcount) 94 | if not isinstance(function(np.array([1, 1]), 95 | np.array([2, 2]), 96 | np.array([1, 1])), numbers.Number): 97 | raise ValueError('function must return a numeric.') 98 | 99 | if wrap: 100 | return _Fitness(function=wrap_non_picklable_objects(function), 101 | greater_is_better=greater_is_better) 102 | return _Fitness(function=function, 103 | greater_is_better=greater_is_better) 104 | 105 | 106 | def _weighted_pearson(y, y_pred, w): 107 | """Calculate the weighted Pearson correlation coefficient.""" 108 | with np.errstate(divide='ignore', invalid='ignore'): 109 | y_pred_demean = y_pred - np.average(y_pred, weights=w) 110 | y_demean = y - np.average(y, weights=w) 111 | corr = ((np.sum(w * y_pred_demean * y_demean) / np.sum(w)) / 112 | np.sqrt((np.sum(w * y_pred_demean ** 2) * 113 | np.sum(w * y_demean ** 2)) / 114 | (np.sum(w) ** 2))) 115 | if np.isfinite(corr): 116 | return np.abs(corr) 117 | return 0. 118 | 119 | 120 | def _weighted_spearman(y, y_pred, w): 121 | """Calculate the weighted Spearman correlation coefficient.""" 122 | y_pred_ranked = np.apply_along_axis(rankdata, 0, y_pred) 123 | y_ranked = np.apply_along_axis(rankdata, 0, y) 124 | return _weighted_pearson(y_pred_ranked, y_ranked, w) 125 | 126 | 127 | def _mean_absolute_error(y, y_pred, w): 128 | """Calculate the mean absolute error.""" 129 | return np.average(np.abs(y_pred - y), weights=w) 130 | 131 | 132 | def _mean_square_error(y, y_pred, w): 133 | """Calculate the mean square error.""" 134 | return np.average(((y_pred - y) ** 2), weights=w) 135 | 136 | 137 | def _root_mean_square_error(y, y_pred, w): 138 | """Calculate the root mean square error.""" 139 | return np.sqrt(np.average(((y_pred - y) ** 2), weights=w)) 140 | 141 | 142 | def _log_loss(y, y_pred, w): 143 | """Calculate the log loss.""" 144 | eps = 1e-15 145 | inv_y_pred = np.clip(1 - y_pred, eps, 1 - eps) 146 | y_pred = np.clip(y_pred, eps, 1 - eps) 147 | score = y * np.log(y_pred) + (1 - y) * np.log(inv_y_pred) 148 | return np.average(-score, weights=w) 149 | 150 | 151 | weighted_pearson = _Fitness(function=_weighted_pearson, 152 | greater_is_better=True) 153 | weighted_spearman = _Fitness(function=_weighted_spearman, 154 | greater_is_better=True) 155 | mean_absolute_error = _Fitness(function=_mean_absolute_error, 156 | greater_is_better=False) 157 | mean_square_error = _Fitness(function=_mean_square_error, 158 | greater_is_better=False) 159 | root_mean_square_error = _Fitness(function=_root_mean_square_error, 160 | greater_is_better=False) 161 | log_loss = _Fitness(function=_log_loss, 162 | greater_is_better=False) 163 | 164 | _fitness_map = {'pearson': weighted_pearson, 165 | 'spearman': weighted_spearman, 166 | 'mean absolute error': mean_absolute_error, 167 | 'mse': mean_square_error, 168 | 'rmse': root_mean_square_error, 169 | 'log loss': log_loss} -------------------------------------------------------------------------------- /data_trans.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time : 2024/2/7 3 | # @Author : Junzhe Huang 4 | # @Email : huangjz01@igoldenbeta.com 5 | # @File : data_trans 6 | # @Software : gplearnplus 7 | import pandas as pd 8 | 9 | 10 | # todo 移植部分 11 | def data_transform(X, y, data_type, number_feature_list, category_feature_list=None, 12 | security_index=None, time_series_index=None): 13 | # 检查数据类型 14 | if data_type not in ('section', 'time_series', 'panel'): 15 | raise ValueError('Valid data_type methods include ' 16 | '"section", "time_series" and "panel". Given %s.' 17 | % data_type) 18 | 19 | # X必须为pd.DataFrame 20 | if not isinstance(X, pd.DataFrame): 21 | raise ValueError('Data structure must be DataFrame') 22 | 23 | # 验证y的长度是否与X相同 24 | if len(X) != len(y): 25 | raise ValueError('X and y must have same length') 26 | 27 | # 检查column 是否包含category_feature_list 和 number_feature_list 28 | # 将category_feature_list 调整至前 number_feature_list 调整至后 29 | # 找出X的columns与category_feature_list的交集列表 30 | if category_feature_list is not None: 31 | if not isinstance(category_feature_list, list): 32 | raise ValueError('category_feature_list must be list') 33 | category_feature_list_inX = [col for col in X.columns if col not in category_feature_list] 34 | else: 35 | category_feature_list_inX = [] 36 | # 找出X的columns与number_feature_list的交集列表 37 | if not isinstance(number_feature_list, list): 38 | raise ValueError('number_feature_list must be list') 39 | number_feature_list_inX = [col for col in X.columns if col not in number_feature_list] 40 | # 重构顺序,将分类类型放在前面, 并把第一列设为常数1,column为 const_1 41 | X['const_1'] = 1 42 | feature_names = category_feature_list_inX + number_feature_list_inX 43 | X_trans = X[['const_1'] + feature_names].copy() 44 | 45 | # 若存在security_index和time_series_index,插入X_trans最后,默认先插入security_index再插入time_series_index 46 | if security_index is not None: 47 | # 若security_index在X的columns中,或者为X.index,将其插入到X_trans最后 48 | if security_index in X.columns: 49 | X_trans[security_index] = X[security_index] 50 | elif X.index.name == security_index: 51 | X_trans[security_index] = X.index.get_level_values(security_index) 52 | else: 53 | # 若security_index不在X_trans的columns中,也不再index中,报错 54 | raise ValueError('Can not fund security_index {} in both columns and index' 55 | .format(security_index)) 56 | if time_series_index is not None: 57 | # 若time_series_index在X的columns 58 | if time_series_index in X.columns: 59 | X_trans[time_series_index] = X[time_series_index] 60 | elif X.index.name == time_series_index: 61 | 62 | 63 | 64 | # 检查时间index和个股index, 对于截面,时序和面板数据分别检查 65 | if data_type == 'section': 66 | if time_series_index is not None: 67 | raise ValueError('For Section Data, time_series_index should be None') 68 | if security_index is not None: 69 | # 在index和columns中寻找security_index 70 | # 判断是否有重复个股 71 | if len(X[security_index].unique()) < len(X[security_index]): 72 | raise ValueError('For Section Data, security data should be unique') 73 | elif data_type == 'time_series': 74 | if security_index is not None: 75 | raise ValueError('For time_series Data, security_index should be None') 76 | if time_series_index is not None: 77 | # 在index和columns中寻找time_series_index 78 | if time_series_index not in X.columns and \ 79 | (X.index.name is None or time_series_index not in X.index.name): 80 | raise ValueError('Can not fund time_series_index {} in both columns and index' 81 | .format(time_series_index)) 82 | elif time_series_index in X.columns: 83 | X.set_index(time_series_index, inplace=True) 84 | # 判断是否有重复时间 85 | if len(X.index.drop_duplicates()) < len(X): 86 | raise ValueError('For time_series Data, time_series data should be unique') 87 | X_combine = X.copy() 88 | X_combine['_label'] = y.values if isinstance(y, pd.Series) else y 89 | X_combine.sort_index(inplace=True) 90 | X, y = X_combine.loc[:, self.feature_names], X_combine.loc[:, '_label'] 91 | # debug 92 | 93 | time_series_data = X.index.values 94 | 95 | else: 96 | if self.time_series_index is None: 97 | raise ValueError('For panel Data, time_series_index should NOT be None') 98 | if self.security_index is None: 99 | raise ValueError('For panel Data, security_index should NOT be None') 100 | 101 | # security time_series 进入index 102 | if self.time_series_index not in X.columns and \ 103 | (X.index.name is None or self.time_series_index not in X.index.name): 104 | raise ValueError('Can not fund time_series_index {} in both columns and index' 105 | .format(self.time_series_index)) 106 | elif self.security_index not in X.columns and \ 107 | (X.index.name is None or self.security_index not in X.index.name): 108 | raise ValueError('Can not fund security_index {} in both columns and index' 109 | .format(self.security_index)) 110 | elif self.time_series_index in X.columns and self.security_index in X.columns: 111 | X.set_index([self.time_series_index, self.security_index], inplace=True) 112 | elif self.time_series_index in X.columns: 113 | X.set_index(self.security_index, inplace=True, append=True) 114 | elif self.security_index in X.columns: 115 | X.set_index(self.time_series_index, inplace=True, append=True) 116 | 117 | # 判断没有重复 118 | if len(X.index) != len(X.index.drop_duplicates()): 119 | raise ValueError('For time_series Data, time_series data should be unique') 120 | 121 | 122 | X_combine = X.copy() 123 | X_combine['_label'] = y.values if isinstance(y, pd.Series) else y 124 | X_combine.sort_index(inplace=True) 125 | X, y = X_combine.loc[:, self.feature_names], X_combine.loc[:, '_label'] 126 | time_series_data = X.index.get_level_values(self.time_series_index).values 127 | security_data = X.index.get_level_values(self.security_index).values 128 | 129 | # 检查category_features是否与全包含在feature_names中 130 | # 当存在分类数据时,输入数据类型必须为pd。DataFrame 131 | if self.category_features is not None: 132 | if not isinstance(X, pd.DataFrame): 133 | raise ValueError('while there are category_features in X, X must be pd.DataFrame') 134 | if not isinstance(self.category_features, list): 135 | raise ValueError('category_features must be list') 136 | for cat_feature in self.category_features: 137 | if cat_feature not in self.feature_names: 138 | raise ValueError('Valid category_feature {} , not in feature_names'.format(cat_feature)) 139 | # 处理分类数据,转换为整型 140 | label_encoder = LabelEncoder() 141 | X[self.category_features] = X[self.category_features].apply(label_encoder.fit_transform) 142 | # 重构顺序,将分类类型放在前面 143 | self.feature_names = \ 144 | [self.category_features + [_col for _col in self.feature_names if _col not in self.category_features]] 145 | X = X[self.feature_names] 146 | 147 | # Check arrays 148 | if sample_weight is not None: 149 | sample_weight = _check_sample_weight(sample_weight, X) 150 | 151 | # 检查数据内容 152 | if isinstance(self, ClassifierMixin): 153 | # 验证y是否为分类数据, X, y强转ndarray 154 | # todo 分类场景的处理有待优化,暂时不处理 155 | X, y = self._validate_data(X, y, y_numeric=False) 156 | check_classification_targets(y) 157 | 158 | if self.class_weight: 159 | if sample_weight is None: 160 | sample_weight = 1. 161 | # modify the sample weights with the corresponding class weight 162 | sample_weight = (sample_weight * 163 | compute_sample_weight(self.class_weight, y)) 164 | 165 | self.classes_, y = np.unique(y, return_inverse=True) 166 | n_trim_classes = np.count_nonzero(np.bincount(y, sample_weight)) 167 | if n_trim_classes != 2: 168 | raise ValueError("y contains %d class after sample_weight " 169 | "trimmed classes with zero weights, while 2 " 170 | "classes are required." 171 | % n_trim_classes) 172 | self.n_classes_ = len(self.classes_) 173 | 174 | else: 175 | # 验证y是否为数值数据, X, y强转ndarray 176 | X, y = self._validate_data(X, y, y_numeric=True) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # gplearnplus 2 | 对gplearn进行升级,适应时序数据和面板数据,适用于更多的场景 3 | 且在函数参数中区分分类数据和数值型数据,可兼容类似于groupby等操作 4 | 5 | # 文件描述 6 | 7 | ## `_Program.py` 8 | 构建,调用公式树模块, 9 | 对象为`_Program` 10 | 属性`program`为栈形式的公式树 11 | ### 公式树形式 12 | ![alt text](https://github.com/ACEACEjasonhuang/gplearnplus/readme_pic/tree.png) 13 | 14 | 该公式表达是为 15 | 16 | $$ 17 | ((X_0 \times X_0) - (3.0 \times X_1)) + 0.5 18 | $$ 19 | 20 | program结果栈为: 21 | `['add', 'sub', 'mul', '0', '0', 'mul', 3.0, '1', 0.5]` 22 | 23 | ### 初始化方法 24 | `build_program` 25 | 通过stack对树进行深度优先搜索构建 26 | ```mermaid 27 | graph TB 28 | Z[选择根函数,必
须返回数值向量] --> A[(工作栈stack中插入
根函数参数列表)] 29 | A-->B([结果栈program中
插入根函数]) 30 | B-->C{工作栈stack非空} 31 | C-->|Y|D{判断工作栈中最后一个
函数的第一个参数
1.该节点必须接受向量
2.当前深度比最大深度低
3.随机种子选中了函数
或者模式为'full'} 32 | C-->|N|L[异常,工作栈不得为空] 33 | D-->|Y|E[插入函数
成为子树节点] 34 | D-->|N|F[插入向量或标量
成为叶子节点] 35 | E-->G{该节点是否可接受
分类函数和数值函数} 36 | G-->|Y|H[所有函数中随机挑选] 37 | G-->|N|I[相应的分类或
数值函数中随机挑选] 38 | H-->J[(工作栈stack中插入
相应函数参数列表)] 39 | I-->J 40 | J-->K[(结果栈program中
插入相应函数)] 41 | 42 | F-->M{若参数不接受标量或
const_range为空或
随机数选中向量} 43 | M-->|Y|N{存在分类向量且
参数接受分类向量且
随机数选中分类向量} 44 | N-->|Y|O([结果栈program中
插入该分类向量,
类型为字符串数字]) 45 | N-->|N|P([结果栈program中
插入该数值向量,
类型为字符串数字]) 46 | M-->|N|R{若该节点接受
浮点类型标量} 47 | R-->|Y|S([结果栈program中
插入范围内随机浮点
标量,类型为浮点]) 48 | R-->|N|T([结果栈program中
插入范围内随机整型
标量,类型为整型]) 49 | O-->U[(工作栈stack中弹出
最后一个函数的
最后一个参数节点)] 50 | P-->U 51 | R-->U 52 | S-->U 53 | T-->U 54 | U-->Q{工作栈stack
最后一个函数
参数列表为空} 55 | Q-->|Y|V[(工作栈stack中弹出
最后一个函数的W{工作栈为空} 57 | W-->|Y|X([返回结果栈program
公式树初始化完成]) 58 | W-->|N|Y[(工作栈stack中弹出
最后一个函数的
最后一个参数节点)] 59 | Y-->Q 60 | Q-->|N|C 61 | K-->C 62 | ``` 63 | ### 树的检验 64 | `validate_program` 65 | 对树一次深度优先搜索,保证所有节点完备,即每一个函数参数量足够 66 | 67 | `_depth` 68 | 深度优先搜索的同时记录最大深度 69 | 70 | `_length` 71 | 返回program长度,即树的节点数量 72 | 73 | ### 树的打印 74 | `__str__`:打印树 75 | `export_graphviz`:可视化整个树 76 | 77 | 78 | ### 公式树的计算 79 | `execute`:接受pandas或者二位nd_array,shape = [n_samples, n_features] 80 | 执行过程中,将program中的字符串和常数处理成可接受参数 81 | - 常数需要广播成常向量 82 | - 字符串转换为输入X中对应的列 83 | - 若数据类型为面板数据`panel`,X中需要额外输入证券列和时间列, 84 | 85 | `raw_fitness`:原始适应度 86 | 1. 由公式树计算出$\hat{y}$ 87 | 2. 对$\hat{y}$进行调整 88 | 3. 计算$y$与$\hat{y}$的适应度`metric` 89 | 90 | `fitness`:带惩罚项适应度 91 | $$ 92 | penalty=p\_coef \times program\_len \times sign(metric) 93 | $$ 94 | ### 样本选择(防止过拟合) 95 | 为了防止过拟合,仅选择部分样本 96 | 97 | `get_all_indices` 输入总样本量和抽样样本量 98 | 99 | 返回抽样内样本index和抽样外样本index 100 | 101 | ### 公式树的截取 102 | 103 | `get_subtree(random_state, start, program=None)`:获取指定子树 104 | 获取根节点为start的指定子树 105 | 106 | 107 | `get_random_subtree(random_state, program=None, return_type=None):`获取随机子树 108 | 根据需要设定获取**数值型子树**还是**分类型子树** 109 | 返回子树和子树类型 110 | 111 | ### 公式树的交叉变异 112 | 113 | `crossover(donor, random_state)` 114 | 与公式树`donor`交叉,要求同返回类型 115 | 116 | `subtree_mutation(random_state)` 117 | 随机生成一颗公式树,与父树交叉 118 | 119 | `hoist_mutation(random_state)` 120 | 首先寻找可以hoist的节点,要求该节点下存在子节点与自己类型相同 121 | 把一颗子树的同类型子树上提 122 | 123 | `point_mutation(random_state)` 124 | 点变异 125 | 对随机选中的点进行点变异 126 | 点变异保证函数合法 127 | 128 | 129 | ## `fitness.py` 130 | 131 | 定义适应度函数,和自定义适应函数的方法 132 | 133 | 定义函数对象`_Fitness` 134 | 135 | 包含是属性: 136 | 137 | `function` 138 | 139 | 必须接受三个变量`(y, y_perd, w)` 140 | 141 | `greater_is_better` 142 | 143 | 144 | ## `function.py` 145 | 146 | 自定义函数和构建方法 147 | 定义函数对象`_Function` 148 | 包含是属性: 149 | 150 | `function`:可调用函数 151 | 152 | `name`:函数名 153 | 154 | `arity`:参数个数 155 | 156 | `param_type`: 157 | 参数类型列表,长度与arity一致,**默认不接受分类类型** 158 | 该设计是本项目最重要的升级,影响公式树的构建 159 | ```python 160 | [{ 161 | 'vector': {'category': (None, None), 'number': (None, None)}, 162 | 'scalar': {'int': (None, None), 'float': (None, None)} 163 | },] 164 | ``` 165 | 166 | `function_type`:函数类型 默认 'all' 167 | 'all', 'section', 'time_series‘ 168 | 169 | `return_type`:返回类型 默认'number' 170 | 'number', 'category' 171 | 172 | 包含的方法: 173 | 174 | `__call__` 175 | 调用函数特殊处理, 176 | 参数仅接受标量,却传入向量 177 | 则取向量第一个值为标量 178 | 179 | `add_range`: 180 | 181 | 替换掉参数中没有约束的范围,给所有标量限制范围 182 | 183 | 若没有const_range, 则表明所有函数不接收常数, 去掉所有的const type 184 | 185 | `is_point_mutation(candidate_func)` 186 | 187 | 检验某个待替换函数是否可以替换 188 | 189 | 外部函数: 190 | `make_function(*, function, name, arity, param_type=None, wrap=True, return_type='number', function_type='all')` 191 | 将函数处理为_Funtion对象 192 | 主要进行合法性检验和测试 193 | 194 | ## `genetic.py` 195 | 196 | 模型接口,包括由工厂类派生出,回归,分类器和特征工程工具类,应用于不同场景 197 | 198 | ### '_parallel_evolve(n_programs, parents, X, y, sample_weight, seeds, params)' 199 | 200 | 进行一次种群进化 201 | n_programs为种群数量 202 | 203 | 204 | ### `BaseSymbolic` 205 | 206 | 207 | 208 | 209 | ## `utils.py` 210 | 211 | 支持函数 212 | 213 | 214 | 215 | `test.py` 216 | 217 | 218 | `data_trans.py` 219 | 220 | 221 | 222 | 223 | 自定义函数样例 224 | 225 | 226 | 227 | # 函数文档 228 | 229 | ## 全局函数 230 | 231 | ### gpelarn自带全局函数 232 | | 函数名 | 实现逻辑 | 参数要求 | 输出类型 | 233 | |-----------|-----------------|---------------| ------------ | 234 | | add(X, Y) | 向量相加 | 【数值向量】,【数值向量】 | 【数值向量】 | 235 | | sub(X, Y) | 向量相减 | 【数值向量】,【数值向量】 | 【数值向量】 | 236 | | mul(X, Y) | 向量相乘 | 【数值向量】,【数值向量】 | 【数值向量】 | 237 | | div(X, Y) | 向量相除(极小值替换分母0) | 【数值向量】,【数值向量】 | 【数值向量】 | 238 | | sqrt(X) | 开平方(负数处理为绝对值) | 【数值向量】 | 【数值向量】 | 239 | | log(X) | 取自然对数(同理处理0和负数) | 【数值向量】 | 【数值向量】 | 240 | | neg(X) | 取反数 | 【数值向量】 | 【数值向量】 | 241 | | inv(X) | 取倒数 (极小值处理0) | 【数值向量】 | 【数值向量】 | 242 | | abs(X) | 取绝对值 | 【数值向量】 | 【数值向量】 | 243 | | max(X, Y) | 向量取孰大值 | 【数值向量】,【数值向量】 | 【数值向量】 | 244 | | min(X, Y) | 向量取孰小值 | 【数值向量】,【数值向量】 | 【数值向量】 | 245 | | sin(X) | 取向量正弦 | 【数值向量】, | 【数值向量】 | 246 | | cos(X) | 取向量余弦 | 【数值向量】 | 【数值向量】 | 247 | | tan(X) | 取向量正切 | 【数值向量】 | 【数值向量】 | 248 | | sig(X) | 逻辑斯蒂函数 | 【数值向量】 | 【数值向量】 | 249 | ### 自定义全局函数 250 | | 函数名 | 实现逻辑 | 参数要求 | 输出类型 | 251 | | --------------- |------------| -------------------------- | ------------ | 252 | | combine(X, Y) | 两个分类变量笛卡尔积 | 【分类向量】,【分类向量】 | 【分类向量】 | 253 | 254 | 255 | ## 时间序列函数 256 | 257 | ### 基本时间序列函数 258 | | 函数名 | 实现逻辑 | 参数要求 | 输出类型 | 259 | |--------------------|-----------|----------------------| ------------ | 260 | | delay(X,d) | 时序变量滞后 | 【数值向量】,【整形常量】 | 【数值向量】 | 261 | | delta(X,d) | 与滞后项作差 | 【数值向量】,【整形常量】 | 【数值向量】 | 262 | | ts_min(X,d) | 时间窗口最小值 | 【数值向量】,【整形常量】 | 【数值向量】 | 263 | | ts_max(X,d) | 时间窗口最大值 | 【数值向量】,【整形常量】 | 【数值向量】 | 264 | | ts_argmax(X,d) | 时间窗口最小值位置 | 【数值向量】,【整形常量】 | 【数值向量】 | 265 | | ts_argmin(X,d) | 时间窗口最大值位置 | 【数值向量】,【整形常量】 | 【数值向量】 | 266 | | ts_rank(X, d) | 时间窗口排序值 | 【数值向量】,【整形常量】 | 【数值向量】 | 267 | | ts_sum(X, d) | 时间窗口求和 | 【数值向量】,【整形常量】 | 【数值向量】 | 268 | | ts_stddev(X, d) | 时间窗口标准差 | 【数值向量】,【整形常量】 | 【数值向量】 | 269 | | ts_corr(X,Y,d) | 时间窗口相关系数 | 【数值向量】,【数值向量】,【整形常量】 | 【数值向量】 | 270 | | ts_mean(X, d) | 时间窗口均值 | 【数值向量】,【整形常量】 | 【数值向量】 | 271 | | ts_neutralize(X, d) | 时间窗口Z分数 | 【数值向量】,【整形常量】 | 【数值向量】 | 272 | | ts_freq(X, d) | 时间窗口Z分数 | 【分类向量】,【整形常量】 | 【数值向量】 | 273 | 274 | ### 技术指标函数 275 | 276 | | 函数名 | 实现逻辑 | 参数要求 | 输出类型 | 277 | |---------------------|------------| ------------------- | ------------ | 278 | | EMA(X,d) | 指数平滑均线 | 【数值向量】,【整形常量】 | 【数值向量】 | 279 | | DEMA(X,d) | 二重指数平滑线 | 【数值向量】,【整形常量】 | 【数值向量】 | 280 | | MA(X,d) | 均线 | 【数值变量】,【整形常量】 | 【数值向量】 | 281 | | KAMA(X,d) | 库夫曼自适应移动均线 | 【数值向量】,【整形常量】 | 【数值向量】 | 282 | | MIDPOINT(X,d) | 中间点 | 【数值向量】,【整形向量】 | 【数值向量】 | 283 | | BETA(X,Y,d) | 回归系数 | 【数值向量】,【数值向量】,【整形常量】 | 【数值向量】 | 284 | | LINEARREG_SLOPE(X, d) | 回归斜率 | 【数值向量】,【整形常量】 | 【数值向量】 | 285 | | LINEARREG_ANGLE(X, d) | 回归角度 | 【数值向量】,【整形常量】 | 【数值向量】 | 286 | | LINEARREG_INTERCEPT(X, d) | 回归截距 | 【数值向量】,【整形常量】 | 【数值向量】 | 287 | 288 | ## 截面函数 289 | ### 基本截面函数 290 | | 函数名 | 实现逻辑 | 参数要求 | 输出类型 | 291 | |------------------------------------|----------|------------------|--------| 292 | | MAX_SECTION(X) | 最大值填充 | 【数值向量】 | 【数值向量】 | 293 | | MIN_SECTION(X) | 最小值填充 | 【数值向量】 | 【数值向量】 | 294 | | MEAN_SECTION(X) | 均值填充 | 【数值向量】 | 【数值向量】 | 295 | | MEDIAN_SECTION(X) | 中位数填充 | 【数值向量】 | 【数值向量】 | 296 | | STD_SECTION(X) | 标准差填充 | 【数值向量】 | 【数值向量】 | 297 | | RANK_SECTION(X) | 序数填充 | 【数值向量】 | 【数值向量】 | 298 | | NEUTRALIZE_SECTION(X) | Z分数填充 | 【数值向量】 | 【数值向量】 | 299 | | FREQ_SECTION(X) | 频数填充 | 【分类向量】 | 【数值向量】 | 300 | | CUT_EQUAL_DISTANCE(X, d) | 等距分组 | 【数值向量】,【整形标量】 | 【分类向量】 | 301 | | CUT_EQUAL_AMOUNT(X, d) | 等量分组 | 【数值向量】,【整形标量】 | 【分类向量】 | 302 | 303 | ### 截面分类聚合函数 304 | 305 | | 函数名 | 实现逻辑 | 参数要求 | 输出类型 | 306 | |------------------------------------|----------|------------------|--------| 307 | | GROUPBYTHENMAX(gbx, X) | 分组后取最大值 | 【分类向量】,【数值向量】 | 【数值向量】 | 308 | | GROUPBYTHENMIN(gbx, X) | 分组后取最小值 | 【分类向量】,【数值向量】 | 【数值向量】 | 309 | | GROUPBYTHENMEAN(gbx, X) | 分组后取均值 | 【分类向量】,【数值向量】 | 【数值向量】 | 310 | | GROUPBYTHENMEDIAN(gbx, X) | 分组后取中位数 | 【分类向量】,【数值向量】 | 【数值向量】 | 311 | | GROUPBYTHENSTD(gbx, X) | 分组后取标准差 | 【分类向量】,【数值向量】 | 【数值向量】 | 312 | | GROUPBYTHENRANK(gbx, X) | 分组后取序数 | 【分类向量】,【数值向量】 | 【数值向量】 | 313 | | GROUPBYTHENNEUTRALIZE(gbx, X) | 分组后取Z分数 | 【分类向量】,【数值向量】 | 【数值向量】 | 314 | | GROUPBYTHEN_CUT_EQ_DIST(gbx, X, d) | 分组后取等距分组 | 【分类向量】,【数值向量】,【整形常量】 | 【分类向量】 | 315 | | GROUPBYTHEN_CUT_EQ_AMT(gbx, X, d) | 分组后取等量分组 | 【分类向量】,【数值向量】,【整形常量】 | 【分类向量】 | 316 | | GROUPBYTHENFREQ(gbx, X) | 分组后取取频数 | 【分类向量】,【分类向量】 | 【数值向量】 | 317 | # 更新记录 318 | 319 | ## v1.0 320 | 321 | 未调试完全, 有bug 322 | 323 | ## v1.1 324 | 325 | 处理完funtions模块的问题 326 | 调试成功,对于时序自定义函数中的常数参数,需要在函数中做去广播判定 327 | 328 | ## v1.2 329 | 330 | test中加入了自定义函数的定义方法,需要忽略运行时的RuntimeWarning 331 | 332 | ## v1.3 333 | 334 | functions中去掉了对于function.__code__.co_argument的限制 335 | 增强对函数修饰器的兼容 336 | 337 | ## v1.4 338 | test.py debug 339 | 函数定义考虑特殊参数情况 340 | 341 | ## v1.5 342 | 新增面板数据支持功能 343 | 将场景分位截面,时序和面板 344 | 数据定义要求更新 345 | 346 | 函数定义要求更新 347 | 348 | 更新适应度惩罚计算 349 | 350 | 修改遗传规划中的特征筛选逻辑 351 | (当最大相关系数绝对值低于某一阈值时,直接按fitness筛选) -------------------------------------------------------------------------------- /.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 18 | 19 | 24 | 25 | 26 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 49 | 50 | 51 | 52 | 55 | { 56 | "keyToString": { 57 | "RunOnceActivity.OpenProjectViewOnStart": "true", 58 | "RunOnceActivity.ShowReadmeOnStart": "true", 59 | "WebServerToolWindowFactoryState": "false", 60 | "last_opened_file_path": "D:/software/python38/Lib/site-packages/gplearn", 61 | "node.js.detected.package.eslint": "true", 62 | "node.js.detected.package.tslint": "true", 63 | "node.js.selected.package.eslint": "(autodetect)", 64 | "node.js.selected.package.tslint": "(autodetect)", 65 | "nodejs_package_manager_path": "npm", 66 | "settings.editor.selected.configurable": "com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable", 67 | "vue.rearranger.settings.migration": "true" 68 | } 69 | } 70 | 71 | 72 | 73 | 93 | 94 | 95 | 115 | 116 | 117 | 137 | 138 | 139 | 159 | 160 | 161 | 181 | 182 | 183 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 217 | 218 | 219 | 220 | 1669872968278 221 | 245 | 246 | 1680227722018 247 | 252 | 253 | 1680603735322 254 | 259 | 260 | 1680775226047 261 | 266 | 267 | 1681880171652 268 | 273 | 274 | 1681896821625 275 | 280 | 281 | 1681983650112 282 | 287 | 288 | 1682070356913 289 | 294 | 295 | 1707125947309 296 | 301 | 304 | 305 | 307 | 308 | 317 | 318 | 319 | 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | 337 | 339 | 340 | 341 | 342 | 343 | file://$PROJECT_DIR$/../../../py39/Lib/site-packages/sklearn/base.py 344 | 5 345 | 347 | 348 | 349 | 350 | 351 | 352 | 353 | 354 | 355 | 356 | 357 | 358 | -------------------------------------------------------------------------------- /functions.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | # @Project :gplearnplus 5 | # @File :function 6 | # @Date :2022/12/1 0001 13:46 7 | # @Author :Junzhe Huang 8 | # @Email :acejasonhuang@163.com 9 | # @Software :PyCharm 10 | ------------------------------------------------- 11 | """ 12 | import numpy as np 13 | from joblib import wrap_non_picklable_objects 14 | 15 | NoneType = type(None) 16 | 17 | __all__ = ['make_function', 'raw_function_list'] 18 | 19 | 20 | class _Function(object): 21 | """ 22 | 函数对象,参数至少有一个为向量 23 | 默认函数类型为,all,既可用于时序也可用于截面 24 | 默认返回类型为数值, 25 | 默认输入类型,数值向量或者标量 26 | 27 | Parameters 28 | ---------- 29 | function : callable 30 | A function with signature function(x1, *args) that returns a Numpy 31 | array of the same shape as its arguments. 32 | 33 | name : str 34 | The name for the function as it should be represented in the program 35 | and its visualizations. 36 | 37 | arity : int 38 | The number of arguments that the ``function`` takes. 39 | 40 | param_type : [{ 41 | 'vector': {'category': (None, None), 'number': (None, None)}, 42 | 'scalar': {'int': (int, int), 'float': (float, float)} 43 | },] 44 | function_type : 'all', 'section', 'time_series‘ 45 | return_type: 'number', 'category' 46 | 47 | """ 48 | 49 | def __init__(self, function, name, arity, param_type=None, return_type='number', function_type='all'): 50 | self.function = function 51 | self.name = name 52 | self.arity = arity 53 | if param_type is None: 54 | # 默认不接受分类类型 55 | param_type = arity * [{'vector': {'number': (None, None)}, 56 | 'scalar': {'int': (None, None), 'float': (None, None)}}] 57 | else: 58 | # 防止长度不一 59 | if len(param_type) != arity: 60 | raise ValueError( 61 | "length of param_type should be equal to arity, it should be {}, not {}" 62 | .format(arity, len(param_type))) 63 | self.param_type = param_type 64 | if (return_type != 'number') and (return_type != 'category'): 65 | raise ValueError("return_type of function {} should be number or category, NOT {}" 66 | .format(name, return_type)) 67 | self.return_type = return_type 68 | self.function_type = function_type 69 | 70 | def __call__(self, *args): 71 | """ 72 | 调用函数特殊处理, 73 | 参数仅接受标量,却传入向量 74 | 则取向量第一个值为标量 75 | """ 76 | for _param, _param_type in zip(args, self.param_type): 77 | if len(_param_type) == 1 and 'scalar' in _param_type and isinstance(_param, (list, np.ndarray)): 78 | _param = _param[0] 79 | return self.function(*args) 80 | 81 | def add_range(self, const_range): 82 | # 作用:替换掉参数中没有约束的范围,给所有标量限制范围 83 | # 若没有const_range, 则表明所有函数不接收常数, 去掉所有的const type 84 | if const_range is None: 85 | for i, _dict in enumerate(self.param_type): 86 | if 'vector' not in _dict: 87 | raise ValueError("for None const range, vector type should in all function param") 88 | if 'scalar' in _dict: 89 | self.param_type[i].pop('scalar') 90 | return 91 | if not isinstance(const_range, tuple): 92 | raise ValueError('const_range must be an tuple') 93 | _min, _max = const_range 94 | if not isinstance(_min, (int, float)): 95 | raise ValueError('const_range left must be an int, float') 96 | if not isinstance(_max, (int, float)): 97 | raise ValueError('const_range right must be an int, float') 98 | if _min > _max: 99 | raise ValueError('const_range left should le right') 100 | 101 | for i, _dict in enumerate(self.param_type): 102 | if 'scalar' in _dict: 103 | _scalar_range = _dict['scalar'] 104 | if 'int' in _scalar_range: 105 | _l = int(_min) if _scalar_range['int'][0] is None else int(_scalar_range['int'][0]) 106 | _r = int(_max) if _scalar_range['int'][1] is None else int(_scalar_range['int'][1]) 107 | self.param_type[i]['scalar']['int'] = (_l, _r) 108 | if 'float' in _scalar_range: 109 | _l = float(_min) if _scalar_range['float'][0] is None else float(_scalar_range['float'][0]) 110 | _r = float(_max) if _scalar_range['float'][1] is None else float(_scalar_range['float'][1]) 111 | self.param_type[i]['scalar']['float'] = (_l, _r) 112 | 113 | return 114 | 115 | def is_point_mutation(self, candidate_func): 116 | # 检验某个待替换函数是否可以替换 117 | if not isinstance(candidate_func, _Function): 118 | raise ValueError("wrong type, it should be _Function style") 119 | # 带替换函数是否与该函数参数长度一致 120 | if len(candidate_func.param_type) != len(self.param_type): 121 | return False 122 | if self.return_type != candidate_func.return_type: 123 | return False 124 | 125 | # candidate函数的参数必须为待替换函数参数的子集 126 | # 要求替换和,函数的所有参数仍然合法 127 | for dict_self, dict_candi in zip(self.param_type, candidate_func.param_type): 128 | if len(dict_candi) <= len(dict_self): 129 | return False 130 | for upper_type in dict_self: 131 | if upper_type not in dict_candi: 132 | return False 133 | else: 134 | for lower_type in dict_self: 135 | if lower_type not in dict_candi[upper_type]: 136 | return False 137 | else: 138 | if upper_type == 'scalar': 139 | if (dict_candi['scalar'][lower_type][0] > dict_self['scalar'][lower_type][0]) or ( 140 | dict_candi['scalar'][lower_type][1] > dict_candi['scalar'][lower_type][1]): 141 | return False 142 | return True 143 | 144 | 145 | 146 | # warp 用于多进程序列化,会降低进化效率 147 | def make_function(*, function, name, arity, param_type=None, wrap=True, return_type='number', function_type='all'): 148 | """ 149 | Parameters 150 | ---------- 151 | function : callable 152 | 153 | name : str 154 | 155 | arity : int 156 | 157 | param_type : [{type: (, ), type: (, )}, ........] 158 | 159 | wrap : bool, optional (default=True) 160 | """ 161 | 162 | if not isinstance(arity, int): 163 | raise ValueError('arity must be an int, got %s' % type(arity)) 164 | if not isinstance(name, str): 165 | raise ValueError('name must be a string, got %s' % type(name)) 166 | if not isinstance(wrap, bool): 167 | raise ValueError('wrap must be an bool, got %s' % type(wrap)) 168 | 169 | # check out param_type vector > scalar int > float 170 | if param_type is None: 171 | param_type = [None] * arity 172 | if not isinstance(param_type, list): 173 | raise ValueError('param_type must be list') 174 | if len(param_type) != arity: 175 | raise ValueError('len of param_type must be arity') 176 | # 保证函数中至少有一个向量 177 | vector_flag = False 178 | for i, _dict in enumerate(param_type): 179 | # 转换None type 180 | # 标记某一个参数是否可接受向量 181 | non_vector_param = True 182 | if _dict is None: 183 | param_type[i] = {'vector': {'category': (None, None), 'number': (None, None)}, 184 | 'scalar': {'int': (None, None), 'float': (None, None)}} 185 | elif not isinstance(_dict, dict): 186 | raise ValueError('element in param_type {} must be dict'.format(i + 1)) 187 | if len(_dict) > 2: 188 | raise ValueError('len of element in param_type {} must be 1, 2'.format(i + 1)) 189 | for upper_type in _dict: 190 | if upper_type == 'vector': 191 | if not isinstance(_dict['vector'], dict): 192 | raise ValueError('type of element in param_type {} must be {upper_type: {lower_type:( , )}}}' 193 | .format(i + 1)) 194 | if len(_dict['vector']) == 0: 195 | raise ValueError('length of upper_type dict in param_type {} should not be 0'.format(i + 1)) 196 | vector_flag = True 197 | non_vector_param = False 198 | for lower_type in _dict['vector']: 199 | if lower_type not in ['number', 'category']: 200 | raise ValueError('key of vector in param_type {} must be number or category'.format(i + 1)) 201 | param_type[i]['vector'][lower_type] = (None, None) 202 | 203 | elif upper_type == 'scalar': 204 | if not isinstance(_dict['scalar'], dict): 205 | raise ValueError('type of element in param_type {} must be {upper_type: {lower_type:( , )}}}' 206 | .format(i + 1)) 207 | if len(_dict['scalar']) == 0: 208 | raise ValueError('length of upper_type dict in param_type {} should not be 0'.format(i + 1)) 209 | for lower_type in _dict['scalar']: 210 | if lower_type == 'int': 211 | if not isinstance(_dict['scalar']['int'], tuple): 212 | raise ValueError('structure of lower_type in param_type {} must be {type: ( , )}}' 213 | .format(i + 1)) 214 | if len(_dict['scalar']['int']) != 2: 215 | raise ValueError("len of lower_type's structure in param_type {} must be 2".format(i + 1)) 216 | if not isinstance(_dict['scalar']['int'][0], (int, NoneType)): 217 | raise ValueError("the first element in lower_type's structure in param_type {} " 218 | "must be None, int or float".format(i + 1)) 219 | if not isinstance(_dict['scalar']['int'][1], (int, NoneType)): 220 | raise ValueError("the second element in lower_type's structure in param_type {} " 221 | "must be None, int or float".format(i + 1)) 222 | if isinstance(_dict['scalar']['int'][0], int) and isinstance(_dict['scalar']['int'][1], int) \ 223 | and _dict['scalar']['int'][1] < _dict['scalar']['int'][0]: 224 | raise ValueError('the second element should ge the first element in param_type {}' 225 | .format(i + 1)) 226 | 227 | elif lower_type == 'float': 228 | if not isinstance(_dict['scalar']['float'], tuple): 229 | raise ValueError('structure of lower_type in param_type {} must be {type: ( , )}}' 230 | .format(i + 1)) 231 | if len(_dict['scalar']['float']) != 2: 232 | raise ValueError("len of lower_type's structure in param_type {} must be 2".format(i + 1)) 233 | if not isinstance(_dict['scalar']['float'][0], (float, int, NoneType)): 234 | raise ValueError("the first element in lower_type's structure in param_type {} " 235 | "must be None, int or float".format(i + 1)) 236 | if not isinstance(_dict['scalar']['float'][1], (float, int, NoneType)): 237 | raise ValueError("the second element in lower_type's structure in param_type {} " 238 | "must be None, int or float".format(i + 1)) 239 | if isinstance(_dict['scalar']['float'][0], (int, float)) and \ 240 | isinstance(_dict['scalar']['float'][1], (int, float)) \ 241 | and _dict['scalar']['float'][1] < _dict['scalar']['float'][0]: 242 | raise ValueError('the second element should ge the first element in param_type {}' 243 | .format(i + 1)) 244 | else: 245 | raise ValueError('key of scalar in param_type {} must be int or float'.format(i + 1)) 246 | else: 247 | raise ValueError('key of element in param_type {} must be vector or scalar'.format(i + 1)) 248 | 249 | if not vector_flag: 250 | raise ValueError('there is at least 1 vector in param_type {}'.format(i + 1)) 251 | 252 | # Check output shape 253 | # 生成测试数据 254 | args = [] 255 | for _dict in param_type: 256 | if 'vector' in _dict: 257 | if 'number' in _dict['vector']: 258 | args.append(np.ones(10)) 259 | else: 260 | args.append(np.array([1] * 10)) 261 | elif 'scalar' in _dict: 262 | if 'int' in _dict['scalar']: 263 | args.append(((0 if _dict['scalar']['int'][1] is None else _dict['scalar']['int'][1]) + 264 | (0 if _dict['scalar']['int'][0] is None else _dict['scalar']['int'][0])) // 2) 265 | else: 266 | args.append(((0 if _dict['scalar']['float'][1] is None else _dict['scalar']['float'][1]) + 267 | (0 if _dict['scalar']['float'][0] is None else _dict['scalar']['float'][0])) // 2) 268 | 269 | try: 270 | function(*args) 271 | except (ValueError, TypeError): 272 | print(args) 273 | raise ValueError('supplied function %s does not support arity of %d.' 274 | % (name, arity)) 275 | if not hasattr(function(*args), 'shape'): 276 | raise ValueError('supplied function %s does not return a numpy array.' 277 | % name) 278 | if function(*args).shape != (10,): 279 | raise ValueError('supplied function %s does not return same shape as ' 280 | 'input vectors.' % name) 281 | if function(*args).dtype.type is np.float_ and return_type == 'category': 282 | raise ValueError('the return type should be category not {}'.format(function(*args).dtype.type)) 283 | elif function(*args).dtype not in [np.float, np.int, np.int64] and return_type == 'number': 284 | raise ValueError('the return type should be category not {}'.format(function(*args).dtype.type)) 285 | 286 | # Check closure for zero & negative input arguments 287 | args2 = [] 288 | args3 = [] 289 | for _dict in param_type: 290 | if 'vector' in _dict: 291 | # 兼容category向量 292 | args2.append(np.zeros(10)) 293 | args3.append(-1 * np.ones(10)) 294 | elif 'scalar' in _dict: 295 | if 'int' in _dict['scalar']: 296 | 297 | _temp = (((0 if _dict['scalar']['int'][1] is None else _dict['scalar']['int'][1]) + 298 | (0 if _dict['scalar']['int'][0] is None else _dict['scalar']['int'][0])) // 2) 299 | args2.append(_temp) 300 | args3.append(_temp) 301 | else: 302 | _temp = (((0 if _dict['scalar']['float'][1] is None else _dict['scalar']['float'][1]) + 303 | (0 if _dict['scalar']['float'][0] is None else _dict['scalar']['float'][0])) // 2) 304 | args2.append(_temp) 305 | args3.append(_temp) 306 | 307 | 308 | if not np.all(np.isnan(function(*args2)) | np.isfinite(function(*args2))): 309 | raise ValueError('supplied function %s does not have closure against ' 310 | 'zeros in argument vectors.' % name) 311 | 312 | if not np.all(np.isnan(function(*args3)) | np.isfinite(function(*args3))): 313 | raise ValueError('supplied function %s does not have closure against ' 314 | 'negatives in argument vectors.' % name) 315 | if wrap: 316 | return _Function(function=wrap_non_picklable_objects(function), 317 | name=name, 318 | arity=arity, 319 | param_type=param_type, 320 | return_type=return_type, 321 | function_type=function_type) 322 | return _Function(function=function, 323 | name=name, 324 | arity=arity, 325 | param_type=param_type, 326 | return_type=return_type, 327 | function_type=function_type) 328 | 329 | 330 | def _protected_division(x1, x2): 331 | """Closure of division (x1/x2) for zero denominator.""" 332 | with np.errstate(divide='ignore', invalid='ignore'): 333 | return np.where(np.abs(x2) > 0.001, np.divide(x1, x2), 1.) 334 | 335 | 336 | def _protected_sqrt(x1): 337 | """Closure of square root for negative arguments.""" 338 | return np.sqrt(np.abs(x1)) 339 | 340 | 341 | def _protected_log(x1): 342 | """Closure of log for zero and negative arguments.""" 343 | with np.errstate(divide='ignore', invalid='ignore'): 344 | return np.where(np.abs(x1) > 0.001, np.log(np.abs(x1)), 0.) 345 | 346 | 347 | def _protected_inverse(x1): 348 | """Closure of inverse for zero arguments.""" 349 | with np.errstate(divide='ignore', invalid='ignore'): 350 | return np.where(np.abs(x1) > 0.001, 1. / x1, 0.) 351 | 352 | 353 | def _sigmoid(x1): 354 | """Special case of logistic function to transform to probabilities.""" 355 | with np.errstate(over='ignore', under='ignore'): 356 | return 1 / (1 + np.exp(-x1)) 357 | 358 | def _groupby(gbx, func, *args, **kwargs): 359 | indices = np.argsort(gbx) 360 | gbx_sorted = gbx[indices] 361 | X = np.column_stack((np.arange(len(gbx)), gbx_sorted, *args)) 362 | splits = np.split(X, np.unique(gbx_sorted, return_index=True)[1][1:]) 363 | result_list = [func(*(split[:, 2:].T), **kwargs) for split in splits] 364 | result = np.hstack(result_list) 365 | return result[indices.argsort()] 366 | 367 | 368 | add2 = _Function(function=np.add, name='add', arity=2) 369 | sub2 = _Function(function=np.subtract, name='sub', arity=2) 370 | mul2 = _Function(function=np.multiply, name='mul', arity=2) 371 | div2 = _Function(function=_protected_division, name='div', arity=2) 372 | sqrt1 = _Function(function=_protected_sqrt, name='sqrt', arity=1) 373 | log1 = _Function(function=_protected_log, name='log', arity=1) 374 | neg1 = _Function(function=np.negative, name='neg', arity=1) 375 | inv1 = _Function(function=_protected_inverse, name='inv', arity=1) 376 | abs1 = _Function(function=np.abs, name='abs', arity=1) 377 | max2 = _Function(function=np.maximum, name='max', arity=2) 378 | min2 = _Function(function=np.minimum, name='min', arity=2) 379 | sin1 = _Function(function=np.sin, name='sin', arity=1) 380 | cos1 = _Function(function=np.cos, name='cos', arity=1) 381 | tan1 = _Function(function=np.tan, name='tan', arity=1) 382 | sig1 = _Function(function=_sigmoid, name='sig', arity=1) 383 | 384 | _function_map = {'add': add2, 385 | 'sub': sub2, 386 | 'mul': mul2, 387 | 'div': div2, 388 | 'sqrt': sqrt1, 389 | 'log': log1, 390 | 'abs': abs1, 391 | 'neg': neg1, 392 | 'inv': inv1, 393 | 'max': max2, 394 | 'min': min2, 395 | 'sin': sin1, 396 | 'cos': cos1, 397 | 'tan': tan1} 398 | 399 | raw_function_list = ['add', 'sub', 'mul', 'div', 'sqrt', 400 | 'sqrt', 'log', 'abs', 'neg', 'inv', 401 | 'max', 'min', 'sin', 'cos', 'tan'] 402 | 403 | all_function = raw_function_list.copy() 404 | 405 | section_function = [] 406 | 407 | time_series_function = [] 408 | 409 | if __name__ == '__main__': 410 | # def ff(a, b, c): 411 | # return a * b + c 412 | # 413 | # param_type = [{'vector':{'number': (None, None)}}, {'scalar': {'int':(None, 1)}}, {'scalar': {'float': (-1, None)}}] 414 | # f_m = make_function(function=ff, name='ff', arity=3, param_type=param_type, wrap=True, return_type='number') 415 | # f_m.add_range((-1, 1)) 416 | # print(f_m.param_type) 417 | a = np.array([1, 2, 2, 1, np.nan]) 418 | b = np.array([1, 2, 3, 4, 5]) 419 | print(_groupby(a, max, b)) 420 | 421 | -------------------------------------------------------------------------------- /example.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | # @Project :gplearn 5 | # @File :example.py 6 | # @Date :2023/3/31 0013 17:37 7 | # @Author :Junzhe Huang 8 | # @Email :acejasonhuang@163.com 9 | # @Software :PyCharm 10 | ------------------------------------------------- 11 | """ 12 | ##### 13 | # 目录 14 | # 1. ALL FUNCTION 全局函数 15 | # 2. TIME SERIES FUNCTION 一般时间序列函数 16 | # 3. TA FUNCTION 技术指标函数 17 | # 4. SECTION FUNCTION 截面函数 18 | # 5. SECTION GROUPBY FUNCTION 截面分类聚合函数 19 | # 20 | # 21 | ### 22 | import numpy as np 23 | from typing import Any 24 | import numba as nb 25 | from copy import copy 26 | from numba import jit 27 | from gplearnplus import functions 28 | from functools import wraps 29 | from functions import _groupby 30 | 31 | 32 | def no_numpy_warning(func): 33 | @wraps(func) 34 | def warp(*args, **kwargs): 35 | with np.errstate(all='ignore'): 36 | _res = func(*args, **kwargs) 37 | return _res 38 | return warp 39 | 40 | @nb.jit(nopython=True) 41 | def handle_nan(X): 42 | # 前值填充 43 | X = np.copy(X) 44 | _temp = np.nan 45 | na_len = 0 46 | for i in range(len(X)): 47 | if np.isnan(X[i]): 48 | X[i] = _temp 49 | na_len += 1 50 | else: 51 | _temp = X[i] 52 | return X, na_len 53 | 54 | #### ALL FUNCTION ##### 55 | 56 | @jit(nopython=True) 57 | def _combine(X, Y): 58 | p1 = 15485863 59 | p2 = 32416190071 60 | p3 = 100000007 61 | return np.mod(X * p1 + Y * p2, p3) 62 | 63 | combine = functions.make_function(function=_combine, name='combine', arity=2, return_type='category', 64 | param_type=[{'vector': {'category': (None, None)}}, 65 | {'vector': {'category': (None, None)}}]) 66 | 67 | #### TIME SERIES FUNCTION ##### 68 | 69 | @jit(nopython=True) 70 | def _delay(X, d): 71 | res = np.empty_like(X) 72 | res.fill(np.nan) 73 | end = len(X) - d 74 | for i in range(d, len(X)): 75 | res[i] = X[i - d] 76 | return res 77 | 78 | delay = functions.make_function(function=_delay, name='delay', arity=2, function_type='time_series', 79 | param_type=[{'vector': {'number': (None, None)}}, 80 | {'scalar': {'int':(3, 30)}}]) 81 | 82 | @jit(nopython=True) 83 | def _delta(X, d): 84 | res = np.empty_like(X) 85 | res.fill(np.nan) 86 | end = len(X) - d 87 | for i in range(d, len(X)): 88 | res[i] = X[i] - X[i - d] 89 | return res 90 | 91 | delta = functions.make_function(function=_delta, name='delta', arity=2, function_type='time_series', 92 | param_type=[{'vector': {'number': (None, None)}}, 93 | {'scalar': {'int':(3, 30)}}]) 94 | @jit(nopython=True) 95 | def _ts_min(X, d): 96 | d = len(X) - 1 if d >= len(X) else d 97 | shape = (X.size - d + 1, d) 98 | res = np.empty(X.size, dtype=X.dtype) 99 | res.fill(np.nan) 100 | for i in range(len(X) - d + 1): 101 | res[i + d - 1] = np.nanmin(X[i:i + d]) 102 | return res 103 | 104 | ts_min = functions.make_function(function=_ts_min, name='ts_min', arity=2, function_type='time_series', 105 | param_type=[{'vector': {'number': (None, None)}}, {'scalar': {'int':(3, 30)}}]) 106 | 107 | @jit(nopython=True) 108 | def _ts_max(X, d): 109 | d = len(X) - 1 if d >= len(X) else d 110 | shape = (X.size - d + 1, d) 111 | res = np.empty(X.size, dtype=X.dtype) 112 | res.fill(np.nan) 113 | for i in range(len(X) - d + 1): 114 | res[i + d - 1] = np.nanmax(X[i:i + d]) 115 | return res 116 | 117 | ts_max = functions.make_function(function=_ts_max, name='ts_max', arity=2, function_type='time_series', 118 | param_type=[{'vector': {'number': (None, None)}}, {'scalar': {'int':(3, 30)}}]) 119 | 120 | @jit(nopython=True) 121 | def _ts_argmax(X, d): 122 | d = len(X) - 1 if d >= len(X) else d 123 | res = np.empty(len(X), dtype=np.float64) 124 | res[:d - 1] = np.nan 125 | for i in range(len(X) - d + 1): 126 | res[i + d - 1] = np.argmax(X[i:i + d]) 127 | return res 128 | 129 | ts_argmax = functions.make_function(function=_ts_argmax, name='ts_argmax', arity=2, function_type='time_series', 130 | param_type=[{'vector': {'number': (None, None)}}, {'scalar': {'int':(3, 30)}}]) 131 | 132 | @jit(nopython=True) 133 | def _ts_argmin(X, d): 134 | n = len(X) 135 | d = n - 1 if d >= n else d 136 | res = np.full(n, np.nan) 137 | for i in range(n - d + 1): 138 | res[i + d - 1] = np.argmax(X[i:i + d]) 139 | return res 140 | ts_argmin = functions.make_function(function=_ts_argmin, name='ts_argmax', arity=2, function_type='time_series', 141 | param_type=[{'vector': {'number': (None, None)}}, 142 | {'scalar': {'int':(3, 30)}}]) 143 | 144 | @jit(nopython=True) 145 | def _ts_rank(X, d): 146 | n = len(X) 147 | d = n - 1 if d >= n else d 148 | res = np.full(n, np.nan) 149 | for i in range(n - d + 1): 150 | rank = np.argsort(X[i:i + d]).argsort()[-1] + 1 151 | res[i + d - 1] = rank / d 152 | return res 153 | 154 | ts_rank = functions.make_function(function=_ts_rank, name='ts_rank', arity=2, function_type='time_series', 155 | param_type=[{'vector': {'number': (None, None)}}, {'scalar': {'int':(3, 30)}}]) 156 | 157 | @jit(nopython=True) 158 | def _ts_sum(X, d): 159 | n = len(X) 160 | d = n - 1 if d >= n else d 161 | res = np.full(n, np.nan) 162 | cumsum = np.nancumsum(X) 163 | res[d - 1:n] = cumsum[d - 1:] - cumsum[:-d] 164 | return res 165 | 166 | ts_sum = functions.make_function(function=_ts_sum, name='ts_sum', arity=2, function_type='time_series', 167 | param_type=[{'vector': {'number': (None, None)}}, {'scalar': {'int':(3, 30)}}]) 168 | 169 | @jit(nopython=True) 170 | def _ts_stddev(X, d): 171 | d = len(X) - 1 if d >= len(X) else d 172 | res = np.empty(len(X)) 173 | res[:] = np.nan 174 | for i in range(d - 1, len(X)): 175 | res[i] = np.nanstd(X[i - d + 1:i + 1]) 176 | return res 177 | 178 | ts_stddev = functions.make_function(function=_ts_stddev, name='ts_stddev', arity=2, function_type='time_series', 179 | param_type=[{'vector': {'number': (None, None)}}, {'scalar': {'int':(3, 30)}}]) 180 | 181 | @jit(nopython=True) 182 | def _ts_corr(X, Y, d): 183 | d = len(X) - 1 if d >= len(X) else d 184 | res = np.empty(len(X)) 185 | res[:d-1] = np.nan 186 | for i in range(len(X) - d + 1): 187 | X_ = X[i:i+d] 188 | Y_ = Y[i:i+d] 189 | X_ = X_[~(np.isnan(X_) | np.isnan(Y_))] 190 | Y_ = Y_[~(np.isnan(X_) | np.isnan(Y_))] 191 | if len(X_) <= 2: 192 | res[i+d-1] = np.nan 193 | else: 194 | res[i+d-1] = np.corrcoef(X_, Y_)[0][1] 195 | return res 196 | 197 | ts_corr = functions.make_function(function=_ts_corr, name='ts_corr', arity=3, function_type='time_series', 198 | param_type=[{'vector': {'number': (None, None)}}, 199 | {'vector': {'number': (None, None)}}, 200 | {'scalar': {'int':(3, 30)}}]) 201 | 202 | @jit(nopython=True) 203 | def _ts_mean(X, d): 204 | d = len(X) - 1 if d >= len(X) else d 205 | res = np.full(len(X), np.nan) 206 | s = np.sum(X[:d]) 207 | for i in range(d - 1, len(X)): 208 | res[i] = s / d 209 | s += X[i + 1] - X[i - d + 1] 210 | return res 211 | 212 | ts_mean = functions.make_function(function=_ts_mean, name='ts_mean', arity=2, 213 | function_type='time_series', 214 | param_type=[{'vector': {'number': (None, None)}}, 215 | {'scalar': {'int':(3, 30)}}]) 216 | 217 | @jit(nopython=True) 218 | def _ts_neutralize(X, d): 219 | N = len(X) 220 | d = len(X) - 1 if d >= len(X) else d 221 | mov_mean = np.empty(N - d + 1) 222 | mov_std = np.empty(N - d + 1) 223 | res = np.empty(N) 224 | 225 | for i in nb.prange(N - d + 1): 226 | mov_mean[i] = np.mean(X[i:i + d]) 227 | mov_std[i] = np.sqrt(np.mean((X[i:i + d] - mov_mean[i]) ** 2)) 228 | mov_std[i] = mov_std[i] if mov_std[i] > 0.001 else 0.001 229 | 230 | for i in nb.prange(N): 231 | if i < d - 1: 232 | res[i] = np.nan 233 | else: 234 | res[i] = (X[i] - mov_mean[i - d + 1]) / mov_std[i - d + 1] 235 | 236 | return res 237 | 238 | ts_neutralize = functions.make_function(function=_ts_neutralize, name='ts_neutralize', arity=2, 239 | function_type='time_series', 240 | param_type=[{'vector': {'number': (None, None)}}, 241 | {'scalar': {'int':(3, 30)}}]) 242 | 243 | @nb.jit(nopython=True) 244 | def _ts_freq(X, d): 245 | d = len(X) - 1 if d >= len(X) else d 246 | res = np.empty(len(X), dtype=np.float64) 247 | res[:d - 1] = np.nan 248 | for i in range(d - 1, len(X)): 249 | subarr = X[i - d + 1:i + 1] 250 | res[i] = sum(subarr == X[i]) 251 | return res 252 | 253 | ts_freq = functions.make_function(function=_ts_freq, name='ts_freq', arity=2, 254 | function_type='time_series', 255 | param_type=[{'vector': {'category': (None, None)}}, 256 | {'scalar': {'int':(3, 30)}}]) 257 | 258 | #### TIME SERIES TA FUNCTION #### 259 | 260 | @nb.jit(nopython=True) 261 | def _EMA(X, d): 262 | d = len(X) - 1 if d >= len(X) else d 263 | X, _l = handle_nan(X) 264 | X = X[_l:] 265 | if len(X) < d: 266 | return np.array([np.nan] * (len(X) + _l)) 267 | kt = 2 / (d + 1) 268 | pre_ma = np.mean(X[:d]) 269 | __res = np.array([np.nan] * (len(X) + _l)) 270 | __res[_l + d - 1] = pre_ma 271 | for i in range(d, len(X)): 272 | pre_ma += (X[i] - pre_ma) * kt 273 | __res[_l + i] = pre_ma 274 | return __res 275 | 276 | EMA = functions.make_function(function=_EMA, name='EMA', arity=2, function_type='time_series', 277 | param_type=[{'vector': {'number': (None, None)}}, {'scalar': {'int':(3, 30)}}]) 278 | 279 | @jit(nopython=True) 280 | def _DEMA(X, d): 281 | d = d if len(X) > 2 * d - 2 else len(X) // 2 - 1 282 | _ema = _EMA(X, d) 283 | _eema = _EMA(_ema, d) 284 | __res = 2 * _ema - _eema 285 | return __res 286 | 287 | DEMA = functions.make_function(function=_DEMA, name='DEMA', arity=2, function_type='time_series', 288 | param_type=[{'vector': {'number': (None, None)}}, {'scalar': {'int':(3, 30)}}]) 289 | 290 | @jit(nopython=True) 291 | def _MA(X, d): 292 | d = len(X) - 1 if d >= len(X) else d 293 | X, _l = handle_nan(X) 294 | X = X[_l:] 295 | if len(X) < d: 296 | return np.array([np.nan] * (len(X) + _l)) 297 | __res = [np.nan] * (_l + d - 1) + [np.mean(X[i:i + d]) for i in range(len(X) - d + 1)] 298 | return np.array(__res) 299 | 300 | MA = functions.make_function(function=_MA, name='MA', arity=2, function_type='time_series', 301 | param_type=[{'vector': {'number': (None, None)}}, {'scalar': {'int':(3, 30)}}]) 302 | 303 | @jit(nopython=True) 304 | def _KAMA(X, d): 305 | d = len(X) - 1 if d >= len(X) else d 306 | X, _l = handle_nan(X) 307 | X = X[_l:] 308 | if len(X) < d: 309 | return np.array([np.nan] * (len(X) + _l)) 310 | _af = 2 / (2 + 1) 311 | _as = 2 / (30 + 1) 312 | __res = np.array([np.nan] * (len(X) + _l)) 313 | for i in range(d, len(X)): 314 | period_roc = X[i] - X[i - d] 315 | sum_roc = np.sum(np.abs(np.diff(X[i - d: i + 1]))) 316 | _er = 1.0 if ((period_roc >= sum_roc) or (sum_roc == 0)) else abs(period_roc / sum_roc) 317 | _at = (_er * (_af - _as) + _as) ** 2 318 | __res[_l + i] = _at * X[i] + (1 - _at) * (__res[_l + i - 1] if i != d else X[i - 1]) 319 | return __res 320 | 321 | KAMA = functions.make_function(function=_KAMA, name='KAMA', arity=2, function_type='time_series', 322 | param_type=[{'vector': {'number': (None, None)}}, {'scalar': {'int':(3, 30)}}]) 323 | 324 | @nb.jit(nopython=True) 325 | def _MIDPOINT(X, d): 326 | d = len(X) - 1 if d >= len(X) else d 327 | res = np.empty(len(X)) 328 | res[:] = np.nan 329 | for i in range(d - 1, len(X)): 330 | res[i] = (np.nanmax(X[i-d+1:i+1]) + np.nanmin(X[i-d+1:i+1])) / 2 331 | return res 332 | 333 | MIDPOINT = functions.make_function(function=_MIDPOINT, name='MIDPOINT', arity=2, function_type='time_series', 334 | param_type=[{'vector': {'number': (None, None)}}, {'scalar': {'int':(3, 30)}}]) 335 | 336 | @nb.jit(nopython=True) 337 | def _BETA(X, Y, d): 338 | d = len(X) - 1 if d >= len(X) else d 339 | res = np.full(len(X), np.nan) 340 | for i in range(d - 1, len(X)): 341 | X_slice = X[i - d + 1: i + 1] 342 | Y_slice = Y[i - d + 1: i + 1] 343 | X_mean = np.mean(X_slice) 344 | Y_mean = np.mean(Y_slice) 345 | numerator = np.sum((X_slice - X_mean) * (Y_slice - Y_mean)) 346 | denominator = np.sum((X_slice - X_mean) ** 2) 347 | denominator = denominator if denominator > 0.001 else 0.001 348 | res[i] = numerator / denominator 349 | return res 350 | 351 | BETA = functions.make_function(function=_BETA, name='BETA', arity=3, function_type='time_series', 352 | param_type=[{'vector': {'number': (None, None)}}, 353 | {'vector': {'number': (None, None)}}, 354 | {'scalar': {'int':(3, 30)}}]) 355 | 356 | @nb.jit(nopython=True) 357 | def _LINEARREG_SLOPE(X, d): 358 | d = len(X) - 1 if d >= len(X) else d 359 | Y = np.arange(d) 360 | res = np.full(len(X), np.nan) 361 | for i in range(d - 1, len(X)): 362 | X_slice = X[i - d + 1: i + 1] 363 | Y_slice = Y[:len(X_slice)] 364 | X_mean = np.mean(X_slice) 365 | Y_mean = np.mean(Y_slice) 366 | numerator = np.sum((X_slice - X_mean) * (Y_slice - Y_mean)) 367 | denominator = np.sum((X_slice - X_mean) ** 2) 368 | denominator = denominator if denominator > 0.001 else 0.001 369 | res[i] = numerator / denominator 370 | return res 371 | 372 | LINEARREG_SLOPE = functions.make_function(function=_LINEARREG_SLOPE, name='LINEARREG_SLOPE', arity=2, 373 | function_type='time_series', 374 | param_type=[{'vector': {'number': (None, None)}}, 375 | {'scalar': {'int':(3, 30)}}]) 376 | 377 | @nb.jit(nopython=True) 378 | def _LINEARREG_ANGLE(X, d): 379 | d = len(X) - 1 if d >= len(X) else d 380 | Y = np.arange(d) 381 | res = np.full(len(X), np.nan) 382 | for i in range(d - 1, len(X)): 383 | X_slice = X[i - d + 1: i + 1] 384 | Y_slice = Y[:len(X_slice)] 385 | X_mean = np.mean(X_slice) 386 | Y_mean = np.mean(Y_slice) 387 | numerator = np.sum((X_slice - X_mean) * (Y_slice - Y_mean)) 388 | denominator = np.sum((X_slice - X_mean) ** 2) 389 | denominator = denominator if denominator > 0.001 else 0.001 390 | res[i] = np.arctan(numerator / denominator) * (180.0 / np.pi) 391 | return res 392 | 393 | LINEARREG_ANGLE = functions.make_function(function=_LINEARREG_ANGLE, name='LINEARREG_ANGLE', arity=2, 394 | function_type='time_series', 395 | param_type=[{'vector': {'number': (None, None)}}, 396 | {'scalar': {'int':(3, 30)}}]) 397 | 398 | @nb.jit(nopython=True) 399 | def _LINEARREG_INTERCEPT(X, d): 400 | d = len(X) - 1 if d >= len(X) else d 401 | Y = np.arange(d) 402 | res = np.full(len(X), np.nan) 403 | for i in range(d - 1, len(X)): 404 | X_slice = X[i - d + 1: i + 1] 405 | Y_slice = Y[:len(X_slice)] 406 | X_mean = np.mean(X_slice) 407 | Y_mean = np.mean(Y_slice) 408 | numerator = np.sum((X_slice - X_mean) * (Y_slice - Y_mean)) 409 | denominator = np.sum((X_slice - X_mean) ** 2) 410 | denominator = denominator if denominator > 0.001 else 0.001 411 | _temp = np.arctan(numerator / denominator) * (180.0 / np.pi) 412 | res[i] = np.sum(X_slice) - _temp * np.sum(Y_slice) 413 | return res 414 | 415 | LINEARREG_INTERCEPT = functions.make_function(function=_LINEARREG_INTERCEPT, name='LINEARREG_INTERCEPT', 416 | arity=2, function_type='time_series', 417 | param_type=[{'vector': {'number': (None, None)}}, 418 | {'scalar': {'int':(3, 30)}}]) 419 | 420 | #### SECTION FUNCTION #### 421 | 422 | @nb.jit(nopython=True) 423 | def _MAX_SECTION(X: np.ndarray) -> np.ndarray: 424 | return np.full_like(X, np.max(X)) 425 | 426 | sec_max = functions.make_function(function=_MAX_SECTION, name='sec_max', arity=1, function_type='section', 427 | param_type=[{'vector': {'number': (None, None)}}]) 428 | 429 | @nb.jit(nopython=True) 430 | def _MIN_SECTION(X): 431 | return np.full_like(X, np.min(X)) 432 | 433 | sec_min = functions.make_function(function=_MIN_SECTION, name='sec_min', arity=1, function_type='section', 434 | param_type=[{'vector': {'number': (None, None)}}]) 435 | 436 | @nb.jit(nopython=True) 437 | def _MEAN_SECTION(X): 438 | return np.full_like(X, np.mean(X)) 439 | 440 | sec_mean = functions.make_function(function=_MEAN_SECTION, name='sec_mean', arity=1, function_type='section', 441 | param_type=[{'vector': {'number': (None, None)}}]) 442 | 443 | @nb.jit(nopython=True) 444 | def _MEDIAN_SECTION(X): 445 | return np.full_like(X, np.median(X)) 446 | 447 | sec_median = functions.make_function(function=_MEDIAN_SECTION, name='sec_median', arity=1, function_type='section', 448 | param_type=[{'vector': {'number': (None, None)}}]) 449 | 450 | @nb.jit(nopython=True) 451 | def _STD_SECTION(X): 452 | return np.full_like(X, np.std(X)) 453 | 454 | sec_std = functions.make_function(function=_STD_SECTION, name='sec_std', arity=1, function_type='section', 455 | param_type=[{'vector': {'number': (None, None)}}]) 456 | 457 | @nb.jit(nopython=True) 458 | def _RANK_SECTION(X): 459 | idx = np.argsort(X) 460 | rank = np.empty_like(idx) 461 | for i in range(len(X)): 462 | rank[idx[i]] = i 463 | return rank 464 | 465 | sec_rank = functions.make_function(function=_RANK_SECTION, name='sec_rank', arity=1, function_type='section', 466 | param_type=[{'vector': {'number': (None, None)}}]) 467 | 468 | @nb.jit(nopython=True) 469 | def _NEUTRALIZE_SECTION(X): 470 | mean = np.mean(X) 471 | std = np.std(X) 472 | if std <= 0.001: 473 | std = 0.001 474 | return (X - mean) / np.repeat(std, len(X)) 475 | 476 | sec_neutralize = functions.make_function(function=_NEUTRALIZE_SECTION, name='sec_neutralize', arity=1, 477 | function_type='section', param_type=[{'vector': {'number': (None, None)}}]) 478 | 479 | @no_numpy_warning 480 | def _FREQ_SECTION(X): 481 | unique_values, counts = np.unique(X, return_counts=True) 482 | count_dict = dict(zip(unique_values, counts)) 483 | vectorized_func = np.vectorize(lambda x: count_dict[x]) 484 | return vectorized_func(X) 485 | 486 | freq = functions.make_function(function=_FREQ_SECTION, name='freq', arity=1, 487 | function_type='section', param_type=[{'vector': {'category': (None, None)}}]) 488 | 489 | @no_numpy_warning 490 | def _CUT_EQUAL_DISTANCE(X, d): 491 | ''' 492 | 等距分组 493 | Parameters 494 | ---------- 495 | X 496 | d 497 | 498 | Returns 499 | ------- 500 | 501 | ''' 502 | d = len(X) - 1 if d >= len(X) - 1 else d 503 | bins = [np.min(X) + i * (np.max(X) - np.min(X)) * 1.000001 / d for i in range(d + 1)] 504 | return np.digitize(X, bins) 505 | 506 | cut_equal_distance = functions.make_function(function=_CUT_EQUAL_DISTANCE, name='cut_eq_dist', arity=2, 507 | function_type='section', return_type='category', 508 | param_type=[{'vector': {'number': (None, None)}}, 509 | {'scalar': {'int': (2, 30)}}]) 510 | 511 | @no_numpy_warning 512 | def _CUT_EQUAL_AMOUNT(X, d): 513 | X_ = _RANK_SECTION(X) 514 | return _CUT_EQUAL_DISTANCE(X_, d) 515 | 516 | cut_equal_amount = functions.make_function(function=_CUT_EQUAL_AMOUNT, name='cut_eq_amt', arity=2, 517 | function_type='section', return_type='category', 518 | param_type=[{'vector': {'number': (None, None)}}, 519 | {'scalar': {'int': (2, 30)}}]) 520 | 521 | @no_numpy_warning 522 | def _GROUPBYTHENMAX(gbx, X): 523 | return _groupby(gbx, _MAX_SECTION, X) 524 | 525 | groupby_max = functions.make_function(function=_GROUPBYTHENMAX, name='gb_max', arity=2, function_type='section', 526 | param_type=[{'vector': {'category': (None, None)}}, 527 | {'vector': {'number': (None, None)}}]) 528 | 529 | @no_numpy_warning 530 | def _GROUPBYTHENMIN(gbx, X): 531 | return _groupby(gbx, _MIN_SECTION, X) 532 | 533 | groupby_min = functions.make_function(function=_GROUPBYTHENMIN, name='gb_min', arity=2, function_type='section', 534 | param_type=[{'vector': {'category': (None, None)}}, 535 | {'vector': {'number': (None, None)}}]) 536 | 537 | @no_numpy_warning 538 | def _GROUPBYTHENMEAN(gbx, X): 539 | return _groupby(gbx, _MEAN_SECTION, X) 540 | groupby_mean = functions.make_function(function=_GROUPBYTHENMEAN, name='gb_mean', arity=2, function_type='section', 541 | param_type=[{'vector': {'category': (None, None)}}, 542 | {'vector': {'number': (None, None)}}]) 543 | 544 | @no_numpy_warning 545 | def _GROUPBYTHENMEDIAN(gbx, X): 546 | return _groupby(gbx, _MEDIAN_SECTION, X) 547 | groupby_median = functions.make_function(function=_GROUPBYTHENMEDIAN, name='gb_median', 548 | arity=2, function_type='section', 549 | param_type=[{'vector': {'category': (None, None)}}, 550 | {'vector': {'number': (None, None)}}]) 551 | 552 | @no_numpy_warning 553 | def _GROUPBYTHENSTD(gbx, X): 554 | return _groupby(gbx, _STD_SECTION, X) 555 | groupby_std = functions.make_function(function=_GROUPBYTHENSTD, name='gb_std', arity=2, function_type='section', 556 | param_type=[{'vector': {'category': (None, None)}}, 557 | {'vector': {'number': (None, None)}}]) 558 | 559 | @no_numpy_warning 560 | def _GROUPBYTHENRANK(gbx, X): 561 | return _groupby(gbx, _RANK_SECTION, X) 562 | groupby_rank = functions.make_function(function=_GROUPBYTHENRANK, name='gb_rank', arity=2, function_type='section', 563 | param_type=[{'vector': {'category': (None, None)}}, 564 | {'vector': {'number': (None, None)}}]) 565 | 566 | @no_numpy_warning 567 | def _GROUPBYTHENNEUTRALIZE(gbx, X): 568 | return _groupby(gbx, _NEUTRALIZE_SECTION, X) 569 | groupby_neutralize = functions.make_function(function=_GROUPBYTHENNEUTRALIZE, name='gb_neu', arity=2, 570 | function_type='section', 571 | param_type=[{'vector': {'category': (None, None)}}, 572 | {'vector': {'number': (None, None)}}]) 573 | 574 | @no_numpy_warning 575 | def _GROUPBYTHEN_CUT_EQ_DIST(gbx, X, d): 576 | return _groupby(gbx, _CUT_EQUAL_DISTANCE, X, d=d) 577 | groupby_cut_equal_distance = functions.make_function(function=_GROUPBYTHEN_CUT_EQ_DIST, name='gb_cut_eq_dist', arity=3, 578 | function_type='section', return_type='category', 579 | param_type=[{'vector': {'category': (None, None)}}, 580 | {'vector': {'number': (None, None)}}, 581 | {'scalar': {'int': (2, 30)}}]) 582 | 583 | @no_numpy_warning 584 | def _GROUPBYTHEN_CUT_EQ_AMT(gbx, X, d): 585 | return _groupby(gbx, _CUT_EQUAL_AMOUNT, X, d=d) 586 | groupby_cut_equal_amount = functions.make_function(function=_GROUPBYTHEN_CUT_EQ_AMT, name='gb_cut_eq_amt', arity=3, 587 | function_type='section', return_type='category', 588 | param_type=[{'vector': {'category': (None, None)}}, 589 | {'vector': {'number': (None, None)}}, 590 | {'scalar': {'int': (2, 30)}}]) 591 | 592 | @no_numpy_warning 593 | def _GROUPBYTHENFREQ(gbx, X): 594 | return _groupby(gbx, _FREQ_SECTION, X) 595 | groupby_freq = functions.make_function(function=_GROUPBYTHENFREQ, name='gb_freq', arity=2, 596 | function_type='section', 597 | param_type=[{'vector': {'category': (None, None)}}, 598 | {'vector': {'category': (None, None)}}]) 599 | 600 | __all__ = ['delay', 'delta', 'sec_max', 'sec_min', 'sec_median', 'ts_min', 'ts_max', 'ts_sum', 'ts_corr', 'ts_rank', 601 | 'ts_stddev', 'ts_argmax', 'ts_argmin', 'ts_mean', 'EMA', 'DEMA', 'KAMA', 'MA', 'MIDPOINT', 602 | 'BETA', 'LINEARREG_ANGLE', 'LINEARREG_SLOPE', 'LINEARREG_INTERCEPT', 'sec_std', 'sec_rank', 'sec_mean', 603 | 'groupby_std', 'groupby_max', 'groupby_median', 'groupby_mean', 'groupby_rank', 'groupby_min', 604 | 'ts_neutralize', 'sec_neutralize', 'groupby_neutralize', 'cut_equal_amount', 'cut_equal_distance', 605 | 'groupby_cut_equal_amount', 'groupby_freq', 'groupby_cut_equal_distance', 'freq', 'ts_freq'] 606 | 607 | def test(): 608 | a = np.random.uniform(0.9, 1.1, 30) 609 | b = np.random.uniform(0.9, 1.1, 30) 610 | c = np.random.randint(0, 2, size=30) 611 | print(groupby_cut_equal_distance(c,a,3)) 612 | 613 | 614 | if __name__ == "__main__": 615 | test() 616 | -------------------------------------------------------------------------------- /_program.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | # @Project :gplearnplus 5 | # @File :_program 6 | # @Date :2022/12/1 0001 13:37 7 | # @Author :Junzhe Huang 8 | # @Email :acejasonhuang@163.com 9 | # @Software :PyCharm 10 | ------------------------------------------------- 11 | """ 12 | from copy import copy, deepcopy 13 | import numpy as np 14 | from sklearn.utils.random import sample_without_replacement 15 | 16 | from .functions import _Function, _groupby 17 | from .utils import check_random_state 18 | 19 | 20 | class _Program(object): 21 | ''' 22 | 23 | ''' 24 | def __init__(self, 25 | function_dict, 26 | arities, 27 | init_depth, 28 | init_method, 29 | n_features, 30 | const_range, 31 | metric, 32 | p_point_replace, 33 | parsimony_coefficient, 34 | random_state, 35 | data_type, 36 | n_cat_features, 37 | transformer=None, 38 | feature_names=None, 39 | program=None): 40 | ''' 41 | 42 | Parameters 43 | ---------- 44 | function_dict: 储存基础函数,原为function_set {'number': [], 'category': []} 45 | arities: 函数参数个数 46 | init_depth:初始深度, 接受元组(min_depth, max_depth) 47 | init_method:生成方式, 48 | n_features:特征个数 49 | const_range:常数范围, (-1, 1) 50 | metric:目标函数,’MAE‘,'MSE' 51 | p_point_replace:点变异概率 52 | parsimony_coefficient:惩罚系数,'auto'护着浮点数,默认0.01 53 | random_state:随机对象 54 | data_type:新增参数 截面,时序or面板, ’section‘, ’time_series', 'panel' 55 | n_cat_features:新增参数 分类特征个数 56 | transformer 57 | feature_names 58 | program 59 | ''' 60 | self.function_dict = function_dict 61 | self.arities = arities 62 | self.init_depth = (init_depth[0], init_depth[1] + 1) 63 | self.init_method = init_method 64 | self.n_features = n_features 65 | self.const_range = const_range 66 | self.metric = metric 67 | self.p_point_replace = p_point_replace 68 | self.parsimony_coefficient = parsimony_coefficient 69 | self.data_type = data_type 70 | self.transformer = transformer 71 | self.feature_names = feature_names 72 | self.program = program 73 | self.n_cat_features = n_cat_features 74 | 75 | self.num_func_number = len(self.function_dict['number']) 76 | self.cat_func_number = len(self.function_dict['category']) 77 | 78 | if self.program is not None: 79 | # 验证当下树是否完整 80 | if not self.validate_program(): 81 | raise ValueError('The supplied program is incomplete.') 82 | else: 83 | # Create a naive random program 84 | self.program = self.build_program(random_state) 85 | 86 | self.raw_fitness_ = None 87 | self.fitness_ = None 88 | self.parents = None 89 | self._n_samples = None 90 | self._max_samples = None 91 | self._indices_state = None 92 | 93 | def build_program(self, random_state, type='number'): 94 | """ 95 | 参数中无program 初始化方法 96 | # v1.55 修改数的生成逻辑 97 | :param random_state: RandomState 对象, 随机数生成器 98 | :param type: 生成树返回数值还是分类 99 | :return: list, 100 | """ 101 | if self.init_method == 'half and half': 102 | method = ('full' if random_state.randint(2) else 'grow') 103 | else: 104 | method = self.init_method 105 | max_depth = random_state.randint(*self.init_depth) 106 | 107 | # Start a program with a function to avoid degenerative programs 108 | # 公式树返回类型必须为数值类型,随机挑选一个返回数值向量的函数作为公式树的根节点 109 | _root_function_num = random_state.randint(len(self.function_dict['number'])) 110 | _root_function = self.function_dict['number'][_root_function_num] 111 | 112 | # 初始化公式树和工作栈,当前工作栈中仅有根节点,工作栈中存储参数类型列表,用于树的生成 113 | program = [_root_function] 114 | terminal_stack = [deepcopy(_root_function.param_type)] 115 | 116 | while terminal_stack: 117 | depth = len(terminal_stack) 118 | candidate_num = self.n_features + self.num_func_number + self.cat_func_number 119 | candidate_choice = random_state.randint(candidate_num) 120 | # Determine if we are adding a function or terminal 121 | # terminal_stack的元素必须是list 122 | if not isinstance(terminal_stack[-1], list): 123 | raise ValueError("element in terminal_stack should be list") 124 | # terminal_stack的元素的list内,元素须为dict 125 | if not isinstance(terminal_stack[-1][0], dict): 126 | raise ValueError("element in terminal_stack'element should be dict") 127 | 128 | # 深度优先的方式构建公式树,迭代处理工作栈中最后一个子树第一个子节点 129 | # 与gplearn主要不同点 130 | if ('vector' in terminal_stack[-1][0]) and (depth < max_depth) \ 131 | and (method == 'full' or candidate_choice < (self.num_func_number + self.cat_func_number)): 132 | # 插入函数的要求,1 该节点必须接受向量,2.当前深度比最大深度低, 3.随机种子选中了函数或者模式为‘full’ 133 | 134 | # 决定选择数值型函数 还是 分类型函数 135 | # 若该节点都可以接受,则随机决定插入的函数类型 136 | # 否则根据可接受类型插入相应函数 137 | _choice = random_state.randint(self.cat_func_number + self.num_func_number) 138 | if 'number' in terminal_stack[-1][0]['vector'] and 'category' in terminal_stack[-1][0]['vector']: 139 | key = 'number' if _choice < self.num_func_number else 'category' 140 | else: 141 | key = 'number' if 'number' in terminal_stack[-1][0]['vector'] else 'category' 142 | function_choice = self.function_dict[key][_choice % 143 | (self.num_func_number if key == 'number' else self.cat_func_number)] 144 | program.append(function_choice) 145 | terminal_stack.append(deepcopy(function_choice.param_type)) 146 | else: 147 | # 插入向量或者常量 148 | _choice = random_state.randint(self.n_features + 1) 149 | # 根据特殊情况调整_choice 150 | # 1.若const_range为None 或者 不接受标量类型,则默认插入向量 151 | # 2.若不接受向量类型,则默认插入标量 152 | # 3.其他情况按照随机数决定 153 | if _choice == self.n_features and \ 154 | ((self.const_range is None) or \ 155 | (('scalar') not in terminal_stack[-1][0])): 156 | # 只能插入向量的情况 157 | if 'vector' not in terminal_stack[-1][0]: 158 | raise ValueError('Error param type {}'.format(terminal_stack[-1][0])) 159 | 160 | _choice = random_state.randint(self.n_features) 161 | elif ('vector' not in terminal_stack[-1][0]): 162 | # 只能插入常量的情况 163 | _choice = self.n_features 164 | 165 | if _choice < self.n_features: 166 | # 插入向量 167 | if 'number' in terminal_stack[-1][0]['vector'] and 'category' in terminal_stack[-1][0][ 168 | 'vector']: 169 | # 可插入数值向量也可插入分类向量 170 | key = 'category' if _choice < self.n_cat_features else 'number' 171 | else: 172 | key = 'number' if 'number' in terminal_stack[-1][0]['vector'] else 'category' 173 | if self.n_cat_features == 0 and key == 'category': 174 | # 需要插入分类向量,特征中却没有分类向量的情况,插入常数分类向量1, 默认0 175 | candicate_var = 0 176 | else: 177 | candicate_var = (_choice % self.n_cat_features) + 1 if key == 'category' else \ 178 | ((_choice % (self.n_features - self.n_cat_features) + self.n_cat_features) + 1) 179 | program.append(str(candicate_var)) 180 | else: 181 | # 插入常量 182 | if 'float' in terminal_stack[-1][0]['scalar']: 183 | _choice = random_state.uniform(*terminal_stack[-1][0]['scalar']['float']) 184 | elif 'int' in terminal_stack[-1][0]['scalar']: 185 | _choice = random_state.randint(*terminal_stack[-1][0]['scalar']['int']) 186 | else: 187 | raise ValueError('Error param type {}'.format(terminal_stack[-1][0])) 188 | program.append(_choice) 189 | 190 | terminal_stack[-1].pop(0) 191 | while len(terminal_stack[-1]) == 0: 192 | terminal_stack.pop() 193 | if not terminal_stack: 194 | return program 195 | terminal_stack[-1].pop(0) 196 | # We should never get here 197 | return None 198 | 199 | # 检查函数是否可用,不包括类型检查 200 | def validate_program(self): 201 | """Rough check that the embedded program in the object is valid.""" 202 | terminals = [0] 203 | for node in self.program: 204 | if isinstance(node, _Function): 205 | terminals.append(node.arity) 206 | else: 207 | terminals[-1] -= 1 208 | while terminals[-1] == 0: 209 | terminals.pop() 210 | terminals[-1] -= 1 211 | return terminals == [-1] 212 | 213 | # 打印树 214 | def __str__(self): 215 | """Overloads `print` output of the object to resemble a LISP tree.""" 216 | terminals = [0] 217 | output = '' 218 | for i, node in enumerate(self.program): 219 | if isinstance(node, _Function): 220 | terminals.append(node.arity) 221 | output += node.name + '(' 222 | else: 223 | if isinstance(node, str): 224 | if self.feature_names is None: 225 | output += 'X%s' % node 226 | else: 227 | output += self.feature_names[int(node)] 228 | elif isinstance(node, int): 229 | output += '%d' % node 230 | elif isinstance(node, float): 231 | output += '%.3f' % node 232 | else: 233 | raise ValueError('Error param type {}'.format(node)) 234 | terminals[-1] -= 1 235 | while terminals[-1] == 0: 236 | terminals.pop() 237 | terminals[-1] -= 1 238 | output += ')' 239 | if i != len(self.program) - 1: 240 | output += ', ' 241 | return output 242 | 243 | # 可视化整个树 244 | def export_graphviz(self, fade_nodes=None): 245 | """Returns a string, Graphviz script for visualizing the program. 246 | 247 | Parameters 248 | ---------- 249 | fade_nodes : list, optional 250 | A list of node indices to fade out for showing which were removed 251 | during evolution. 252 | 253 | Returns 254 | ------- 255 | output : string 256 | The Graphviz script to plot the tree representation of the program. 257 | 258 | """ 259 | terminals = [] 260 | if fade_nodes is None: 261 | fade_nodes = [] 262 | output = 'digraph program {\nnode [style=filled]\n' 263 | for i, node in enumerate(self.program): 264 | fill = '#cecece' 265 | if isinstance(node, _Function): 266 | if i not in fade_nodes: 267 | fill = '#136ed4' 268 | terminals.append([node.arity, i]) 269 | output += ('%d [label="%s", fillcolor="%s"] ;\n' 270 | % (i, node.name, fill)) 271 | else: 272 | if i not in fade_nodes: 273 | fill = '#60a6f6' 274 | 275 | if isinstance(node, str): 276 | if self.feature_names is None: 277 | feature_name = 'X%s' % node 278 | else: 279 | feature_name = self.feature_names[int(node)] 280 | output += ('%d [label="%s", fillcolor="%s"] ;\n' 281 | % (i, feature_name, fill)) 282 | elif isinstance(node, int): 283 | output += ('%d [label="%d", fillcolor="%s"] ;\n' 284 | % (i, node, fill)) 285 | elif isinstance(node, int): 286 | output += ('%d [label="%.3f", fillcolor="%s"] ;\n' 287 | % (i, node, fill)) 288 | else: 289 | raise ValueError('Error param type {}'.format(node)) 290 | 291 | if i == 0: 292 | # A degenerative program of only one node 293 | return output + '}' 294 | terminals[-1][0] -= 1 295 | terminals[-1].append(i) 296 | while terminals[-1][0] == 0: 297 | output += '%d -> %d ;\n' % (terminals[-1][1], 298 | terminals[-1][-1]) 299 | terminals[-1].pop() 300 | if len(terminals[-1]) == 2: 301 | parent = terminals[-1][-1] 302 | terminals.pop() 303 | if not terminals: 304 | return output + '}' 305 | terminals[-1].append(parent) 306 | terminals[-1][0] -= 1 307 | 308 | # We should never get here 309 | return None 310 | 311 | # 计算树的深度 312 | def _depth(self): 313 | """Calculates the maximum depth of the program tree.""" 314 | terminals = [0] 315 | depth = 1 316 | for node in self.program: 317 | if isinstance(node, _Function): 318 | terminals.append(node.arity) 319 | depth = max(len(terminals), depth) 320 | else: 321 | terminals[-1] -= 1 322 | while terminals[-1] == 0: 323 | terminals.pop() 324 | terminals[-1] -= 1 325 | return depth - 1 326 | 327 | # 计算公式中函数和变量的数量 328 | def _length(self): 329 | """Calculates the number of functions and terminals in the program.""" 330 | return len(self.program) 331 | 332 | # 计算参数X的函数结果 333 | def execute(self, X): 334 | """Execute the program according to X. 335 | 336 | Parameters 337 | ---------- 338 | X : {array-like} 339 | 若数据类型为'section','time_series'则为[n_samples, n_features + 1] 340 | 若数据类型为'panel', 则为[n_samples, n_features + 3] 341 | 342 | Returns 343 | ------- 344 | y_hats : array-like, shape = [n_samples] 345 | The result of executing the program on X. 346 | 347 | """ 348 | # 检验X列数是否正确 349 | if self.data_type == 'panel' and X.shape[1] != self.n_features + 3: 350 | raise ValueError("For panel Data, the col number of X should be n_features + 3") 351 | elif self.data_type in ['section', 'time_series'] and X.shape[1] != self.n_features + 1: 352 | raise ValueError("For section or time_series Data, the col number of X should be n_features + 1") 353 | 354 | # Check for single-node programs 355 | node = self.program[0] 356 | # 常数 357 | if isinstance(node, (float, int)): 358 | return np.repeat(node, X.shape[0]) 359 | # 变量 360 | if isinstance(node, str): 361 | return X[:, int(node)] 362 | 363 | apply_stack = [] 364 | for node in self.program: 365 | 366 | if isinstance(node, _Function): 367 | apply_stack.append([node]) 368 | else: 369 | # Lazily evaluate later 370 | apply_stack[-1].append(node) 371 | 372 | while len(apply_stack[-1]) == apply_stack[-1][0].arity + 1: 373 | # Apply functions that have sufficient arguments 374 | function = apply_stack[-1][0] 375 | terminals = [np.repeat(t, X.shape[0]) if isinstance(t, (float, int)) 376 | else (X[:, int(t)] if isinstance(t, str) 377 | else t) for t in apply_stack[-1][1:]] 378 | # 对于时序和截面函数加入管道 379 | if self.data_type == 'panel' and function.function_type == 'section': 380 | time_series_data = X[:, -1] 381 | intermediate_result = _groupby(time_series_data, function, *terminals) 382 | elif self.data_type == 'panel' and function.function_type == 'time_series': 383 | security_data = X[:, -2] 384 | intermediate_result = _groupby(security_data, function, *terminals) 385 | else: 386 | intermediate_result = function(*terminals) 387 | if len(apply_stack) != 1: 388 | apply_stack.pop() 389 | apply_stack[-1].append(intermediate_result) 390 | else: 391 | return intermediate_result 392 | 393 | # We should never get here 394 | return None 395 | 396 | # 选择部分样本 397 | def get_all_indices(self, n_samples=None, max_samples=None, 398 | random_state=None): 399 | """Get the indices on which to evaluate the fitness of a program. 400 | 401 | Parameters 402 | ---------- 403 | n_samples : int 404 | The number of samples. 405 | 406 | max_samples : int 407 | The maximum number of samples to use. 408 | 409 | random_state : RandomState instance 410 | The random number generator. 411 | 412 | Returns 413 | ------- 414 | indices : array-like, shape = [n_samples] 415 | The in-sample indices. 416 | 抽样内index 417 | 418 | not_indices : array-like, shape = [n_samples] 419 | The out-of-sample indices. 420 | 抽样外index 421 | 422 | """ 423 | if self._indices_state is None and random_state is None: 424 | raise ValueError('The program has not been evaluated for fitness ' 425 | 'yet, indices not available.') 426 | 427 | if n_samples is not None and self._n_samples is None: 428 | self._n_samples = n_samples 429 | if max_samples is not None and self._max_samples is None: 430 | self._max_samples = max_samples 431 | if random_state is not None and self._indices_state is None: 432 | self._indices_state = random_state.get_state() 433 | 434 | indices_state = check_random_state(None) 435 | indices_state.set_state(self._indices_state) 436 | 437 | not_indices = sample_without_replacement( 438 | self._n_samples, 439 | self._n_samples - self._max_samples, 440 | random_state=indices_state) 441 | sample_counts = np.bincount(not_indices, minlength=self._n_samples) 442 | indices = np.where(sample_counts == 0)[0] 443 | 444 | return indices, not_indices 445 | 446 | # 获取衡量模型适应度的指标 447 | def _indices(self): 448 | """Get the indices used to measure the program's fitness.""" 449 | return self.get_all_indices()[0] 450 | 451 | # 原始适应度 452 | def raw_fitness(self, X, y, sample_weight): 453 | """Evaluate the raw fitness of the program according to X, y. 454 | 455 | Parameters 456 | ---------- 457 | X : {array-like} 458 | 若数据类型为'section','time_series'则为[n_samples, n_features + 1] 459 | 若数据类型为'panel', 则为[n_samples, n_features + 3] 460 | 461 | y : array-like, shape = [n_samples] 462 | Target values. 463 | 464 | sample_weight : array-like, shape = [n_samples] 465 | Weights applied to individual samples. 466 | 467 | Returns 468 | ------- 469 | raw_fitness : float 470 | The raw fitness of the program. 471 | 472 | """ 473 | if X.shape[0] != len(y): 474 | raise ValueError("The length of y should be equal to X") 475 | y_pred = self.execute(X) 476 | if self.transformer: 477 | y_pred = self.transformer(y_pred) 478 | raw_fitness = self.metric(y, y_pred, sample_weight) 479 | 480 | return raw_fitness 481 | 482 | # todo 引入非线性适应度 483 | # 惩罚后适应度 对函数长度进行惩罚 484 | def fitness(self, parsimony_coefficient=None): 485 | """Evaluate the penalized fitness of the program according to X, y. 486 | 487 | Parameters 488 | ---------- 489 | parsimony_coefficient : float, optional 490 | If automatic parsimony is being used, the computed value according 491 | to the population. Otherwise the initialized value is used. 492 | 493 | Returns 494 | ------- 495 | fitness : float 496 | The penalized fitness of the program. 497 | 498 | """ 499 | if parsimony_coefficient is None: 500 | parsimony_coefficient = self.parsimony_coefficient 501 | penalty = parsimony_coefficient * len(self.program) * self.metric.sign 502 | return self.raw_fitness_ - penalty 503 | 504 | # 此函数为获得指定子树 505 | def get_subtree(self, start, program=None): 506 | """ 507 | 508 | Parameters 509 | ---------- 510 | start: 子树的根节点位置 511 | program 512 | Returns 513 | ------- 514 | start 515 | end 子树截止位置 + 1 便于索引 516 | """ 517 | if program is None: 518 | program = self.program 519 | stack = 1 520 | end = start 521 | while stack > end - start: 522 | node = program[end] 523 | if isinstance(node, _Function): 524 | stack += node.arity 525 | end += 1 526 | 527 | if isinstance(program[start], _Function): 528 | return_type = _Function.return_type 529 | elif isinstance(program[start], str): 530 | if int(program[start]) == 0: 531 | raise ValueError("The return of sub_tree's root should not be const_1") 532 | return_type = 'category' if int(program[start]) <= self.n_cat_features else 'number' 533 | else: 534 | raise ValueError("The return type of sub_tree's root should be number or category") 535 | return start, end, return_type 536 | 537 | # 此函数为获得随机子树 538 | # 此处做了修改,不会选到标量 539 | # 需要考虑返回类型 540 | def get_random_subtree(self, random_state, program=None, return_type=None): 541 | """Get a random subtree from the program. 542 | 543 | Parameters 544 | ---------- 545 | random_state : RandomState instance 546 | The random number generator. 547 | 548 | program : list, optional (default=None) 549 | The flattened tree representation of the program. If None, the 550 | embedded tree in the object will be used. 551 | 552 | return_type: 子数的返回类型限定 默认 None, number 和 category都可以选择 553 | 554 | Returns 555 | ------- 556 | start, end : tuple of two ints 557 | The indices of the start and end of the random subtree. 558 | return_type: 子数返回类型,数值向量 还是 分类向量, 防止交叉时出现错误 559 | """ 560 | if program is None: 561 | program = self.program 562 | # Choice of crossover points follows Koza's (1992) widely used approach 563 | # 子数节点概率权重90%,向量叶子节点概率权重10%,标量叶包括常分类向量子节点概率权重0 564 | # 若type为number, 所有返回category的节点概率权重为0 565 | # 若type为category, 所有返回number的节点概率权重为0 566 | if return_type not in ['number', 'category', None]: 567 | raise ValueError("Type of sub_tree should be number, category or None") 568 | if return_type == 'number': 569 | probs = np.array([0.9 if isinstance(node, _Function) and node.return_type == 'number' 570 | else (0.1 if isinstance(node, str) and int(node) > self.n_cat_features else 0.0) 571 | for node in program]) 572 | elif return_type == 'category': 573 | probs = np.array([0.9 if isinstance(node, _Function) and node.return_type == 'category' 574 | else (0.1 if isinstance(node, str) and int(node) <= self.n_cat_features 575 | and int(node) != 0 else 0.0) 576 | for node in program]) 577 | else: 578 | probs = np.array([0.9 if isinstance(node, _Function) 579 | else (0.1 if isinstance(node, str) 580 | and int(node) != 0 else 0.0) 581 | for node in program]) 582 | probs = np.cumsum(probs / probs.sum()) 583 | start = np.searchsorted(probs, random_state.uniform()) 584 | return self.get_subtree(start, program) 585 | 586 | def reproduce(self): 587 | """Return a copy of the embedded program.""" 588 | return copy(self.program) 589 | 590 | def vaild_category(self, program=None): 591 | """验证公式树中是否包含分类向量或子树, 不包括常数分类向量""" 592 | if program is None: 593 | program = self.program 594 | for node in program: 595 | if isinstance(node, _Function) and node.return_type == 'category': 596 | return True 597 | elif isinstance(node, str) and int(node) != 0 and int(node) <= self.n_cat_features: 598 | return True 599 | return False 600 | 601 | # 交换self 和 donor 的子树 602 | # 此处不会交换常数 603 | def crossover(self, donor, random_state): 604 | """Perform the crossover genetic operation on the program. 605 | 606 | Crossover selects a random subtree from the embedded program to be 607 | replaced. A donor also has a subtree selected at random and this is 608 | inserted into the original parent to form an offspring. 609 | 610 | Parameters 611 | ---------- 612 | donor : list 613 | The flattened tree representation of the donor program. 614 | 615 | random_state : RandomState instance 616 | The random number generator. 617 | 618 | Returns 619 | ------- 620 | program : list 621 | The flattened tree representation of the program. 622 | 623 | """ 624 | # Get a subtree to replace 625 | # 若都包含 626 | if self.vaild_category() and self.vaild_category(donor): 627 | start, end, self_return_type = self.get_random_subtree(random_state) 628 | else: 629 | start, end, self_return_type = self.get_random_subtree(random_state, return_type='number') 630 | removed = range(start, end) 631 | # Get a subtree to donate 632 | donor_start, donor_end, donor_return_type = self.get_random_subtree(random_state, donor, self_return_type) 633 | donor_removed = list(set(range(len(donor))) - 634 | set(range(donor_start, donor_end))) 635 | # Insert genetic material from donor 636 | return (self.program[:start] + 637 | donor[donor_start:donor_end] + 638 | self.program[end:]), removed, donor_removed 639 | 640 | # 此处不会选择常数 641 | # 子数变异 642 | def subtree_mutation(self, random_state): 643 | """Perform the subtree mutation operation on the program. 644 | 645 | Subtree mutation selects a random subtree from the embedded program to 646 | be replaced. A donor subtree is generated at random and this is 647 | inserted into the original parent to form an offspring. This 648 | implementation uses the "headless chicken" method where the donor 649 | subtree is grown using the initialization methods and a subtree of it 650 | is selected to be donated to the parent. 651 | 652 | Parameters 653 | ---------- 654 | random_state : RandomState instance 655 | The random number generator. 656 | 657 | Returns 658 | ------- 659 | program : list 660 | The flattened tree representation of the program. 661 | 662 | """ 663 | # Build a new naive program 664 | chicken = self.build_program(random_state) 665 | # Do subtree mutation via the headless chicken method! 666 | return self.crossover(chicken, random_state) 667 | 668 | def get_hoist_list(self, program=None): 669 | """ 670 | 判断公式树哪些节点可以做hoist变异, 该节点非叶子节点 且 存在与自身同类型的子树, 常分类向量不算分类向量的同类型 671 | Parameters 672 | ---------- 673 | program 674 | 675 | Returns 676 | ------- 677 | hoist_list 678 | """ 679 | if program is None: 680 | program = self.program 681 | 682 | apply_stack = [] 683 | hoist_list = [False] * len(program) 684 | # 深度优先搜索,压入栈中的元素是一个list,list第一个元素表示函数再program列表中的位置,第二个元素是函数对象,后面的元素是返回结果 685 | # 深搜结果为['number'], ['number','category'], ['category'], [], 表示该节点及其子节点所包含的类型集合 686 | for i, node in enumerate(program): 687 | if isinstance(node, _Function): 688 | apply_stack.append([i, node]) 689 | else: 690 | # Lazily evaluate later 691 | apply_stack[-1].append(node) 692 | while len(apply_stack[-1]) == apply_stack[-1][1].arity + 2: 693 | father_type = apply_stack[-1][1].return_type 694 | type_list = [t if isinstance(t, list) else 695 | (['number'] if isinstance(t, str) and int(t) > self.n_cat_features else 696 | (['category'] if isinstance(t, str) and int(t) <= self.n_cat_features and int(t) != 0 697 | else [])) 698 | for t in apply_stack[-1][2:] 699 | ] 700 | # 判断子树中是否存在与本节点同类型的节点,若存在表示可以hoist变异 701 | if father_type in list(set().union(*type_list)): 702 | hoist_list[apply_stack[-1][0]] = True 703 | # 函数返回类型加入列表 704 | type_list.append([father_type]) 705 | 706 | intermediate_result = list(set().union(*type_list)) 707 | if len(apply_stack) != 1: 708 | apply_stack.pop() 709 | apply_stack[-1].append(intermediate_result) 710 | else: 711 | return hoist_list 712 | return None 713 | 714 | # 将子树的子树变上提,简化公式 715 | # 由于子树不会选到常数,故符合条件 716 | # 子数不会选到分类变量 717 | def hoist_mutation(self, random_state): 718 | """Perform the hoist mutation operation on the program. 719 | 720 | Hoist mutation selects a random subtree from the embedded program to 721 | be replaced. A random subtree of that subtree is then selected and this 722 | is 'hoisted' into the original subtrees location to form an offspring. 723 | This method helps to control bloat. 724 | 725 | gplearnplus修改,由于引入了变量类型,需要先考哪些节点可以hosit变异的节点 726 | 要求 727 | 1. 该节点下存在于节点同类型的子树 728 | 729 | Parameters 730 | ---------- 731 | random_state : RandomState instance 732 | The random number generator. 733 | 734 | Returns 735 | ------- 736 | program : list 737 | The flattened tree representation of the program. 738 | 739 | """ 740 | # Get a subtree to replace 741 | hoist_list = self.get_hoist_list() 742 | if sum(hoist_list) == 0: 743 | return self.program 744 | # 随机选取可以hoist的节点 745 | hoist_root = random_state.choice(np.where(hoist_list)[0]) 746 | start, end, return_type = self.get_subtree(hoist_root) 747 | subtree = self.program[start:end] 748 | # Get a subtree of the subtree to hoist 749 | sub_start, sub_end, _ = self.get_random_subtree(random_state, subtree, return_type=return_type) 750 | hoist = subtree[sub_start:sub_end] 751 | # Determine which nodes were removed for plotting 752 | removed = list(set(range(start, end)) - 753 | set(range(start + sub_start, start + sub_end))) 754 | return self.program[:start] + hoist + self.program[end:], removed 755 | 756 | # 点变异完全修改 757 | # 要求函数满足is_point_mutation条件 758 | # 由于无法得知范围,常数不变异 759 | def point_mutation(self, random_state): 760 | """Perform the point mutation operation on the program. 761 | 762 | Point mutation selects random nodes from the embedded program to be 763 | replaced. Terminals are replaced by other terminals and functions are 764 | replaced by other functions that require the same number of arguments 765 | as the original node. The resulting tree forms an offspring. 766 | 767 | Parameters 768 | ---------- 769 | random_state : RandomState instance 770 | The random number generator. 771 | 772 | Returns 773 | ------- 774 | program : list 775 | The flattened tree representation of the program. 776 | 777 | """ 778 | program = copy(self.program) 779 | 780 | # Get the nodes to modify 781 | mutate = np.where(random_state.uniform(size=len(program)) < 782 | self.p_point_replace)[0] 783 | tag = np.array([True] * len(mutate)) 784 | for i, node in enumerate(mutate): 785 | if isinstance(program[node], _Function): 786 | arity = program[node].arity 787 | # Find a valid replacement with same arity 788 | replacement_list = [func_ for func_ in self.arities[arity] if program[node].is_point_mutation(func_)] 789 | if len(replacement_list) == 0: 790 | # 没有满足条件的变异 791 | tag[i] = False 792 | continue 793 | replacement = random_state.randint(len(replacement_list)) 794 | replacement = replacement_list[replacement] 795 | program[node] = replacement 796 | elif isinstance(program[node], str): 797 | # We've got a terminal, add a const or variable 798 | terminal = random_state.randint(1, self.n_features + 1) 799 | program[node] = str(terminal) 800 | else: 801 | # 常数不发生变异 802 | tag[i] = False 803 | if len(mutate): 804 | mutate = mutate[tag] 805 | return program, list(mutate) 806 | 807 | depth_ = property(_depth) 808 | length_ = property(_length) 809 | indices_ = property(_indices) 810 | -------------------------------------------------------------------------------- /genetic.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | # @Project :gplearnplus 5 | # @File :genetic 6 | # @Date :2022/12/5 0005 4:23 7 | # @Author :Junzhe Huang 8 | # @Email :acejasonhuang@163.com 9 | # @Software :PyCharm 10 | ------------------------------------------------- 11 | """ 12 | import itertools 13 | from abc import ABCMeta, abstractmethod 14 | from time import time 15 | from warnings import warn 16 | from copy import deepcopy 17 | 18 | import numpy as np 19 | import pandas as pd 20 | from joblib import Parallel, delayed 21 | from scipy.stats import rankdata 22 | from sklearn.base import BaseEstimator 23 | from sklearn.base import RegressorMixin, TransformerMixin, ClassifierMixin 24 | from sklearn.exceptions import NotFittedError 25 | from sklearn.utils import compute_sample_weight 26 | from sklearn.utils.validation import check_array, _check_sample_weight 27 | from sklearn.utils.multiclass import check_classification_targets 28 | from sklearn.preprocessing import LabelEncoder 29 | 30 | from ._program import _Program 31 | from .fitness import _fitness_map, _Fitness 32 | from .functions import _function_map, _Function, sig1 as sigmoid 33 | from .utils import _partition_estimators 34 | from .utils import check_random_state 35 | 36 | __all__ = ['SymbolicRegressor', 'SymbolicClassifier', 'SymbolicTransformer'] 37 | 38 | MAX_INT = np.iinfo(np.int32).max 39 | 40 | # 并行实现子树交叉,变异 41 | def _parallel_evolve(n_programs, parents, X, y, sample_weight, seeds, params): 42 | """ 43 | 44 | Parameters 45 | ---------- 46 | n_programs: 种群数量 47 | parents:父辈个体集合 48 | X:原始特征 49 | y:预测label 50 | sample_weight:抽样比例 51 | seeds:随机种子 52 | params:参数 53 | 54 | Returns 55 | ------- 56 | 57 | """ 58 | 59 | """Private function used to build a batch of programs within a job.""" 60 | n_samples, n_features = X.shape 61 | 62 | # Unpack parameters 63 | tournament_size = params['tournament_size'] 64 | function_dict = params['function_dict'] 65 | arities = params['arities'] 66 | init_depth = params['init_depth'] 67 | init_method = params['init_method'] 68 | const_range = params['const_range'] 69 | metric = params['_metric'] 70 | transformer = params['_transformer'] 71 | parsimony_coefficient = params['parsimony_coefficient'] 72 | method_probs = params['method_probs'] 73 | data_type = params['data_type'] 74 | p_point_replace = params['p_point_replace'] 75 | max_samples = params['max_samples'] # 最大样本比例 76 | feature_names = params['feature_names'] 77 | n_cat_features = params['cat_var_number'] 78 | 79 | # 处理不同类型的数据X 80 | if data_type == 'panel': 81 | n_features -= 3 82 | else: 83 | n_features -= 1 84 | 85 | max_samples = int(max_samples * n_samples) 86 | 87 | def _tournament(): 88 | # 从所有父代中随机选择tournament_size个,取其中最优的那一个子代 89 | """Find the fittest individual from a sub-population.""" 90 | contenders = random_state.randint(0, len(parents), tournament_size) 91 | fitness = [parents[p].fitness_ for p in contenders] 92 | if metric.greater_is_better: 93 | parent_index = contenders[np.argmax(fitness)] 94 | else: 95 | parent_index = contenders[np.argmin(fitness)] 96 | return parents[parent_index], parent_index 97 | 98 | # Build programs 99 | programs = [] 100 | 101 | for i in range(n_programs): 102 | 103 | random_state = check_random_state(seeds[i]) 104 | 105 | if parents is None: 106 | # 初代 107 | program = None 108 | genome = None 109 | else: 110 | method = random_state.uniform() 111 | # 在父辈个体集合中抽样选择一个最优的父辈 112 | parent, parent_index = _tournament() 113 | 114 | # 随机进行一种交叉 or 变异 115 | if method < method_probs[0]: 116 | # crossover 117 | donor, donor_index = _tournament() 118 | program, removed, remains = parent.crossover(donor.program, 119 | random_state) 120 | genome = {'method': 'Crossover', 121 | 'parent_idx': parent_index, 122 | 'parent_nodes': removed, 123 | 'donor_idx': donor_index, 124 | 'donor_nodes': remains} 125 | elif method < method_probs[1]: 126 | # subtree_mutation 127 | program, removed, _ = parent.subtree_mutation(random_state) 128 | genome = {'method': 'Subtree Mutation', 129 | 'parent_idx': parent_index, 130 | 'parent_nodes': removed} 131 | elif method < method_probs[2]: 132 | # hoist_mutation 133 | program, removed = parent.hoist_mutation(random_state) 134 | genome = {'method': 'Hoist Mutation', 135 | 'parent_idx': parent_index, 136 | 'parent_nodes': removed} 137 | elif method < method_probs[3]: 138 | # point_mutation 139 | program, mutated = parent.point_mutation(random_state) 140 | genome = {'method': 'Point Mutation', 141 | 'parent_idx': parent_index, 142 | 'parent_nodes': mutated} 143 | else: 144 | # reproduction 145 | program = parent.reproduce() 146 | genome = {'method': 'Reproduction', 147 | 'parent_idx': parent_index, 148 | 'parent_nodes': []} 149 | 150 | program = _Program(function_dict=function_dict, 151 | arities=arities, 152 | init_depth=init_depth, 153 | init_method=init_method, 154 | n_features=n_features, 155 | metric=metric, 156 | transformer=transformer, 157 | const_range=const_range, 158 | p_point_replace=p_point_replace, 159 | parsimony_coefficient=parsimony_coefficient, 160 | data_type=data_type, 161 | feature_names=feature_names, 162 | random_state=random_state, 163 | n_cat_features=n_cat_features, 164 | program=program) 165 | 166 | program.parents = genome 167 | 168 | # Draw samples, using sample weights, and then fit 169 | if sample_weight is None: 170 | curr_sample_weight = np.ones((n_samples,)) 171 | else: 172 | curr_sample_weight = sample_weight.copy() 173 | oob_sample_weight = curr_sample_weight.copy() 174 | 175 | indices, not_indices = program.get_all_indices(n_samples, 176 | max_samples, 177 | random_state) 178 | 179 | curr_sample_weight[not_indices] = 0 180 | oob_sample_weight[indices] = 0 181 | 182 | program.raw_fitness_ = program.raw_fitness(X, y, curr_sample_weight) 183 | if max_samples < n_samples: 184 | # Calculate OOB fitness 185 | program.oob_fitness_ = program.raw_fitness(X, y, oob_sample_weight) 186 | 187 | programs.append(program) 188 | 189 | return programs 190 | 191 | 192 | class BaseSymbolic(BaseEstimator, metaclass=ABCMeta): 193 | 194 | """Base class for symbolic regression / classification estimators. 195 | 196 | Warning: This class should not be used directly. 197 | Use derived classes instead. 198 | 199 | """ 200 | 201 | @abstractmethod 202 | def __init__(self, 203 | *, 204 | population_size=1000, 205 | hall_of_fame=None, 206 | n_components=None, 207 | generations=20, 208 | tournament_size=20, 209 | stopping_criteria=0.0, 210 | const_range=(-1., 1.), 211 | init_depth=(2, 6), 212 | init_method='half and half', 213 | function_set=('add', 'sub', 'mul', 'div'), 214 | transformer=None, 215 | metric='mean absolute error', 216 | parsimony_coefficient=0.001, 217 | p_crossover=0.9, 218 | p_subtree_mutation=0.01, 219 | p_hoist_mutation=0.01, 220 | p_point_mutation=0.01, 221 | p_point_replace=0.05, 222 | max_samples=1.0, 223 | tolerable_corr=0.0, 224 | class_weight=None, 225 | feature_names=None, 226 | time_series_index=None, 227 | security_index=None, 228 | category_features=None, 229 | warm_start=False, 230 | low_memory=False, 231 | n_jobs=1, 232 | verbose=0, 233 | data_type='section', 234 | random_state=None): 235 | 236 | self.population_size = population_size 237 | self.hall_of_fame = hall_of_fame 238 | self.n_components = n_components 239 | self.generations = generations 240 | self.tournament_size = tournament_size 241 | self.stopping_criteria = stopping_criteria 242 | self.const_range = const_range 243 | self.init_depth = init_depth 244 | self.init_method = init_method 245 | self.function_set = function_set 246 | self.transformer = transformer 247 | self.metric = metric 248 | self.parsimony_coefficient = parsimony_coefficient 249 | self.p_crossover = p_crossover 250 | self.p_subtree_mutation = p_subtree_mutation 251 | self.p_hoist_mutation = p_hoist_mutation 252 | self.p_point_mutation = p_point_mutation 253 | self.p_point_replace = p_point_replace 254 | self.max_samples = max_samples 255 | self.class_weight = class_weight 256 | self.feature_names = feature_names 257 | self.category_features = category_features 258 | self.time_series_index = time_series_index 259 | self.security_index = security_index 260 | self.warm_start = warm_start 261 | self.low_memory = low_memory 262 | self.n_jobs = n_jobs 263 | self.verbose = verbose 264 | self.random_state = random_state 265 | self.data_type = data_type 266 | self.tolerable_corr = tolerable_corr 267 | 268 | # 打印训练日志 269 | def _verbose_reporter(self, run_details=None): 270 | """A report of the progress of the evolution process. 271 | 272 | Parameters 273 | ---------- 274 | run_details : dict 275 | Information about the evolution. 276 | 277 | """ 278 | if run_details is None: 279 | print(' |{:^25}|{:^42}|'.format('Population Average', 280 | 'Best Individual')) 281 | print('-' * 4 + ' ' + '-' * 25 + ' ' + '-' * 42 + ' ' + '-' * 10) 282 | line_format = '{:>4} {:>8} {:>16} {:>8} {:>16} {:>16} {:>10}' 283 | print(line_format.format('Gen', 'Length', 'Fitness', 'Length', 284 | 'Fitness', 'OOB Fitness', 'Time Left')) 285 | 286 | else: 287 | # Estimate remaining time for run 288 | gen = run_details['generation'][-1] 289 | generation_time = run_details['generation_time'][-1] 290 | remaining_time = (self.generations - gen - 1) * generation_time 291 | if remaining_time > 60: 292 | remaining_time = '{0:.2f}m'.format(remaining_time / 60.0) 293 | else: 294 | remaining_time = '{0:.2f}s'.format(remaining_time) 295 | 296 | oob_fitness = 'N/A' 297 | line_format = '{:4d} {:8.2f} {:16g} {:8d} {:16g} {:>16} {:>10}' 298 | if self.max_samples < 1.0: 299 | oob_fitness = run_details['best_oob_fitness'][-1] 300 | line_format = '{:4d} {:8.2f} {:16g} {:8d} {:16g} {:16g} {:>10}' 301 | 302 | print(line_format.format(run_details['generation'][-1], 303 | run_details['average_length'][-1], 304 | run_details['average_fitness'][-1], 305 | run_details['best_length'][-1], 306 | run_details['best_fitness'][-1], 307 | oob_fitness, 308 | remaining_time)) 309 | 310 | # fit 的时候考虑时序问题 311 | # 转移出数据处理模块 312 | def fit(self, X, y, sample_weight=None): 313 | """Fit the Genetic Program according to X, y. 314 | 315 | Parameters 316 | ---------- 317 | X : array-like, shape = [n_samples, n_features] 318 | Training vectors, where n_samples is the number of samples and 319 | n_features is the number of features. 320 | 321 | y : array-like, shape = [n_samples] 322 | Target values. 323 | 324 | sample_weight : array-like, shape = [n_samples], optional 325 | Weights applied to individual samples. 326 | 327 | Returns 328 | ------- 329 | self : object 330 | Returns self. 331 | 332 | """ 333 | random_state = check_random_state(self.random_state) 334 | 335 | # 检查数据类型 336 | if self.data_type not in ('section', 'time_series', 'panel'): 337 | raise ValueError('Valid data_type methods include ' 338 | '"section", "time_series" and "panel". Given %s.' 339 | % self.data_type) 340 | 341 | # 检查数据结构 342 | # 若含有security或者timeindex 必须为DataFrame 343 | if self.security_index is not None or self.time_series_index is not None: 344 | if not isinstance(X, pd.DataFrame): 345 | raise ValueError('with security ot time index, data structure should be DataFrame') 346 | 347 | # 检查时间index和个股index, 对于截面,时序和面板数据分别检查 348 | security_data = None 349 | time_series_data = None 350 | if self.data_type == 'section': 351 | if self.time_series_index is not None: 352 | raise ValueError('For Section Data, time_series_index should be None') 353 | if self.security_index is not None: 354 | # 在index和columns中寻找security_index 355 | if self.security_index not in X.columns and \ 356 | (X.index.name is None or self.security_index not in X.index.name): 357 | raise ValueError('Can not fund security_index {} in both columns and index' 358 | .format(self.security_index)) 359 | elif self.security_index in X.columns: 360 | X.set_index(self.security_index, inplace=True) 361 | 362 | # 判断是否有重复个股 363 | if len(X[self.security_index].unique()) < len(X[self.security_index]): 364 | raise ValueError('For Section Data, security data should be unique') 365 | 366 | security_data = X.index.values 367 | 368 | elif self.data_type == 'time_series': 369 | if self.time_series_index is None: 370 | raise ValueError('For time_series Data, time_series_index should NOT be None') 371 | if self.security_index is not None: 372 | raise ValueError('For time_series Data, security_index should be None') 373 | if self.time_series_index not in X.columns and \ 374 | (X.index.name is None or self.time_series_index not in X.index.name): 375 | raise ValueError('Can not fund time_series_index {} in both columns and index' 376 | .format(self.time_series_index)) 377 | elif self.time_series_index in X.columns: 378 | X.set_index(self.time_series_index, inplace=True) 379 | 380 | # 判断是否有重复时间 381 | if len(X.index.drop_duplicates()) < len(X): 382 | raise ValueError('For time_series Data, time_series data should be unique') 383 | 384 | X_combine = X.copy() 385 | X_combine['_label'] = y.values if isinstance(y, pd.Series) else y 386 | X_combine.sort_index(inplace=True) 387 | X, y = X_combine.loc[:, self.feature_names], X_combine.loc[:, '_label'] 388 | time_series_data = X.index.values 389 | 390 | else: 391 | if self.time_series_index is None: 392 | raise ValueError('For panel Data, time_series_index should NOT be None') 393 | if self.security_index is None: 394 | raise ValueError('For panel Data, security_index should NOT be None') 395 | 396 | # security time_series 进入index 397 | if self.time_series_index not in X.columns and \ 398 | (X.index.name is None or self.time_series_index not in X.index.name): 399 | raise ValueError('Can not fund time_series_index {} in both columns and index' 400 | .format(self.time_series_index)) 401 | elif self.security_index not in X.columns and \ 402 | (X.index.name is None or self.security_index not in X.index.name): 403 | raise ValueError('Can not fund security_index {} in both columns and index' 404 | .format(self.security_index)) 405 | elif self.time_series_index in X.columns and self.security_index in X.columns: 406 | X.set_index([self.time_series_index, self.security_index], inplace=True) 407 | elif self.time_series_index in X.columns: 408 | X.set_index(self.security_index, inplace=True, append=True) 409 | elif self.security_index in X.columns: 410 | X.set_index(self.time_series_index, inplace=True, append=True) 411 | 412 | # 判断没有重复 413 | if len(X.index) != len(X.index.drop_duplicates()): 414 | raise ValueError('For time_series Data, time_series data should be unique') 415 | 416 | 417 | X_combine = X.copy() 418 | X_combine['_label'] = y.values if isinstance(y, pd.Series) else y 419 | X_combine.sort_index(inplace=True) 420 | X, y = X_combine.loc[:, self.feature_names], X_combine.loc[:, '_label'] 421 | time_series_data = X.index.get_level_values(self.time_series_index).values 422 | security_data = X.index.get_level_values(self.security_index).values 423 | 424 | # 检查category_features是否与全包含在feature_names中 425 | # 当存在分类数据时,输入数据类型必须为pd。DataFrame 426 | if self.category_features is not None: 427 | if not isinstance(X, pd.DataFrame): 428 | raise ValueError('while there are category_features in X, X must be pd.DataFrame') 429 | if not isinstance(self.category_features, list): 430 | raise ValueError('category_features must be list') 431 | for cat_feature in self.category_features: 432 | if cat_feature not in self.feature_names: 433 | raise ValueError('Valid category_feature {} , not in feature_names'.format(cat_feature)) 434 | # 处理分类数据,转换为整型 435 | label_encoder = LabelEncoder() 436 | X[self.category_features] = X[self.category_features].apply(label_encoder.fit_transform) 437 | # 重构顺序,将分类类型放在前面 438 | self.feature_names = \ 439 | [self.category_features + [_col for _col in self.feature_names if _col not in self.category_features]] 440 | X = X[self.feature_names] 441 | 442 | # Check arrays 443 | if sample_weight is not None: 444 | sample_weight = _check_sample_weight(sample_weight, X) 445 | 446 | # 检查数据内容 447 | if isinstance(self, ClassifierMixin): 448 | # 验证y是否为分类数据, X, y强转ndarray 449 | # todo 分类场景的处理有待优化,暂时不处理 450 | X, y = self._validate_data(X, y, y_numeric=False) 451 | check_classification_targets(y) 452 | 453 | if self.class_weight: 454 | if sample_weight is None: 455 | sample_weight = 1. 456 | # modify the sample weights with the corresponding class weight 457 | sample_weight = (sample_weight * 458 | compute_sample_weight(self.class_weight, y)) 459 | 460 | self.classes_, y = np.unique(y, return_inverse=True) 461 | n_trim_classes = np.count_nonzero(np.bincount(y, sample_weight)) 462 | if n_trim_classes != 2: 463 | raise ValueError("y contains %d class after sample_weight " 464 | "trimmed classes with zero weights, while 2 " 465 | "classes are required." 466 | % n_trim_classes) 467 | self.n_classes_ = len(self.classes_) 468 | 469 | else: 470 | # 验证y是否为数值数据, X, y强转ndarray 471 | X, y = self._validate_data(X, y, y_numeric=True) 472 | 473 | # check hall_of_fame and n_components ,if have 474 | hall_of_fame = self.hall_of_fame 475 | if hall_of_fame is None: 476 | hall_of_fame = self.population_size 477 | if hall_of_fame > self.population_size or hall_of_fame < 1: 478 | raise ValueError('hall_of_fame (%d) must be less than or equal to ' 479 | 'population_size (%d).' % (self.hall_of_fame, 480 | self.population_size)) 481 | n_components = self.n_components 482 | if n_components is None: 483 | n_components = hall_of_fame 484 | if n_components > hall_of_fame or n_components < 1: 485 | raise ValueError('n_components (%d) must be less than or equal to ' 486 | 'hall_of_fame (%d).' % (self.n_components, 487 | self.hall_of_fame)) 488 | 489 | # 检查feature_names是否与n_features_in_一致 490 | if self.feature_names is not None: 491 | if self.n_features_in_ != len(self.feature_names): 492 | raise ValueError('The supplied `feature_names` has different ' 493 | 'length to n_features. Expected %d, got %d.' 494 | % (self.n_features_in_, 495 | len(self.feature_names))) 496 | for feature_name in self.feature_names: 497 | if not isinstance(feature_name, str): 498 | raise ValueError('invalid type %s found in ' 499 | '`feature_names`.' % type(feature_name)) 500 | 501 | # 检查const_range 502 | if not ((isinstance(self.const_range, tuple) and 503 | len(self.const_range) == 2) or self.const_range is None): 504 | raise ValueError('const_range should be a tuple with length two, ' 505 | 'or None.') 506 | 507 | # 检查function, 稍作修改, 结合const_range到range里面, 并区分number func 和 cat function 508 | # 存放不同类型的函数(分类和数值) 509 | self._function_dict = {'number': [], 'category': []} 510 | # 检验是否存在接受分类变量参数的函数 511 | _cat_func_flag = False 512 | for function in self.function_set: 513 | # 类型检验 514 | if isinstance(function, str): 515 | if function not in _function_map: 516 | raise ValueError('invalid function name %s found in ' 517 | '`function_set`.' % function) 518 | function = deepcopy(_function_map[function]) 519 | function.add_range(self.const_range) 520 | self._function_dict['number'].append(function) 521 | elif isinstance(function, _Function): 522 | function = deepcopy(function) 523 | # 添加常数范围 524 | function.add_range(self.const_range) 525 | # 检验是否有仅接收分类变量的函数 526 | if not _cat_func_flag: 527 | for _param in function.param_type: 528 | if len(_param) == 1 and 'vector' in _param and \ 529 | len(_param['vector']) == 1 and 'category' in _param['vector']: 530 | _cat_func_flag = True 531 | if function.return_type == 'number': 532 | self._function_dict['number'].append(function) 533 | else: 534 | self._function_dict['category'].append(function) 535 | else: 536 | raise ValueError('invalid type %s found in `function_set`.' 537 | % type(function)) 538 | 539 | # number类型函数必须有 540 | if len(self._function_dict['number']) == 0: 541 | raise ValueError('No valid functions found in `function_set`.') 542 | 543 | # 当存在只接受分类变量参数的函数时(如groupby),category变量不能为空 544 | if _cat_func_flag and len(self.category_features) == 0: 545 | raise ValueError('There no category var in input features, but there are functions only get category param') 546 | 547 | # 点变异记录函数参数个数, 需要在点变异中再考察参数类型 548 | self._arities = {} 549 | for _type in ['number', 'category']: 550 | for function in self._function_dict[_type]: 551 | arity = function.arity 552 | self._arities[arity] = self._arities.get(arity, []) 553 | self._arities[arity].append(function) 554 | 555 | # 检查fitness 556 | if isinstance(self.metric, _Fitness): 557 | self._metric = self.metric 558 | elif isinstance(self, RegressorMixin): 559 | if self.metric not in ('mean absolute error', 'mse', 'rmse', 560 | 'pearson', 'spearman'): 561 | raise ValueError('Unsupported metric: %s' % self.metric) 562 | self._metric = _fitness_map[self.metric] 563 | elif isinstance(self, ClassifierMixin): 564 | if self.metric != 'log loss': 565 | raise ValueError('Unsupported metric: %s' % self.metric) 566 | self._metric = _fitness_map[self.metric] 567 | elif isinstance(self, TransformerMixin): 568 | if self.metric not in ('pearson', 'spearman'): 569 | raise ValueError('Unsupported metric: %s' % self.metric) 570 | self._metric = _fitness_map[self.metric] 571 | 572 | # 检查概率参数 573 | # todo 增加交叉变异方法后需要修改此处 574 | self._method_probs = np.array([self.p_crossover, 575 | self.p_subtree_mutation, 576 | self.p_hoist_mutation, 577 | self.p_point_mutation]) 578 | self._method_probs = np.cumsum(self._method_probs) 579 | if self._method_probs[-1] > 1: 580 | raise ValueError('The sum of p_crossover, p_subtree_mutation, ' 581 | 'p_hoist_mutation and p_point_mutation should ' 582 | 'total to 1.0 or less.') 583 | 584 | # 检查初始化模式 585 | if self.init_method not in ('half and half', 'grow', 'full'): 586 | raise ValueError('Valid program initializations methods include ' 587 | '"grow", "full" and "half and half". Given %s.' 588 | % self.init_method) 589 | 590 | # 检查初始化深度 591 | if (not isinstance(self.init_depth, tuple) or 592 | len(self.init_depth) != 2): 593 | raise ValueError('init_depth should be a tuple with length two.') 594 | if self.init_depth[0] > self.init_depth[1]: 595 | raise ValueError('init_depth should be in increasing numerical ' 596 | 'order: (min_depth, max_depth).') 597 | 598 | # 初始化transformer函数 599 | if self.transformer is not None: 600 | if isinstance(self.transformer, _Function): 601 | self._transformer = self.transformer 602 | elif self.transformer == 'sigmoid': 603 | self._transformer = sigmoid 604 | else: 605 | raise ValueError('Invalid `transformer`. Expected either ' 606 | '"sigmoid" or _Function object, got %s' % 607 | type(self.transformer)) 608 | if self._transformer.arity != 1: 609 | raise ValueError('Invalid arity for `transformer`. Expected 1, ' 610 | 'got %d.' % (self._transformer.arity)) 611 | 612 | params = self.get_params() 613 | params['_metric'] = self._metric 614 | if hasattr(self, '_transformer'): 615 | params['_transformer'] = self._transformer 616 | else: 617 | params['_transformer'] = None 618 | params['function_dict'] = self._function_dict 619 | params['arities'] = self._arities 620 | params['method_probs'] = self._method_probs 621 | params['cat_var_number'] = len(self.category_features) if self.category_features is not None else 0 622 | 623 | # 清空_program 624 | if not self.warm_start or not hasattr(self, '_programs'): 625 | # Free allocated memory, if any 626 | self._programs = [] 627 | self.run_details_ = {'generation': [], 628 | 'average_length': [], 629 | 'average_fitness': [], 630 | 'best_length': [], 631 | 'best_fitness': [], 632 | 'best_oob_fitness': [], 633 | 'generation_time': []} 634 | 635 | prior_generations = len(self._programs) 636 | n_more_generations = self.generations - prior_generations 637 | 638 | if n_more_generations < 0: 639 | raise ValueError('generations=%d must be larger or equal to ' 640 | 'len(_programs)=%d when warm_start==True' 641 | % (self.generations, len(self._programs))) 642 | elif n_more_generations == 0: 643 | fitness = [program.raw_fitness_ for program in self._programs[-1]] 644 | warn('Warm-start fitting without increasing n_estimators does not ' 645 | 'fit new programs.') 646 | 647 | if self.warm_start: 648 | # Generate and discard seeds that would have been produced on the 649 | # initial fit call. 650 | for i in range(len(self._programs)): 651 | _ = random_state.randint(MAX_INT, size=self.population_size) 652 | 653 | if self.verbose: 654 | # Print header fields 655 | self._verbose_reporter() 656 | 657 | for gen in range(prior_generations, self.generations): 658 | start_time = time() 659 | 660 | if gen == 0: 661 | parents = None 662 | else: 663 | try: 664 | parents = self._programs[gen - 1] 665 | except: 666 | print(len(self._programs)) 667 | print(gen) 668 | 669 | exit() 670 | # Parallel loop 671 | # 将population_size分配给n_job个进程 672 | n_jobs, n_programs, starts = _partition_estimators(self.population_size, self.n_jobs) 673 | seeds = random_state.randint(MAX_INT, size=self.population_size) 674 | 675 | population = Parallel(n_jobs=n_jobs, 676 | verbose=int(self.verbose > 1))( 677 | delayed(_parallel_evolve)(n_programs[i], 678 | parents, 679 | X, 680 | y, 681 | security_data, 682 | time_series_data, 683 | sample_weight, 684 | seeds[starts[i]:starts[i + 1]], 685 | params) 686 | for i in range(n_jobs)) 687 | 688 | # Reduce, maintaining order across different n_jobs 689 | population = list(itertools.chain.from_iterable(population)) 690 | 691 | fitness = [program.raw_fitness_ for program in population] 692 | length = [program.length_ for program in population] 693 | 694 | # 惩罚系数 695 | parsimony_coefficient = None 696 | if self.parsimony_coefficient == 'auto': 697 | parsimony_coefficient = (np.cov(length, fitness)[1, 0] / 698 | np.var(length)) 699 | for program in population: 700 | program.fitness_ = program.fitness(parsimony_coefficient) 701 | 702 | self._programs.append(population) 703 | 704 | # 去除没有进入下一代的父辈种群 705 | if not self.low_memory: 706 | for old_gen in np.arange(gen, 0, -1): 707 | indices = [] 708 | for program in self._programs[old_gen]: 709 | if program is not None: 710 | for idx in program.parents: 711 | if 'idx' in idx: 712 | indices.append(program.parents[idx]) 713 | indices = set(indices) 714 | for idx in range(self.population_size): 715 | if idx not in indices: 716 | self._programs[old_gen - 1][idx] = None 717 | elif gen > 0: 718 | # 在low_memory的情况下,去除所有 719 | self._programs[gen - 1] = None 720 | 721 | # 记录运行细节 722 | if self._metric.greater_is_better: 723 | best_program = population[np.argmax(fitness)] 724 | else: 725 | best_program = population[np.argmin(fitness)] 726 | 727 | self.run_details_['generation'].append(gen) 728 | self.run_details_['average_length'].append(np.mean(length)) 729 | self.run_details_['average_fitness'].append(np.mean(fitness)) 730 | self.run_details_['best_length'].append(best_program.length_) 731 | self.run_details_['best_fitness'].append(best_program.raw_fitness_) 732 | oob_fitness = np.nan 733 | if self.max_samples < 1.0: 734 | oob_fitness = best_program.oob_fitness_ 735 | self.run_details_['best_oob_fitness'].append(oob_fitness) 736 | generation_time = time() - start_time 737 | self.run_details_['generation_time'].append(generation_time) 738 | 739 | if self.verbose: 740 | self._verbose_reporter(self.run_details_) 741 | 742 | # 是否进入停止条件 743 | if self._metric.greater_is_better: 744 | best_fitness = fitness[np.argmax(fitness)] 745 | if best_fitness >= self.stopping_criteria: 746 | break 747 | else: 748 | best_fitness = fitness[np.argmin(fitness)] 749 | if best_fitness <= self.stopping_criteria: 750 | break 751 | 752 | # 特征工程专属模块 753 | if isinstance(self, TransformerMixin): 754 | # Find the best individuals in the final generation 755 | fitness = np.array(fitness) 756 | # 找出适应度最优的hall_of_fame个进入fitness 757 | if self._metric.greater_is_better: 758 | hall_of_fame = fitness.argsort()[::-1][:self.hall_of_fame] 759 | else: 760 | hall_of_fame = fitness.argsort()[:self.hall_of_fame] 761 | evaluation = np.array([gp.execute(X) for gp in 762 | [self._programs[-1][i] for 763 | i in hall_of_fame]]) 764 | if self.metric == 'spearman': 765 | evaluation = np.apply_along_axis(rankdata, 1, evaluation) 766 | 767 | with np.errstate(divide='ignore', invalid='ignore'): 768 | correlations = np.abs(np.corrcoef(evaluation)) 769 | np.fill_diagonal(correlations, 0.) 770 | components = list(range(self.hall_of_fame)) 771 | indices = list(range(self.hall_of_fame)) 772 | # Iteratively remove least fit individual of most correlated pair 773 | while len(components) > self.n_components: 774 | # 去除hall_of_fame - n_components个高度相关特征 775 | # 找到相关系数矩阵中相关系数绝对值最大的两个特征,删去其中fitness较低的那个 776 | # 相关性低于某一阈值时按照fitness筛选(gplearnplus新增) 777 | most_correlated = np.unravel_index(np.argmax(correlations), 778 | correlations.shape) 779 | # The correlation matrix is sorted by fitness, so identifying 780 | # the least fit of the pair is simply getting the higher index 781 | worst = max(most_correlated) 782 | components.pop(worst) 783 | indices.remove(worst) 784 | correlations = correlations[:, indices][indices, :] 785 | if np.max(correlations) < self.tolerable_corr: 786 | break 787 | indices = list(range(len(components))) 788 | # 余下的选出最优的self.n_components个 789 | components = components[:self.n_components] 790 | self._best_programs = [self._programs[-1][i] for i in 791 | hall_of_fame[components]] 792 | 793 | else: 794 | # Find the best individual in the final generation 795 | if self._metric.greater_is_better: 796 | self._program = self._programs[-1][np.argmax(fitness)] 797 | else: 798 | self._program = self._programs[-1][np.argmin(fitness)] 799 | 800 | return self 801 | 802 | 803 | class SymbolicRegressor(BaseSymbolic, RegressorMixin): 804 | def __init__(self, 805 | *, 806 | population_size=1000, 807 | generations=20, 808 | tournament_size=20, 809 | stopping_criteria=0.0, 810 | const_range=(-1., 1.), 811 | init_depth=(2, 6), 812 | init_method='half and half', 813 | function_set=('add', 'sub', 'mul', 'div'), 814 | metric='mean absolute error', 815 | parsimony_coefficient=0.001, 816 | p_crossover=0.9, 817 | p_subtree_mutation=0.01, 818 | p_hoist_mutation=0.01, 819 | p_point_mutation=0.01, 820 | p_point_replace=0.05, 821 | max_samples=1.0, 822 | feature_names=None, 823 | time_series_index=None, 824 | security_index=None, 825 | category_features=None, 826 | warm_start=False, 827 | low_memory=False, 828 | n_jobs=1, 829 | verbose=0, 830 | data_type='section', 831 | random_state=None): 832 | super(SymbolicRegressor, self).__init__( 833 | population_size=population_size, 834 | generations=generations, 835 | tournament_size=tournament_size, 836 | stopping_criteria=stopping_criteria, 837 | const_range=const_range, 838 | init_depth=init_depth, 839 | init_method=init_method, 840 | function_set=function_set, 841 | metric=metric, 842 | parsimony_coefficient=parsimony_coefficient, 843 | p_crossover=p_crossover, 844 | p_subtree_mutation=p_subtree_mutation, 845 | p_hoist_mutation=p_hoist_mutation, 846 | p_point_mutation=p_point_mutation, 847 | p_point_replace=p_point_replace, 848 | max_samples=max_samples, 849 | feature_names=feature_names, 850 | time_series_index=time_series_index, 851 | security_index=security_index, 852 | category_features=category_features, 853 | warm_start=warm_start, 854 | low_memory=low_memory, 855 | n_jobs=n_jobs, 856 | verbose=verbose, 857 | random_state=random_state, 858 | data_type=data_type) 859 | 860 | def __str__(self): 861 | """Overloads `print` output of the object to resemble a LISP tree.""" 862 | if not hasattr(self, '_program'): 863 | return self.__repr__() 864 | return self._program.__str__() 865 | 866 | def predict(self, X): 867 | """Perform regression on test vectors X. 868 | 869 | Parameters 870 | ---------- 871 | X : array-like, shape = [n_samples, n_features] 872 | Input vectors, where n_samples is the number of samples 873 | and n_features is the number of features. 874 | 875 | Returns 876 | ------- 877 | y : array, shape = [n_samples] 878 | Predicted values for X. 879 | 880 | """ 881 | if not hasattr(self, '_program'): 882 | raise NotFittedError('SymbolicRegressor not fitted.') 883 | 884 | X = check_array(X) 885 | _, n_features = X.shape 886 | if self.n_features_in_ != n_features: 887 | raise ValueError('Number of features of the model must match the ' 888 | 'input. Model n_features is %s and input ' 889 | 'n_features is %s.' 890 | % (self.n_features_in_, n_features)) 891 | 892 | y = self._program.execute(X) 893 | 894 | return y 895 | 896 | 897 | class SymbolicClassifier(BaseSymbolic, ClassifierMixin): 898 | def __init__(self, 899 | *, 900 | population_size=1000, 901 | generations=20, 902 | tournament_size=20, 903 | stopping_criteria=0.0, 904 | const_range=(-1., 1.), 905 | init_depth=(2, 6), 906 | init_method='half and half', 907 | function_set=('add', 'sub', 'mul', 'div'), 908 | transformer='sigmoid', 909 | metric='log loss', 910 | parsimony_coefficient=0.001, 911 | p_crossover=0.9, 912 | p_subtree_mutation=0.01, 913 | p_hoist_mutation=0.01, 914 | p_point_mutation=0.01, 915 | p_point_replace=0.05, 916 | max_samples=1.0, 917 | class_weight=None, 918 | feature_names=None, 919 | time_series_index=None, 920 | security_index=None, 921 | category_features=None, 922 | warm_start=False, 923 | low_memory=False, 924 | n_jobs=1, 925 | verbose=0, 926 | data_type='section', 927 | random_state=None): 928 | super(SymbolicClassifier, self).__init__( 929 | population_size=population_size, 930 | generations=generations, 931 | tournament_size=tournament_size, 932 | stopping_criteria=stopping_criteria, 933 | const_range=const_range, 934 | init_depth=init_depth, 935 | init_method=init_method, 936 | function_set=function_set, 937 | transformer=transformer, 938 | metric=metric, 939 | parsimony_coefficient=parsimony_coefficient, 940 | p_crossover=p_crossover, 941 | p_subtree_mutation=p_subtree_mutation, 942 | p_hoist_mutation=p_hoist_mutation, 943 | p_point_mutation=p_point_mutation, 944 | p_point_replace=p_point_replace, 945 | max_samples=max_samples, 946 | class_weight=class_weight, 947 | feature_names=feature_names, 948 | time_series_index=time_series_index, 949 | security_index=security_index, 950 | category_features=category_features, 951 | warm_start=warm_start, 952 | low_memory=low_memory, 953 | n_jobs=n_jobs, 954 | verbose=verbose, 955 | data_type=data_type, 956 | random_state=random_state) 957 | 958 | def __str__(self): 959 | """Overloads `print` output of the object to resemble a LISP tree.""" 960 | if not hasattr(self, '_program'): 961 | return self.__repr__() 962 | return self._program.__str__() 963 | 964 | def _more_tags(self): 965 | return {'binary_only': True} 966 | 967 | def predict_proba(self, X): 968 | # 输出概率 只支持二分类 969 | if not hasattr(self, '_program'): 970 | raise NotFittedError('SymbolicClassifier not fitted.') 971 | 972 | X = check_array(X) 973 | _, n_features = X.shape 974 | if self.n_features_in_ != n_features: 975 | raise ValueError('Number of features of the model must match the ' 976 | 'input. Model n_features is %s and input ' 977 | 'n_features is %s.' 978 | % (self.n_features_in_, n_features)) 979 | 980 | scores = self._program.execute(X) 981 | proba = self._transformer(scores) 982 | proba = np.vstack([1 - proba, proba]).T 983 | return proba 984 | 985 | def predict(self, X): 986 | # 输出预测结果 987 | proba = self.predict_proba(X) 988 | return self.classes_.take(np.argmax(proba, axis=1), axis=0) 989 | 990 | 991 | class SymbolicTransformer(BaseSymbolic, TransformerMixin): 992 | def __init__(self, 993 | *, 994 | population_size=1000, 995 | hall_of_fame=100, 996 | n_components=10, 997 | generations=20, 998 | tournament_size=20, 999 | stopping_criteria=1.0, 1000 | const_range=(-1., 1.), 1001 | init_depth=(2, 6), 1002 | init_method='half and half', 1003 | function_set=('add', 'sub', 'mul', 'div'), 1004 | metric='pearson', 1005 | parsimony_coefficient=0.001, 1006 | p_crossover=0.9, 1007 | p_subtree_mutation=0.01, 1008 | p_hoist_mutation=0.01, 1009 | p_point_mutation=0.01, 1010 | p_point_replace=0.05, 1011 | max_samples=1.0, 1012 | tolerable_corr=0.0, 1013 | feature_names=None, 1014 | time_series_index=None, 1015 | security_index=None, 1016 | category_features=None, 1017 | warm_start=False, 1018 | low_memory=False, 1019 | n_jobs=1, 1020 | verbose=0, 1021 | data_type='section', 1022 | random_state=None): 1023 | super(SymbolicTransformer, self).__init__( 1024 | population_size=population_size, 1025 | hall_of_fame=hall_of_fame, 1026 | n_components=n_components, 1027 | generations=generations, 1028 | tournament_size=tournament_size, 1029 | stopping_criteria=stopping_criteria, 1030 | const_range=const_range, 1031 | init_depth=init_depth, 1032 | init_method=init_method, 1033 | function_set=function_set, 1034 | metric=metric, 1035 | parsimony_coefficient=parsimony_coefficient, 1036 | p_crossover=p_crossover, 1037 | p_subtree_mutation=p_subtree_mutation, 1038 | p_hoist_mutation=p_hoist_mutation, 1039 | p_point_mutation=p_point_mutation, 1040 | p_point_replace=p_point_replace, 1041 | max_samples=max_samples, 1042 | tolerable_corr=tolerable_corr, 1043 | feature_names=feature_names, 1044 | time_series_index=time_series_index, 1045 | security_index=security_index, 1046 | category_features=category_features, 1047 | warm_start=warm_start, 1048 | low_memory=low_memory, 1049 | n_jobs=n_jobs, 1050 | verbose=verbose, 1051 | data_type=data_type, 1052 | random_state=random_state) 1053 | 1054 | def __len__(self): 1055 | """Overloads `len` output to be the number of fitted components.""" 1056 | if not hasattr(self, '_best_programs'): 1057 | return 0 1058 | return self.n_components 1059 | 1060 | def __getitem__(self, item): 1061 | """Return the ith item of the fitted components.""" 1062 | if item >= len(self): 1063 | raise IndexError 1064 | return self._best_programs[item] 1065 | 1066 | def __str__(self): 1067 | """Overloads `print` output of the object to resemble LISP trees.""" 1068 | if not hasattr(self, '_best_programs'): 1069 | return self.__repr__() 1070 | output = str([gp.__str__() for gp in self]) 1071 | return output.replace("',", ",\n").replace("'", "") 1072 | 1073 | def _more_tags(self): 1074 | return { 1075 | "_xfail_checks": { 1076 | "check_sample_weights_invariance": ( 1077 | "zero sample_weight is not equivalent to removing samples" 1078 | ), 1079 | } 1080 | } 1081 | 1082 | def transform(self, X): 1083 | # 将X转换成以及训练好的特征 1084 | if not hasattr(self, '_best_programs'): 1085 | raise NotFittedError('SymbolicTransformer not fitted.') 1086 | 1087 | X = check_array(X) 1088 | _, n_features = X.shape 1089 | if self.n_features_in_ != n_features: 1090 | raise ValueError('Number of features of the model must match the ' 1091 | 'input. Model n_features is %s and input ' 1092 | 'n_features is %s.' 1093 | % (self.n_features_in_, n_features)) 1094 | 1095 | X_new = np.array([gp.execute(X) for gp in self._best_programs]).T 1096 | 1097 | return X_new 1098 | 1099 | def fit_transform(self, X, y, sample_weight=None): 1100 | # 训练之后转换 1101 | return self.fit(X, y, sample_weight).transform(X) 1102 | 1103 | --------------------------------------------------------------------------------