├── readme_pic
└── tree.png
├── __init__.py
├── utils.py
├── fitness.py
├── data_trans.py
├── README.md
├── .idea
└── workspace.xml
├── functions.py
├── example.py
├── _program.py
└── genetic.py
/readme_pic/tree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ACEACEjasonhuang/gplearnplus/HEAD/readme_pic/tree.png
--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | # @Project :gplearnplus
5 | # @File :__init__.py
6 | # @Date :2022/12/1 0001 13:36
7 | # @Author :Junzhe Huang
8 | # @Email :acejasonhuang@163.com
9 | # @Software :PyCharm
10 | -------------------------------------------------
11 | """
12 | __version__ = '1.5.9'
13 |
14 | __all__ = ['genetic', 'functions', 'fitness', 'example']
15 |
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | # @Project :gplearnplus
5 | # @File :utils
6 | # @Date :2022/12/1 0001 13:38
7 | # @Author :Junzhe Huang
8 | # @Email :acejasonhuang@163.com
9 | # @Software :PyCharm
10 | -------------------------------------------------
11 | """
12 |
13 | import numbers
14 |
15 | import numpy as np
16 | from joblib import cpu_count
17 |
18 |
19 | # seed 转换为 np.random.RandomState对象
20 | def check_random_state(seed):
21 | """Turn seed into a np.random.RandomState instance
22 |
23 | Parameters
24 | ----------
25 | seed : None | int | instance of RandomState
26 | If seed is None, return the RandomState singleton used by np.random.
27 | If seed is an int, return a new RandomState instance seeded with seed.
28 | If seed is already a RandomState instance, return it.
29 | Otherwise raise ValueError.
30 |
31 | """
32 | if seed is None or seed is np.random:
33 | return np.random.mtrand._rand
34 | if isinstance(seed, (numbers.Integral, np.integer)):
35 | return np.random.RandomState(seed)
36 | if isinstance(seed, np.random.RandomState):
37 | return seed
38 | raise ValueError('%r cannot be used to seed a numpy.random.RandomState'
39 | ' instance' % seed)
40 |
41 |
42 | # n_jobs转换 -1为全部
43 | def _get_n_jobs(n_jobs):
44 | """Get number of jobs for the computation.
45 |
46 | This function reimplements the logic of joblib to determine the actual
47 | number of jobs depending on the cpu count. If -1 all CPUs are used.
48 | If 1 is given, no parallel computing code is used at all, which is useful
49 | for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used.
50 | Thus for n_jobs = -2, all CPUs but one are used.
51 |
52 | Parameters
53 | ----------
54 | n_jobs : int
55 | Number of jobs stated in joblib convention.
56 |
57 | Returns
58 | -------
59 | n_jobs : int
60 | The actual number of jobs as positive integer.
61 |
62 | """
63 | if n_jobs < 0:
64 | return max(cpu_count() + 1 + n_jobs, 1)
65 | elif n_jobs == 0:
66 | raise ValueError('Parameter n_jobs == 0 has no meaning.')
67 | else:
68 | return n_jobs
69 |
70 |
71 | # 将estimator 分配到每一个job上
72 | # 返回 进程数, 每个进程任务数,累计任务数
73 | def _partition_estimators(n_estimators, n_jobs):
74 | """Private function used to partition estimators between jobs."""
75 | # Compute the number of jobs
76 | n_jobs = min(_get_n_jobs(n_jobs), n_estimators)
77 |
78 | # 给进程分配任务
79 | # Partition estimators between jobs
80 | n_estimators_per_job = (n_estimators // n_jobs) * np.ones(n_jobs,
81 | dtype=int)
82 | n_estimators_per_job[:n_estimators % n_jobs] += 1
83 | starts = np.cumsum(n_estimators_per_job)
84 |
85 | return n_jobs, n_estimators_per_job.tolist(), [0] + starts.tolist()
86 |
--------------------------------------------------------------------------------
/fitness.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | # @Project :gplearnplus
5 | # @File :fitness
6 | # @Date :2022/12/5 0005 7:25
7 | # @Author :Junzhe Huang
8 | # @Email :acejasonhuang@163.com
9 | # @Software :PyCharm
10 | -------------------------------------------------
11 | """
12 |
13 | import numbers
14 |
15 | import numpy as np
16 | from joblib import wrap_non_picklable_objects
17 | from scipy.stats import rankdata
18 |
19 | __all__ = ['make_fitness']
20 |
21 |
22 | class _Fitness(object):
23 |
24 | """A metric to measure the fitness of a program.
25 |
26 | This object is able to be called with NumPy vectorized arguments and return
27 | a resulting floating point score quantifying the quality of the program's
28 | representation of the true relationship.
29 |
30 | Parameters
31 | ----------
32 | function : callable
33 | A function with signature function(y, y_pred, sample_weight) that
34 | returns a floating point number. Where `y` is the input target y
35 | vector, `y_pred` is the predicted values from the genetic program, and
36 | sample_weight is the sample_weight vector.
37 |
38 | greater_is_better : bool
39 | Whether a higher value from `function` indicates a better fit. In
40 | general this would be False for metrics indicating the magnitude of
41 | the error, and True for metrics indicating the quality of fit.
42 |
43 | """
44 |
45 | def __init__(self, function, greater_is_better):
46 | self.function = function
47 | self.greater_is_better = greater_is_better
48 | self.sign = 1 if greater_is_better else -1
49 |
50 | def __call__(self, *args):
51 | return self.function(*args)
52 |
53 |
54 | def make_fitness(*, function, greater_is_better, wrap=True):
55 | """Make a fitness measure, a metric scoring the quality of a program's fit.
56 |
57 | This factory function creates a fitness measure object which measures the
58 | quality of a program's fit and thus its likelihood to undergo genetic
59 | operations into the next generation. The resulting object is able to be
60 | called with NumPy vectorized arguments and return a resulting floating
61 | point score quantifying the quality of the program's representation of the
62 | true relationship.
63 |
64 | Parameters
65 | ----------
66 | function : callable
67 | A function with signature function(y, y_pred, sample_weight) that
68 | returns a floating point number. Where `y` is the input target y
69 | vector, `y_pred` is the predicted values from the genetic program, and
70 | sample_weight is the sample_weight vector.
71 |
72 | greater_is_better : bool
73 | Whether a higher value from `function` indicates a better fit. In
74 | general this would be False for metrics indicating the magnitude of
75 | the error, and True for metrics indicating the quality of fit.
76 |
77 | wrap : bool, optional (default=True)
78 | When running in parallel, pickling of custom metrics is not supported
79 | by Python's default pickler. This option will wrap the function using
80 | cloudpickle allowing you to pickle your solution, but the evolution may
81 | run slightly more slowly. If you are running single-threaded in an
82 | interactive Python session or have no need to save the model, set to
83 | `False` for faster runs.
84 |
85 | """
86 | if not isinstance(greater_is_better, bool):
87 | raise ValueError('greater_is_better must be bool, got %s'
88 | % type(greater_is_better))
89 | if not isinstance(wrap, bool):
90 | raise ValueError('wrap must be an bool, got %s' % type(wrap))
91 | if function.__code__.co_argcount != 3:
92 | raise ValueError('function requires 3 arguments (y, y_pred, w),'
93 | ' got %d.' % function.__code__.co_argcount)
94 | if not isinstance(function(np.array([1, 1]),
95 | np.array([2, 2]),
96 | np.array([1, 1])), numbers.Number):
97 | raise ValueError('function must return a numeric.')
98 |
99 | if wrap:
100 | return _Fitness(function=wrap_non_picklable_objects(function),
101 | greater_is_better=greater_is_better)
102 | return _Fitness(function=function,
103 | greater_is_better=greater_is_better)
104 |
105 |
106 | def _weighted_pearson(y, y_pred, w):
107 | """Calculate the weighted Pearson correlation coefficient."""
108 | with np.errstate(divide='ignore', invalid='ignore'):
109 | y_pred_demean = y_pred - np.average(y_pred, weights=w)
110 | y_demean = y - np.average(y, weights=w)
111 | corr = ((np.sum(w * y_pred_demean * y_demean) / np.sum(w)) /
112 | np.sqrt((np.sum(w * y_pred_demean ** 2) *
113 | np.sum(w * y_demean ** 2)) /
114 | (np.sum(w) ** 2)))
115 | if np.isfinite(corr):
116 | return np.abs(corr)
117 | return 0.
118 |
119 |
120 | def _weighted_spearman(y, y_pred, w):
121 | """Calculate the weighted Spearman correlation coefficient."""
122 | y_pred_ranked = np.apply_along_axis(rankdata, 0, y_pred)
123 | y_ranked = np.apply_along_axis(rankdata, 0, y)
124 | return _weighted_pearson(y_pred_ranked, y_ranked, w)
125 |
126 |
127 | def _mean_absolute_error(y, y_pred, w):
128 | """Calculate the mean absolute error."""
129 | return np.average(np.abs(y_pred - y), weights=w)
130 |
131 |
132 | def _mean_square_error(y, y_pred, w):
133 | """Calculate the mean square error."""
134 | return np.average(((y_pred - y) ** 2), weights=w)
135 |
136 |
137 | def _root_mean_square_error(y, y_pred, w):
138 | """Calculate the root mean square error."""
139 | return np.sqrt(np.average(((y_pred - y) ** 2), weights=w))
140 |
141 |
142 | def _log_loss(y, y_pred, w):
143 | """Calculate the log loss."""
144 | eps = 1e-15
145 | inv_y_pred = np.clip(1 - y_pred, eps, 1 - eps)
146 | y_pred = np.clip(y_pred, eps, 1 - eps)
147 | score = y * np.log(y_pred) + (1 - y) * np.log(inv_y_pred)
148 | return np.average(-score, weights=w)
149 |
150 |
151 | weighted_pearson = _Fitness(function=_weighted_pearson,
152 | greater_is_better=True)
153 | weighted_spearman = _Fitness(function=_weighted_spearman,
154 | greater_is_better=True)
155 | mean_absolute_error = _Fitness(function=_mean_absolute_error,
156 | greater_is_better=False)
157 | mean_square_error = _Fitness(function=_mean_square_error,
158 | greater_is_better=False)
159 | root_mean_square_error = _Fitness(function=_root_mean_square_error,
160 | greater_is_better=False)
161 | log_loss = _Fitness(function=_log_loss,
162 | greater_is_better=False)
163 |
164 | _fitness_map = {'pearson': weighted_pearson,
165 | 'spearman': weighted_spearman,
166 | 'mean absolute error': mean_absolute_error,
167 | 'mse': mean_square_error,
168 | 'rmse': root_mean_square_error,
169 | 'log loss': log_loss}
--------------------------------------------------------------------------------
/data_trans.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Time : 2024/2/7
3 | # @Author : Junzhe Huang
4 | # @Email : huangjz01@igoldenbeta.com
5 | # @File : data_trans
6 | # @Software : gplearnplus
7 | import pandas as pd
8 |
9 |
10 | # todo 移植部分
11 | def data_transform(X, y, data_type, number_feature_list, category_feature_list=None,
12 | security_index=None, time_series_index=None):
13 | # 检查数据类型
14 | if data_type not in ('section', 'time_series', 'panel'):
15 | raise ValueError('Valid data_type methods include '
16 | '"section", "time_series" and "panel". Given %s.'
17 | % data_type)
18 |
19 | # X必须为pd.DataFrame
20 | if not isinstance(X, pd.DataFrame):
21 | raise ValueError('Data structure must be DataFrame')
22 |
23 | # 验证y的长度是否与X相同
24 | if len(X) != len(y):
25 | raise ValueError('X and y must have same length')
26 |
27 | # 检查column 是否包含category_feature_list 和 number_feature_list
28 | # 将category_feature_list 调整至前 number_feature_list 调整至后
29 | # 找出X的columns与category_feature_list的交集列表
30 | if category_feature_list is not None:
31 | if not isinstance(category_feature_list, list):
32 | raise ValueError('category_feature_list must be list')
33 | category_feature_list_inX = [col for col in X.columns if col not in category_feature_list]
34 | else:
35 | category_feature_list_inX = []
36 | # 找出X的columns与number_feature_list的交集列表
37 | if not isinstance(number_feature_list, list):
38 | raise ValueError('number_feature_list must be list')
39 | number_feature_list_inX = [col for col in X.columns if col not in number_feature_list]
40 | # 重构顺序,将分类类型放在前面, 并把第一列设为常数1,column为 const_1
41 | X['const_1'] = 1
42 | feature_names = category_feature_list_inX + number_feature_list_inX
43 | X_trans = X[['const_1'] + feature_names].copy()
44 |
45 | # 若存在security_index和time_series_index,插入X_trans最后,默认先插入security_index再插入time_series_index
46 | if security_index is not None:
47 | # 若security_index在X的columns中,或者为X.index,将其插入到X_trans最后
48 | if security_index in X.columns:
49 | X_trans[security_index] = X[security_index]
50 | elif X.index.name == security_index:
51 | X_trans[security_index] = X.index.get_level_values(security_index)
52 | else:
53 | # 若security_index不在X_trans的columns中,也不再index中,报错
54 | raise ValueError('Can not fund security_index {} in both columns and index'
55 | .format(security_index))
56 | if time_series_index is not None:
57 | # 若time_series_index在X的columns
58 | if time_series_index in X.columns:
59 | X_trans[time_series_index] = X[time_series_index]
60 | elif X.index.name == time_series_index:
61 |
62 |
63 |
64 | # 检查时间index和个股index, 对于截面,时序和面板数据分别检查
65 | if data_type == 'section':
66 | if time_series_index is not None:
67 | raise ValueError('For Section Data, time_series_index should be None')
68 | if security_index is not None:
69 | # 在index和columns中寻找security_index
70 | # 判断是否有重复个股
71 | if len(X[security_index].unique()) < len(X[security_index]):
72 | raise ValueError('For Section Data, security data should be unique')
73 | elif data_type == 'time_series':
74 | if security_index is not None:
75 | raise ValueError('For time_series Data, security_index should be None')
76 | if time_series_index is not None:
77 | # 在index和columns中寻找time_series_index
78 | if time_series_index not in X.columns and \
79 | (X.index.name is None or time_series_index not in X.index.name):
80 | raise ValueError('Can not fund time_series_index {} in both columns and index'
81 | .format(time_series_index))
82 | elif time_series_index in X.columns:
83 | X.set_index(time_series_index, inplace=True)
84 | # 判断是否有重复时间
85 | if len(X.index.drop_duplicates()) < len(X):
86 | raise ValueError('For time_series Data, time_series data should be unique')
87 | X_combine = X.copy()
88 | X_combine['_label'] = y.values if isinstance(y, pd.Series) else y
89 | X_combine.sort_index(inplace=True)
90 | X, y = X_combine.loc[:, self.feature_names], X_combine.loc[:, '_label']
91 | # debug
92 |
93 | time_series_data = X.index.values
94 |
95 | else:
96 | if self.time_series_index is None:
97 | raise ValueError('For panel Data, time_series_index should NOT be None')
98 | if self.security_index is None:
99 | raise ValueError('For panel Data, security_index should NOT be None')
100 |
101 | # security time_series 进入index
102 | if self.time_series_index not in X.columns and \
103 | (X.index.name is None or self.time_series_index not in X.index.name):
104 | raise ValueError('Can not fund time_series_index {} in both columns and index'
105 | .format(self.time_series_index))
106 | elif self.security_index not in X.columns and \
107 | (X.index.name is None or self.security_index not in X.index.name):
108 | raise ValueError('Can not fund security_index {} in both columns and index'
109 | .format(self.security_index))
110 | elif self.time_series_index in X.columns and self.security_index in X.columns:
111 | X.set_index([self.time_series_index, self.security_index], inplace=True)
112 | elif self.time_series_index in X.columns:
113 | X.set_index(self.security_index, inplace=True, append=True)
114 | elif self.security_index in X.columns:
115 | X.set_index(self.time_series_index, inplace=True, append=True)
116 |
117 | # 判断没有重复
118 | if len(X.index) != len(X.index.drop_duplicates()):
119 | raise ValueError('For time_series Data, time_series data should be unique')
120 |
121 |
122 | X_combine = X.copy()
123 | X_combine['_label'] = y.values if isinstance(y, pd.Series) else y
124 | X_combine.sort_index(inplace=True)
125 | X, y = X_combine.loc[:, self.feature_names], X_combine.loc[:, '_label']
126 | time_series_data = X.index.get_level_values(self.time_series_index).values
127 | security_data = X.index.get_level_values(self.security_index).values
128 |
129 | # 检查category_features是否与全包含在feature_names中
130 | # 当存在分类数据时,输入数据类型必须为pd。DataFrame
131 | if self.category_features is not None:
132 | if not isinstance(X, pd.DataFrame):
133 | raise ValueError('while there are category_features in X, X must be pd.DataFrame')
134 | if not isinstance(self.category_features, list):
135 | raise ValueError('category_features must be list')
136 | for cat_feature in self.category_features:
137 | if cat_feature not in self.feature_names:
138 | raise ValueError('Valid category_feature {} , not in feature_names'.format(cat_feature))
139 | # 处理分类数据,转换为整型
140 | label_encoder = LabelEncoder()
141 | X[self.category_features] = X[self.category_features].apply(label_encoder.fit_transform)
142 | # 重构顺序,将分类类型放在前面
143 | self.feature_names = \
144 | [self.category_features + [_col for _col in self.feature_names if _col not in self.category_features]]
145 | X = X[self.feature_names]
146 |
147 | # Check arrays
148 | if sample_weight is not None:
149 | sample_weight = _check_sample_weight(sample_weight, X)
150 |
151 | # 检查数据内容
152 | if isinstance(self, ClassifierMixin):
153 | # 验证y是否为分类数据, X, y强转ndarray
154 | # todo 分类场景的处理有待优化,暂时不处理
155 | X, y = self._validate_data(X, y, y_numeric=False)
156 | check_classification_targets(y)
157 |
158 | if self.class_weight:
159 | if sample_weight is None:
160 | sample_weight = 1.
161 | # modify the sample weights with the corresponding class weight
162 | sample_weight = (sample_weight *
163 | compute_sample_weight(self.class_weight, y))
164 |
165 | self.classes_, y = np.unique(y, return_inverse=True)
166 | n_trim_classes = np.count_nonzero(np.bincount(y, sample_weight))
167 | if n_trim_classes != 2:
168 | raise ValueError("y contains %d class after sample_weight "
169 | "trimmed classes with zero weights, while 2 "
170 | "classes are required."
171 | % n_trim_classes)
172 | self.n_classes_ = len(self.classes_)
173 |
174 | else:
175 | # 验证y是否为数值数据, X, y强转ndarray
176 | X, y = self._validate_data(X, y, y_numeric=True)
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # gplearnplus
2 | 对gplearn进行升级,适应时序数据和面板数据,适用于更多的场景
3 | 且在函数参数中区分分类数据和数值型数据,可兼容类似于groupby等操作
4 |
5 | # 文件描述
6 |
7 | ## `_Program.py`
8 | 构建,调用公式树模块,
9 | 对象为`_Program`
10 | 属性`program`为栈形式的公式树
11 | ### 公式树形式
12 | 
13 |
14 | 该公式表达是为
15 |
16 | $$
17 | ((X_0 \times X_0) - (3.0 \times X_1)) + 0.5
18 | $$
19 |
20 | program结果栈为:
21 | `['add', 'sub', 'mul', '0', '0', 'mul', 3.0, '1', 0.5]`
22 |
23 | ### 初始化方法
24 | `build_program`
25 | 通过stack对树进行深度优先搜索构建
26 | ```mermaid
27 | graph TB
28 | Z[选择根函数,必
须返回数值向量] --> A[(工作栈stack中插入
根函数参数列表)]
29 | A-->B([结果栈program中
插入根函数])
30 | B-->C{工作栈stack非空}
31 | C-->|Y|D{判断工作栈中最后一个
函数的第一个参数
1.该节点必须接受向量
2.当前深度比最大深度低
3.随机种子选中了函数
或者模式为'full'}
32 | C-->|N|L[异常,工作栈不得为空]
33 | D-->|Y|E[插入函数
成为子树节点]
34 | D-->|N|F[插入向量或标量
成为叶子节点]
35 | E-->G{该节点是否可接受
分类函数和数值函数}
36 | G-->|Y|H[所有函数中随机挑选]
37 | G-->|N|I[相应的分类或
数值函数中随机挑选]
38 | H-->J[(工作栈stack中插入
相应函数参数列表)]
39 | I-->J
40 | J-->K[(结果栈program中
插入相应函数)]
41 |
42 | F-->M{若参数不接受标量或
const_range为空或
随机数选中向量}
43 | M-->|Y|N{存在分类向量且
参数接受分类向量且
随机数选中分类向量}
44 | N-->|Y|O([结果栈program中
插入该分类向量,
类型为字符串数字])
45 | N-->|N|P([结果栈program中
插入该数值向量,
类型为字符串数字])
46 | M-->|N|R{若该节点接受
浮点类型标量}
47 | R-->|Y|S([结果栈program中
插入范围内随机浮点
标量,类型为浮点])
48 | R-->|N|T([结果栈program中
插入范围内随机整型
标量,类型为整型])
49 | O-->U[(工作栈stack中弹出
最后一个函数的
最后一个参数节点)]
50 | P-->U
51 | R-->U
52 | S-->U
53 | T-->U
54 | U-->Q{工作栈stack
最后一个函数
参数列表为空}
55 | Q-->|Y|V[(工作栈stack中弹出
最后一个函数的
W{工作栈为空}
57 | W-->|Y|X([返回结果栈program
公式树初始化完成])
58 | W-->|N|Y[(工作栈stack中弹出
最后一个函数的
最后一个参数节点)]
59 | Y-->Q
60 | Q-->|N|C
61 | K-->C
62 | ```
63 | ### 树的检验
64 | `validate_program`
65 | 对树一次深度优先搜索,保证所有节点完备,即每一个函数参数量足够
66 |
67 | `_depth`
68 | 深度优先搜索的同时记录最大深度
69 |
70 | `_length`
71 | 返回program长度,即树的节点数量
72 |
73 | ### 树的打印
74 | `__str__`:打印树
75 | `export_graphviz`:可视化整个树
76 |
77 |
78 | ### 公式树的计算
79 | `execute`:接受pandas或者二位nd_array,shape = [n_samples, n_features]
80 | 执行过程中,将program中的字符串和常数处理成可接受参数
81 | - 常数需要广播成常向量
82 | - 字符串转换为输入X中对应的列
83 | - 若数据类型为面板数据`panel`,X中需要额外输入证券列和时间列,
84 |
85 | `raw_fitness`:原始适应度
86 | 1. 由公式树计算出$\hat{y}$
87 | 2. 对$\hat{y}$进行调整
88 | 3. 计算$y$与$\hat{y}$的适应度`metric`
89 |
90 | `fitness`:带惩罚项适应度
91 | $$
92 | penalty=p\_coef \times program\_len \times sign(metric)
93 | $$
94 | ### 样本选择(防止过拟合)
95 | 为了防止过拟合,仅选择部分样本
96 |
97 | `get_all_indices` 输入总样本量和抽样样本量
98 |
99 | 返回抽样内样本index和抽样外样本index
100 |
101 | ### 公式树的截取
102 |
103 | `get_subtree(random_state, start, program=None)`:获取指定子树
104 | 获取根节点为start的指定子树
105 |
106 |
107 | `get_random_subtree(random_state, program=None, return_type=None):`获取随机子树
108 | 根据需要设定获取**数值型子树**还是**分类型子树**
109 | 返回子树和子树类型
110 |
111 | ### 公式树的交叉变异
112 |
113 | `crossover(donor, random_state)`
114 | 与公式树`donor`交叉,要求同返回类型
115 |
116 | `subtree_mutation(random_state)`
117 | 随机生成一颗公式树,与父树交叉
118 |
119 | `hoist_mutation(random_state)`
120 | 首先寻找可以hoist的节点,要求该节点下存在子节点与自己类型相同
121 | 把一颗子树的同类型子树上提
122 |
123 | `point_mutation(random_state)`
124 | 点变异
125 | 对随机选中的点进行点变异
126 | 点变异保证函数合法
127 |
128 |
129 | ## `fitness.py`
130 |
131 | 定义适应度函数,和自定义适应函数的方法
132 |
133 | 定义函数对象`_Fitness`
134 |
135 | 包含是属性:
136 |
137 | `function`
138 |
139 | 必须接受三个变量`(y, y_perd, w)`
140 |
141 | `greater_is_better`
142 |
143 |
144 | ## `function.py`
145 |
146 | 自定义函数和构建方法
147 | 定义函数对象`_Function`
148 | 包含是属性:
149 |
150 | `function`:可调用函数
151 |
152 | `name`:函数名
153 |
154 | `arity`:参数个数
155 |
156 | `param_type`:
157 | 参数类型列表,长度与arity一致,**默认不接受分类类型**
158 | 该设计是本项目最重要的升级,影响公式树的构建
159 | ```python
160 | [{
161 | 'vector': {'category': (None, None), 'number': (None, None)},
162 | 'scalar': {'int': (None, None), 'float': (None, None)}
163 | },]
164 | ```
165 |
166 | `function_type`:函数类型 默认 'all'
167 | 'all', 'section', 'time_series‘
168 |
169 | `return_type`:返回类型 默认'number'
170 | 'number', 'category'
171 |
172 | 包含的方法:
173 |
174 | `__call__`
175 | 调用函数特殊处理,
176 | 参数仅接受标量,却传入向量
177 | 则取向量第一个值为标量
178 |
179 | `add_range`:
180 |
181 | 替换掉参数中没有约束的范围,给所有标量限制范围
182 |
183 | 若没有const_range, 则表明所有函数不接收常数, 去掉所有的const type
184 |
185 | `is_point_mutation(candidate_func)`
186 |
187 | 检验某个待替换函数是否可以替换
188 |
189 | 外部函数:
190 | `make_function(*, function, name, arity, param_type=None, wrap=True, return_type='number', function_type='all')`
191 | 将函数处理为_Funtion对象
192 | 主要进行合法性检验和测试
193 |
194 | ## `genetic.py`
195 |
196 | 模型接口,包括由工厂类派生出,回归,分类器和特征工程工具类,应用于不同场景
197 |
198 | ### '_parallel_evolve(n_programs, parents, X, y, sample_weight, seeds, params)'
199 |
200 | 进行一次种群进化
201 | n_programs为种群数量
202 |
203 |
204 | ### `BaseSymbolic`
205 |
206 |
207 |
208 |
209 | ## `utils.py`
210 |
211 | 支持函数
212 |
213 |
214 |
215 | `test.py`
216 |
217 |
218 | `data_trans.py`
219 |
220 |
221 |
222 |
223 | 自定义函数样例
224 |
225 |
226 |
227 | # 函数文档
228 |
229 | ## 全局函数
230 |
231 | ### gpelarn自带全局函数
232 | | 函数名 | 实现逻辑 | 参数要求 | 输出类型 |
233 | |-----------|-----------------|---------------| ------------ |
234 | | add(X, Y) | 向量相加 | 【数值向量】,【数值向量】 | 【数值向量】 |
235 | | sub(X, Y) | 向量相减 | 【数值向量】,【数值向量】 | 【数值向量】 |
236 | | mul(X, Y) | 向量相乘 | 【数值向量】,【数值向量】 | 【数值向量】 |
237 | | div(X, Y) | 向量相除(极小值替换分母0) | 【数值向量】,【数值向量】 | 【数值向量】 |
238 | | sqrt(X) | 开平方(负数处理为绝对值) | 【数值向量】 | 【数值向量】 |
239 | | log(X) | 取自然对数(同理处理0和负数) | 【数值向量】 | 【数值向量】 |
240 | | neg(X) | 取反数 | 【数值向量】 | 【数值向量】 |
241 | | inv(X) | 取倒数 (极小值处理0) | 【数值向量】 | 【数值向量】 |
242 | | abs(X) | 取绝对值 | 【数值向量】 | 【数值向量】 |
243 | | max(X, Y) | 向量取孰大值 | 【数值向量】,【数值向量】 | 【数值向量】 |
244 | | min(X, Y) | 向量取孰小值 | 【数值向量】,【数值向量】 | 【数值向量】 |
245 | | sin(X) | 取向量正弦 | 【数值向量】, | 【数值向量】 |
246 | | cos(X) | 取向量余弦 | 【数值向量】 | 【数值向量】 |
247 | | tan(X) | 取向量正切 | 【数值向量】 | 【数值向量】 |
248 | | sig(X) | 逻辑斯蒂函数 | 【数值向量】 | 【数值向量】 |
249 | ### 自定义全局函数
250 | | 函数名 | 实现逻辑 | 参数要求 | 输出类型 |
251 | | --------------- |------------| -------------------------- | ------------ |
252 | | combine(X, Y) | 两个分类变量笛卡尔积 | 【分类向量】,【分类向量】 | 【分类向量】 |
253 |
254 |
255 | ## 时间序列函数
256 |
257 | ### 基本时间序列函数
258 | | 函数名 | 实现逻辑 | 参数要求 | 输出类型 |
259 | |--------------------|-----------|----------------------| ------------ |
260 | | delay(X,d) | 时序变量滞后 | 【数值向量】,【整形常量】 | 【数值向量】 |
261 | | delta(X,d) | 与滞后项作差 | 【数值向量】,【整形常量】 | 【数值向量】 |
262 | | ts_min(X,d) | 时间窗口最小值 | 【数值向量】,【整形常量】 | 【数值向量】 |
263 | | ts_max(X,d) | 时间窗口最大值 | 【数值向量】,【整形常量】 | 【数值向量】 |
264 | | ts_argmax(X,d) | 时间窗口最小值位置 | 【数值向量】,【整形常量】 | 【数值向量】 |
265 | | ts_argmin(X,d) | 时间窗口最大值位置 | 【数值向量】,【整形常量】 | 【数值向量】 |
266 | | ts_rank(X, d) | 时间窗口排序值 | 【数值向量】,【整形常量】 | 【数值向量】 |
267 | | ts_sum(X, d) | 时间窗口求和 | 【数值向量】,【整形常量】 | 【数值向量】 |
268 | | ts_stddev(X, d) | 时间窗口标准差 | 【数值向量】,【整形常量】 | 【数值向量】 |
269 | | ts_corr(X,Y,d) | 时间窗口相关系数 | 【数值向量】,【数值向量】,【整形常量】 | 【数值向量】 |
270 | | ts_mean(X, d) | 时间窗口均值 | 【数值向量】,【整形常量】 | 【数值向量】 |
271 | | ts_neutralize(X, d) | 时间窗口Z分数 | 【数值向量】,【整形常量】 | 【数值向量】 |
272 | | ts_freq(X, d) | 时间窗口Z分数 | 【分类向量】,【整形常量】 | 【数值向量】 |
273 |
274 | ### 技术指标函数
275 |
276 | | 函数名 | 实现逻辑 | 参数要求 | 输出类型 |
277 | |---------------------|------------| ------------------- | ------------ |
278 | | EMA(X,d) | 指数平滑均线 | 【数值向量】,【整形常量】 | 【数值向量】 |
279 | | DEMA(X,d) | 二重指数平滑线 | 【数值向量】,【整形常量】 | 【数值向量】 |
280 | | MA(X,d) | 均线 | 【数值变量】,【整形常量】 | 【数值向量】 |
281 | | KAMA(X,d) | 库夫曼自适应移动均线 | 【数值向量】,【整形常量】 | 【数值向量】 |
282 | | MIDPOINT(X,d) | 中间点 | 【数值向量】,【整形向量】 | 【数值向量】 |
283 | | BETA(X,Y,d) | 回归系数 | 【数值向量】,【数值向量】,【整形常量】 | 【数值向量】 |
284 | | LINEARREG_SLOPE(X, d) | 回归斜率 | 【数值向量】,【整形常量】 | 【数值向量】 |
285 | | LINEARREG_ANGLE(X, d) | 回归角度 | 【数值向量】,【整形常量】 | 【数值向量】 |
286 | | LINEARREG_INTERCEPT(X, d) | 回归截距 | 【数值向量】,【整形常量】 | 【数值向量】 |
287 |
288 | ## 截面函数
289 | ### 基本截面函数
290 | | 函数名 | 实现逻辑 | 参数要求 | 输出类型 |
291 | |------------------------------------|----------|------------------|--------|
292 | | MAX_SECTION(X) | 最大值填充 | 【数值向量】 | 【数值向量】 |
293 | | MIN_SECTION(X) | 最小值填充 | 【数值向量】 | 【数值向量】 |
294 | | MEAN_SECTION(X) | 均值填充 | 【数值向量】 | 【数值向量】 |
295 | | MEDIAN_SECTION(X) | 中位数填充 | 【数值向量】 | 【数值向量】 |
296 | | STD_SECTION(X) | 标准差填充 | 【数值向量】 | 【数值向量】 |
297 | | RANK_SECTION(X) | 序数填充 | 【数值向量】 | 【数值向量】 |
298 | | NEUTRALIZE_SECTION(X) | Z分数填充 | 【数值向量】 | 【数值向量】 |
299 | | FREQ_SECTION(X) | 频数填充 | 【分类向量】 | 【数值向量】 |
300 | | CUT_EQUAL_DISTANCE(X, d) | 等距分组 | 【数值向量】,【整形标量】 | 【分类向量】 |
301 | | CUT_EQUAL_AMOUNT(X, d) | 等量分组 | 【数值向量】,【整形标量】 | 【分类向量】 |
302 |
303 | ### 截面分类聚合函数
304 |
305 | | 函数名 | 实现逻辑 | 参数要求 | 输出类型 |
306 | |------------------------------------|----------|------------------|--------|
307 | | GROUPBYTHENMAX(gbx, X) | 分组后取最大值 | 【分类向量】,【数值向量】 | 【数值向量】 |
308 | | GROUPBYTHENMIN(gbx, X) | 分组后取最小值 | 【分类向量】,【数值向量】 | 【数值向量】 |
309 | | GROUPBYTHENMEAN(gbx, X) | 分组后取均值 | 【分类向量】,【数值向量】 | 【数值向量】 |
310 | | GROUPBYTHENMEDIAN(gbx, X) | 分组后取中位数 | 【分类向量】,【数值向量】 | 【数值向量】 |
311 | | GROUPBYTHENSTD(gbx, X) | 分组后取标准差 | 【分类向量】,【数值向量】 | 【数值向量】 |
312 | | GROUPBYTHENRANK(gbx, X) | 分组后取序数 | 【分类向量】,【数值向量】 | 【数值向量】 |
313 | | GROUPBYTHENNEUTRALIZE(gbx, X) | 分组后取Z分数 | 【分类向量】,【数值向量】 | 【数值向量】 |
314 | | GROUPBYTHEN_CUT_EQ_DIST(gbx, X, d) | 分组后取等距分组 | 【分类向量】,【数值向量】,【整形常量】 | 【分类向量】 |
315 | | GROUPBYTHEN_CUT_EQ_AMT(gbx, X, d) | 分组后取等量分组 | 【分类向量】,【数值向量】,【整形常量】 | 【分类向量】 |
316 | | GROUPBYTHENFREQ(gbx, X) | 分组后取取频数 | 【分类向量】,【分类向量】 | 【数值向量】 |
317 | # 更新记录
318 |
319 | ## v1.0
320 |
321 | 未调试完全, 有bug
322 |
323 | ## v1.1
324 |
325 | 处理完funtions模块的问题
326 | 调试成功,对于时序自定义函数中的常数参数,需要在函数中做去广播判定
327 |
328 | ## v1.2
329 |
330 | test中加入了自定义函数的定义方法,需要忽略运行时的RuntimeWarning
331 |
332 | ## v1.3
333 |
334 | functions中去掉了对于function.__code__.co_argument的限制
335 | 增强对函数修饰器的兼容
336 |
337 | ## v1.4
338 | test.py debug
339 | 函数定义考虑特殊参数情况
340 |
341 | ## v1.5
342 | 新增面板数据支持功能
343 | 将场景分位截面,时序和面板
344 | 数据定义要求更新
345 |
346 | 函数定义要求更新
347 |
348 | 更新适应度惩罚计算
349 |
350 | 修改遗传规划中的特征筛选逻辑
351 | (当最大相关系数绝对值低于某一阈值时,直接按fitness筛选)
--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 | {
56 | "keyToString": {
57 | "RunOnceActivity.OpenProjectViewOnStart": "true",
58 | "RunOnceActivity.ShowReadmeOnStart": "true",
59 | "WebServerToolWindowFactoryState": "false",
60 | "last_opened_file_path": "D:/software/python38/Lib/site-packages/gplearn",
61 | "node.js.detected.package.eslint": "true",
62 | "node.js.detected.package.tslint": "true",
63 | "node.js.selected.package.eslint": "(autodetect)",
64 | "node.js.selected.package.tslint": "(autodetect)",
65 | "nodejs_package_manager_path": "npm",
66 | "settings.editor.selected.configurable": "com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable",
67 | "vue.rearranger.settings.migration": "true"
68 | }
69 | }
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 | 1669872968278
221 |
222 |
223 | 1669872968278
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 | 1680227722018
247 |
248 |
249 |
250 | 1680227722018
251 |
252 |
253 | 1680603735322
254 |
255 |
256 |
257 | 1680603735322
258 |
259 |
260 | 1680775226047
261 |
262 |
263 |
264 | 1680775226047
265 |
266 |
267 | 1681880171652
268 |
269 |
270 |
271 | 1681880171652
272 |
273 |
274 | 1681896821625
275 |
276 |
277 |
278 | 1681896821625
279 |
280 |
281 | 1681983650112
282 |
283 |
284 |
285 | 1681983650112
286 |
287 |
288 | 1682070356913
289 |
290 |
291 |
292 | 1682070356913
293 |
294 |
295 | 1707125947309
296 |
297 |
298 |
299 | 1707125947309
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 |
337 |
338 |
339 |
340 |
341 |
342 |
343 | file://$PROJECT_DIR$/../../../py39/Lib/site-packages/sklearn/base.py
344 | 5
345 |
346 |
347 |
348 |
349 |
350 |
351 |
352 |
353 |
354 |
355 |
356 |
357 |
358 |
--------------------------------------------------------------------------------
/functions.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | # @Project :gplearnplus
5 | # @File :function
6 | # @Date :2022/12/1 0001 13:46
7 | # @Author :Junzhe Huang
8 | # @Email :acejasonhuang@163.com
9 | # @Software :PyCharm
10 | -------------------------------------------------
11 | """
12 | import numpy as np
13 | from joblib import wrap_non_picklable_objects
14 |
15 | NoneType = type(None)
16 |
17 | __all__ = ['make_function', 'raw_function_list']
18 |
19 |
20 | class _Function(object):
21 | """
22 | 函数对象,参数至少有一个为向量
23 | 默认函数类型为,all,既可用于时序也可用于截面
24 | 默认返回类型为数值,
25 | 默认输入类型,数值向量或者标量
26 |
27 | Parameters
28 | ----------
29 | function : callable
30 | A function with signature function(x1, *args) that returns a Numpy
31 | array of the same shape as its arguments.
32 |
33 | name : str
34 | The name for the function as it should be represented in the program
35 | and its visualizations.
36 |
37 | arity : int
38 | The number of arguments that the ``function`` takes.
39 |
40 | param_type : [{
41 | 'vector': {'category': (None, None), 'number': (None, None)},
42 | 'scalar': {'int': (int, int), 'float': (float, float)}
43 | },]
44 | function_type : 'all', 'section', 'time_series‘
45 | return_type: 'number', 'category'
46 |
47 | """
48 |
49 | def __init__(self, function, name, arity, param_type=None, return_type='number', function_type='all'):
50 | self.function = function
51 | self.name = name
52 | self.arity = arity
53 | if param_type is None:
54 | # 默认不接受分类类型
55 | param_type = arity * [{'vector': {'number': (None, None)},
56 | 'scalar': {'int': (None, None), 'float': (None, None)}}]
57 | else:
58 | # 防止长度不一
59 | if len(param_type) != arity:
60 | raise ValueError(
61 | "length of param_type should be equal to arity, it should be {}, not {}"
62 | .format(arity, len(param_type)))
63 | self.param_type = param_type
64 | if (return_type != 'number') and (return_type != 'category'):
65 | raise ValueError("return_type of function {} should be number or category, NOT {}"
66 | .format(name, return_type))
67 | self.return_type = return_type
68 | self.function_type = function_type
69 |
70 | def __call__(self, *args):
71 | """
72 | 调用函数特殊处理,
73 | 参数仅接受标量,却传入向量
74 | 则取向量第一个值为标量
75 | """
76 | for _param, _param_type in zip(args, self.param_type):
77 | if len(_param_type) == 1 and 'scalar' in _param_type and isinstance(_param, (list, np.ndarray)):
78 | _param = _param[0]
79 | return self.function(*args)
80 |
81 | def add_range(self, const_range):
82 | # 作用:替换掉参数中没有约束的范围,给所有标量限制范围
83 | # 若没有const_range, 则表明所有函数不接收常数, 去掉所有的const type
84 | if const_range is None:
85 | for i, _dict in enumerate(self.param_type):
86 | if 'vector' not in _dict:
87 | raise ValueError("for None const range, vector type should in all function param")
88 | if 'scalar' in _dict:
89 | self.param_type[i].pop('scalar')
90 | return
91 | if not isinstance(const_range, tuple):
92 | raise ValueError('const_range must be an tuple')
93 | _min, _max = const_range
94 | if not isinstance(_min, (int, float)):
95 | raise ValueError('const_range left must be an int, float')
96 | if not isinstance(_max, (int, float)):
97 | raise ValueError('const_range right must be an int, float')
98 | if _min > _max:
99 | raise ValueError('const_range left should le right')
100 |
101 | for i, _dict in enumerate(self.param_type):
102 | if 'scalar' in _dict:
103 | _scalar_range = _dict['scalar']
104 | if 'int' in _scalar_range:
105 | _l = int(_min) if _scalar_range['int'][0] is None else int(_scalar_range['int'][0])
106 | _r = int(_max) if _scalar_range['int'][1] is None else int(_scalar_range['int'][1])
107 | self.param_type[i]['scalar']['int'] = (_l, _r)
108 | if 'float' in _scalar_range:
109 | _l = float(_min) if _scalar_range['float'][0] is None else float(_scalar_range['float'][0])
110 | _r = float(_max) if _scalar_range['float'][1] is None else float(_scalar_range['float'][1])
111 | self.param_type[i]['scalar']['float'] = (_l, _r)
112 |
113 | return
114 |
115 | def is_point_mutation(self, candidate_func):
116 | # 检验某个待替换函数是否可以替换
117 | if not isinstance(candidate_func, _Function):
118 | raise ValueError("wrong type, it should be _Function style")
119 | # 带替换函数是否与该函数参数长度一致
120 | if len(candidate_func.param_type) != len(self.param_type):
121 | return False
122 | if self.return_type != candidate_func.return_type:
123 | return False
124 |
125 | # candidate函数的参数必须为待替换函数参数的子集
126 | # 要求替换和,函数的所有参数仍然合法
127 | for dict_self, dict_candi in zip(self.param_type, candidate_func.param_type):
128 | if len(dict_candi) <= len(dict_self):
129 | return False
130 | for upper_type in dict_self:
131 | if upper_type not in dict_candi:
132 | return False
133 | else:
134 | for lower_type in dict_self:
135 | if lower_type not in dict_candi[upper_type]:
136 | return False
137 | else:
138 | if upper_type == 'scalar':
139 | if (dict_candi['scalar'][lower_type][0] > dict_self['scalar'][lower_type][0]) or (
140 | dict_candi['scalar'][lower_type][1] > dict_candi['scalar'][lower_type][1]):
141 | return False
142 | return True
143 |
144 |
145 |
146 | # warp 用于多进程序列化,会降低进化效率
147 | def make_function(*, function, name, arity, param_type=None, wrap=True, return_type='number', function_type='all'):
148 | """
149 | Parameters
150 | ----------
151 | function : callable
152 |
153 | name : str
154 |
155 | arity : int
156 |
157 | param_type : [{type: (, ), type: (, )}, ........]
158 |
159 | wrap : bool, optional (default=True)
160 | """
161 |
162 | if not isinstance(arity, int):
163 | raise ValueError('arity must be an int, got %s' % type(arity))
164 | if not isinstance(name, str):
165 | raise ValueError('name must be a string, got %s' % type(name))
166 | if not isinstance(wrap, bool):
167 | raise ValueError('wrap must be an bool, got %s' % type(wrap))
168 |
169 | # check out param_type vector > scalar int > float
170 | if param_type is None:
171 | param_type = [None] * arity
172 | if not isinstance(param_type, list):
173 | raise ValueError('param_type must be list')
174 | if len(param_type) != arity:
175 | raise ValueError('len of param_type must be arity')
176 | # 保证函数中至少有一个向量
177 | vector_flag = False
178 | for i, _dict in enumerate(param_type):
179 | # 转换None type
180 | # 标记某一个参数是否可接受向量
181 | non_vector_param = True
182 | if _dict is None:
183 | param_type[i] = {'vector': {'category': (None, None), 'number': (None, None)},
184 | 'scalar': {'int': (None, None), 'float': (None, None)}}
185 | elif not isinstance(_dict, dict):
186 | raise ValueError('element in param_type {} must be dict'.format(i + 1))
187 | if len(_dict) > 2:
188 | raise ValueError('len of element in param_type {} must be 1, 2'.format(i + 1))
189 | for upper_type in _dict:
190 | if upper_type == 'vector':
191 | if not isinstance(_dict['vector'], dict):
192 | raise ValueError('type of element in param_type {} must be {upper_type: {lower_type:( , )}}}'
193 | .format(i + 1))
194 | if len(_dict['vector']) == 0:
195 | raise ValueError('length of upper_type dict in param_type {} should not be 0'.format(i + 1))
196 | vector_flag = True
197 | non_vector_param = False
198 | for lower_type in _dict['vector']:
199 | if lower_type not in ['number', 'category']:
200 | raise ValueError('key of vector in param_type {} must be number or category'.format(i + 1))
201 | param_type[i]['vector'][lower_type] = (None, None)
202 |
203 | elif upper_type == 'scalar':
204 | if not isinstance(_dict['scalar'], dict):
205 | raise ValueError('type of element in param_type {} must be {upper_type: {lower_type:( , )}}}'
206 | .format(i + 1))
207 | if len(_dict['scalar']) == 0:
208 | raise ValueError('length of upper_type dict in param_type {} should not be 0'.format(i + 1))
209 | for lower_type in _dict['scalar']:
210 | if lower_type == 'int':
211 | if not isinstance(_dict['scalar']['int'], tuple):
212 | raise ValueError('structure of lower_type in param_type {} must be {type: ( , )}}'
213 | .format(i + 1))
214 | if len(_dict['scalar']['int']) != 2:
215 | raise ValueError("len of lower_type's structure in param_type {} must be 2".format(i + 1))
216 | if not isinstance(_dict['scalar']['int'][0], (int, NoneType)):
217 | raise ValueError("the first element in lower_type's structure in param_type {} "
218 | "must be None, int or float".format(i + 1))
219 | if not isinstance(_dict['scalar']['int'][1], (int, NoneType)):
220 | raise ValueError("the second element in lower_type's structure in param_type {} "
221 | "must be None, int or float".format(i + 1))
222 | if isinstance(_dict['scalar']['int'][0], int) and isinstance(_dict['scalar']['int'][1], int) \
223 | and _dict['scalar']['int'][1] < _dict['scalar']['int'][0]:
224 | raise ValueError('the second element should ge the first element in param_type {}'
225 | .format(i + 1))
226 |
227 | elif lower_type == 'float':
228 | if not isinstance(_dict['scalar']['float'], tuple):
229 | raise ValueError('structure of lower_type in param_type {} must be {type: ( , )}}'
230 | .format(i + 1))
231 | if len(_dict['scalar']['float']) != 2:
232 | raise ValueError("len of lower_type's structure in param_type {} must be 2".format(i + 1))
233 | if not isinstance(_dict['scalar']['float'][0], (float, int, NoneType)):
234 | raise ValueError("the first element in lower_type's structure in param_type {} "
235 | "must be None, int or float".format(i + 1))
236 | if not isinstance(_dict['scalar']['float'][1], (float, int, NoneType)):
237 | raise ValueError("the second element in lower_type's structure in param_type {} "
238 | "must be None, int or float".format(i + 1))
239 | if isinstance(_dict['scalar']['float'][0], (int, float)) and \
240 | isinstance(_dict['scalar']['float'][1], (int, float)) \
241 | and _dict['scalar']['float'][1] < _dict['scalar']['float'][0]:
242 | raise ValueError('the second element should ge the first element in param_type {}'
243 | .format(i + 1))
244 | else:
245 | raise ValueError('key of scalar in param_type {} must be int or float'.format(i + 1))
246 | else:
247 | raise ValueError('key of element in param_type {} must be vector or scalar'.format(i + 1))
248 |
249 | if not vector_flag:
250 | raise ValueError('there is at least 1 vector in param_type {}'.format(i + 1))
251 |
252 | # Check output shape
253 | # 生成测试数据
254 | args = []
255 | for _dict in param_type:
256 | if 'vector' in _dict:
257 | if 'number' in _dict['vector']:
258 | args.append(np.ones(10))
259 | else:
260 | args.append(np.array([1] * 10))
261 | elif 'scalar' in _dict:
262 | if 'int' in _dict['scalar']:
263 | args.append(((0 if _dict['scalar']['int'][1] is None else _dict['scalar']['int'][1]) +
264 | (0 if _dict['scalar']['int'][0] is None else _dict['scalar']['int'][0])) // 2)
265 | else:
266 | args.append(((0 if _dict['scalar']['float'][1] is None else _dict['scalar']['float'][1]) +
267 | (0 if _dict['scalar']['float'][0] is None else _dict['scalar']['float'][0])) // 2)
268 |
269 | try:
270 | function(*args)
271 | except (ValueError, TypeError):
272 | print(args)
273 | raise ValueError('supplied function %s does not support arity of %d.'
274 | % (name, arity))
275 | if not hasattr(function(*args), 'shape'):
276 | raise ValueError('supplied function %s does not return a numpy array.'
277 | % name)
278 | if function(*args).shape != (10,):
279 | raise ValueError('supplied function %s does not return same shape as '
280 | 'input vectors.' % name)
281 | if function(*args).dtype.type is np.float_ and return_type == 'category':
282 | raise ValueError('the return type should be category not {}'.format(function(*args).dtype.type))
283 | elif function(*args).dtype not in [np.float, np.int, np.int64] and return_type == 'number':
284 | raise ValueError('the return type should be category not {}'.format(function(*args).dtype.type))
285 |
286 | # Check closure for zero & negative input arguments
287 | args2 = []
288 | args3 = []
289 | for _dict in param_type:
290 | if 'vector' in _dict:
291 | # 兼容category向量
292 | args2.append(np.zeros(10))
293 | args3.append(-1 * np.ones(10))
294 | elif 'scalar' in _dict:
295 | if 'int' in _dict['scalar']:
296 |
297 | _temp = (((0 if _dict['scalar']['int'][1] is None else _dict['scalar']['int'][1]) +
298 | (0 if _dict['scalar']['int'][0] is None else _dict['scalar']['int'][0])) // 2)
299 | args2.append(_temp)
300 | args3.append(_temp)
301 | else:
302 | _temp = (((0 if _dict['scalar']['float'][1] is None else _dict['scalar']['float'][1]) +
303 | (0 if _dict['scalar']['float'][0] is None else _dict['scalar']['float'][0])) // 2)
304 | args2.append(_temp)
305 | args3.append(_temp)
306 |
307 |
308 | if not np.all(np.isnan(function(*args2)) | np.isfinite(function(*args2))):
309 | raise ValueError('supplied function %s does not have closure against '
310 | 'zeros in argument vectors.' % name)
311 |
312 | if not np.all(np.isnan(function(*args3)) | np.isfinite(function(*args3))):
313 | raise ValueError('supplied function %s does not have closure against '
314 | 'negatives in argument vectors.' % name)
315 | if wrap:
316 | return _Function(function=wrap_non_picklable_objects(function),
317 | name=name,
318 | arity=arity,
319 | param_type=param_type,
320 | return_type=return_type,
321 | function_type=function_type)
322 | return _Function(function=function,
323 | name=name,
324 | arity=arity,
325 | param_type=param_type,
326 | return_type=return_type,
327 | function_type=function_type)
328 |
329 |
330 | def _protected_division(x1, x2):
331 | """Closure of division (x1/x2) for zero denominator."""
332 | with np.errstate(divide='ignore', invalid='ignore'):
333 | return np.where(np.abs(x2) > 0.001, np.divide(x1, x2), 1.)
334 |
335 |
336 | def _protected_sqrt(x1):
337 | """Closure of square root for negative arguments."""
338 | return np.sqrt(np.abs(x1))
339 |
340 |
341 | def _protected_log(x1):
342 | """Closure of log for zero and negative arguments."""
343 | with np.errstate(divide='ignore', invalid='ignore'):
344 | return np.where(np.abs(x1) > 0.001, np.log(np.abs(x1)), 0.)
345 |
346 |
347 | def _protected_inverse(x1):
348 | """Closure of inverse for zero arguments."""
349 | with np.errstate(divide='ignore', invalid='ignore'):
350 | return np.where(np.abs(x1) > 0.001, 1. / x1, 0.)
351 |
352 |
353 | def _sigmoid(x1):
354 | """Special case of logistic function to transform to probabilities."""
355 | with np.errstate(over='ignore', under='ignore'):
356 | return 1 / (1 + np.exp(-x1))
357 |
358 | def _groupby(gbx, func, *args, **kwargs):
359 | indices = np.argsort(gbx)
360 | gbx_sorted = gbx[indices]
361 | X = np.column_stack((np.arange(len(gbx)), gbx_sorted, *args))
362 | splits = np.split(X, np.unique(gbx_sorted, return_index=True)[1][1:])
363 | result_list = [func(*(split[:, 2:].T), **kwargs) for split in splits]
364 | result = np.hstack(result_list)
365 | return result[indices.argsort()]
366 |
367 |
368 | add2 = _Function(function=np.add, name='add', arity=2)
369 | sub2 = _Function(function=np.subtract, name='sub', arity=2)
370 | mul2 = _Function(function=np.multiply, name='mul', arity=2)
371 | div2 = _Function(function=_protected_division, name='div', arity=2)
372 | sqrt1 = _Function(function=_protected_sqrt, name='sqrt', arity=1)
373 | log1 = _Function(function=_protected_log, name='log', arity=1)
374 | neg1 = _Function(function=np.negative, name='neg', arity=1)
375 | inv1 = _Function(function=_protected_inverse, name='inv', arity=1)
376 | abs1 = _Function(function=np.abs, name='abs', arity=1)
377 | max2 = _Function(function=np.maximum, name='max', arity=2)
378 | min2 = _Function(function=np.minimum, name='min', arity=2)
379 | sin1 = _Function(function=np.sin, name='sin', arity=1)
380 | cos1 = _Function(function=np.cos, name='cos', arity=1)
381 | tan1 = _Function(function=np.tan, name='tan', arity=1)
382 | sig1 = _Function(function=_sigmoid, name='sig', arity=1)
383 |
384 | _function_map = {'add': add2,
385 | 'sub': sub2,
386 | 'mul': mul2,
387 | 'div': div2,
388 | 'sqrt': sqrt1,
389 | 'log': log1,
390 | 'abs': abs1,
391 | 'neg': neg1,
392 | 'inv': inv1,
393 | 'max': max2,
394 | 'min': min2,
395 | 'sin': sin1,
396 | 'cos': cos1,
397 | 'tan': tan1}
398 |
399 | raw_function_list = ['add', 'sub', 'mul', 'div', 'sqrt',
400 | 'sqrt', 'log', 'abs', 'neg', 'inv',
401 | 'max', 'min', 'sin', 'cos', 'tan']
402 |
403 | all_function = raw_function_list.copy()
404 |
405 | section_function = []
406 |
407 | time_series_function = []
408 |
409 | if __name__ == '__main__':
410 | # def ff(a, b, c):
411 | # return a * b + c
412 | #
413 | # param_type = [{'vector':{'number': (None, None)}}, {'scalar': {'int':(None, 1)}}, {'scalar': {'float': (-1, None)}}]
414 | # f_m = make_function(function=ff, name='ff', arity=3, param_type=param_type, wrap=True, return_type='number')
415 | # f_m.add_range((-1, 1))
416 | # print(f_m.param_type)
417 | a = np.array([1, 2, 2, 1, np.nan])
418 | b = np.array([1, 2, 3, 4, 5])
419 | print(_groupby(a, max, b))
420 |
421 |
--------------------------------------------------------------------------------
/example.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | # @Project :gplearn
5 | # @File :example.py
6 | # @Date :2023/3/31 0013 17:37
7 | # @Author :Junzhe Huang
8 | # @Email :acejasonhuang@163.com
9 | # @Software :PyCharm
10 | -------------------------------------------------
11 | """
12 | #####
13 | # 目录
14 | # 1. ALL FUNCTION 全局函数
15 | # 2. TIME SERIES FUNCTION 一般时间序列函数
16 | # 3. TA FUNCTION 技术指标函数
17 | # 4. SECTION FUNCTION 截面函数
18 | # 5. SECTION GROUPBY FUNCTION 截面分类聚合函数
19 | #
20 | #
21 | ###
22 | import numpy as np
23 | from typing import Any
24 | import numba as nb
25 | from copy import copy
26 | from numba import jit
27 | from gplearnplus import functions
28 | from functools import wraps
29 | from functions import _groupby
30 |
31 |
32 | def no_numpy_warning(func):
33 | @wraps(func)
34 | def warp(*args, **kwargs):
35 | with np.errstate(all='ignore'):
36 | _res = func(*args, **kwargs)
37 | return _res
38 | return warp
39 |
40 | @nb.jit(nopython=True)
41 | def handle_nan(X):
42 | # 前值填充
43 | X = np.copy(X)
44 | _temp = np.nan
45 | na_len = 0
46 | for i in range(len(X)):
47 | if np.isnan(X[i]):
48 | X[i] = _temp
49 | na_len += 1
50 | else:
51 | _temp = X[i]
52 | return X, na_len
53 |
54 | #### ALL FUNCTION #####
55 |
56 | @jit(nopython=True)
57 | def _combine(X, Y):
58 | p1 = 15485863
59 | p2 = 32416190071
60 | p3 = 100000007
61 | return np.mod(X * p1 + Y * p2, p3)
62 |
63 | combine = functions.make_function(function=_combine, name='combine', arity=2, return_type='category',
64 | param_type=[{'vector': {'category': (None, None)}},
65 | {'vector': {'category': (None, None)}}])
66 |
67 | #### TIME SERIES FUNCTION #####
68 |
69 | @jit(nopython=True)
70 | def _delay(X, d):
71 | res = np.empty_like(X)
72 | res.fill(np.nan)
73 | end = len(X) - d
74 | for i in range(d, len(X)):
75 | res[i] = X[i - d]
76 | return res
77 |
78 | delay = functions.make_function(function=_delay, name='delay', arity=2, function_type='time_series',
79 | param_type=[{'vector': {'number': (None, None)}},
80 | {'scalar': {'int':(3, 30)}}])
81 |
82 | @jit(nopython=True)
83 | def _delta(X, d):
84 | res = np.empty_like(X)
85 | res.fill(np.nan)
86 | end = len(X) - d
87 | for i in range(d, len(X)):
88 | res[i] = X[i] - X[i - d]
89 | return res
90 |
91 | delta = functions.make_function(function=_delta, name='delta', arity=2, function_type='time_series',
92 | param_type=[{'vector': {'number': (None, None)}},
93 | {'scalar': {'int':(3, 30)}}])
94 | @jit(nopython=True)
95 | def _ts_min(X, d):
96 | d = len(X) - 1 if d >= len(X) else d
97 | shape = (X.size - d + 1, d)
98 | res = np.empty(X.size, dtype=X.dtype)
99 | res.fill(np.nan)
100 | for i in range(len(X) - d + 1):
101 | res[i + d - 1] = np.nanmin(X[i:i + d])
102 | return res
103 |
104 | ts_min = functions.make_function(function=_ts_min, name='ts_min', arity=2, function_type='time_series',
105 | param_type=[{'vector': {'number': (None, None)}}, {'scalar': {'int':(3, 30)}}])
106 |
107 | @jit(nopython=True)
108 | def _ts_max(X, d):
109 | d = len(X) - 1 if d >= len(X) else d
110 | shape = (X.size - d + 1, d)
111 | res = np.empty(X.size, dtype=X.dtype)
112 | res.fill(np.nan)
113 | for i in range(len(X) - d + 1):
114 | res[i + d - 1] = np.nanmax(X[i:i + d])
115 | return res
116 |
117 | ts_max = functions.make_function(function=_ts_max, name='ts_max', arity=2, function_type='time_series',
118 | param_type=[{'vector': {'number': (None, None)}}, {'scalar': {'int':(3, 30)}}])
119 |
120 | @jit(nopython=True)
121 | def _ts_argmax(X, d):
122 | d = len(X) - 1 if d >= len(X) else d
123 | res = np.empty(len(X), dtype=np.float64)
124 | res[:d - 1] = np.nan
125 | for i in range(len(X) - d + 1):
126 | res[i + d - 1] = np.argmax(X[i:i + d])
127 | return res
128 |
129 | ts_argmax = functions.make_function(function=_ts_argmax, name='ts_argmax', arity=2, function_type='time_series',
130 | param_type=[{'vector': {'number': (None, None)}}, {'scalar': {'int':(3, 30)}}])
131 |
132 | @jit(nopython=True)
133 | def _ts_argmin(X, d):
134 | n = len(X)
135 | d = n - 1 if d >= n else d
136 | res = np.full(n, np.nan)
137 | for i in range(n - d + 1):
138 | res[i + d - 1] = np.argmax(X[i:i + d])
139 | return res
140 | ts_argmin = functions.make_function(function=_ts_argmin, name='ts_argmax', arity=2, function_type='time_series',
141 | param_type=[{'vector': {'number': (None, None)}},
142 | {'scalar': {'int':(3, 30)}}])
143 |
144 | @jit(nopython=True)
145 | def _ts_rank(X, d):
146 | n = len(X)
147 | d = n - 1 if d >= n else d
148 | res = np.full(n, np.nan)
149 | for i in range(n - d + 1):
150 | rank = np.argsort(X[i:i + d]).argsort()[-1] + 1
151 | res[i + d - 1] = rank / d
152 | return res
153 |
154 | ts_rank = functions.make_function(function=_ts_rank, name='ts_rank', arity=2, function_type='time_series',
155 | param_type=[{'vector': {'number': (None, None)}}, {'scalar': {'int':(3, 30)}}])
156 |
157 | @jit(nopython=True)
158 | def _ts_sum(X, d):
159 | n = len(X)
160 | d = n - 1 if d >= n else d
161 | res = np.full(n, np.nan)
162 | cumsum = np.nancumsum(X)
163 | res[d - 1:n] = cumsum[d - 1:] - cumsum[:-d]
164 | return res
165 |
166 | ts_sum = functions.make_function(function=_ts_sum, name='ts_sum', arity=2, function_type='time_series',
167 | param_type=[{'vector': {'number': (None, None)}}, {'scalar': {'int':(3, 30)}}])
168 |
169 | @jit(nopython=True)
170 | def _ts_stddev(X, d):
171 | d = len(X) - 1 if d >= len(X) else d
172 | res = np.empty(len(X))
173 | res[:] = np.nan
174 | for i in range(d - 1, len(X)):
175 | res[i] = np.nanstd(X[i - d + 1:i + 1])
176 | return res
177 |
178 | ts_stddev = functions.make_function(function=_ts_stddev, name='ts_stddev', arity=2, function_type='time_series',
179 | param_type=[{'vector': {'number': (None, None)}}, {'scalar': {'int':(3, 30)}}])
180 |
181 | @jit(nopython=True)
182 | def _ts_corr(X, Y, d):
183 | d = len(X) - 1 if d >= len(X) else d
184 | res = np.empty(len(X))
185 | res[:d-1] = np.nan
186 | for i in range(len(X) - d + 1):
187 | X_ = X[i:i+d]
188 | Y_ = Y[i:i+d]
189 | X_ = X_[~(np.isnan(X_) | np.isnan(Y_))]
190 | Y_ = Y_[~(np.isnan(X_) | np.isnan(Y_))]
191 | if len(X_) <= 2:
192 | res[i+d-1] = np.nan
193 | else:
194 | res[i+d-1] = np.corrcoef(X_, Y_)[0][1]
195 | return res
196 |
197 | ts_corr = functions.make_function(function=_ts_corr, name='ts_corr', arity=3, function_type='time_series',
198 | param_type=[{'vector': {'number': (None, None)}},
199 | {'vector': {'number': (None, None)}},
200 | {'scalar': {'int':(3, 30)}}])
201 |
202 | @jit(nopython=True)
203 | def _ts_mean(X, d):
204 | d = len(X) - 1 if d >= len(X) else d
205 | res = np.full(len(X), np.nan)
206 | s = np.sum(X[:d])
207 | for i in range(d - 1, len(X)):
208 | res[i] = s / d
209 | s += X[i + 1] - X[i - d + 1]
210 | return res
211 |
212 | ts_mean = functions.make_function(function=_ts_mean, name='ts_mean', arity=2,
213 | function_type='time_series',
214 | param_type=[{'vector': {'number': (None, None)}},
215 | {'scalar': {'int':(3, 30)}}])
216 |
217 | @jit(nopython=True)
218 | def _ts_neutralize(X, d):
219 | N = len(X)
220 | d = len(X) - 1 if d >= len(X) else d
221 | mov_mean = np.empty(N - d + 1)
222 | mov_std = np.empty(N - d + 1)
223 | res = np.empty(N)
224 |
225 | for i in nb.prange(N - d + 1):
226 | mov_mean[i] = np.mean(X[i:i + d])
227 | mov_std[i] = np.sqrt(np.mean((X[i:i + d] - mov_mean[i]) ** 2))
228 | mov_std[i] = mov_std[i] if mov_std[i] > 0.001 else 0.001
229 |
230 | for i in nb.prange(N):
231 | if i < d - 1:
232 | res[i] = np.nan
233 | else:
234 | res[i] = (X[i] - mov_mean[i - d + 1]) / mov_std[i - d + 1]
235 |
236 | return res
237 |
238 | ts_neutralize = functions.make_function(function=_ts_neutralize, name='ts_neutralize', arity=2,
239 | function_type='time_series',
240 | param_type=[{'vector': {'number': (None, None)}},
241 | {'scalar': {'int':(3, 30)}}])
242 |
243 | @nb.jit(nopython=True)
244 | def _ts_freq(X, d):
245 | d = len(X) - 1 if d >= len(X) else d
246 | res = np.empty(len(X), dtype=np.float64)
247 | res[:d - 1] = np.nan
248 | for i in range(d - 1, len(X)):
249 | subarr = X[i - d + 1:i + 1]
250 | res[i] = sum(subarr == X[i])
251 | return res
252 |
253 | ts_freq = functions.make_function(function=_ts_freq, name='ts_freq', arity=2,
254 | function_type='time_series',
255 | param_type=[{'vector': {'category': (None, None)}},
256 | {'scalar': {'int':(3, 30)}}])
257 |
258 | #### TIME SERIES TA FUNCTION ####
259 |
260 | @nb.jit(nopython=True)
261 | def _EMA(X, d):
262 | d = len(X) - 1 if d >= len(X) else d
263 | X, _l = handle_nan(X)
264 | X = X[_l:]
265 | if len(X) < d:
266 | return np.array([np.nan] * (len(X) + _l))
267 | kt = 2 / (d + 1)
268 | pre_ma = np.mean(X[:d])
269 | __res = np.array([np.nan] * (len(X) + _l))
270 | __res[_l + d - 1] = pre_ma
271 | for i in range(d, len(X)):
272 | pre_ma += (X[i] - pre_ma) * kt
273 | __res[_l + i] = pre_ma
274 | return __res
275 |
276 | EMA = functions.make_function(function=_EMA, name='EMA', arity=2, function_type='time_series',
277 | param_type=[{'vector': {'number': (None, None)}}, {'scalar': {'int':(3, 30)}}])
278 |
279 | @jit(nopython=True)
280 | def _DEMA(X, d):
281 | d = d if len(X) > 2 * d - 2 else len(X) // 2 - 1
282 | _ema = _EMA(X, d)
283 | _eema = _EMA(_ema, d)
284 | __res = 2 * _ema - _eema
285 | return __res
286 |
287 | DEMA = functions.make_function(function=_DEMA, name='DEMA', arity=2, function_type='time_series',
288 | param_type=[{'vector': {'number': (None, None)}}, {'scalar': {'int':(3, 30)}}])
289 |
290 | @jit(nopython=True)
291 | def _MA(X, d):
292 | d = len(X) - 1 if d >= len(X) else d
293 | X, _l = handle_nan(X)
294 | X = X[_l:]
295 | if len(X) < d:
296 | return np.array([np.nan] * (len(X) + _l))
297 | __res = [np.nan] * (_l + d - 1) + [np.mean(X[i:i + d]) for i in range(len(X) - d + 1)]
298 | return np.array(__res)
299 |
300 | MA = functions.make_function(function=_MA, name='MA', arity=2, function_type='time_series',
301 | param_type=[{'vector': {'number': (None, None)}}, {'scalar': {'int':(3, 30)}}])
302 |
303 | @jit(nopython=True)
304 | def _KAMA(X, d):
305 | d = len(X) - 1 if d >= len(X) else d
306 | X, _l = handle_nan(X)
307 | X = X[_l:]
308 | if len(X) < d:
309 | return np.array([np.nan] * (len(X) + _l))
310 | _af = 2 / (2 + 1)
311 | _as = 2 / (30 + 1)
312 | __res = np.array([np.nan] * (len(X) + _l))
313 | for i in range(d, len(X)):
314 | period_roc = X[i] - X[i - d]
315 | sum_roc = np.sum(np.abs(np.diff(X[i - d: i + 1])))
316 | _er = 1.0 if ((period_roc >= sum_roc) or (sum_roc == 0)) else abs(period_roc / sum_roc)
317 | _at = (_er * (_af - _as) + _as) ** 2
318 | __res[_l + i] = _at * X[i] + (1 - _at) * (__res[_l + i - 1] if i != d else X[i - 1])
319 | return __res
320 |
321 | KAMA = functions.make_function(function=_KAMA, name='KAMA', arity=2, function_type='time_series',
322 | param_type=[{'vector': {'number': (None, None)}}, {'scalar': {'int':(3, 30)}}])
323 |
324 | @nb.jit(nopython=True)
325 | def _MIDPOINT(X, d):
326 | d = len(X) - 1 if d >= len(X) else d
327 | res = np.empty(len(X))
328 | res[:] = np.nan
329 | for i in range(d - 1, len(X)):
330 | res[i] = (np.nanmax(X[i-d+1:i+1]) + np.nanmin(X[i-d+1:i+1])) / 2
331 | return res
332 |
333 | MIDPOINT = functions.make_function(function=_MIDPOINT, name='MIDPOINT', arity=2, function_type='time_series',
334 | param_type=[{'vector': {'number': (None, None)}}, {'scalar': {'int':(3, 30)}}])
335 |
336 | @nb.jit(nopython=True)
337 | def _BETA(X, Y, d):
338 | d = len(X) - 1 if d >= len(X) else d
339 | res = np.full(len(X), np.nan)
340 | for i in range(d - 1, len(X)):
341 | X_slice = X[i - d + 1: i + 1]
342 | Y_slice = Y[i - d + 1: i + 1]
343 | X_mean = np.mean(X_slice)
344 | Y_mean = np.mean(Y_slice)
345 | numerator = np.sum((X_slice - X_mean) * (Y_slice - Y_mean))
346 | denominator = np.sum((X_slice - X_mean) ** 2)
347 | denominator = denominator if denominator > 0.001 else 0.001
348 | res[i] = numerator / denominator
349 | return res
350 |
351 | BETA = functions.make_function(function=_BETA, name='BETA', arity=3, function_type='time_series',
352 | param_type=[{'vector': {'number': (None, None)}},
353 | {'vector': {'number': (None, None)}},
354 | {'scalar': {'int':(3, 30)}}])
355 |
356 | @nb.jit(nopython=True)
357 | def _LINEARREG_SLOPE(X, d):
358 | d = len(X) - 1 if d >= len(X) else d
359 | Y = np.arange(d)
360 | res = np.full(len(X), np.nan)
361 | for i in range(d - 1, len(X)):
362 | X_slice = X[i - d + 1: i + 1]
363 | Y_slice = Y[:len(X_slice)]
364 | X_mean = np.mean(X_slice)
365 | Y_mean = np.mean(Y_slice)
366 | numerator = np.sum((X_slice - X_mean) * (Y_slice - Y_mean))
367 | denominator = np.sum((X_slice - X_mean) ** 2)
368 | denominator = denominator if denominator > 0.001 else 0.001
369 | res[i] = numerator / denominator
370 | return res
371 |
372 | LINEARREG_SLOPE = functions.make_function(function=_LINEARREG_SLOPE, name='LINEARREG_SLOPE', arity=2,
373 | function_type='time_series',
374 | param_type=[{'vector': {'number': (None, None)}},
375 | {'scalar': {'int':(3, 30)}}])
376 |
377 | @nb.jit(nopython=True)
378 | def _LINEARREG_ANGLE(X, d):
379 | d = len(X) - 1 if d >= len(X) else d
380 | Y = np.arange(d)
381 | res = np.full(len(X), np.nan)
382 | for i in range(d - 1, len(X)):
383 | X_slice = X[i - d + 1: i + 1]
384 | Y_slice = Y[:len(X_slice)]
385 | X_mean = np.mean(X_slice)
386 | Y_mean = np.mean(Y_slice)
387 | numerator = np.sum((X_slice - X_mean) * (Y_slice - Y_mean))
388 | denominator = np.sum((X_slice - X_mean) ** 2)
389 | denominator = denominator if denominator > 0.001 else 0.001
390 | res[i] = np.arctan(numerator / denominator) * (180.0 / np.pi)
391 | return res
392 |
393 | LINEARREG_ANGLE = functions.make_function(function=_LINEARREG_ANGLE, name='LINEARREG_ANGLE', arity=2,
394 | function_type='time_series',
395 | param_type=[{'vector': {'number': (None, None)}},
396 | {'scalar': {'int':(3, 30)}}])
397 |
398 | @nb.jit(nopython=True)
399 | def _LINEARREG_INTERCEPT(X, d):
400 | d = len(X) - 1 if d >= len(X) else d
401 | Y = np.arange(d)
402 | res = np.full(len(X), np.nan)
403 | for i in range(d - 1, len(X)):
404 | X_slice = X[i - d + 1: i + 1]
405 | Y_slice = Y[:len(X_slice)]
406 | X_mean = np.mean(X_slice)
407 | Y_mean = np.mean(Y_slice)
408 | numerator = np.sum((X_slice - X_mean) * (Y_slice - Y_mean))
409 | denominator = np.sum((X_slice - X_mean) ** 2)
410 | denominator = denominator if denominator > 0.001 else 0.001
411 | _temp = np.arctan(numerator / denominator) * (180.0 / np.pi)
412 | res[i] = np.sum(X_slice) - _temp * np.sum(Y_slice)
413 | return res
414 |
415 | LINEARREG_INTERCEPT = functions.make_function(function=_LINEARREG_INTERCEPT, name='LINEARREG_INTERCEPT',
416 | arity=2, function_type='time_series',
417 | param_type=[{'vector': {'number': (None, None)}},
418 | {'scalar': {'int':(3, 30)}}])
419 |
420 | #### SECTION FUNCTION ####
421 |
422 | @nb.jit(nopython=True)
423 | def _MAX_SECTION(X: np.ndarray) -> np.ndarray:
424 | return np.full_like(X, np.max(X))
425 |
426 | sec_max = functions.make_function(function=_MAX_SECTION, name='sec_max', arity=1, function_type='section',
427 | param_type=[{'vector': {'number': (None, None)}}])
428 |
429 | @nb.jit(nopython=True)
430 | def _MIN_SECTION(X):
431 | return np.full_like(X, np.min(X))
432 |
433 | sec_min = functions.make_function(function=_MIN_SECTION, name='sec_min', arity=1, function_type='section',
434 | param_type=[{'vector': {'number': (None, None)}}])
435 |
436 | @nb.jit(nopython=True)
437 | def _MEAN_SECTION(X):
438 | return np.full_like(X, np.mean(X))
439 |
440 | sec_mean = functions.make_function(function=_MEAN_SECTION, name='sec_mean', arity=1, function_type='section',
441 | param_type=[{'vector': {'number': (None, None)}}])
442 |
443 | @nb.jit(nopython=True)
444 | def _MEDIAN_SECTION(X):
445 | return np.full_like(X, np.median(X))
446 |
447 | sec_median = functions.make_function(function=_MEDIAN_SECTION, name='sec_median', arity=1, function_type='section',
448 | param_type=[{'vector': {'number': (None, None)}}])
449 |
450 | @nb.jit(nopython=True)
451 | def _STD_SECTION(X):
452 | return np.full_like(X, np.std(X))
453 |
454 | sec_std = functions.make_function(function=_STD_SECTION, name='sec_std', arity=1, function_type='section',
455 | param_type=[{'vector': {'number': (None, None)}}])
456 |
457 | @nb.jit(nopython=True)
458 | def _RANK_SECTION(X):
459 | idx = np.argsort(X)
460 | rank = np.empty_like(idx)
461 | for i in range(len(X)):
462 | rank[idx[i]] = i
463 | return rank
464 |
465 | sec_rank = functions.make_function(function=_RANK_SECTION, name='sec_rank', arity=1, function_type='section',
466 | param_type=[{'vector': {'number': (None, None)}}])
467 |
468 | @nb.jit(nopython=True)
469 | def _NEUTRALIZE_SECTION(X):
470 | mean = np.mean(X)
471 | std = np.std(X)
472 | if std <= 0.001:
473 | std = 0.001
474 | return (X - mean) / np.repeat(std, len(X))
475 |
476 | sec_neutralize = functions.make_function(function=_NEUTRALIZE_SECTION, name='sec_neutralize', arity=1,
477 | function_type='section', param_type=[{'vector': {'number': (None, None)}}])
478 |
479 | @no_numpy_warning
480 | def _FREQ_SECTION(X):
481 | unique_values, counts = np.unique(X, return_counts=True)
482 | count_dict = dict(zip(unique_values, counts))
483 | vectorized_func = np.vectorize(lambda x: count_dict[x])
484 | return vectorized_func(X)
485 |
486 | freq = functions.make_function(function=_FREQ_SECTION, name='freq', arity=1,
487 | function_type='section', param_type=[{'vector': {'category': (None, None)}}])
488 |
489 | @no_numpy_warning
490 | def _CUT_EQUAL_DISTANCE(X, d):
491 | '''
492 | 等距分组
493 | Parameters
494 | ----------
495 | X
496 | d
497 |
498 | Returns
499 | -------
500 |
501 | '''
502 | d = len(X) - 1 if d >= len(X) - 1 else d
503 | bins = [np.min(X) + i * (np.max(X) - np.min(X)) * 1.000001 / d for i in range(d + 1)]
504 | return np.digitize(X, bins)
505 |
506 | cut_equal_distance = functions.make_function(function=_CUT_EQUAL_DISTANCE, name='cut_eq_dist', arity=2,
507 | function_type='section', return_type='category',
508 | param_type=[{'vector': {'number': (None, None)}},
509 | {'scalar': {'int': (2, 30)}}])
510 |
511 | @no_numpy_warning
512 | def _CUT_EQUAL_AMOUNT(X, d):
513 | X_ = _RANK_SECTION(X)
514 | return _CUT_EQUAL_DISTANCE(X_, d)
515 |
516 | cut_equal_amount = functions.make_function(function=_CUT_EQUAL_AMOUNT, name='cut_eq_amt', arity=2,
517 | function_type='section', return_type='category',
518 | param_type=[{'vector': {'number': (None, None)}},
519 | {'scalar': {'int': (2, 30)}}])
520 |
521 | @no_numpy_warning
522 | def _GROUPBYTHENMAX(gbx, X):
523 | return _groupby(gbx, _MAX_SECTION, X)
524 |
525 | groupby_max = functions.make_function(function=_GROUPBYTHENMAX, name='gb_max', arity=2, function_type='section',
526 | param_type=[{'vector': {'category': (None, None)}},
527 | {'vector': {'number': (None, None)}}])
528 |
529 | @no_numpy_warning
530 | def _GROUPBYTHENMIN(gbx, X):
531 | return _groupby(gbx, _MIN_SECTION, X)
532 |
533 | groupby_min = functions.make_function(function=_GROUPBYTHENMIN, name='gb_min', arity=2, function_type='section',
534 | param_type=[{'vector': {'category': (None, None)}},
535 | {'vector': {'number': (None, None)}}])
536 |
537 | @no_numpy_warning
538 | def _GROUPBYTHENMEAN(gbx, X):
539 | return _groupby(gbx, _MEAN_SECTION, X)
540 | groupby_mean = functions.make_function(function=_GROUPBYTHENMEAN, name='gb_mean', arity=2, function_type='section',
541 | param_type=[{'vector': {'category': (None, None)}},
542 | {'vector': {'number': (None, None)}}])
543 |
544 | @no_numpy_warning
545 | def _GROUPBYTHENMEDIAN(gbx, X):
546 | return _groupby(gbx, _MEDIAN_SECTION, X)
547 | groupby_median = functions.make_function(function=_GROUPBYTHENMEDIAN, name='gb_median',
548 | arity=2, function_type='section',
549 | param_type=[{'vector': {'category': (None, None)}},
550 | {'vector': {'number': (None, None)}}])
551 |
552 | @no_numpy_warning
553 | def _GROUPBYTHENSTD(gbx, X):
554 | return _groupby(gbx, _STD_SECTION, X)
555 | groupby_std = functions.make_function(function=_GROUPBYTHENSTD, name='gb_std', arity=2, function_type='section',
556 | param_type=[{'vector': {'category': (None, None)}},
557 | {'vector': {'number': (None, None)}}])
558 |
559 | @no_numpy_warning
560 | def _GROUPBYTHENRANK(gbx, X):
561 | return _groupby(gbx, _RANK_SECTION, X)
562 | groupby_rank = functions.make_function(function=_GROUPBYTHENRANK, name='gb_rank', arity=2, function_type='section',
563 | param_type=[{'vector': {'category': (None, None)}},
564 | {'vector': {'number': (None, None)}}])
565 |
566 | @no_numpy_warning
567 | def _GROUPBYTHENNEUTRALIZE(gbx, X):
568 | return _groupby(gbx, _NEUTRALIZE_SECTION, X)
569 | groupby_neutralize = functions.make_function(function=_GROUPBYTHENNEUTRALIZE, name='gb_neu', arity=2,
570 | function_type='section',
571 | param_type=[{'vector': {'category': (None, None)}},
572 | {'vector': {'number': (None, None)}}])
573 |
574 | @no_numpy_warning
575 | def _GROUPBYTHEN_CUT_EQ_DIST(gbx, X, d):
576 | return _groupby(gbx, _CUT_EQUAL_DISTANCE, X, d=d)
577 | groupby_cut_equal_distance = functions.make_function(function=_GROUPBYTHEN_CUT_EQ_DIST, name='gb_cut_eq_dist', arity=3,
578 | function_type='section', return_type='category',
579 | param_type=[{'vector': {'category': (None, None)}},
580 | {'vector': {'number': (None, None)}},
581 | {'scalar': {'int': (2, 30)}}])
582 |
583 | @no_numpy_warning
584 | def _GROUPBYTHEN_CUT_EQ_AMT(gbx, X, d):
585 | return _groupby(gbx, _CUT_EQUAL_AMOUNT, X, d=d)
586 | groupby_cut_equal_amount = functions.make_function(function=_GROUPBYTHEN_CUT_EQ_AMT, name='gb_cut_eq_amt', arity=3,
587 | function_type='section', return_type='category',
588 | param_type=[{'vector': {'category': (None, None)}},
589 | {'vector': {'number': (None, None)}},
590 | {'scalar': {'int': (2, 30)}}])
591 |
592 | @no_numpy_warning
593 | def _GROUPBYTHENFREQ(gbx, X):
594 | return _groupby(gbx, _FREQ_SECTION, X)
595 | groupby_freq = functions.make_function(function=_GROUPBYTHENFREQ, name='gb_freq', arity=2,
596 | function_type='section',
597 | param_type=[{'vector': {'category': (None, None)}},
598 | {'vector': {'category': (None, None)}}])
599 |
600 | __all__ = ['delay', 'delta', 'sec_max', 'sec_min', 'sec_median', 'ts_min', 'ts_max', 'ts_sum', 'ts_corr', 'ts_rank',
601 | 'ts_stddev', 'ts_argmax', 'ts_argmin', 'ts_mean', 'EMA', 'DEMA', 'KAMA', 'MA', 'MIDPOINT',
602 | 'BETA', 'LINEARREG_ANGLE', 'LINEARREG_SLOPE', 'LINEARREG_INTERCEPT', 'sec_std', 'sec_rank', 'sec_mean',
603 | 'groupby_std', 'groupby_max', 'groupby_median', 'groupby_mean', 'groupby_rank', 'groupby_min',
604 | 'ts_neutralize', 'sec_neutralize', 'groupby_neutralize', 'cut_equal_amount', 'cut_equal_distance',
605 | 'groupby_cut_equal_amount', 'groupby_freq', 'groupby_cut_equal_distance', 'freq', 'ts_freq']
606 |
607 | def test():
608 | a = np.random.uniform(0.9, 1.1, 30)
609 | b = np.random.uniform(0.9, 1.1, 30)
610 | c = np.random.randint(0, 2, size=30)
611 | print(groupby_cut_equal_distance(c,a,3))
612 |
613 |
614 | if __name__ == "__main__":
615 | test()
616 |
--------------------------------------------------------------------------------
/_program.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | # @Project :gplearnplus
5 | # @File :_program
6 | # @Date :2022/12/1 0001 13:37
7 | # @Author :Junzhe Huang
8 | # @Email :acejasonhuang@163.com
9 | # @Software :PyCharm
10 | -------------------------------------------------
11 | """
12 | from copy import copy, deepcopy
13 | import numpy as np
14 | from sklearn.utils.random import sample_without_replacement
15 |
16 | from .functions import _Function, _groupby
17 | from .utils import check_random_state
18 |
19 |
20 | class _Program(object):
21 | '''
22 |
23 | '''
24 | def __init__(self,
25 | function_dict,
26 | arities,
27 | init_depth,
28 | init_method,
29 | n_features,
30 | const_range,
31 | metric,
32 | p_point_replace,
33 | parsimony_coefficient,
34 | random_state,
35 | data_type,
36 | n_cat_features,
37 | transformer=None,
38 | feature_names=None,
39 | program=None):
40 | '''
41 |
42 | Parameters
43 | ----------
44 | function_dict: 储存基础函数,原为function_set {'number': [], 'category': []}
45 | arities: 函数参数个数
46 | init_depth:初始深度, 接受元组(min_depth, max_depth)
47 | init_method:生成方式,
48 | n_features:特征个数
49 | const_range:常数范围, (-1, 1)
50 | metric:目标函数,’MAE‘,'MSE'
51 | p_point_replace:点变异概率
52 | parsimony_coefficient:惩罚系数,'auto'护着浮点数,默认0.01
53 | random_state:随机对象
54 | data_type:新增参数 截面,时序or面板, ’section‘, ’time_series', 'panel'
55 | n_cat_features:新增参数 分类特征个数
56 | transformer
57 | feature_names
58 | program
59 | '''
60 | self.function_dict = function_dict
61 | self.arities = arities
62 | self.init_depth = (init_depth[0], init_depth[1] + 1)
63 | self.init_method = init_method
64 | self.n_features = n_features
65 | self.const_range = const_range
66 | self.metric = metric
67 | self.p_point_replace = p_point_replace
68 | self.parsimony_coefficient = parsimony_coefficient
69 | self.data_type = data_type
70 | self.transformer = transformer
71 | self.feature_names = feature_names
72 | self.program = program
73 | self.n_cat_features = n_cat_features
74 |
75 | self.num_func_number = len(self.function_dict['number'])
76 | self.cat_func_number = len(self.function_dict['category'])
77 |
78 | if self.program is not None:
79 | # 验证当下树是否完整
80 | if not self.validate_program():
81 | raise ValueError('The supplied program is incomplete.')
82 | else:
83 | # Create a naive random program
84 | self.program = self.build_program(random_state)
85 |
86 | self.raw_fitness_ = None
87 | self.fitness_ = None
88 | self.parents = None
89 | self._n_samples = None
90 | self._max_samples = None
91 | self._indices_state = None
92 |
93 | def build_program(self, random_state, type='number'):
94 | """
95 | 参数中无program 初始化方法
96 | # v1.55 修改数的生成逻辑
97 | :param random_state: RandomState 对象, 随机数生成器
98 | :param type: 生成树返回数值还是分类
99 | :return: list,
100 | """
101 | if self.init_method == 'half and half':
102 | method = ('full' if random_state.randint(2) else 'grow')
103 | else:
104 | method = self.init_method
105 | max_depth = random_state.randint(*self.init_depth)
106 |
107 | # Start a program with a function to avoid degenerative programs
108 | # 公式树返回类型必须为数值类型,随机挑选一个返回数值向量的函数作为公式树的根节点
109 | _root_function_num = random_state.randint(len(self.function_dict['number']))
110 | _root_function = self.function_dict['number'][_root_function_num]
111 |
112 | # 初始化公式树和工作栈,当前工作栈中仅有根节点,工作栈中存储参数类型列表,用于树的生成
113 | program = [_root_function]
114 | terminal_stack = [deepcopy(_root_function.param_type)]
115 |
116 | while terminal_stack:
117 | depth = len(terminal_stack)
118 | candidate_num = self.n_features + self.num_func_number + self.cat_func_number
119 | candidate_choice = random_state.randint(candidate_num)
120 | # Determine if we are adding a function or terminal
121 | # terminal_stack的元素必须是list
122 | if not isinstance(terminal_stack[-1], list):
123 | raise ValueError("element in terminal_stack should be list")
124 | # terminal_stack的元素的list内,元素须为dict
125 | if not isinstance(terminal_stack[-1][0], dict):
126 | raise ValueError("element in terminal_stack'element should be dict")
127 |
128 | # 深度优先的方式构建公式树,迭代处理工作栈中最后一个子树第一个子节点
129 | # 与gplearn主要不同点
130 | if ('vector' in terminal_stack[-1][0]) and (depth < max_depth) \
131 | and (method == 'full' or candidate_choice < (self.num_func_number + self.cat_func_number)):
132 | # 插入函数的要求,1 该节点必须接受向量,2.当前深度比最大深度低, 3.随机种子选中了函数或者模式为‘full’
133 |
134 | # 决定选择数值型函数 还是 分类型函数
135 | # 若该节点都可以接受,则随机决定插入的函数类型
136 | # 否则根据可接受类型插入相应函数
137 | _choice = random_state.randint(self.cat_func_number + self.num_func_number)
138 | if 'number' in terminal_stack[-1][0]['vector'] and 'category' in terminal_stack[-1][0]['vector']:
139 | key = 'number' if _choice < self.num_func_number else 'category'
140 | else:
141 | key = 'number' if 'number' in terminal_stack[-1][0]['vector'] else 'category'
142 | function_choice = self.function_dict[key][_choice %
143 | (self.num_func_number if key == 'number' else self.cat_func_number)]
144 | program.append(function_choice)
145 | terminal_stack.append(deepcopy(function_choice.param_type))
146 | else:
147 | # 插入向量或者常量
148 | _choice = random_state.randint(self.n_features + 1)
149 | # 根据特殊情况调整_choice
150 | # 1.若const_range为None 或者 不接受标量类型,则默认插入向量
151 | # 2.若不接受向量类型,则默认插入标量
152 | # 3.其他情况按照随机数决定
153 | if _choice == self.n_features and \
154 | ((self.const_range is None) or \
155 | (('scalar') not in terminal_stack[-1][0])):
156 | # 只能插入向量的情况
157 | if 'vector' not in terminal_stack[-1][0]:
158 | raise ValueError('Error param type {}'.format(terminal_stack[-1][0]))
159 |
160 | _choice = random_state.randint(self.n_features)
161 | elif ('vector' not in terminal_stack[-1][0]):
162 | # 只能插入常量的情况
163 | _choice = self.n_features
164 |
165 | if _choice < self.n_features:
166 | # 插入向量
167 | if 'number' in terminal_stack[-1][0]['vector'] and 'category' in terminal_stack[-1][0][
168 | 'vector']:
169 | # 可插入数值向量也可插入分类向量
170 | key = 'category' if _choice < self.n_cat_features else 'number'
171 | else:
172 | key = 'number' if 'number' in terminal_stack[-1][0]['vector'] else 'category'
173 | if self.n_cat_features == 0 and key == 'category':
174 | # 需要插入分类向量,特征中却没有分类向量的情况,插入常数分类向量1, 默认0
175 | candicate_var = 0
176 | else:
177 | candicate_var = (_choice % self.n_cat_features) + 1 if key == 'category' else \
178 | ((_choice % (self.n_features - self.n_cat_features) + self.n_cat_features) + 1)
179 | program.append(str(candicate_var))
180 | else:
181 | # 插入常量
182 | if 'float' in terminal_stack[-1][0]['scalar']:
183 | _choice = random_state.uniform(*terminal_stack[-1][0]['scalar']['float'])
184 | elif 'int' in terminal_stack[-1][0]['scalar']:
185 | _choice = random_state.randint(*terminal_stack[-1][0]['scalar']['int'])
186 | else:
187 | raise ValueError('Error param type {}'.format(terminal_stack[-1][0]))
188 | program.append(_choice)
189 |
190 | terminal_stack[-1].pop(0)
191 | while len(terminal_stack[-1]) == 0:
192 | terminal_stack.pop()
193 | if not terminal_stack:
194 | return program
195 | terminal_stack[-1].pop(0)
196 | # We should never get here
197 | return None
198 |
199 | # 检查函数是否可用,不包括类型检查
200 | def validate_program(self):
201 | """Rough check that the embedded program in the object is valid."""
202 | terminals = [0]
203 | for node in self.program:
204 | if isinstance(node, _Function):
205 | terminals.append(node.arity)
206 | else:
207 | terminals[-1] -= 1
208 | while terminals[-1] == 0:
209 | terminals.pop()
210 | terminals[-1] -= 1
211 | return terminals == [-1]
212 |
213 | # 打印树
214 | def __str__(self):
215 | """Overloads `print` output of the object to resemble a LISP tree."""
216 | terminals = [0]
217 | output = ''
218 | for i, node in enumerate(self.program):
219 | if isinstance(node, _Function):
220 | terminals.append(node.arity)
221 | output += node.name + '('
222 | else:
223 | if isinstance(node, str):
224 | if self.feature_names is None:
225 | output += 'X%s' % node
226 | else:
227 | output += self.feature_names[int(node)]
228 | elif isinstance(node, int):
229 | output += '%d' % node
230 | elif isinstance(node, float):
231 | output += '%.3f' % node
232 | else:
233 | raise ValueError('Error param type {}'.format(node))
234 | terminals[-1] -= 1
235 | while terminals[-1] == 0:
236 | terminals.pop()
237 | terminals[-1] -= 1
238 | output += ')'
239 | if i != len(self.program) - 1:
240 | output += ', '
241 | return output
242 |
243 | # 可视化整个树
244 | def export_graphviz(self, fade_nodes=None):
245 | """Returns a string, Graphviz script for visualizing the program.
246 |
247 | Parameters
248 | ----------
249 | fade_nodes : list, optional
250 | A list of node indices to fade out for showing which were removed
251 | during evolution.
252 |
253 | Returns
254 | -------
255 | output : string
256 | The Graphviz script to plot the tree representation of the program.
257 |
258 | """
259 | terminals = []
260 | if fade_nodes is None:
261 | fade_nodes = []
262 | output = 'digraph program {\nnode [style=filled]\n'
263 | for i, node in enumerate(self.program):
264 | fill = '#cecece'
265 | if isinstance(node, _Function):
266 | if i not in fade_nodes:
267 | fill = '#136ed4'
268 | terminals.append([node.arity, i])
269 | output += ('%d [label="%s", fillcolor="%s"] ;\n'
270 | % (i, node.name, fill))
271 | else:
272 | if i not in fade_nodes:
273 | fill = '#60a6f6'
274 |
275 | if isinstance(node, str):
276 | if self.feature_names is None:
277 | feature_name = 'X%s' % node
278 | else:
279 | feature_name = self.feature_names[int(node)]
280 | output += ('%d [label="%s", fillcolor="%s"] ;\n'
281 | % (i, feature_name, fill))
282 | elif isinstance(node, int):
283 | output += ('%d [label="%d", fillcolor="%s"] ;\n'
284 | % (i, node, fill))
285 | elif isinstance(node, int):
286 | output += ('%d [label="%.3f", fillcolor="%s"] ;\n'
287 | % (i, node, fill))
288 | else:
289 | raise ValueError('Error param type {}'.format(node))
290 |
291 | if i == 0:
292 | # A degenerative program of only one node
293 | return output + '}'
294 | terminals[-1][0] -= 1
295 | terminals[-1].append(i)
296 | while terminals[-1][0] == 0:
297 | output += '%d -> %d ;\n' % (terminals[-1][1],
298 | terminals[-1][-1])
299 | terminals[-1].pop()
300 | if len(terminals[-1]) == 2:
301 | parent = terminals[-1][-1]
302 | terminals.pop()
303 | if not terminals:
304 | return output + '}'
305 | terminals[-1].append(parent)
306 | terminals[-1][0] -= 1
307 |
308 | # We should never get here
309 | return None
310 |
311 | # 计算树的深度
312 | def _depth(self):
313 | """Calculates the maximum depth of the program tree."""
314 | terminals = [0]
315 | depth = 1
316 | for node in self.program:
317 | if isinstance(node, _Function):
318 | terminals.append(node.arity)
319 | depth = max(len(terminals), depth)
320 | else:
321 | terminals[-1] -= 1
322 | while terminals[-1] == 0:
323 | terminals.pop()
324 | terminals[-1] -= 1
325 | return depth - 1
326 |
327 | # 计算公式中函数和变量的数量
328 | def _length(self):
329 | """Calculates the number of functions and terminals in the program."""
330 | return len(self.program)
331 |
332 | # 计算参数X的函数结果
333 | def execute(self, X):
334 | """Execute the program according to X.
335 |
336 | Parameters
337 | ----------
338 | X : {array-like}
339 | 若数据类型为'section','time_series'则为[n_samples, n_features + 1]
340 | 若数据类型为'panel', 则为[n_samples, n_features + 3]
341 |
342 | Returns
343 | -------
344 | y_hats : array-like, shape = [n_samples]
345 | The result of executing the program on X.
346 |
347 | """
348 | # 检验X列数是否正确
349 | if self.data_type == 'panel' and X.shape[1] != self.n_features + 3:
350 | raise ValueError("For panel Data, the col number of X should be n_features + 3")
351 | elif self.data_type in ['section', 'time_series'] and X.shape[1] != self.n_features + 1:
352 | raise ValueError("For section or time_series Data, the col number of X should be n_features + 1")
353 |
354 | # Check for single-node programs
355 | node = self.program[0]
356 | # 常数
357 | if isinstance(node, (float, int)):
358 | return np.repeat(node, X.shape[0])
359 | # 变量
360 | if isinstance(node, str):
361 | return X[:, int(node)]
362 |
363 | apply_stack = []
364 | for node in self.program:
365 |
366 | if isinstance(node, _Function):
367 | apply_stack.append([node])
368 | else:
369 | # Lazily evaluate later
370 | apply_stack[-1].append(node)
371 |
372 | while len(apply_stack[-1]) == apply_stack[-1][0].arity + 1:
373 | # Apply functions that have sufficient arguments
374 | function = apply_stack[-1][0]
375 | terminals = [np.repeat(t, X.shape[0]) if isinstance(t, (float, int))
376 | else (X[:, int(t)] if isinstance(t, str)
377 | else t) for t in apply_stack[-1][1:]]
378 | # 对于时序和截面函数加入管道
379 | if self.data_type == 'panel' and function.function_type == 'section':
380 | time_series_data = X[:, -1]
381 | intermediate_result = _groupby(time_series_data, function, *terminals)
382 | elif self.data_type == 'panel' and function.function_type == 'time_series':
383 | security_data = X[:, -2]
384 | intermediate_result = _groupby(security_data, function, *terminals)
385 | else:
386 | intermediate_result = function(*terminals)
387 | if len(apply_stack) != 1:
388 | apply_stack.pop()
389 | apply_stack[-1].append(intermediate_result)
390 | else:
391 | return intermediate_result
392 |
393 | # We should never get here
394 | return None
395 |
396 | # 选择部分样本
397 | def get_all_indices(self, n_samples=None, max_samples=None,
398 | random_state=None):
399 | """Get the indices on which to evaluate the fitness of a program.
400 |
401 | Parameters
402 | ----------
403 | n_samples : int
404 | The number of samples.
405 |
406 | max_samples : int
407 | The maximum number of samples to use.
408 |
409 | random_state : RandomState instance
410 | The random number generator.
411 |
412 | Returns
413 | -------
414 | indices : array-like, shape = [n_samples]
415 | The in-sample indices.
416 | 抽样内index
417 |
418 | not_indices : array-like, shape = [n_samples]
419 | The out-of-sample indices.
420 | 抽样外index
421 |
422 | """
423 | if self._indices_state is None and random_state is None:
424 | raise ValueError('The program has not been evaluated for fitness '
425 | 'yet, indices not available.')
426 |
427 | if n_samples is not None and self._n_samples is None:
428 | self._n_samples = n_samples
429 | if max_samples is not None and self._max_samples is None:
430 | self._max_samples = max_samples
431 | if random_state is not None and self._indices_state is None:
432 | self._indices_state = random_state.get_state()
433 |
434 | indices_state = check_random_state(None)
435 | indices_state.set_state(self._indices_state)
436 |
437 | not_indices = sample_without_replacement(
438 | self._n_samples,
439 | self._n_samples - self._max_samples,
440 | random_state=indices_state)
441 | sample_counts = np.bincount(not_indices, minlength=self._n_samples)
442 | indices = np.where(sample_counts == 0)[0]
443 |
444 | return indices, not_indices
445 |
446 | # 获取衡量模型适应度的指标
447 | def _indices(self):
448 | """Get the indices used to measure the program's fitness."""
449 | return self.get_all_indices()[0]
450 |
451 | # 原始适应度
452 | def raw_fitness(self, X, y, sample_weight):
453 | """Evaluate the raw fitness of the program according to X, y.
454 |
455 | Parameters
456 | ----------
457 | X : {array-like}
458 | 若数据类型为'section','time_series'则为[n_samples, n_features + 1]
459 | 若数据类型为'panel', 则为[n_samples, n_features + 3]
460 |
461 | y : array-like, shape = [n_samples]
462 | Target values.
463 |
464 | sample_weight : array-like, shape = [n_samples]
465 | Weights applied to individual samples.
466 |
467 | Returns
468 | -------
469 | raw_fitness : float
470 | The raw fitness of the program.
471 |
472 | """
473 | if X.shape[0] != len(y):
474 | raise ValueError("The length of y should be equal to X")
475 | y_pred = self.execute(X)
476 | if self.transformer:
477 | y_pred = self.transformer(y_pred)
478 | raw_fitness = self.metric(y, y_pred, sample_weight)
479 |
480 | return raw_fitness
481 |
482 | # todo 引入非线性适应度
483 | # 惩罚后适应度 对函数长度进行惩罚
484 | def fitness(self, parsimony_coefficient=None):
485 | """Evaluate the penalized fitness of the program according to X, y.
486 |
487 | Parameters
488 | ----------
489 | parsimony_coefficient : float, optional
490 | If automatic parsimony is being used, the computed value according
491 | to the population. Otherwise the initialized value is used.
492 |
493 | Returns
494 | -------
495 | fitness : float
496 | The penalized fitness of the program.
497 |
498 | """
499 | if parsimony_coefficient is None:
500 | parsimony_coefficient = self.parsimony_coefficient
501 | penalty = parsimony_coefficient * len(self.program) * self.metric.sign
502 | return self.raw_fitness_ - penalty
503 |
504 | # 此函数为获得指定子树
505 | def get_subtree(self, start, program=None):
506 | """
507 |
508 | Parameters
509 | ----------
510 | start: 子树的根节点位置
511 | program
512 | Returns
513 | -------
514 | start
515 | end 子树截止位置 + 1 便于索引
516 | """
517 | if program is None:
518 | program = self.program
519 | stack = 1
520 | end = start
521 | while stack > end - start:
522 | node = program[end]
523 | if isinstance(node, _Function):
524 | stack += node.arity
525 | end += 1
526 |
527 | if isinstance(program[start], _Function):
528 | return_type = _Function.return_type
529 | elif isinstance(program[start], str):
530 | if int(program[start]) == 0:
531 | raise ValueError("The return of sub_tree's root should not be const_1")
532 | return_type = 'category' if int(program[start]) <= self.n_cat_features else 'number'
533 | else:
534 | raise ValueError("The return type of sub_tree's root should be number or category")
535 | return start, end, return_type
536 |
537 | # 此函数为获得随机子树
538 | # 此处做了修改,不会选到标量
539 | # 需要考虑返回类型
540 | def get_random_subtree(self, random_state, program=None, return_type=None):
541 | """Get a random subtree from the program.
542 |
543 | Parameters
544 | ----------
545 | random_state : RandomState instance
546 | The random number generator.
547 |
548 | program : list, optional (default=None)
549 | The flattened tree representation of the program. If None, the
550 | embedded tree in the object will be used.
551 |
552 | return_type: 子数的返回类型限定 默认 None, number 和 category都可以选择
553 |
554 | Returns
555 | -------
556 | start, end : tuple of two ints
557 | The indices of the start and end of the random subtree.
558 | return_type: 子数返回类型,数值向量 还是 分类向量, 防止交叉时出现错误
559 | """
560 | if program is None:
561 | program = self.program
562 | # Choice of crossover points follows Koza's (1992) widely used approach
563 | # 子数节点概率权重90%,向量叶子节点概率权重10%,标量叶包括常分类向量子节点概率权重0
564 | # 若type为number, 所有返回category的节点概率权重为0
565 | # 若type为category, 所有返回number的节点概率权重为0
566 | if return_type not in ['number', 'category', None]:
567 | raise ValueError("Type of sub_tree should be number, category or None")
568 | if return_type == 'number':
569 | probs = np.array([0.9 if isinstance(node, _Function) and node.return_type == 'number'
570 | else (0.1 if isinstance(node, str) and int(node) > self.n_cat_features else 0.0)
571 | for node in program])
572 | elif return_type == 'category':
573 | probs = np.array([0.9 if isinstance(node, _Function) and node.return_type == 'category'
574 | else (0.1 if isinstance(node, str) and int(node) <= self.n_cat_features
575 | and int(node) != 0 else 0.0)
576 | for node in program])
577 | else:
578 | probs = np.array([0.9 if isinstance(node, _Function)
579 | else (0.1 if isinstance(node, str)
580 | and int(node) != 0 else 0.0)
581 | for node in program])
582 | probs = np.cumsum(probs / probs.sum())
583 | start = np.searchsorted(probs, random_state.uniform())
584 | return self.get_subtree(start, program)
585 |
586 | def reproduce(self):
587 | """Return a copy of the embedded program."""
588 | return copy(self.program)
589 |
590 | def vaild_category(self, program=None):
591 | """验证公式树中是否包含分类向量或子树, 不包括常数分类向量"""
592 | if program is None:
593 | program = self.program
594 | for node in program:
595 | if isinstance(node, _Function) and node.return_type == 'category':
596 | return True
597 | elif isinstance(node, str) and int(node) != 0 and int(node) <= self.n_cat_features:
598 | return True
599 | return False
600 |
601 | # 交换self 和 donor 的子树
602 | # 此处不会交换常数
603 | def crossover(self, donor, random_state):
604 | """Perform the crossover genetic operation on the program.
605 |
606 | Crossover selects a random subtree from the embedded program to be
607 | replaced. A donor also has a subtree selected at random and this is
608 | inserted into the original parent to form an offspring.
609 |
610 | Parameters
611 | ----------
612 | donor : list
613 | The flattened tree representation of the donor program.
614 |
615 | random_state : RandomState instance
616 | The random number generator.
617 |
618 | Returns
619 | -------
620 | program : list
621 | The flattened tree representation of the program.
622 |
623 | """
624 | # Get a subtree to replace
625 | # 若都包含
626 | if self.vaild_category() and self.vaild_category(donor):
627 | start, end, self_return_type = self.get_random_subtree(random_state)
628 | else:
629 | start, end, self_return_type = self.get_random_subtree(random_state, return_type='number')
630 | removed = range(start, end)
631 | # Get a subtree to donate
632 | donor_start, donor_end, donor_return_type = self.get_random_subtree(random_state, donor, self_return_type)
633 | donor_removed = list(set(range(len(donor))) -
634 | set(range(donor_start, donor_end)))
635 | # Insert genetic material from donor
636 | return (self.program[:start] +
637 | donor[donor_start:donor_end] +
638 | self.program[end:]), removed, donor_removed
639 |
640 | # 此处不会选择常数
641 | # 子数变异
642 | def subtree_mutation(self, random_state):
643 | """Perform the subtree mutation operation on the program.
644 |
645 | Subtree mutation selects a random subtree from the embedded program to
646 | be replaced. A donor subtree is generated at random and this is
647 | inserted into the original parent to form an offspring. This
648 | implementation uses the "headless chicken" method where the donor
649 | subtree is grown using the initialization methods and a subtree of it
650 | is selected to be donated to the parent.
651 |
652 | Parameters
653 | ----------
654 | random_state : RandomState instance
655 | The random number generator.
656 |
657 | Returns
658 | -------
659 | program : list
660 | The flattened tree representation of the program.
661 |
662 | """
663 | # Build a new naive program
664 | chicken = self.build_program(random_state)
665 | # Do subtree mutation via the headless chicken method!
666 | return self.crossover(chicken, random_state)
667 |
668 | def get_hoist_list(self, program=None):
669 | """
670 | 判断公式树哪些节点可以做hoist变异, 该节点非叶子节点 且 存在与自身同类型的子树, 常分类向量不算分类向量的同类型
671 | Parameters
672 | ----------
673 | program
674 |
675 | Returns
676 | -------
677 | hoist_list
678 | """
679 | if program is None:
680 | program = self.program
681 |
682 | apply_stack = []
683 | hoist_list = [False] * len(program)
684 | # 深度优先搜索,压入栈中的元素是一个list,list第一个元素表示函数再program列表中的位置,第二个元素是函数对象,后面的元素是返回结果
685 | # 深搜结果为['number'], ['number','category'], ['category'], [], 表示该节点及其子节点所包含的类型集合
686 | for i, node in enumerate(program):
687 | if isinstance(node, _Function):
688 | apply_stack.append([i, node])
689 | else:
690 | # Lazily evaluate later
691 | apply_stack[-1].append(node)
692 | while len(apply_stack[-1]) == apply_stack[-1][1].arity + 2:
693 | father_type = apply_stack[-1][1].return_type
694 | type_list = [t if isinstance(t, list) else
695 | (['number'] if isinstance(t, str) and int(t) > self.n_cat_features else
696 | (['category'] if isinstance(t, str) and int(t) <= self.n_cat_features and int(t) != 0
697 | else []))
698 | for t in apply_stack[-1][2:]
699 | ]
700 | # 判断子树中是否存在与本节点同类型的节点,若存在表示可以hoist变异
701 | if father_type in list(set().union(*type_list)):
702 | hoist_list[apply_stack[-1][0]] = True
703 | # 函数返回类型加入列表
704 | type_list.append([father_type])
705 |
706 | intermediate_result = list(set().union(*type_list))
707 | if len(apply_stack) != 1:
708 | apply_stack.pop()
709 | apply_stack[-1].append(intermediate_result)
710 | else:
711 | return hoist_list
712 | return None
713 |
714 | # 将子树的子树变上提,简化公式
715 | # 由于子树不会选到常数,故符合条件
716 | # 子数不会选到分类变量
717 | def hoist_mutation(self, random_state):
718 | """Perform the hoist mutation operation on the program.
719 |
720 | Hoist mutation selects a random subtree from the embedded program to
721 | be replaced. A random subtree of that subtree is then selected and this
722 | is 'hoisted' into the original subtrees location to form an offspring.
723 | This method helps to control bloat.
724 |
725 | gplearnplus修改,由于引入了变量类型,需要先考哪些节点可以hosit变异的节点
726 | 要求
727 | 1. 该节点下存在于节点同类型的子树
728 |
729 | Parameters
730 | ----------
731 | random_state : RandomState instance
732 | The random number generator.
733 |
734 | Returns
735 | -------
736 | program : list
737 | The flattened tree representation of the program.
738 |
739 | """
740 | # Get a subtree to replace
741 | hoist_list = self.get_hoist_list()
742 | if sum(hoist_list) == 0:
743 | return self.program
744 | # 随机选取可以hoist的节点
745 | hoist_root = random_state.choice(np.where(hoist_list)[0])
746 | start, end, return_type = self.get_subtree(hoist_root)
747 | subtree = self.program[start:end]
748 | # Get a subtree of the subtree to hoist
749 | sub_start, sub_end, _ = self.get_random_subtree(random_state, subtree, return_type=return_type)
750 | hoist = subtree[sub_start:sub_end]
751 | # Determine which nodes were removed for plotting
752 | removed = list(set(range(start, end)) -
753 | set(range(start + sub_start, start + sub_end)))
754 | return self.program[:start] + hoist + self.program[end:], removed
755 |
756 | # 点变异完全修改
757 | # 要求函数满足is_point_mutation条件
758 | # 由于无法得知范围,常数不变异
759 | def point_mutation(self, random_state):
760 | """Perform the point mutation operation on the program.
761 |
762 | Point mutation selects random nodes from the embedded program to be
763 | replaced. Terminals are replaced by other terminals and functions are
764 | replaced by other functions that require the same number of arguments
765 | as the original node. The resulting tree forms an offspring.
766 |
767 | Parameters
768 | ----------
769 | random_state : RandomState instance
770 | The random number generator.
771 |
772 | Returns
773 | -------
774 | program : list
775 | The flattened tree representation of the program.
776 |
777 | """
778 | program = copy(self.program)
779 |
780 | # Get the nodes to modify
781 | mutate = np.where(random_state.uniform(size=len(program)) <
782 | self.p_point_replace)[0]
783 | tag = np.array([True] * len(mutate))
784 | for i, node in enumerate(mutate):
785 | if isinstance(program[node], _Function):
786 | arity = program[node].arity
787 | # Find a valid replacement with same arity
788 | replacement_list = [func_ for func_ in self.arities[arity] if program[node].is_point_mutation(func_)]
789 | if len(replacement_list) == 0:
790 | # 没有满足条件的变异
791 | tag[i] = False
792 | continue
793 | replacement = random_state.randint(len(replacement_list))
794 | replacement = replacement_list[replacement]
795 | program[node] = replacement
796 | elif isinstance(program[node], str):
797 | # We've got a terminal, add a const or variable
798 | terminal = random_state.randint(1, self.n_features + 1)
799 | program[node] = str(terminal)
800 | else:
801 | # 常数不发生变异
802 | tag[i] = False
803 | if len(mutate):
804 | mutate = mutate[tag]
805 | return program, list(mutate)
806 |
807 | depth_ = property(_depth)
808 | length_ = property(_length)
809 | indices_ = property(_indices)
810 |
--------------------------------------------------------------------------------
/genetic.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | # @Project :gplearnplus
5 | # @File :genetic
6 | # @Date :2022/12/5 0005 4:23
7 | # @Author :Junzhe Huang
8 | # @Email :acejasonhuang@163.com
9 | # @Software :PyCharm
10 | -------------------------------------------------
11 | """
12 | import itertools
13 | from abc import ABCMeta, abstractmethod
14 | from time import time
15 | from warnings import warn
16 | from copy import deepcopy
17 |
18 | import numpy as np
19 | import pandas as pd
20 | from joblib import Parallel, delayed
21 | from scipy.stats import rankdata
22 | from sklearn.base import BaseEstimator
23 | from sklearn.base import RegressorMixin, TransformerMixin, ClassifierMixin
24 | from sklearn.exceptions import NotFittedError
25 | from sklearn.utils import compute_sample_weight
26 | from sklearn.utils.validation import check_array, _check_sample_weight
27 | from sklearn.utils.multiclass import check_classification_targets
28 | from sklearn.preprocessing import LabelEncoder
29 |
30 | from ._program import _Program
31 | from .fitness import _fitness_map, _Fitness
32 | from .functions import _function_map, _Function, sig1 as sigmoid
33 | from .utils import _partition_estimators
34 | from .utils import check_random_state
35 |
36 | __all__ = ['SymbolicRegressor', 'SymbolicClassifier', 'SymbolicTransformer']
37 |
38 | MAX_INT = np.iinfo(np.int32).max
39 |
40 | # 并行实现子树交叉,变异
41 | def _parallel_evolve(n_programs, parents, X, y, sample_weight, seeds, params):
42 | """
43 |
44 | Parameters
45 | ----------
46 | n_programs: 种群数量
47 | parents:父辈个体集合
48 | X:原始特征
49 | y:预测label
50 | sample_weight:抽样比例
51 | seeds:随机种子
52 | params:参数
53 |
54 | Returns
55 | -------
56 |
57 | """
58 |
59 | """Private function used to build a batch of programs within a job."""
60 | n_samples, n_features = X.shape
61 |
62 | # Unpack parameters
63 | tournament_size = params['tournament_size']
64 | function_dict = params['function_dict']
65 | arities = params['arities']
66 | init_depth = params['init_depth']
67 | init_method = params['init_method']
68 | const_range = params['const_range']
69 | metric = params['_metric']
70 | transformer = params['_transformer']
71 | parsimony_coefficient = params['parsimony_coefficient']
72 | method_probs = params['method_probs']
73 | data_type = params['data_type']
74 | p_point_replace = params['p_point_replace']
75 | max_samples = params['max_samples'] # 最大样本比例
76 | feature_names = params['feature_names']
77 | n_cat_features = params['cat_var_number']
78 |
79 | # 处理不同类型的数据X
80 | if data_type == 'panel':
81 | n_features -= 3
82 | else:
83 | n_features -= 1
84 |
85 | max_samples = int(max_samples * n_samples)
86 |
87 | def _tournament():
88 | # 从所有父代中随机选择tournament_size个,取其中最优的那一个子代
89 | """Find the fittest individual from a sub-population."""
90 | contenders = random_state.randint(0, len(parents), tournament_size)
91 | fitness = [parents[p].fitness_ for p in contenders]
92 | if metric.greater_is_better:
93 | parent_index = contenders[np.argmax(fitness)]
94 | else:
95 | parent_index = contenders[np.argmin(fitness)]
96 | return parents[parent_index], parent_index
97 |
98 | # Build programs
99 | programs = []
100 |
101 | for i in range(n_programs):
102 |
103 | random_state = check_random_state(seeds[i])
104 |
105 | if parents is None:
106 | # 初代
107 | program = None
108 | genome = None
109 | else:
110 | method = random_state.uniform()
111 | # 在父辈个体集合中抽样选择一个最优的父辈
112 | parent, parent_index = _tournament()
113 |
114 | # 随机进行一种交叉 or 变异
115 | if method < method_probs[0]:
116 | # crossover
117 | donor, donor_index = _tournament()
118 | program, removed, remains = parent.crossover(donor.program,
119 | random_state)
120 | genome = {'method': 'Crossover',
121 | 'parent_idx': parent_index,
122 | 'parent_nodes': removed,
123 | 'donor_idx': donor_index,
124 | 'donor_nodes': remains}
125 | elif method < method_probs[1]:
126 | # subtree_mutation
127 | program, removed, _ = parent.subtree_mutation(random_state)
128 | genome = {'method': 'Subtree Mutation',
129 | 'parent_idx': parent_index,
130 | 'parent_nodes': removed}
131 | elif method < method_probs[2]:
132 | # hoist_mutation
133 | program, removed = parent.hoist_mutation(random_state)
134 | genome = {'method': 'Hoist Mutation',
135 | 'parent_idx': parent_index,
136 | 'parent_nodes': removed}
137 | elif method < method_probs[3]:
138 | # point_mutation
139 | program, mutated = parent.point_mutation(random_state)
140 | genome = {'method': 'Point Mutation',
141 | 'parent_idx': parent_index,
142 | 'parent_nodes': mutated}
143 | else:
144 | # reproduction
145 | program = parent.reproduce()
146 | genome = {'method': 'Reproduction',
147 | 'parent_idx': parent_index,
148 | 'parent_nodes': []}
149 |
150 | program = _Program(function_dict=function_dict,
151 | arities=arities,
152 | init_depth=init_depth,
153 | init_method=init_method,
154 | n_features=n_features,
155 | metric=metric,
156 | transformer=transformer,
157 | const_range=const_range,
158 | p_point_replace=p_point_replace,
159 | parsimony_coefficient=parsimony_coefficient,
160 | data_type=data_type,
161 | feature_names=feature_names,
162 | random_state=random_state,
163 | n_cat_features=n_cat_features,
164 | program=program)
165 |
166 | program.parents = genome
167 |
168 | # Draw samples, using sample weights, and then fit
169 | if sample_weight is None:
170 | curr_sample_weight = np.ones((n_samples,))
171 | else:
172 | curr_sample_weight = sample_weight.copy()
173 | oob_sample_weight = curr_sample_weight.copy()
174 |
175 | indices, not_indices = program.get_all_indices(n_samples,
176 | max_samples,
177 | random_state)
178 |
179 | curr_sample_weight[not_indices] = 0
180 | oob_sample_weight[indices] = 0
181 |
182 | program.raw_fitness_ = program.raw_fitness(X, y, curr_sample_weight)
183 | if max_samples < n_samples:
184 | # Calculate OOB fitness
185 | program.oob_fitness_ = program.raw_fitness(X, y, oob_sample_weight)
186 |
187 | programs.append(program)
188 |
189 | return programs
190 |
191 |
192 | class BaseSymbolic(BaseEstimator, metaclass=ABCMeta):
193 |
194 | """Base class for symbolic regression / classification estimators.
195 |
196 | Warning: This class should not be used directly.
197 | Use derived classes instead.
198 |
199 | """
200 |
201 | @abstractmethod
202 | def __init__(self,
203 | *,
204 | population_size=1000,
205 | hall_of_fame=None,
206 | n_components=None,
207 | generations=20,
208 | tournament_size=20,
209 | stopping_criteria=0.0,
210 | const_range=(-1., 1.),
211 | init_depth=(2, 6),
212 | init_method='half and half',
213 | function_set=('add', 'sub', 'mul', 'div'),
214 | transformer=None,
215 | metric='mean absolute error',
216 | parsimony_coefficient=0.001,
217 | p_crossover=0.9,
218 | p_subtree_mutation=0.01,
219 | p_hoist_mutation=0.01,
220 | p_point_mutation=0.01,
221 | p_point_replace=0.05,
222 | max_samples=1.0,
223 | tolerable_corr=0.0,
224 | class_weight=None,
225 | feature_names=None,
226 | time_series_index=None,
227 | security_index=None,
228 | category_features=None,
229 | warm_start=False,
230 | low_memory=False,
231 | n_jobs=1,
232 | verbose=0,
233 | data_type='section',
234 | random_state=None):
235 |
236 | self.population_size = population_size
237 | self.hall_of_fame = hall_of_fame
238 | self.n_components = n_components
239 | self.generations = generations
240 | self.tournament_size = tournament_size
241 | self.stopping_criteria = stopping_criteria
242 | self.const_range = const_range
243 | self.init_depth = init_depth
244 | self.init_method = init_method
245 | self.function_set = function_set
246 | self.transformer = transformer
247 | self.metric = metric
248 | self.parsimony_coefficient = parsimony_coefficient
249 | self.p_crossover = p_crossover
250 | self.p_subtree_mutation = p_subtree_mutation
251 | self.p_hoist_mutation = p_hoist_mutation
252 | self.p_point_mutation = p_point_mutation
253 | self.p_point_replace = p_point_replace
254 | self.max_samples = max_samples
255 | self.class_weight = class_weight
256 | self.feature_names = feature_names
257 | self.category_features = category_features
258 | self.time_series_index = time_series_index
259 | self.security_index = security_index
260 | self.warm_start = warm_start
261 | self.low_memory = low_memory
262 | self.n_jobs = n_jobs
263 | self.verbose = verbose
264 | self.random_state = random_state
265 | self.data_type = data_type
266 | self.tolerable_corr = tolerable_corr
267 |
268 | # 打印训练日志
269 | def _verbose_reporter(self, run_details=None):
270 | """A report of the progress of the evolution process.
271 |
272 | Parameters
273 | ----------
274 | run_details : dict
275 | Information about the evolution.
276 |
277 | """
278 | if run_details is None:
279 | print(' |{:^25}|{:^42}|'.format('Population Average',
280 | 'Best Individual'))
281 | print('-' * 4 + ' ' + '-' * 25 + ' ' + '-' * 42 + ' ' + '-' * 10)
282 | line_format = '{:>4} {:>8} {:>16} {:>8} {:>16} {:>16} {:>10}'
283 | print(line_format.format('Gen', 'Length', 'Fitness', 'Length',
284 | 'Fitness', 'OOB Fitness', 'Time Left'))
285 |
286 | else:
287 | # Estimate remaining time for run
288 | gen = run_details['generation'][-1]
289 | generation_time = run_details['generation_time'][-1]
290 | remaining_time = (self.generations - gen - 1) * generation_time
291 | if remaining_time > 60:
292 | remaining_time = '{0:.2f}m'.format(remaining_time / 60.0)
293 | else:
294 | remaining_time = '{0:.2f}s'.format(remaining_time)
295 |
296 | oob_fitness = 'N/A'
297 | line_format = '{:4d} {:8.2f} {:16g} {:8d} {:16g} {:>16} {:>10}'
298 | if self.max_samples < 1.0:
299 | oob_fitness = run_details['best_oob_fitness'][-1]
300 | line_format = '{:4d} {:8.2f} {:16g} {:8d} {:16g} {:16g} {:>10}'
301 |
302 | print(line_format.format(run_details['generation'][-1],
303 | run_details['average_length'][-1],
304 | run_details['average_fitness'][-1],
305 | run_details['best_length'][-1],
306 | run_details['best_fitness'][-1],
307 | oob_fitness,
308 | remaining_time))
309 |
310 | # fit 的时候考虑时序问题
311 | # 转移出数据处理模块
312 | def fit(self, X, y, sample_weight=None):
313 | """Fit the Genetic Program according to X, y.
314 |
315 | Parameters
316 | ----------
317 | X : array-like, shape = [n_samples, n_features]
318 | Training vectors, where n_samples is the number of samples and
319 | n_features is the number of features.
320 |
321 | y : array-like, shape = [n_samples]
322 | Target values.
323 |
324 | sample_weight : array-like, shape = [n_samples], optional
325 | Weights applied to individual samples.
326 |
327 | Returns
328 | -------
329 | self : object
330 | Returns self.
331 |
332 | """
333 | random_state = check_random_state(self.random_state)
334 |
335 | # 检查数据类型
336 | if self.data_type not in ('section', 'time_series', 'panel'):
337 | raise ValueError('Valid data_type methods include '
338 | '"section", "time_series" and "panel". Given %s.'
339 | % self.data_type)
340 |
341 | # 检查数据结构
342 | # 若含有security或者timeindex 必须为DataFrame
343 | if self.security_index is not None or self.time_series_index is not None:
344 | if not isinstance(X, pd.DataFrame):
345 | raise ValueError('with security ot time index, data structure should be DataFrame')
346 |
347 | # 检查时间index和个股index, 对于截面,时序和面板数据分别检查
348 | security_data = None
349 | time_series_data = None
350 | if self.data_type == 'section':
351 | if self.time_series_index is not None:
352 | raise ValueError('For Section Data, time_series_index should be None')
353 | if self.security_index is not None:
354 | # 在index和columns中寻找security_index
355 | if self.security_index not in X.columns and \
356 | (X.index.name is None or self.security_index not in X.index.name):
357 | raise ValueError('Can not fund security_index {} in both columns and index'
358 | .format(self.security_index))
359 | elif self.security_index in X.columns:
360 | X.set_index(self.security_index, inplace=True)
361 |
362 | # 判断是否有重复个股
363 | if len(X[self.security_index].unique()) < len(X[self.security_index]):
364 | raise ValueError('For Section Data, security data should be unique')
365 |
366 | security_data = X.index.values
367 |
368 | elif self.data_type == 'time_series':
369 | if self.time_series_index is None:
370 | raise ValueError('For time_series Data, time_series_index should NOT be None')
371 | if self.security_index is not None:
372 | raise ValueError('For time_series Data, security_index should be None')
373 | if self.time_series_index not in X.columns and \
374 | (X.index.name is None or self.time_series_index not in X.index.name):
375 | raise ValueError('Can not fund time_series_index {} in both columns and index'
376 | .format(self.time_series_index))
377 | elif self.time_series_index in X.columns:
378 | X.set_index(self.time_series_index, inplace=True)
379 |
380 | # 判断是否有重复时间
381 | if len(X.index.drop_duplicates()) < len(X):
382 | raise ValueError('For time_series Data, time_series data should be unique')
383 |
384 | X_combine = X.copy()
385 | X_combine['_label'] = y.values if isinstance(y, pd.Series) else y
386 | X_combine.sort_index(inplace=True)
387 | X, y = X_combine.loc[:, self.feature_names], X_combine.loc[:, '_label']
388 | time_series_data = X.index.values
389 |
390 | else:
391 | if self.time_series_index is None:
392 | raise ValueError('For panel Data, time_series_index should NOT be None')
393 | if self.security_index is None:
394 | raise ValueError('For panel Data, security_index should NOT be None')
395 |
396 | # security time_series 进入index
397 | if self.time_series_index not in X.columns and \
398 | (X.index.name is None or self.time_series_index not in X.index.name):
399 | raise ValueError('Can not fund time_series_index {} in both columns and index'
400 | .format(self.time_series_index))
401 | elif self.security_index not in X.columns and \
402 | (X.index.name is None or self.security_index not in X.index.name):
403 | raise ValueError('Can not fund security_index {} in both columns and index'
404 | .format(self.security_index))
405 | elif self.time_series_index in X.columns and self.security_index in X.columns:
406 | X.set_index([self.time_series_index, self.security_index], inplace=True)
407 | elif self.time_series_index in X.columns:
408 | X.set_index(self.security_index, inplace=True, append=True)
409 | elif self.security_index in X.columns:
410 | X.set_index(self.time_series_index, inplace=True, append=True)
411 |
412 | # 判断没有重复
413 | if len(X.index) != len(X.index.drop_duplicates()):
414 | raise ValueError('For time_series Data, time_series data should be unique')
415 |
416 |
417 | X_combine = X.copy()
418 | X_combine['_label'] = y.values if isinstance(y, pd.Series) else y
419 | X_combine.sort_index(inplace=True)
420 | X, y = X_combine.loc[:, self.feature_names], X_combine.loc[:, '_label']
421 | time_series_data = X.index.get_level_values(self.time_series_index).values
422 | security_data = X.index.get_level_values(self.security_index).values
423 |
424 | # 检查category_features是否与全包含在feature_names中
425 | # 当存在分类数据时,输入数据类型必须为pd。DataFrame
426 | if self.category_features is not None:
427 | if not isinstance(X, pd.DataFrame):
428 | raise ValueError('while there are category_features in X, X must be pd.DataFrame')
429 | if not isinstance(self.category_features, list):
430 | raise ValueError('category_features must be list')
431 | for cat_feature in self.category_features:
432 | if cat_feature not in self.feature_names:
433 | raise ValueError('Valid category_feature {} , not in feature_names'.format(cat_feature))
434 | # 处理分类数据,转换为整型
435 | label_encoder = LabelEncoder()
436 | X[self.category_features] = X[self.category_features].apply(label_encoder.fit_transform)
437 | # 重构顺序,将分类类型放在前面
438 | self.feature_names = \
439 | [self.category_features + [_col for _col in self.feature_names if _col not in self.category_features]]
440 | X = X[self.feature_names]
441 |
442 | # Check arrays
443 | if sample_weight is not None:
444 | sample_weight = _check_sample_weight(sample_weight, X)
445 |
446 | # 检查数据内容
447 | if isinstance(self, ClassifierMixin):
448 | # 验证y是否为分类数据, X, y强转ndarray
449 | # todo 分类场景的处理有待优化,暂时不处理
450 | X, y = self._validate_data(X, y, y_numeric=False)
451 | check_classification_targets(y)
452 |
453 | if self.class_weight:
454 | if sample_weight is None:
455 | sample_weight = 1.
456 | # modify the sample weights with the corresponding class weight
457 | sample_weight = (sample_weight *
458 | compute_sample_weight(self.class_weight, y))
459 |
460 | self.classes_, y = np.unique(y, return_inverse=True)
461 | n_trim_classes = np.count_nonzero(np.bincount(y, sample_weight))
462 | if n_trim_classes != 2:
463 | raise ValueError("y contains %d class after sample_weight "
464 | "trimmed classes with zero weights, while 2 "
465 | "classes are required."
466 | % n_trim_classes)
467 | self.n_classes_ = len(self.classes_)
468 |
469 | else:
470 | # 验证y是否为数值数据, X, y强转ndarray
471 | X, y = self._validate_data(X, y, y_numeric=True)
472 |
473 | # check hall_of_fame and n_components ,if have
474 | hall_of_fame = self.hall_of_fame
475 | if hall_of_fame is None:
476 | hall_of_fame = self.population_size
477 | if hall_of_fame > self.population_size or hall_of_fame < 1:
478 | raise ValueError('hall_of_fame (%d) must be less than or equal to '
479 | 'population_size (%d).' % (self.hall_of_fame,
480 | self.population_size))
481 | n_components = self.n_components
482 | if n_components is None:
483 | n_components = hall_of_fame
484 | if n_components > hall_of_fame or n_components < 1:
485 | raise ValueError('n_components (%d) must be less than or equal to '
486 | 'hall_of_fame (%d).' % (self.n_components,
487 | self.hall_of_fame))
488 |
489 | # 检查feature_names是否与n_features_in_一致
490 | if self.feature_names is not None:
491 | if self.n_features_in_ != len(self.feature_names):
492 | raise ValueError('The supplied `feature_names` has different '
493 | 'length to n_features. Expected %d, got %d.'
494 | % (self.n_features_in_,
495 | len(self.feature_names)))
496 | for feature_name in self.feature_names:
497 | if not isinstance(feature_name, str):
498 | raise ValueError('invalid type %s found in '
499 | '`feature_names`.' % type(feature_name))
500 |
501 | # 检查const_range
502 | if not ((isinstance(self.const_range, tuple) and
503 | len(self.const_range) == 2) or self.const_range is None):
504 | raise ValueError('const_range should be a tuple with length two, '
505 | 'or None.')
506 |
507 | # 检查function, 稍作修改, 结合const_range到range里面, 并区分number func 和 cat function
508 | # 存放不同类型的函数(分类和数值)
509 | self._function_dict = {'number': [], 'category': []}
510 | # 检验是否存在接受分类变量参数的函数
511 | _cat_func_flag = False
512 | for function in self.function_set:
513 | # 类型检验
514 | if isinstance(function, str):
515 | if function not in _function_map:
516 | raise ValueError('invalid function name %s found in '
517 | '`function_set`.' % function)
518 | function = deepcopy(_function_map[function])
519 | function.add_range(self.const_range)
520 | self._function_dict['number'].append(function)
521 | elif isinstance(function, _Function):
522 | function = deepcopy(function)
523 | # 添加常数范围
524 | function.add_range(self.const_range)
525 | # 检验是否有仅接收分类变量的函数
526 | if not _cat_func_flag:
527 | for _param in function.param_type:
528 | if len(_param) == 1 and 'vector' in _param and \
529 | len(_param['vector']) == 1 and 'category' in _param['vector']:
530 | _cat_func_flag = True
531 | if function.return_type == 'number':
532 | self._function_dict['number'].append(function)
533 | else:
534 | self._function_dict['category'].append(function)
535 | else:
536 | raise ValueError('invalid type %s found in `function_set`.'
537 | % type(function))
538 |
539 | # number类型函数必须有
540 | if len(self._function_dict['number']) == 0:
541 | raise ValueError('No valid functions found in `function_set`.')
542 |
543 | # 当存在只接受分类变量参数的函数时(如groupby),category变量不能为空
544 | if _cat_func_flag and len(self.category_features) == 0:
545 | raise ValueError('There no category var in input features, but there are functions only get category param')
546 |
547 | # 点变异记录函数参数个数, 需要在点变异中再考察参数类型
548 | self._arities = {}
549 | for _type in ['number', 'category']:
550 | for function in self._function_dict[_type]:
551 | arity = function.arity
552 | self._arities[arity] = self._arities.get(arity, [])
553 | self._arities[arity].append(function)
554 |
555 | # 检查fitness
556 | if isinstance(self.metric, _Fitness):
557 | self._metric = self.metric
558 | elif isinstance(self, RegressorMixin):
559 | if self.metric not in ('mean absolute error', 'mse', 'rmse',
560 | 'pearson', 'spearman'):
561 | raise ValueError('Unsupported metric: %s' % self.metric)
562 | self._metric = _fitness_map[self.metric]
563 | elif isinstance(self, ClassifierMixin):
564 | if self.metric != 'log loss':
565 | raise ValueError('Unsupported metric: %s' % self.metric)
566 | self._metric = _fitness_map[self.metric]
567 | elif isinstance(self, TransformerMixin):
568 | if self.metric not in ('pearson', 'spearman'):
569 | raise ValueError('Unsupported metric: %s' % self.metric)
570 | self._metric = _fitness_map[self.metric]
571 |
572 | # 检查概率参数
573 | # todo 增加交叉变异方法后需要修改此处
574 | self._method_probs = np.array([self.p_crossover,
575 | self.p_subtree_mutation,
576 | self.p_hoist_mutation,
577 | self.p_point_mutation])
578 | self._method_probs = np.cumsum(self._method_probs)
579 | if self._method_probs[-1] > 1:
580 | raise ValueError('The sum of p_crossover, p_subtree_mutation, '
581 | 'p_hoist_mutation and p_point_mutation should '
582 | 'total to 1.0 or less.')
583 |
584 | # 检查初始化模式
585 | if self.init_method not in ('half and half', 'grow', 'full'):
586 | raise ValueError('Valid program initializations methods include '
587 | '"grow", "full" and "half and half". Given %s.'
588 | % self.init_method)
589 |
590 | # 检查初始化深度
591 | if (not isinstance(self.init_depth, tuple) or
592 | len(self.init_depth) != 2):
593 | raise ValueError('init_depth should be a tuple with length two.')
594 | if self.init_depth[0] > self.init_depth[1]:
595 | raise ValueError('init_depth should be in increasing numerical '
596 | 'order: (min_depth, max_depth).')
597 |
598 | # 初始化transformer函数
599 | if self.transformer is not None:
600 | if isinstance(self.transformer, _Function):
601 | self._transformer = self.transformer
602 | elif self.transformer == 'sigmoid':
603 | self._transformer = sigmoid
604 | else:
605 | raise ValueError('Invalid `transformer`. Expected either '
606 | '"sigmoid" or _Function object, got %s' %
607 | type(self.transformer))
608 | if self._transformer.arity != 1:
609 | raise ValueError('Invalid arity for `transformer`. Expected 1, '
610 | 'got %d.' % (self._transformer.arity))
611 |
612 | params = self.get_params()
613 | params['_metric'] = self._metric
614 | if hasattr(self, '_transformer'):
615 | params['_transformer'] = self._transformer
616 | else:
617 | params['_transformer'] = None
618 | params['function_dict'] = self._function_dict
619 | params['arities'] = self._arities
620 | params['method_probs'] = self._method_probs
621 | params['cat_var_number'] = len(self.category_features) if self.category_features is not None else 0
622 |
623 | # 清空_program
624 | if not self.warm_start or not hasattr(self, '_programs'):
625 | # Free allocated memory, if any
626 | self._programs = []
627 | self.run_details_ = {'generation': [],
628 | 'average_length': [],
629 | 'average_fitness': [],
630 | 'best_length': [],
631 | 'best_fitness': [],
632 | 'best_oob_fitness': [],
633 | 'generation_time': []}
634 |
635 | prior_generations = len(self._programs)
636 | n_more_generations = self.generations - prior_generations
637 |
638 | if n_more_generations < 0:
639 | raise ValueError('generations=%d must be larger or equal to '
640 | 'len(_programs)=%d when warm_start==True'
641 | % (self.generations, len(self._programs)))
642 | elif n_more_generations == 0:
643 | fitness = [program.raw_fitness_ for program in self._programs[-1]]
644 | warn('Warm-start fitting without increasing n_estimators does not '
645 | 'fit new programs.')
646 |
647 | if self.warm_start:
648 | # Generate and discard seeds that would have been produced on the
649 | # initial fit call.
650 | for i in range(len(self._programs)):
651 | _ = random_state.randint(MAX_INT, size=self.population_size)
652 |
653 | if self.verbose:
654 | # Print header fields
655 | self._verbose_reporter()
656 |
657 | for gen in range(prior_generations, self.generations):
658 | start_time = time()
659 |
660 | if gen == 0:
661 | parents = None
662 | else:
663 | try:
664 | parents = self._programs[gen - 1]
665 | except:
666 | print(len(self._programs))
667 | print(gen)
668 |
669 | exit()
670 | # Parallel loop
671 | # 将population_size分配给n_job个进程
672 | n_jobs, n_programs, starts = _partition_estimators(self.population_size, self.n_jobs)
673 | seeds = random_state.randint(MAX_INT, size=self.population_size)
674 |
675 | population = Parallel(n_jobs=n_jobs,
676 | verbose=int(self.verbose > 1))(
677 | delayed(_parallel_evolve)(n_programs[i],
678 | parents,
679 | X,
680 | y,
681 | security_data,
682 | time_series_data,
683 | sample_weight,
684 | seeds[starts[i]:starts[i + 1]],
685 | params)
686 | for i in range(n_jobs))
687 |
688 | # Reduce, maintaining order across different n_jobs
689 | population = list(itertools.chain.from_iterable(population))
690 |
691 | fitness = [program.raw_fitness_ for program in population]
692 | length = [program.length_ for program in population]
693 |
694 | # 惩罚系数
695 | parsimony_coefficient = None
696 | if self.parsimony_coefficient == 'auto':
697 | parsimony_coefficient = (np.cov(length, fitness)[1, 0] /
698 | np.var(length))
699 | for program in population:
700 | program.fitness_ = program.fitness(parsimony_coefficient)
701 |
702 | self._programs.append(population)
703 |
704 | # 去除没有进入下一代的父辈种群
705 | if not self.low_memory:
706 | for old_gen in np.arange(gen, 0, -1):
707 | indices = []
708 | for program in self._programs[old_gen]:
709 | if program is not None:
710 | for idx in program.parents:
711 | if 'idx' in idx:
712 | indices.append(program.parents[idx])
713 | indices = set(indices)
714 | for idx in range(self.population_size):
715 | if idx not in indices:
716 | self._programs[old_gen - 1][idx] = None
717 | elif gen > 0:
718 | # 在low_memory的情况下,去除所有
719 | self._programs[gen - 1] = None
720 |
721 | # 记录运行细节
722 | if self._metric.greater_is_better:
723 | best_program = population[np.argmax(fitness)]
724 | else:
725 | best_program = population[np.argmin(fitness)]
726 |
727 | self.run_details_['generation'].append(gen)
728 | self.run_details_['average_length'].append(np.mean(length))
729 | self.run_details_['average_fitness'].append(np.mean(fitness))
730 | self.run_details_['best_length'].append(best_program.length_)
731 | self.run_details_['best_fitness'].append(best_program.raw_fitness_)
732 | oob_fitness = np.nan
733 | if self.max_samples < 1.0:
734 | oob_fitness = best_program.oob_fitness_
735 | self.run_details_['best_oob_fitness'].append(oob_fitness)
736 | generation_time = time() - start_time
737 | self.run_details_['generation_time'].append(generation_time)
738 |
739 | if self.verbose:
740 | self._verbose_reporter(self.run_details_)
741 |
742 | # 是否进入停止条件
743 | if self._metric.greater_is_better:
744 | best_fitness = fitness[np.argmax(fitness)]
745 | if best_fitness >= self.stopping_criteria:
746 | break
747 | else:
748 | best_fitness = fitness[np.argmin(fitness)]
749 | if best_fitness <= self.stopping_criteria:
750 | break
751 |
752 | # 特征工程专属模块
753 | if isinstance(self, TransformerMixin):
754 | # Find the best individuals in the final generation
755 | fitness = np.array(fitness)
756 | # 找出适应度最优的hall_of_fame个进入fitness
757 | if self._metric.greater_is_better:
758 | hall_of_fame = fitness.argsort()[::-1][:self.hall_of_fame]
759 | else:
760 | hall_of_fame = fitness.argsort()[:self.hall_of_fame]
761 | evaluation = np.array([gp.execute(X) for gp in
762 | [self._programs[-1][i] for
763 | i in hall_of_fame]])
764 | if self.metric == 'spearman':
765 | evaluation = np.apply_along_axis(rankdata, 1, evaluation)
766 |
767 | with np.errstate(divide='ignore', invalid='ignore'):
768 | correlations = np.abs(np.corrcoef(evaluation))
769 | np.fill_diagonal(correlations, 0.)
770 | components = list(range(self.hall_of_fame))
771 | indices = list(range(self.hall_of_fame))
772 | # Iteratively remove least fit individual of most correlated pair
773 | while len(components) > self.n_components:
774 | # 去除hall_of_fame - n_components个高度相关特征
775 | # 找到相关系数矩阵中相关系数绝对值最大的两个特征,删去其中fitness较低的那个
776 | # 相关性低于某一阈值时按照fitness筛选(gplearnplus新增)
777 | most_correlated = np.unravel_index(np.argmax(correlations),
778 | correlations.shape)
779 | # The correlation matrix is sorted by fitness, so identifying
780 | # the least fit of the pair is simply getting the higher index
781 | worst = max(most_correlated)
782 | components.pop(worst)
783 | indices.remove(worst)
784 | correlations = correlations[:, indices][indices, :]
785 | if np.max(correlations) < self.tolerable_corr:
786 | break
787 | indices = list(range(len(components)))
788 | # 余下的选出最优的self.n_components个
789 | components = components[:self.n_components]
790 | self._best_programs = [self._programs[-1][i] for i in
791 | hall_of_fame[components]]
792 |
793 | else:
794 | # Find the best individual in the final generation
795 | if self._metric.greater_is_better:
796 | self._program = self._programs[-1][np.argmax(fitness)]
797 | else:
798 | self._program = self._programs[-1][np.argmin(fitness)]
799 |
800 | return self
801 |
802 |
803 | class SymbolicRegressor(BaseSymbolic, RegressorMixin):
804 | def __init__(self,
805 | *,
806 | population_size=1000,
807 | generations=20,
808 | tournament_size=20,
809 | stopping_criteria=0.0,
810 | const_range=(-1., 1.),
811 | init_depth=(2, 6),
812 | init_method='half and half',
813 | function_set=('add', 'sub', 'mul', 'div'),
814 | metric='mean absolute error',
815 | parsimony_coefficient=0.001,
816 | p_crossover=0.9,
817 | p_subtree_mutation=0.01,
818 | p_hoist_mutation=0.01,
819 | p_point_mutation=0.01,
820 | p_point_replace=0.05,
821 | max_samples=1.0,
822 | feature_names=None,
823 | time_series_index=None,
824 | security_index=None,
825 | category_features=None,
826 | warm_start=False,
827 | low_memory=False,
828 | n_jobs=1,
829 | verbose=0,
830 | data_type='section',
831 | random_state=None):
832 | super(SymbolicRegressor, self).__init__(
833 | population_size=population_size,
834 | generations=generations,
835 | tournament_size=tournament_size,
836 | stopping_criteria=stopping_criteria,
837 | const_range=const_range,
838 | init_depth=init_depth,
839 | init_method=init_method,
840 | function_set=function_set,
841 | metric=metric,
842 | parsimony_coefficient=parsimony_coefficient,
843 | p_crossover=p_crossover,
844 | p_subtree_mutation=p_subtree_mutation,
845 | p_hoist_mutation=p_hoist_mutation,
846 | p_point_mutation=p_point_mutation,
847 | p_point_replace=p_point_replace,
848 | max_samples=max_samples,
849 | feature_names=feature_names,
850 | time_series_index=time_series_index,
851 | security_index=security_index,
852 | category_features=category_features,
853 | warm_start=warm_start,
854 | low_memory=low_memory,
855 | n_jobs=n_jobs,
856 | verbose=verbose,
857 | random_state=random_state,
858 | data_type=data_type)
859 |
860 | def __str__(self):
861 | """Overloads `print` output of the object to resemble a LISP tree."""
862 | if not hasattr(self, '_program'):
863 | return self.__repr__()
864 | return self._program.__str__()
865 |
866 | def predict(self, X):
867 | """Perform regression on test vectors X.
868 |
869 | Parameters
870 | ----------
871 | X : array-like, shape = [n_samples, n_features]
872 | Input vectors, where n_samples is the number of samples
873 | and n_features is the number of features.
874 |
875 | Returns
876 | -------
877 | y : array, shape = [n_samples]
878 | Predicted values for X.
879 |
880 | """
881 | if not hasattr(self, '_program'):
882 | raise NotFittedError('SymbolicRegressor not fitted.')
883 |
884 | X = check_array(X)
885 | _, n_features = X.shape
886 | if self.n_features_in_ != n_features:
887 | raise ValueError('Number of features of the model must match the '
888 | 'input. Model n_features is %s and input '
889 | 'n_features is %s.'
890 | % (self.n_features_in_, n_features))
891 |
892 | y = self._program.execute(X)
893 |
894 | return y
895 |
896 |
897 | class SymbolicClassifier(BaseSymbolic, ClassifierMixin):
898 | def __init__(self,
899 | *,
900 | population_size=1000,
901 | generations=20,
902 | tournament_size=20,
903 | stopping_criteria=0.0,
904 | const_range=(-1., 1.),
905 | init_depth=(2, 6),
906 | init_method='half and half',
907 | function_set=('add', 'sub', 'mul', 'div'),
908 | transformer='sigmoid',
909 | metric='log loss',
910 | parsimony_coefficient=0.001,
911 | p_crossover=0.9,
912 | p_subtree_mutation=0.01,
913 | p_hoist_mutation=0.01,
914 | p_point_mutation=0.01,
915 | p_point_replace=0.05,
916 | max_samples=1.0,
917 | class_weight=None,
918 | feature_names=None,
919 | time_series_index=None,
920 | security_index=None,
921 | category_features=None,
922 | warm_start=False,
923 | low_memory=False,
924 | n_jobs=1,
925 | verbose=0,
926 | data_type='section',
927 | random_state=None):
928 | super(SymbolicClassifier, self).__init__(
929 | population_size=population_size,
930 | generations=generations,
931 | tournament_size=tournament_size,
932 | stopping_criteria=stopping_criteria,
933 | const_range=const_range,
934 | init_depth=init_depth,
935 | init_method=init_method,
936 | function_set=function_set,
937 | transformer=transformer,
938 | metric=metric,
939 | parsimony_coefficient=parsimony_coefficient,
940 | p_crossover=p_crossover,
941 | p_subtree_mutation=p_subtree_mutation,
942 | p_hoist_mutation=p_hoist_mutation,
943 | p_point_mutation=p_point_mutation,
944 | p_point_replace=p_point_replace,
945 | max_samples=max_samples,
946 | class_weight=class_weight,
947 | feature_names=feature_names,
948 | time_series_index=time_series_index,
949 | security_index=security_index,
950 | category_features=category_features,
951 | warm_start=warm_start,
952 | low_memory=low_memory,
953 | n_jobs=n_jobs,
954 | verbose=verbose,
955 | data_type=data_type,
956 | random_state=random_state)
957 |
958 | def __str__(self):
959 | """Overloads `print` output of the object to resemble a LISP tree."""
960 | if not hasattr(self, '_program'):
961 | return self.__repr__()
962 | return self._program.__str__()
963 |
964 | def _more_tags(self):
965 | return {'binary_only': True}
966 |
967 | def predict_proba(self, X):
968 | # 输出概率 只支持二分类
969 | if not hasattr(self, '_program'):
970 | raise NotFittedError('SymbolicClassifier not fitted.')
971 |
972 | X = check_array(X)
973 | _, n_features = X.shape
974 | if self.n_features_in_ != n_features:
975 | raise ValueError('Number of features of the model must match the '
976 | 'input. Model n_features is %s and input '
977 | 'n_features is %s.'
978 | % (self.n_features_in_, n_features))
979 |
980 | scores = self._program.execute(X)
981 | proba = self._transformer(scores)
982 | proba = np.vstack([1 - proba, proba]).T
983 | return proba
984 |
985 | def predict(self, X):
986 | # 输出预测结果
987 | proba = self.predict_proba(X)
988 | return self.classes_.take(np.argmax(proba, axis=1), axis=0)
989 |
990 |
991 | class SymbolicTransformer(BaseSymbolic, TransformerMixin):
992 | def __init__(self,
993 | *,
994 | population_size=1000,
995 | hall_of_fame=100,
996 | n_components=10,
997 | generations=20,
998 | tournament_size=20,
999 | stopping_criteria=1.0,
1000 | const_range=(-1., 1.),
1001 | init_depth=(2, 6),
1002 | init_method='half and half',
1003 | function_set=('add', 'sub', 'mul', 'div'),
1004 | metric='pearson',
1005 | parsimony_coefficient=0.001,
1006 | p_crossover=0.9,
1007 | p_subtree_mutation=0.01,
1008 | p_hoist_mutation=0.01,
1009 | p_point_mutation=0.01,
1010 | p_point_replace=0.05,
1011 | max_samples=1.0,
1012 | tolerable_corr=0.0,
1013 | feature_names=None,
1014 | time_series_index=None,
1015 | security_index=None,
1016 | category_features=None,
1017 | warm_start=False,
1018 | low_memory=False,
1019 | n_jobs=1,
1020 | verbose=0,
1021 | data_type='section',
1022 | random_state=None):
1023 | super(SymbolicTransformer, self).__init__(
1024 | population_size=population_size,
1025 | hall_of_fame=hall_of_fame,
1026 | n_components=n_components,
1027 | generations=generations,
1028 | tournament_size=tournament_size,
1029 | stopping_criteria=stopping_criteria,
1030 | const_range=const_range,
1031 | init_depth=init_depth,
1032 | init_method=init_method,
1033 | function_set=function_set,
1034 | metric=metric,
1035 | parsimony_coefficient=parsimony_coefficient,
1036 | p_crossover=p_crossover,
1037 | p_subtree_mutation=p_subtree_mutation,
1038 | p_hoist_mutation=p_hoist_mutation,
1039 | p_point_mutation=p_point_mutation,
1040 | p_point_replace=p_point_replace,
1041 | max_samples=max_samples,
1042 | tolerable_corr=tolerable_corr,
1043 | feature_names=feature_names,
1044 | time_series_index=time_series_index,
1045 | security_index=security_index,
1046 | category_features=category_features,
1047 | warm_start=warm_start,
1048 | low_memory=low_memory,
1049 | n_jobs=n_jobs,
1050 | verbose=verbose,
1051 | data_type=data_type,
1052 | random_state=random_state)
1053 |
1054 | def __len__(self):
1055 | """Overloads `len` output to be the number of fitted components."""
1056 | if not hasattr(self, '_best_programs'):
1057 | return 0
1058 | return self.n_components
1059 |
1060 | def __getitem__(self, item):
1061 | """Return the ith item of the fitted components."""
1062 | if item >= len(self):
1063 | raise IndexError
1064 | return self._best_programs[item]
1065 |
1066 | def __str__(self):
1067 | """Overloads `print` output of the object to resemble LISP trees."""
1068 | if not hasattr(self, '_best_programs'):
1069 | return self.__repr__()
1070 | output = str([gp.__str__() for gp in self])
1071 | return output.replace("',", ",\n").replace("'", "")
1072 |
1073 | def _more_tags(self):
1074 | return {
1075 | "_xfail_checks": {
1076 | "check_sample_weights_invariance": (
1077 | "zero sample_weight is not equivalent to removing samples"
1078 | ),
1079 | }
1080 | }
1081 |
1082 | def transform(self, X):
1083 | # 将X转换成以及训练好的特征
1084 | if not hasattr(self, '_best_programs'):
1085 | raise NotFittedError('SymbolicTransformer not fitted.')
1086 |
1087 | X = check_array(X)
1088 | _, n_features = X.shape
1089 | if self.n_features_in_ != n_features:
1090 | raise ValueError('Number of features of the model must match the '
1091 | 'input. Model n_features is %s and input '
1092 | 'n_features is %s.'
1093 | % (self.n_features_in_, n_features))
1094 |
1095 | X_new = np.array([gp.execute(X) for gp in self._best_programs]).T
1096 |
1097 | return X_new
1098 |
1099 | def fit_transform(self, X, y, sample_weight=None):
1100 | # 训练之后转换
1101 | return self.fit(X, y, sample_weight).transform(X)
1102 |
1103 |
--------------------------------------------------------------------------------