├── readme_pic
    └── tree.png
├── __init__.py
├── utils.py
├── fitness.py
├── data_trans.py
├── README.md
├── .idea
    └── workspace.xml
├── functions.py
├── example.py
├── _program.py
└── genetic.py


/readme_pic/tree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ACEACEjasonhuang/gplearnplus/HEAD/readme_pic/tree.png


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | -------------------------------------------------
 4 | # @Project  :gplearnplus 
 5 | # @File     :__init__.py
 6 | # @Date     :2022/12/1 0001 13:36 
 7 | # @Author   :Junzhe Huang
 8 | # @Email    :acejasonhuang@163.com
 9 | # @Software :PyCharm
10 | -------------------------------------------------
11 | """
12 | __version__ = '1.5.9'
13 | 
14 | __all__ = ['genetic', 'functions', 'fitness', 'example']
15 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | -------------------------------------------------
 4 | # @Project  :gplearnplus 
 5 | # @File     :utils
 6 | # @Date     :2022/12/1 0001 13:38 
 7 | # @Author   :Junzhe Huang
 8 | # @Email    :acejasonhuang@163.com
 9 | # @Software :PyCharm
10 | -------------------------------------------------
11 | """
12 | 
13 | import numbers
14 | 
15 | import numpy as np
16 | from joblib import cpu_count
17 | 
18 | 
19 | # seed 转换为 np.random.RandomState对象
20 | def check_random_state(seed):
21 |     """Turn seed into a np.random.RandomState instance
22 | 
23 |     Parameters
24 |     ----------
25 |     seed : None | int | instance of RandomState
26 |         If seed is None, return the RandomState singleton used by np.random.
27 |         If seed is an int, return a new RandomState instance seeded with seed.
28 |         If seed is already a RandomState instance, return it.
29 |         Otherwise raise ValueError.
30 | 
31 |     """
32 |     if seed is None or seed is np.random:
33 |         return np.random.mtrand._rand
34 |     if isinstance(seed, (numbers.Integral, np.integer)):
35 |         return np.random.RandomState(seed)
36 |     if isinstance(seed, np.random.RandomState):
37 |         return seed
38 |     raise ValueError('%r cannot be used to seed a numpy.random.RandomState'
39 |                      ' instance' % seed)
40 | 
41 | 
42 | # n_jobs转换 -1为全部
43 | def _get_n_jobs(n_jobs):
44 |     """Get number of jobs for the computation.
45 | 
46 |     This function reimplements the logic of joblib to determine the actual
47 |     number of jobs depending on the cpu count. If -1 all CPUs are used.
48 |     If 1 is given, no parallel computing code is used at all, which is useful
49 |     for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used.
50 |     Thus for n_jobs = -2, all CPUs but one are used.
51 | 
52 |     Parameters
53 |     ----------
54 |     n_jobs : int
55 |         Number of jobs stated in joblib convention.
56 | 
57 |     Returns
58 |     -------
59 |     n_jobs : int
60 |         The actual number of jobs as positive integer.
61 | 
62 |     """
63 |     if n_jobs < 0:
64 |         return max(cpu_count() + 1 + n_jobs, 1)
65 |     elif n_jobs == 0:
66 |         raise ValueError('Parameter n_jobs == 0 has no meaning.')
67 |     else:
68 |         return n_jobs
69 | 
70 | 
71 | # 将estimator 分配到每一个job上
72 | # 返回 进程数， 每个进程任务数，累计任务数
73 | def _partition_estimators(n_estimators, n_jobs):
74 |     """Private function used to partition estimators between jobs."""
75 |     # Compute the number of jobs
76 |     n_jobs = min(_get_n_jobs(n_jobs), n_estimators)
77 | 
78 |     # 给进程分配任务
79 |     # Partition estimators between jobs
80 |     n_estimators_per_job = (n_estimators // n_jobs) * np.ones(n_jobs,
81 |                                                               dtype=int)
82 |     n_estimators_per_job[:n_estimators % n_jobs] += 1
83 |     starts = np.cumsum(n_estimators_per_job)
84 | 
85 |     return n_jobs, n_estimators_per_job.tolist(), [0] + starts.tolist()
86 | 


--------------------------------------------------------------------------------
/fitness.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | -------------------------------------------------
  4 | # @Project  :gplearnplus 
  5 | # @File     :fitness
  6 | # @Date     :2022/12/5 0005 7:25 
  7 | # @Author   :Junzhe Huang
  8 | # @Email    :acejasonhuang@163.com
  9 | # @Software :PyCharm
 10 | -------------------------------------------------
 11 | """
 12 | 
 13 | import numbers
 14 | 
 15 | import numpy as np
 16 | from joblib import wrap_non_picklable_objects
 17 | from scipy.stats import rankdata
 18 | 
 19 | __all__ = ['make_fitness']
 20 | 
 21 | 
 22 | class _Fitness(object):
 23 | 
 24 |     """A metric to measure the fitness of a program.
 25 | 
 26 |     This object is able to be called with NumPy vectorized arguments and return
 27 |     a resulting floating point score quantifying the quality of the program's
 28 |     representation of the true relationship.
 29 | 
 30 |     Parameters
 31 |     ----------
 32 |     function : callable
 33 |         A function with signature function(y, y_pred, sample_weight) that
 34 |         returns a floating point number. Where `y` is the input target y
 35 |         vector, `y_pred` is the predicted values from the genetic program, and
 36 |         sample_weight is the sample_weight vector.
 37 | 
 38 |     greater_is_better : bool
 39 |         Whether a higher value from `function` indicates a better fit. In
 40 |         general this would be False for metrics indicating the magnitude of
 41 |         the error, and True for metrics indicating the quality of fit.
 42 | 
 43 |     """
 44 | 
 45 |     def __init__(self, function, greater_is_better):
 46 |         self.function = function
 47 |         self.greater_is_better = greater_is_better
 48 |         self.sign = 1 if greater_is_better else -1
 49 | 
 50 |     def __call__(self, *args):
 51 |         return self.function(*args)
 52 | 
 53 | 
 54 | def make_fitness(*, function, greater_is_better, wrap=True):
 55 |     """Make a fitness measure, a metric scoring the quality of a program's fit.
 56 | 
 57 |     This factory function creates a fitness measure object which measures the
 58 |     quality of a program's fit and thus its likelihood to undergo genetic
 59 |     operations into the next generation. The resulting object is able to be
 60 |     called with NumPy vectorized arguments and return a resulting floating
 61 |     point score quantifying the quality of the program's representation of the
 62 |     true relationship.
 63 | 
 64 |     Parameters
 65 |     ----------
 66 |     function : callable
 67 |         A function with signature function(y, y_pred, sample_weight) that
 68 |         returns a floating point number. Where `y` is the input target y
 69 |         vector, `y_pred` is the predicted values from the genetic program, and
 70 |         sample_weight is the sample_weight vector.
 71 | 
 72 |     greater_is_better : bool
 73 |         Whether a higher value from `function` indicates a better fit. In
 74 |         general this would be False for metrics indicating the magnitude of
 75 |         the error, and True for metrics indicating the quality of fit.
 76 | 
 77 |     wrap : bool, optional (default=True)
 78 |         When running in parallel, pickling of custom metrics is not supported
 79 |         by Python's default pickler. This option will wrap the function using
 80 |         cloudpickle allowing you to pickle your solution, but the evolution may
 81 |         run slightly more slowly. If you are running single-threaded in an
 82 |         interactive Python session or have no need to save the model, set to
 83 |         `False` for faster runs.
 84 | 
 85 |     """
 86 |     if not isinstance(greater_is_better, bool):
 87 |         raise ValueError('greater_is_better must be bool, got %s'
 88 |                          % type(greater_is_better))
 89 |     if not isinstance(wrap, bool):
 90 |         raise ValueError('wrap must be an bool, got %s' % type(wrap))
 91 |     if function.__code__.co_argcount != 3:
 92 |         raise ValueError('function requires 3 arguments (y, y_pred, w),'
 93 |                          ' got %d.' % function.__code__.co_argcount)
 94 |     if not isinstance(function(np.array([1, 1]),
 95 |                       np.array([2, 2]),
 96 |                       np.array([1, 1])), numbers.Number):
 97 |         raise ValueError('function must return a numeric.')
 98 | 
 99 |     if wrap:
100 |         return _Fitness(function=wrap_non_picklable_objects(function),
101 |                         greater_is_better=greater_is_better)
102 |     return _Fitness(function=function,
103 |                     greater_is_better=greater_is_better)
104 | 
105 | 
106 | def _weighted_pearson(y, y_pred, w):
107 |     """Calculate the weighted Pearson correlation coefficient."""
108 |     with np.errstate(divide='ignore', invalid='ignore'):
109 |         y_pred_demean = y_pred - np.average(y_pred, weights=w)
110 |         y_demean = y - np.average(y, weights=w)
111 |         corr = ((np.sum(w * y_pred_demean * y_demean) / np.sum(w)) /
112 |                 np.sqrt((np.sum(w * y_pred_demean ** 2) *
113 |                          np.sum(w * y_demean ** 2)) /
114 |                         (np.sum(w) ** 2)))
115 |     if np.isfinite(corr):
116 |         return np.abs(corr)
117 |     return 0.
118 | 
119 | 
120 | def _weighted_spearman(y, y_pred, w):
121 |     """Calculate the weighted Spearman correlation coefficient."""
122 |     y_pred_ranked = np.apply_along_axis(rankdata, 0, y_pred)
123 |     y_ranked = np.apply_along_axis(rankdata, 0, y)
124 |     return _weighted_pearson(y_pred_ranked, y_ranked, w)
125 | 
126 | 
127 | def _mean_absolute_error(y, y_pred, w):
128 |     """Calculate the mean absolute error."""
129 |     return np.average(np.abs(y_pred - y), weights=w)
130 | 
131 | 
132 | def _mean_square_error(y, y_pred, w):
133 |     """Calculate the mean square error."""
134 |     return np.average(((y_pred - y) ** 2), weights=w)
135 | 
136 | 
137 | def _root_mean_square_error(y, y_pred, w):
138 |     """Calculate the root mean square error."""
139 |     return np.sqrt(np.average(((y_pred - y) ** 2), weights=w))
140 | 
141 | 
142 | def _log_loss(y, y_pred, w):
143 |     """Calculate the log loss."""
144 |     eps = 1e-15
145 |     inv_y_pred = np.clip(1 - y_pred, eps, 1 - eps)
146 |     y_pred = np.clip(y_pred, eps, 1 - eps)
147 |     score = y * np.log(y_pred) + (1 - y) * np.log(inv_y_pred)
148 |     return np.average(-score, weights=w)
149 | 
150 | 
151 | weighted_pearson = _Fitness(function=_weighted_pearson,
152 |                             greater_is_better=True)
153 | weighted_spearman = _Fitness(function=_weighted_spearman,
154 |                              greater_is_better=True)
155 | mean_absolute_error = _Fitness(function=_mean_absolute_error,
156 |                                greater_is_better=False)
157 | mean_square_error = _Fitness(function=_mean_square_error,
158 |                              greater_is_better=False)
159 | root_mean_square_error = _Fitness(function=_root_mean_square_error,
160 |                                   greater_is_better=False)
161 | log_loss = _Fitness(function=_log_loss,
162 |                     greater_is_better=False)
163 | 
164 | _fitness_map = {'pearson': weighted_pearson,
165 |                 'spearman': weighted_spearman,
166 |                 'mean absolute error': mean_absolute_error,
167 |                 'mse': mean_square_error,
168 |                 'rmse': root_mean_square_error,
169 |                 'log loss': log_loss}


--------------------------------------------------------------------------------
/data_trans.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # @Time     : 2024/2/7
  3 | # @Author   : Junzhe Huang
  4 | # @Email    : huangjz01@igoldenbeta.com
  5 | # @File     : data_trans
  6 | # @Software : gplearnplus
  7 | import pandas as pd
  8 | 
  9 | 
 10 | # todo 移植部分
 11 | def data_transform(X, y, data_type, number_feature_list, category_feature_list=None,
 12 |                    security_index=None, time_series_index=None):
 13 |     # 检查数据类型
 14 |     if data_type not in ('section', 'time_series', 'panel'):
 15 |         raise ValueError('Valid data_type methods include '
 16 |                          '"section", "time_series" and "panel". Given %s.'
 17 |                          % data_type)
 18 | 
 19 |     # X必须为pd.DataFrame
 20 |     if not isinstance(X, pd.DataFrame):
 21 |         raise ValueError('Data structure must be DataFrame')
 22 | 
 23 |     # 验证y的长度是否与X相同
 24 |     if len(X) != len(y):
 25 |         raise ValueError('X and y must have same length')
 26 | 
 27 |     # 检查column 是否包含category_feature_list 和 number_feature_list
 28 |     # 将category_feature_list 调整至前 number_feature_list 调整至后
 29 |     # 找出X的columns与category_feature_list的交集列表
 30 |     if category_feature_list is not None:
 31 |         if not isinstance(category_feature_list, list):
 32 |             raise ValueError('category_feature_list must be list')
 33 |         category_feature_list_inX = [col for col in X.columns if col not in category_feature_list]
 34 |     else:
 35 |         category_feature_list_inX = []
 36 |     # 找出X的columns与number_feature_list的交集列表
 37 |     if not isinstance(number_feature_list, list):
 38 |         raise ValueError('number_feature_list must be list')
 39 |     number_feature_list_inX = [col for col in X.columns if col not in number_feature_list]
 40 |     # 重构顺序，将分类类型放在前面, 并把第一列设为常数1，column为 const_1
 41 |     X['const_1'] = 1
 42 |     feature_names = category_feature_list_inX + number_feature_list_inX
 43 |     X_trans = X[['const_1'] + feature_names].copy()
 44 | 
 45 |     # 若存在security_index和time_series_index，插入X_trans最后，默认先插入security_index再插入time_series_index
 46 |     if security_index is not None:
 47 |         # 若security_index在X的columns中，或者为X.index，将其插入到X_trans最后
 48 |         if security_index in X.columns:
 49 |             X_trans[security_index] = X[security_index]
 50 |         elif X.index.name == security_index:
 51 |             X_trans[security_index] = X.index.get_level_values(security_index)
 52 |         else:
 53 |             # 若security_index不在X_trans的columns中，也不再index中，报错
 54 |             raise ValueError('Can not fund security_index {} in both columns and index'
 55 |                              .format(security_index))
 56 |    if time_series_index is not None:
 57 |         # 若time_series_index在X的columns
 58 |         if time_series_index in X.columns:
 59 |             X_trans[time_series_index] = X[time_series_index]
 60 |         elif X.index.name == time_series_index:
 61 | 
 62 | 
 63 | 
 64 |     # 检查时间index和个股index， 对于截面，时序和面板数据分别检查
 65 |     if data_type == 'section':
 66 |         if time_series_index is not None:
 67 |             raise ValueError('For Section Data, time_series_index should be None')
 68 |         if security_index is not None:
 69 |             # 在index和columns中寻找security_index
 70 |             # 判断是否有重复个股
 71 |             if len(X[security_index].unique()) < len(X[security_index]):
 72 |                 raise ValueError('For Section Data, security data should be unique')
 73 |     elif data_type == 'time_series':
 74 |         if security_index is not None:
 75 |             raise ValueError('For time_series Data, security_index should be None')
 76 |         if time_series_index is not None:
 77 |             # 在index和columns中寻找time_series_index
 78 |             if time_series_index not in X.columns and \
 79 |                     (X.index.name is None or time_series_index not in X.index.name):
 80 |                 raise ValueError('Can not fund time_series_index {} in both columns and index'
 81 |                                  .format(time_series_index))
 82 |             elif time_series_index in X.columns:
 83 |                 X.set_index(time_series_index, inplace=True)
 84 |             # 判断是否有重复时间
 85 |             if len(X.index.drop_duplicates()) < len(X):
 86 |                 raise ValueError('For time_series Data, time_series data should be unique')
 87 |             X_combine = X.copy()
 88 |             X_combine['_label'] = y.values if isinstance(y, pd.Series) else y
 89 |             X_combine.sort_index(inplace=True)
 90 |             X, y = X_combine.loc[:, self.feature_names], X_combine.loc[:, '_label']
 91 |             # debug
 92 | 
 93 |             time_series_data = X.index.values
 94 | 
 95 |     else:
 96 |         if self.time_series_index is None:
 97 |             raise ValueError('For panel Data, time_series_index should NOT be None')
 98 |         if self.security_index is None:
 99 |             raise ValueError('For panel Data, security_index should NOT be None')
100 | 
101 |         # security time_series 进入index
102 |         if self.time_series_index not in X.columns and \
103 |                 (X.index.name is None or self.time_series_index not in X.index.name):
104 |             raise ValueError('Can not fund time_series_index {} in both columns and index'
105 |                              .format(self.time_series_index))
106 |         elif self.security_index not in X.columns and \
107 |                 (X.index.name is None or self.security_index not in X.index.name):
108 |             raise ValueError('Can not fund security_index {} in both columns and index'
109 |                              .format(self.security_index))
110 |         elif self.time_series_index in X.columns and self.security_index in X.columns:
111 |             X.set_index([self.time_series_index, self.security_index], inplace=True)
112 |         elif self.time_series_index in X.columns:
113 |             X.set_index(self.security_index, inplace=True, append=True)
114 |         elif self.security_index in X.columns:
115 |             X.set_index(self.time_series_index, inplace=True, append=True)
116 | 
117 |         # 判断没有重复
118 |         if len(X.index) != len(X.index.drop_duplicates()):
119 |             raise ValueError('For time_series Data, time_series data should be unique')
120 | 
121 | 
122 |         X_combine = X.copy()
123 |         X_combine['_label'] = y.values if isinstance(y, pd.Series) else y
124 |         X_combine.sort_index(inplace=True)
125 |         X, y = X_combine.loc[:, self.feature_names], X_combine.loc[:, '_label']
126 |         time_series_data = X.index.get_level_values(self.time_series_index).values
127 |         security_data = X.index.get_level_values(self.security_index).values
128 | 
129 |     # 检查category_features是否与全包含在feature_names中
130 |     # 当存在分类数据时，输入数据类型必须为pd。DataFrame
131 |     if self.category_features is not None:
132 |         if not isinstance(X, pd.DataFrame):
133 |             raise ValueError('while there are category_features in X, X must be pd.DataFrame')
134 |         if not isinstance(self.category_features, list):
135 |             raise ValueError('category_features must be list')
136 |         for cat_feature in self.category_features:
137 |             if cat_feature not in self.feature_names:
138 |                 raise ValueError('Valid category_feature {} , not in feature_names'.format(cat_feature))
139 |         # 处理分类数据，转换为整型
140 |         label_encoder = LabelEncoder()
141 |         X[self.category_features] = X[self.category_features].apply(label_encoder.fit_transform)
142 |         # 重构顺序，将分类类型放在前面
143 |         self.feature_names = \
144 |             [self.category_features + [_col for _col in self.feature_names if _col not in self.category_features]]
145 |         X = X[self.feature_names]
146 | 
147 |     # Check arrays
148 |     if sample_weight is not None:
149 |         sample_weight = _check_sample_weight(sample_weight, X)
150 | 
151 |     # 检查数据内容
152 |     if isinstance(self, ClassifierMixin):
153 |         # 验证y是否为分类数据， X， y强转ndarray
154 |         # todo 分类场景的处理有待优化，暂时不处理
155 |         X, y = self._validate_data(X, y, y_numeric=False)
156 |         check_classification_targets(y)
157 | 
158 |         if self.class_weight:
159 |             if sample_weight is None:
160 |                 sample_weight = 1.
161 |             # modify the sample weights with the corresponding class weight
162 |             sample_weight = (sample_weight *
163 |                              compute_sample_weight(self.class_weight, y))
164 | 
165 |         self.classes_, y = np.unique(y, return_inverse=True)
166 |         n_trim_classes = np.count_nonzero(np.bincount(y, sample_weight))
167 |         if n_trim_classes != 2:
168 |             raise ValueError("y contains %d class after sample_weight "
169 |                              "trimmed classes with zero weights, while 2 "
170 |                              "classes are required."
171 |                              % n_trim_classes)
172 |         self.n_classes_ = len(self.classes_)
173 | 
174 |     else:
175 |         # 验证y是否为数值数据， X， y强转ndarray
176 |         X, y = self._validate_data(X, y, y_numeric=True)


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # gplearnplus
  2 | 对gplearn进行升级，适应时序数据和面板数据，适用于更多的场景
  3 | 且在函数参数中区分分类数据和数值型数据，可兼容类似于groupby等操作
  4 | 
  5 | # 文件描述
  6 | 
  7 | ## `_Program.py`
  8 | 构建，调用公式树模块，
  9 | 对象为`_Program`
 10 | 属性`program`为栈形式的公式树
 11 | ### 公式树形式
 12 | ![alt text](https://github.com/ACEACEjasonhuang/gplearnplus/readme_pic/tree.png)
 13 | 
 14 | 该公式表达是为
 15 | 
 16 | $$
 17 | ((X_0 \times X_0) - (3.0 \times X_1)) + 0.5
 18 | $$
 19 | 
 20 | program结果栈为：
 21 | `['add', 'sub', 'mul', '0', '0', 'mul', 3.0, '1', 0.5]`
 22 | 
 23 | ### 初始化方法
 24 | `build_program`
 25 | 通过stack对树进行深度优先搜索构建
 26 | ```mermaid
 27 | graph TB
 28 |     Z[选择根函数,必<br>须返回数值向量] --> A[(工作栈stack中插入<br>根函数参数列表)]
 29 |     A-->B([结果栈program中<br>插入根函数])
 30 |     B-->C{工作栈stack非空}
 31 |     C-->|Y|D{判断工作栈中最后一个<br>函数的第一个参数<br>1.该节点必须接受向量<br>2.当前深度比最大深度低<br>3.随机种子选中了函数<br>或者模式为'full'}
 32 |      C-->|N|L[异常,工作栈不得为空]
 33 |     D-->|Y|E[插入函数<br>成为子树节点]
 34 |     D-->|N|F[插入向量或标量<br>成为叶子节点]
 35 |     E-->G{该节点是否可接受<br>分类函数和数值函数}
 36 |     G-->|Y|H[所有函数中随机挑选]
 37 |     G-->|N|I[相应的分类或<br>数值函数中随机挑选]
 38 |     H-->J[(工作栈stack中插入<br>相应函数参数列表)]
 39 |     I-->J
 40 |     J-->K[(结果栈program中<br>插入相应函数)]
 41 |     
 42 |     F-->M{若参数不接受标量或<br>const_range为空或<br>随机数选中向量}
 43 |     M-->|Y|N{存在分类向量且<br>参数接受分类向量且<br>随机数选中分类向量}
 44 |     N-->|Y|O([结果栈program中<br>插入该分类向量,<br>类型为字符串数字])
 45 |     N-->|N|P([结果栈program中<br>插入该数值向量,<br>类型为字符串数字])
 46 |     M-->|N|R{若该节点接受<br>浮点类型标量}
 47 |     R-->|Y|S([结果栈program中<br>插入范围内随机浮点<br>标量,类型为浮点])
 48 |     R-->|N|T([结果栈program中<br>插入范围内随机整型<br>标量,类型为整型])
 49 |     O-->U[(工作栈stack中弹出<br>最后一个函数的<br>最后一个参数节点)]
 50 |     P-->U
 51 |     R-->U
 52 |     S-->U
 53 |     T-->U
 54 |     U-->Q{工作栈stack<br>最后一个函数<br>参数列表为空}
 55 |     Q-->|Y|V[(工作栈stack中弹出<br>最后一个函数的<br)]
 56 |     V-->W{工作栈为空}
 57 |     W-->|Y|X([返回结果栈program<br>公式树初始化完成])
 58 |     W-->|N|Y[(工作栈stack中弹出<br>最后一个函数的<br>最后一个参数节点)]
 59 |     Y-->Q
 60 |     Q-->|N|C
 61 |     K-->C
 62 | ```
 63 | ### 树的检验
 64 | `validate_program`
 65 | 对树一次深度优先搜索，保证所有节点完备，即每一个函数参数量足够
 66 | 
 67 | `_depth`
 68 | 深度优先搜索的同时记录最大深度
 69 | 
 70 | `_length`
 71 | 返回program长度，即树的节点数量
 72 | 
 73 | ### 树的打印
 74 | `__str__`：打印树
 75 | `export_graphviz`：可视化整个树
 76 | 
 77 | 
 78 | ### 公式树的计算
 79 | `execute`：接受pandas或者二位nd_array，shape = [n_samples, n_features]
 80 | 执行过程中，将program中的字符串和常数处理成可接受参数
 81 | - 常数需要广播成常向量
 82 | - 字符串转换为输入X中对应的列
 83 | - 若数据类型为面板数据`panel`，X中需要额外输入证券列和时间列，
 84 | 
 85 | `raw_fitness`：原始适应度
 86 | 1. 由公式树计算出$\hat{y}$
 87 | 2. 对$\hat{y}$进行调整
 88 | 3. 计算$y$与$\hat{y}$的适应度`metric`
 89 | 
 90 | `fitness`：带惩罚项适应度
 91 | $$
 92 | penalty=p\_coef \times program\_len \times sign(metric)
 93 | $$
 94 | ### 样本选择(防止过拟合)
 95 | 为了防止过拟合，仅选择部分样本
 96 | 
 97 | `get_all_indices` 输入总样本量和抽样样本量
 98 | 
 99 | 返回抽样内样本index和抽样外样本index
100 | 
101 | ### 公式树的截取
102 | 
103 | `get_subtree(random_state, start, program=None)`：获取指定子树
104 | 获取根节点为start的指定子树
105 | 
106 | 
107 | `get_random_subtree(random_state, program=None, return_type=None):`获取随机子树
108 | 根据需要设定获取**数值型子树**还是**分类型子树**
109 | 返回子树和子树类型
110 | 
111 | ### 公式树的交叉变异
112 | 
113 | `crossover(donor, random_state)`
114 | 与公式树`donor`交叉，要求同返回类型
115 | 
116 | `subtree_mutation(random_state)`
117 | 随机生成一颗公式树，与父树交叉
118 | 
119 | `hoist_mutation(random_state)`
120 | 首先寻找可以hoist的节点，要求该节点下存在子节点与自己类型相同
121 | 把一颗子树的同类型子树上提
122 | 
123 | `point_mutation(random_state)`
124 | 点变异
125 | 对随机选中的点进行点变异
126 | 点变异保证函数合法
127 | 
128 | 
129 | ## `fitness.py`
130 | 
131 | 定义适应度函数，和自定义适应函数的方法
132 | 
133 | 定义函数对象`_Fitness`
134 | 
135 | 包含是属性：
136 | 
137 | `function`
138 | 
139 | 必须接受三个变量`(y, y_perd, w)`
140 | 
141 | `greater_is_better`
142 | 
143 | 
144 | ## `function.py`
145 | 
146 | 自定义函数和构建方法
147 | 定义函数对象`_Function`
148 | 包含是属性：
149 | 
150 | `function`：可调用函数
151 | 
152 | `name`：函数名
153 | 
154 | `arity`：参数个数
155 | 
156 | `param_type`：
157 | 参数类型列表，长度与arity一致，**默认不接受分类类型**
158 | 该设计是本项目最重要的升级，影响公式树的构建
159 | ```python
160 | [{
161 | 'vector': {'category': (None, None), 'number': (None, None)},
162 | 'scalar': {'int': (None, None), 'float': (None, None)}
163 | },]
164 | ```
165 | 
166 | `function_type`：函数类型 默认 'all'
167 | 'all', 'section', 'time_series‘
168 | 
169 | `return_type`：返回类型 默认'number'
170 | 'number', 'category'
171 | 
172 | 包含的方法：
173 | 
174 | `__call__`
175 | 调用函数特殊处理，
176 | 参数仅接受标量，却传入向量
177 | 则取向量第一个值为标量
178 | 
179 | `add_range`:
180 | 
181 | 替换掉参数中没有约束的范围，给所有标量限制范围
182 | 
183 | 若没有const_range, 则表明所有函数不接收常数， 去掉所有的const type
184 | 
185 | `is_point_mutation(candidate_func)`
186 | 
187 | 检验某个待替换函数是否可以替换
188 | 
189 | 外部函数：
190 | `make_function(*, function, name, arity, param_type=None, wrap=True, return_type='number', function_type='all')`
191 | 将函数处理为_Funtion对象
192 | 主要进行合法性检验和测试
193 | 
194 | ## `genetic.py`
195 | 
196 | 模型接口，包括由工厂类派生出，回归，分类器和特征工程工具类，应用于不同场景
197 | 
198 | ### '_parallel_evolve(n_programs, parents, X, y, sample_weight, seeds, params)'
199 | 
200 | 进行一次种群进化
201 | n_programs为种群数量
202 | 
203 | 
204 | ### `BaseSymbolic`
205 | 
206 | 
207 | 
208 | 
209 | ## `utils.py`
210 | 
211 | 支持函数
212 | 
213 | 
214 | 
215 | `test.py`
216 | 
217 | 
218 | `data_trans.py`
219 | 
220 | 
221 | 
222 | 
223 | 自定义函数样例
224 | 
225 | 
226 | 
227 | # 函数文档
228 | 
229 | ## 全局函数
230 | 
231 | ### gpelarn自带全局函数
232 | | 函数名       | 实现逻辑            | 参数要求          | 输出类型     |
233 | |-----------|-----------------|---------------| ------------ |
234 | | add（X, Y） | 向量相加            | 【数值向量】，【数值向量】 | 【数值向量】 |
235 | | sub（X, Y） | 向量相减            | 【数值向量】，【数值向量】 | 【数值向量】 |
236 | | mul（X, Y） | 向量相乘            | 【数值向量】，【数值向量】 | 【数值向量】 |
237 | | div（X, Y） | 向量相除（极小值替换分母0）  | 【数值向量】，【数值向量】 | 【数值向量】 |
238 | | sqrt（X）   | 开平方（负数处理为绝对值）   | 【数值向量】 | 【数值向量】 |
239 | | log（X）    | 取自然对数（同理处理0和负数） | 【数值向量】 | 【数值向量】 |
240 | | neg（X）    | 取反数             | 【数值向量】 | 【数值向量】 |
241 | | inv（X）    | 取倒数  （极小值处理0）   | 【数值向量】 | 【数值向量】 |
242 | | abs（X）    | 取绝对值            | 【数值向量】 | 【数值向量】 |
243 | | max（X, Y） | 向量取孰大值          | 【数值向量】，【数值向量】 | 【数值向量】 |
244 | | min（X, Y） | 向量取孰小值          | 【数值向量】，【数值向量】 | 【数值向量】 |
245 | | sin（X）    | 取向量正弦           | 【数值向量】， | 【数值向量】 |
246 | | cos（X）    | 取向量余弦           | 【数值向量】 | 【数值向量】 |
247 | | tan（X）    | 取向量正切           | 【数值向量】 | 【数值向量】 |
248 | | sig（X）    | 逻辑斯蒂函数          | 【数值向量】 | 【数值向量】 |
249 | ### 自定义全局函数
250 | | 函数名          | 实现逻辑       | 参数要求                   | 输出类型     |
251 | | --------------- |------------| -------------------------- | ------------ |
252 | | combine（X, Y） | 两个分类变量笛卡尔积 | 【分类向量】，【分类向量】 | 【分类向量】 |
253 | 
254 | 
255 | ## 时间序列函数
256 | 
257 | ### 基本时间序列函数
258 | | 函数名                | 实现逻辑      | 参数要求                 | 输出类型     |
259 | |--------------------|-----------|----------------------| ------------ |
260 | | delay（X,d）         | 时序变量滞后    | 【数值向量】，【整形常量】        | 【数值向量】 |
261 | | delta（X,d）         | 与滞后项作差    | 【数值向量】，【整形常量】        | 【数值向量】 |
262 | | ts_min（X,d）        | 时间窗口最小值   | 【数值向量】，【整形常量】        | 【数值向量】 |
263 | | ts_max（X,d）        | 时间窗口最大值   | 【数值向量】，【整形常量】        | 【数值向量】 |
264 | | ts_argmax（X,d）     | 时间窗口最小值位置 | 【数值向量】，【整形常量】        | 【数值向量】 |
265 | | ts_argmin（X,d）     | 时间窗口最大值位置 | 【数值向量】，【整形常量】        | 【数值向量】 |
266 | | ts_rank（X, d）      | 时间窗口排序值   | 【数值向量】，【整形常量】        | 【数值向量】 |
267 | | ts_sum（X, d）       | 时间窗口求和    | 【数值向量】，【整形常量】        | 【数值向量】 |
268 | | ts_stddev（X, d）    | 时间窗口标准差   | 【数值向量】，【整形常量】        |  【数值向量】  |
269 | | ts_corr（X,Y,d）     | 时间窗口相关系数  | 【数值向量】，【数值向量】，【整形常量】 |  【数值向量】  |
270 | | ts_mean（X, d）      | 时间窗口均值    | 【数值向量】，【整形常量】        |  【数值向量】  |
271 | | ts_neutralize（X, d） | 时间窗口Z分数   | 【数值向量】，【整形常量】        |  【数值向量】  |
272 | | ts_freq（X, d）      | 时间窗口Z分数   | 【分类向量】，【整形常量】        |  【数值向量】  |
273 | 
274 | ### 技术指标函数
275 | 
276 | | 函数名                 | 实现逻辑       | 参数要求            | 输出类型     |
277 | |---------------------|------------| ------------------- | ------------ |
278 | | EMA（X,d）            | 指数平滑均线     | 【数值向量】，【整形常量】 | 【数值向量】 |
279 | | DEMA（X,d）           | 二重指数平滑线    | 【数值向量】，【整形常量】 | 【数值向量】 |
280 | | MA（X,d）             | 均线         | 【数值变量】，【整形常量】 | 【数值向量】 |
281 | | KAMA（X,d）           | 库夫曼自适应移动均线 | 【数值向量】，【整形常量】 | 【数值向量】 |
282 | | MIDPOINT（X,d）       | 中间点        | 【数值向量】，【整形向量】 | 【数值向量】 |
283 | | BETA（X,Y,d）         | 回归系数       | 【数值向量】，【数值向量】，【整形常量】 | 【数值向量】 |
284 | | LINEARREG_SLOPE（X, d）       | 回归斜率       | 【数值向量】，【整形常量】 | 【数值向量】 |
285 | | LINEARREG_ANGLE（X, d）       | 回归角度       | 【数值向量】，【整形常量】 | 【数值向量】 |
286 | | LINEARREG_INTERCEPT（X, d）     | 回归截距       |  【数值向量】，【整形常量】 |  【数值向量】  |
287 | 
288 | ## 截面函数
289 | ### 基本截面函数
290 | | 函数名                                | 实现逻辑     | 参数要求             | 输出类型   |
291 | |------------------------------------|----------|------------------|--------|
292 | | MAX_SECTION（X）                     | 最大值填充    | 【数值向量】           | 【数值向量】 |
293 | | MIN_SECTION（X）                     | 最小值填充    | 【数值向量】           | 【数值向量】 |
294 | | MEAN_SECTION（X）                    | 均值填充     | 【数值向量】           | 【数值向量】 |
295 | | MEDIAN_SECTION（X）                  | 中位数填充    | 【数值向量】           | 【数值向量】 |
296 | | STD_SECTION（X）                     | 标准差填充    | 【数值向量】           | 【数值向量】 |
297 | | RANK_SECTION（X）                    | 序数填充     | 【数值向量】           | 【数值向量】 |
298 | | NEUTRALIZE_SECTION（X）              | Z分数填充    | 【数值向量】           | 【数值向量】 |
299 | | FREQ_SECTION（X）                    | 频数填充     | 【分类向量】           | 【数值向量】 |
300 | | CUT_EQUAL_DISTANCE（X, d）           | 等距分组     | 【数值向量】，【整形标量】    | 【分类向量】 |
301 | | CUT_EQUAL_AMOUNT（X, d）             | 等量分组     | 【数值向量】，【整形标量】    | 【分类向量】 |
302 | 
303 | ### 截面分类聚合函数
304 | 
305 | | 函数名                                | 实现逻辑     | 参数要求             | 输出类型   |
306 | |------------------------------------|----------|------------------|--------|
307 | | GROUPBYTHENMAX（gbx, X）             | 分组后取最大值  | 【分类向量】，【数值向量】    | 【数值向量】 |
308 | | GROUPBYTHENMIN（gbx, X）             | 分组后取最小值  | 【分类向量】，【数值向量】    | 【数值向量】 |
309 | | GROUPBYTHENMEAN（gbx, X）            | 分组后取均值   | 【分类向量】，【数值向量】    | 【数值向量】 |
310 | | GROUPBYTHENMEDIAN（gbx, X）          | 分组后取中位数  | 【分类向量】，【数值向量】    | 【数值向量】 |
311 | | GROUPBYTHENSTD（gbx, X）             | 分组后取标准差  | 【分类向量】，【数值向量】    | 【数值向量】 |
312 | | GROUPBYTHENRANK（gbx, X）            | 分组后取序数   | 【分类向量】，【数值向量】    | 【数值向量】 |
313 | | GROUPBYTHENNEUTRALIZE（gbx, X）      | 分组后取Z分数  | 【分类向量】，【数值向量】    | 【数值向量】 |
314 | | GROUPBYTHEN_CUT_EQ_DIST（gbx, X, d） | 分组后取等距分组 | 【分类向量】，【数值向量】，【整形常量】 | 【分类向量】 |
315 | | GROUPBYTHEN_CUT_EQ_AMT（gbx, X, d）  | 分组后取等量分组 | 【分类向量】，【数值向量】，【整形常量】 | 【分类向量】 |
316 | | GROUPBYTHENFREQ（gbx, X）            | 分组后取取频数  | 【分类向量】，【分类向量】    | 【数值向量】 |
317 | # 更新记录
318 | 
319 | ## v1.0
320 | 
321 | 未调试完全， 有bug
322 | 
323 | ## v1.1
324 | 
325 | 处理完funtions模块的问题
326 | 调试成功，对于时序自定义函数中的常数参数，需要在函数中做去广播判定
327 | 
328 | ## v1.2
329 | 
330 | test中加入了自定义函数的定义方法，需要忽略运行时的RuntimeWarning
331 | 
332 | ## v1.3
333 | 
334 | functions中去掉了对于function.__code__.co_argument的限制
335 | 增强对函数修饰器的兼容
336 | 
337 | ## v1.4
338 | test.py debug
339 | 函数定义考虑特殊参数情况
340 | 
341 | ## v1.5
342 | 新增面板数据支持功能
343 | 将场景分位截面，时序和面板
344 | 数据定义要求更新
345 | 
346 | 函数定义要求更新
347 | 
348 | 更新适应度惩罚计算
349 | 
350 | 修改遗传规划中的特征筛选逻辑
351 | （当最大相关系数绝对值低于某一阈值时，直接按fitness筛选）


--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project version="4">
  3 |   <component name="AutoImportSettings">
  4 |     <option name="autoReloadType" value="SELECTIVE" />
  5 |   </component>
  6 |   <component name="ChangeListManager">
  7 |     <list default="true" id="da5529ee-c57f-4dbd-9e78-20f9ddf31530" name="Default Changelist" comment="v1.510 （增加注释，增加readme函数解释，交叉变异过程中区分数值和分类）">
  8 |       <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
  9 |       <change beforePath="$PROJECT_DIR$/README.md" beforeDir="false" afterPath="$PROJECT_DIR$/README.md" afterDir="false" />
 10 |       <change beforePath="$PROJECT_DIR$/_program.py" beforeDir="false" afterPath="$PROJECT_DIR$/_program.py" afterDir="false" />
 11 |       <change beforePath="$PROJECT_DIR$/functions.py" beforeDir="false" afterPath="$PROJECT_DIR$/functions.py" afterDir="false" />
 12 |     </list>
 13 |     <option name="SHOW_DIALOG" value="false" />
 14 |     <option name="HIGHLIGHT_CONFLICTS" value="true" />
 15 |     <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
 16 |     <option name="LAST_RESOLUTION" value="IGNORE" />
 17 |   </component>
 18 |   <component name="FileTemplateManagerImpl">
 19 |     <option name="RECENT_TEMPLATES">
 20 |       <list>
 21 |         <option value="Python Script" />
 22 |       </list>
 23 |     </option>
 24 |   </component>
 25 |   <component name="Git.Settings">
 26 |     <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
 27 |   </component>
 28 |   <component name="GitSEFilterConfiguration">
 29 |     <file-type-list>
 30 |       <filtered-out-file-type name="LOCAL_BRANCH" />
 31 |       <filtered-out-file-type name="REMOTE_BRANCH" />
 32 |       <filtered-out-file-type name="TAG" />
 33 |       <filtered-out-file-type name="COMMIT_BY_MESSAGE" />
 34 |     </file-type-list>
 35 |   </component>
 36 |   <component name="HighlightingSettingsPerFile">
 37 |     <setting file="file://$PROJECT_DIR$/README.md" root0="FORCE_HIGHLIGHTING" root1="FORCE_HIGHLIGHTING" />
 38 |     <setting file="file://$PROJECT_DIR$/__init__.py" root0="FORCE_HIGHLIGHTING" />
 39 |     <setting file="file://$PROJECT_DIR$/_program.py" root0="FORCE_HIGHLIGHTING" />
 40 |     <setting file="file://$PROJECT_DIR$/example.py" root0="FORCE_HIGHLIGHTING" />
 41 |     <setting file="file://$PROJECT_DIR$/fitness.py" root0="FORCE_HIGHLIGHTING" />
 42 |     <setting file="file://$PROJECT_DIR$/functions.py" root0="FORCE_HIGHLIGHTING" />
 43 |     <setting file="file://$PROJECT_DIR$/genetic.py" root0="FORCE_HIGHLIGHTING" />
 44 |     <setting file="file://$PROJECT_DIR$/utils.py" root0="FORCE_HIGHLIGHTING" />
 45 |   </component>
 46 |   <component name="MarkdownSettingsMigration">
 47 |     <option name="stateVersion" value="1" />
 48 |   </component>
 49 |   <component name="ProjectId" id="2IIdKWNnXBeA526eceOvkzf0neU" />
 50 |   <component name="ProjectLevelVcsManager" settingsEditedManually="true" />
 51 |   <component name="ProjectViewState">
 52 |     <option name="hideEmptyMiddlePackages" value="true" />
 53 |     <option name="showLibraryContents" value="true" />
 54 |   </component>
 55 |   <component name="PropertiesComponent">{
 56 |   &quot;keyToString&quot;: {
 57 |     &quot;RunOnceActivity.OpenProjectViewOnStart&quot;: &quot;true&quot;,
 58 |     &quot;RunOnceActivity.ShowReadmeOnStart&quot;: &quot;true&quot;,
 59 |     &quot;WebServerToolWindowFactoryState&quot;: &quot;false&quot;,
 60 |     &quot;last_opened_file_path&quot;: &quot;D:/software/python38/Lib/site-packages/gplearn&quot;,
 61 |     &quot;node.js.detected.package.eslint&quot;: &quot;true&quot;,
 62 |     &quot;node.js.detected.package.tslint&quot;: &quot;true&quot;,
 63 |     &quot;node.js.selected.package.eslint&quot;: &quot;(autodetect)&quot;,
 64 |     &quot;node.js.selected.package.tslint&quot;: &quot;(autodetect)&quot;,
 65 |     &quot;nodejs_package_manager_path&quot;: &quot;npm&quot;,
 66 |     &quot;settings.editor.selected.configurable&quot;: &quot;com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable&quot;,
 67 |     &quot;vue.rearranger.settings.migration&quot;: &quot;true&quot;
 68 |   }
 69 | }</component>
 70 |   <component name="RunManager" selected="Python.example (1)">
 71 |     <configuration name="example (1)" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
 72 |       <module name="gplearnplus" />
 73 |       <option name="INTERPRETER_OPTIONS" value="" />
 74 |       <option name="PARENT_ENVS" value="true" />
 75 |       <envs>
 76 |         <env name="PYTHONUNBUFFERED" value="1" />
 77 |       </envs>
 78 |       <option name="SDK_HOME" value="" />
 79 |       <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
 80 |       <option name="IS_MODULE_SDK" value="true" />
 81 |       <option name="ADD_CONTENT_ROOTS" value="true" />
 82 |       <option name="ADD_SOURCE_ROOTS" value="true" />
 83 |       <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
 84 |       <option name="SCRIPT_NAME" value="$PROJECT_DIR$/example.py" />
 85 |       <option name="PARAMETERS" value="" />
 86 |       <option name="SHOW_COMMAND_LINE" value="false" />
 87 |       <option name="EMULATE_TERMINAL" value="false" />
 88 |       <option name="MODULE_MODE" value="false" />
 89 |       <option name="REDIRECT_INPUT" value="false" />
 90 |       <option name="INPUT_FILE" value="" />
 91 |       <method v="2" />
 92 |     </configuration>
 93 |     <configuration name="example" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
 94 |       <module name="gplearnplus" />
 95 |       <option name="INTERPRETER_OPTIONS" value="" />
 96 |       <option name="PARENT_ENVS" value="true" />
 97 |       <envs>
 98 |         <env name="PYTHONUNBUFFERED" value="1" />
 99 |       </envs>
100 |       <option name="SDK_HOME" value="" />
101 |       <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
102 |       <option name="IS_MODULE_SDK" value="true" />
103 |       <option name="ADD_CONTENT_ROOTS" value="true" />
104 |       <option name="ADD_SOURCE_ROOTS" value="true" />
105 |       <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
106 |       <option name="SCRIPT_NAME" value="D:\software\python38\Lib\site-packages\gplearnplus\example.py" />
107 |       <option name="PARAMETERS" value="" />
108 |       <option name="SHOW_COMMAND_LINE" value="false" />
109 |       <option name="EMULATE_TERMINAL" value="false" />
110 |       <option name="MODULE_MODE" value="false" />
111 |       <option name="REDIRECT_INPUT" value="false" />
112 |       <option name="INPUT_FILE" value="" />
113 |       <method v="2" />
114 |     </configuration>
115 |     <configuration name="functions (1)" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
116 |       <module name="gplearnplus" />
117 |       <option name="INTERPRETER_OPTIONS" value="" />
118 |       <option name="PARENT_ENVS" value="true" />
119 |       <envs>
120 |         <env name="PYTHONUNBUFFERED" value="1" />
121 |       </envs>
122 |       <option name="SDK_HOME" value="" />
123 |       <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
124 |       <option name="IS_MODULE_SDK" value="true" />
125 |       <option name="ADD_CONTENT_ROOTS" value="true" />
126 |       <option name="ADD_SOURCE_ROOTS" value="true" />
127 |       <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
128 |       <option name="SCRIPT_NAME" value="$PROJECT_DIR$/functions.py" />
129 |       <option name="PARAMETERS" value="" />
130 |       <option name="SHOW_COMMAND_LINE" value="false" />
131 |       <option name="EMULATE_TERMINAL" value="false" />
132 |       <option name="MODULE_MODE" value="false" />
133 |       <option name="REDIRECT_INPUT" value="false" />
134 |       <option name="INPUT_FILE" value="" />
135 |       <method v="2" />
136 |     </configuration>
137 |     <configuration name="functions" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
138 |       <module name="gplearnplus" />
139 |       <option name="INTERPRETER_OPTIONS" value="" />
140 |       <option name="PARENT_ENVS" value="true" />
141 |       <envs>
142 |         <env name="PYTHONUNBUFFERED" value="1" />
143 |       </envs>
144 |       <option name="SDK_HOME" value="" />
145 |       <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
146 |       <option name="IS_MODULE_SDK" value="true" />
147 |       <option name="ADD_CONTENT_ROOTS" value="true" />
148 |       <option name="ADD_SOURCE_ROOTS" value="true" />
149 |       <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
150 |       <option name="SCRIPT_NAME" value="C:\Users\Administrator\Desktop\投资研究\研报\CTA因子\gplearnplus\functions.py" />
151 |       <option name="PARAMETERS" value="" />
152 |       <option name="SHOW_COMMAND_LINE" value="false" />
153 |       <option name="EMULATE_TERMINAL" value="false" />
154 |       <option name="MODULE_MODE" value="false" />
155 |       <option name="REDIRECT_INPUT" value="false" />
156 |       <option name="INPUT_FILE" value="" />
157 |       <method v="2" />
158 |     </configuration>
159 |     <configuration name="genetic" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
160 |       <module name="gplearnplus" />
161 |       <option name="INTERPRETER_OPTIONS" value="" />
162 |       <option name="PARENT_ENVS" value="true" />
163 |       <envs>
164 |         <env name="PYTHONUNBUFFERED" value="1" />
165 |       </envs>
166 |       <option name="SDK_HOME" value="" />
167 |       <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
168 |       <option name="IS_MODULE_SDK" value="true" />
169 |       <option name="ADD_CONTENT_ROOTS" value="true" />
170 |       <option name="ADD_SOURCE_ROOTS" value="true" />
171 |       <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
172 |       <option name="SCRIPT_NAME" value="$PROJECT_DIR$/genetic.py" />
173 |       <option name="PARAMETERS" value="" />
174 |       <option name="SHOW_COMMAND_LINE" value="false" />
175 |       <option name="EMULATE_TERMINAL" value="false" />
176 |       <option name="MODULE_MODE" value="false" />
177 |       <option name="REDIRECT_INPUT" value="false" />
178 |       <option name="INPUT_FILE" value="" />
179 |       <method v="2" />
180 |     </configuration>
181 |     <configuration name="main" type="PythonConfigurationType" factoryName="Python" nameIsGenerated="true">
182 |       <module name="gplearnplus" />
183 |       <option name="INTERPRETER_OPTIONS" value="" />
184 |       <option name="PARENT_ENVS" value="true" />
185 |       <envs>
186 |         <env name="PYTHONUNBUFFERED" value="1" />
187 |       </envs>
188 |       <option name="SDK_HOME" value="" />
189 |       <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
190 |       <option name="IS_MODULE_SDK" value="true" />
191 |       <option name="ADD_CONTENT_ROOTS" value="true" />
192 |       <option name="ADD_SOURCE_ROOTS" value="true" />
193 |       <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
194 |       <option name="SCRIPT_NAME" value="$PROJECT_DIR$/main.py" />
195 |       <option name="PARAMETERS" value="" />
196 |       <option name="SHOW_COMMAND_LINE" value="false" />
197 |       <option name="EMULATE_TERMINAL" value="false" />
198 |       <option name="MODULE_MODE" value="false" />
199 |       <option name="REDIRECT_INPUT" value="false" />
200 |       <option name="INPUT_FILE" value="" />
201 |       <method v="2" />
202 |     </configuration>
203 |     <recent_temporary>
204 |       <list>
205 |         <item itemvalue="Python.example (1)" />
206 |         <item itemvalue="Python.example" />
207 |         <item itemvalue="Python.functions (1)" />
208 |         <item itemvalue="Python.genetic" />
209 |         <item itemvalue="Python.functions" />
210 |       </list>
211 |     </recent_temporary>
212 |   </component>
213 |   <component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
214 |   <component name="SshConsoleOptionsProvider">
215 |     <option name="myEncoding" value="UTF-8" />
216 |   </component>
217 |   <component name="TaskManager">
218 |     <task active="true" id="Default" summary="Default task">
219 |       <changelist id="da5529ee-c57f-4dbd-9e78-20f9ddf31530" name="Default Changelist" comment="" />
220 |       <created>1669872968278</created>
221 |       <option name="number" value="Default" />
222 |       <option name="presentableId" value="Default" />
223 |       <updated>1669872968278</updated>
224 |       <workItem from="1669872969384" duration="8316000" />
225 |       <workItem from="1669949713064" duration="23046000" />
226 |       <workItem from="1670221175745" duration="40873000" />
227 |       <workItem from="1680224647115" duration="1363000" />
228 |       <workItem from="1680227579667" duration="10937000" />
229 |       <workItem from="1680506238492" duration="7698000" />
230 |       <workItem from="1680570845017" duration="6818000" />
231 |       <workItem from="1680584852150" duration="17444000" />
232 |       <workItem from="1680748457626" duration="450000" />
233 |       <workItem from="1680748923170" duration="1347000" />
234 |       <workItem from="1680755836291" duration="39000" />
235 |       <workItem from="1680755886556" duration="3236000" />
236 |       <workItem from="1681089314279" duration="1186000" />
237 |       <workItem from="1681363057918" duration="14776000" />
238 |       <workItem from="1681698639950" duration="62763000" />
239 |       <workItem from="1682565189577" duration="4243000" />
240 |       <workItem from="1688622051589" duration="1000" />
241 |       <workItem from="1706583239286" duration="33586000" />
242 |       <workItem from="1707030205052" duration="22636000" />
243 |       <workItem from="1707181251234" duration="20438000" />
244 |     </task>
245 |     <task id="LOCAL-00001" summary="v1.4">
246 |       <created>1680227722018</created>
247 |       <option name="number" value="00001" />
248 |       <option name="presentableId" value="LOCAL-00001" />
249 |       <option name="project" value="LOCAL" />
250 |       <updated>1680227722018</updated>
251 |     </task>
252 |     <task id="LOCAL-00002" summary="v1.53 （未完成，有todo不可使用）">
253 |       <created>1680603735322</created>
254 |       <option name="number" value="00002" />
255 |       <option name="presentableId" value="LOCAL-00002" />
256 |       <option name="project" value="LOCAL" />
257 |       <updated>1680603735322</updated>
258 |     </task>
259 |     <task id="LOCAL-00003" summary="v1.55 （未完成，有todo不可使用）">
260 |       <created>1680775226047</created>
261 |       <option name="number" value="00003" />
262 |       <option name="presentableId" value="LOCAL-00003" />
263 |       <option name="project" value="LOCAL" />
264 |       <updated>1680775226047</updated>
265 |     </task>
266 |     <task id="LOCAL-00004" summary="v1.57 （已完成，未测试）">
267 |       <created>1681880171652</created>
268 |       <option name="number" value="00004" />
269 |       <option name="presentableId" value="LOCAL-00004" />
270 |       <option name="project" value="LOCAL" />
271 |       <updated>1681880171652</updated>
272 |     </task>
273 |     <task id="LOCAL-00005" summary="v1.58 （bug修复，自定义函数新增）">
274 |       <created>1681896821625</created>
275 |       <option name="number" value="00005" />
276 |       <option name="presentableId" value="LOCAL-00005" />
277 |       <option name="project" value="LOCAL" />
278 |       <updated>1681896821625</updated>
279 |     </task>
280 |     <task id="LOCAL-00006" summary="v1.58 （bug修复，自定义函数新增）">
281 |       <created>1681983650112</created>
282 |       <option name="number" value="00006" />
283 |       <option name="presentableId" value="LOCAL-00006" />
284 |       <option name="project" value="LOCAL" />
285 |       <updated>1681983650112</updated>
286 |     </task>
287 |     <task id="LOCAL-00007" summary="v1.59 （未完全测试，自定义函数修改，引入numba加速）">
288 |       <created>1682070356913</created>
289 |       <option name="number" value="00007" />
290 |       <option name="presentableId" value="LOCAL-00007" />
291 |       <option name="project" value="LOCAL" />
292 |       <updated>1682070356913</updated>
293 |     </task>
294 |     <task id="LOCAL-00008" summary="v1.510 （增加注释，增加readme函数解释，交叉变异过程中区分数值和分类）">
295 |       <created>1707125947309</created>
296 |       <option name="number" value="00008" />
297 |       <option name="presentableId" value="LOCAL-00008" />
298 |       <option name="project" value="LOCAL" />
299 |       <updated>1707125947309</updated>
300 |     </task>
301 |     <option name="localTasksCounter" value="9" />
302 |     <servers />
303 |   </component>
304 |   <component name="TypeScriptGeneratedFilesManager">
305 |     <option name="version" value="3" />
306 |   </component>
307 |   <component name="Vcs.Log.History.Properties">
308 |     <option name="COLUMN_ID_ORDER">
309 |       <list>
310 |         <option value="Default.Root" />
311 |         <option value="Default.Author" />
312 |         <option value="Default.Date" />
313 |         <option value="Default.Subject" />
314 |         <option value="Space.CommitStatus" />
315 |       </list>
316 |     </option>
317 |   </component>
318 |   <component name="Vcs.Log.Tabs.Properties">
319 |     <option name="TAB_STATES">
320 |       <map>
321 |         <entry key="MAIN">
322 |           <value>
323 |             <State />
324 |           </value>
325 |         </entry>
326 |       </map>
327 |     </option>
328 |   </component>
329 |   <component name="VcsManagerConfiguration">
330 |     <MESSAGE value="v1.4" />
331 |     <MESSAGE value="v1.53 （未完成，有todo不可使用）" />
332 |     <MESSAGE value="v1.55 （未完成，有todo不可使用）" />
333 |     <MESSAGE value="v1.57 （已完成，未测试）" />
334 |     <MESSAGE value="v1.58 （bug修复，自定义函数新增）" />
335 |     <MESSAGE value="v1.59 （未完全测试，自定义函数修改，引入numba加速）" />
336 |     <MESSAGE value="v1.510 （增加注释，增加readme函数解释，交叉变异过程中区分数值和分类）" />
337 |     <option name="LAST_COMMIT_MESSAGE" value="v1.510 （增加注释，增加readme函数解释，交叉变异过程中区分数值和分类）" />
338 |   </component>
339 |   <component name="XDebuggerManager">
340 |     <breakpoint-manager>
341 |       <breakpoints>
342 |         <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
343 |           <url>file://$PROJECT_DIR$/../../../py39/Lib/site-packages/sklearn/base.py</url>
344 |           <line>5</line>
345 |           <option name="timeStamp" value="2" />
346 |         </line-breakpoint>
347 |       </breakpoints>
348 |     </breakpoint-manager>
349 |   </component>
350 |   <component name="com.intellij.coverage.CoverageDataManagerImpl">
351 |     <SUITE FILE_PATH="coverage/gplearnplus$function.coverage" NAME="function Coverage Results" MODIFIED="1669881466474" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
352 |     <SUITE FILE_PATH="coverage/gplearnplus$functions__1_.coverage" NAME="functions (1) Coverage Results" MODIFIED="1681878685631" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
353 |     <SUITE FILE_PATH="coverage/gplearnplus$example__1_.coverage" NAME="example (1) Coverage Results" MODIFIED="1707013792510" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
354 |     <SUITE FILE_PATH="coverage/gplearnplus$utils.coverage" NAME="utils Coverage Results" MODIFIED="1669873168115" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
355 |     <SUITE FILE_PATH="coverage/gplearnplus$test.coverage" NAME="test Coverage Results" MODIFIED="1681882867536" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
356 |     <SUITE FILE_PATH="coverage/gplearnplus$genetic.coverage" NAME="genetic Coverage Results" MODIFIED="1670868197399" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
357 |   </component>
358 | </project>


--------------------------------------------------------------------------------
/functions.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | -------------------------------------------------
  4 | # @Project  :gplearnplus 
  5 | # @File     :function
  6 | # @Date     :2022/12/1 0001 13:46 
  7 | # @Author   :Junzhe Huang
  8 | # @Email    :acejasonhuang@163.com
  9 | # @Software :PyCharm
 10 | -------------------------------------------------
 11 | """
 12 | import numpy as np
 13 | from joblib import wrap_non_picklable_objects
 14 | 
 15 | NoneType = type(None)
 16 | 
 17 | __all__ = ['make_function', 'raw_function_list']
 18 | 
 19 | 
 20 | class _Function(object):
 21 |     """
 22 |     函数对象，参数至少有一个为向量
 23 |     默认函数类型为，all，既可用于时序也可用于截面
 24 |     默认返回类型为数值，
 25 |     默认输入类型，数值向量或者标量
 26 | 
 27 |     Parameters
 28 |     ----------
 29 |     function : callable
 30 |         A function with signature function(x1, *args) that returns a Numpy
 31 |         array of the same shape as its arguments.
 32 | 
 33 |     name : str
 34 |         The name for the function as it should be represented in the program
 35 |         and its visualizations.
 36 | 
 37 |     arity : int
 38 |         The number of arguments that the ``function`` takes.
 39 | 
 40 |     param_type : [{
 41 |                   'vector': {'category': (None, None), 'number': (None, None)},
 42 |                   'scalar': {'int': (int, int), 'float': (float, float)}
 43 |                   },]
 44 |     function_type : 'all', 'section', 'time_series‘
 45 |     return_type: 'number', 'category'
 46 | 
 47 |     """
 48 | 
 49 |     def __init__(self, function, name, arity, param_type=None, return_type='number', function_type='all'):
 50 |         self.function = function
 51 |         self.name = name
 52 |         self.arity = arity
 53 |         if param_type is None:
 54 |             # 默认不接受分类类型
 55 |             param_type = arity * [{'vector': {'number': (None, None)},
 56 |                                    'scalar': {'int': (None, None), 'float': (None, None)}}]
 57 |         else:
 58 |             # 防止长度不一
 59 |             if len(param_type) != arity:
 60 |                 raise ValueError(
 61 |                     "length of param_type should be equal to arity, it should be {}, not {}"
 62 |                     .format(arity, len(param_type)))
 63 |         self.param_type = param_type
 64 |         if (return_type != 'number') and (return_type != 'category'):
 65 |             raise ValueError("return_type of function {} should be number or category, NOT {}"
 66 |                              .format(name, return_type))
 67 |         self.return_type = return_type
 68 |         self.function_type = function_type
 69 | 
 70 |     def __call__(self, *args):
 71 |         """
 72 |         调用函数特殊处理，
 73 |         参数仅接受标量，却传入向量
 74 |         则取向量第一个值为标量
 75 |         """
 76 |         for _param, _param_type in zip(args, self.param_type):
 77 |             if len(_param_type) == 1 and 'scalar' in _param_type and isinstance(_param, (list, np.ndarray)):
 78 |                 _param = _param[0]
 79 |         return self.function(*args)
 80 | 
 81 |     def add_range(self, const_range):
 82 |         # 作用：替换掉参数中没有约束的范围，给所有标量限制范围
 83 |         # 若没有const_range, 则表明所有函数不接收常数， 去掉所有的const type
 84 |         if const_range is None:
 85 |             for i, _dict in enumerate(self.param_type):
 86 |                 if 'vector' not in _dict:
 87 |                     raise ValueError("for None const range, vector type should in all function param")
 88 |                 if 'scalar' in _dict:
 89 |                     self.param_type[i].pop('scalar')
 90 |             return
 91 |         if not isinstance(const_range, tuple):
 92 |             raise ValueError('const_range must be an tuple')
 93 |         _min, _max = const_range
 94 |         if not isinstance(_min, (int, float)):
 95 |             raise ValueError('const_range left must be an int, float')
 96 |         if not isinstance(_max, (int, float)):
 97 |             raise ValueError('const_range right must be an int, float')
 98 |         if _min > _max:
 99 |             raise ValueError('const_range left should le right')
100 | 
101 |         for i, _dict in enumerate(self.param_type):
102 |             if 'scalar' in _dict:
103 |                 _scalar_range = _dict['scalar']
104 |                 if 'int' in _scalar_range:
105 |                     _l = int(_min) if _scalar_range['int'][0] is None else int(_scalar_range['int'][0])
106 |                     _r = int(_max) if _scalar_range['int'][1] is None else int(_scalar_range['int'][1])
107 |                     self.param_type[i]['scalar']['int'] = (_l, _r)
108 |                 if 'float' in _scalar_range:
109 |                     _l = float(_min) if _scalar_range['float'][0] is None else float(_scalar_range['float'][0])
110 |                     _r = float(_max) if _scalar_range['float'][1] is None else float(_scalar_range['float'][1])
111 |                     self.param_type[i]['scalar']['float'] = (_l, _r)
112 | 
113 |         return
114 | 
115 |     def is_point_mutation(self, candidate_func):
116 |         # 检验某个待替换函数是否可以替换
117 |         if not isinstance(candidate_func, _Function):
118 |             raise ValueError("wrong type, it should be _Function style")
119 |         # 带替换函数是否与该函数参数长度一致
120 |         if len(candidate_func.param_type) != len(self.param_type):
121 |             return False
122 |         if self.return_type != candidate_func.return_type:
123 |             return False
124 | 
125 |         # candidate函数的参数必须为待替换函数参数的子集
126 |         # 要求替换和，函数的所有参数仍然合法
127 |         for dict_self, dict_candi in zip(self.param_type, candidate_func.param_type):
128 |             if len(dict_candi) <= len(dict_self):
129 |                 return False
130 |             for upper_type in dict_self:
131 |                 if upper_type not in dict_candi:
132 |                     return False
133 |                 else:
134 |                     for lower_type in dict_self:
135 |                         if lower_type not in dict_candi[upper_type]:
136 |                             return False
137 |                         else:
138 |                             if upper_type == 'scalar':
139 |                                 if (dict_candi['scalar'][lower_type][0] > dict_self['scalar'][lower_type][0]) or (
140 |                                         dict_candi['scalar'][lower_type][1] > dict_candi['scalar'][lower_type][1]):
141 |                                     return False
142 |         return True
143 | 
144 | 
145 | 
146 | # warp 用于多进程序列化，会降低进化效率
147 | def make_function(*, function, name, arity, param_type=None, wrap=True, return_type='number', function_type='all'):
148 |     """
149 |        Parameters
150 |        ----------
151 |        function : callable
152 | 
153 |        name : str
154 | 
155 |        arity : int
156 | 
157 |        param_type : [{type: (, ), type: (, )}, ........]
158 | 
159 |        wrap : bool, optional (default=True)
160 |        """
161 | 
162 |     if not isinstance(arity, int):
163 |         raise ValueError('arity must be an int, got %s' % type(arity))
164 |     if not isinstance(name, str):
165 |         raise ValueError('name must be a string, got %s' % type(name))
166 |     if not isinstance(wrap, bool):
167 |         raise ValueError('wrap must be an bool, got %s' % type(wrap))
168 | 
169 |     # check out param_type vector > scalar int > float
170 |     if param_type is None:
171 |         param_type = [None] * arity
172 |     if not isinstance(param_type, list):
173 |         raise ValueError('param_type must be list')
174 |     if len(param_type) != arity:
175 |         raise ValueError('len of param_type must be arity')
176 |     # 保证函数中至少有一个向量
177 |     vector_flag = False
178 |     for i, _dict in enumerate(param_type):
179 |         # 转换None type
180 |         # 标记某一个参数是否可接受向量
181 |         non_vector_param = True
182 |         if _dict is None:
183 |             param_type[i] = {'vector': {'category': (None, None), 'number': (None, None)},
184 |                              'scalar': {'int': (None, None), 'float': (None, None)}}
185 |         elif not isinstance(_dict, dict):
186 |             raise ValueError('element in param_type {} must be dict'.format(i + 1))
187 |         if len(_dict) > 2:
188 |             raise ValueError('len of element in param_type {} must be 1, 2'.format(i + 1))
189 |         for upper_type in _dict:
190 |             if upper_type == 'vector':
191 |                 if not isinstance(_dict['vector'], dict):
192 |                     raise ValueError('type of element in param_type {} must be {upper_type: {lower_type:( , )}}}'
193 |                                      .format(i + 1))
194 |                 if len(_dict['vector']) == 0:
195 |                     raise ValueError('length of upper_type dict in param_type {} should not be 0'.format(i + 1))
196 |                 vector_flag = True
197 |                 non_vector_param = False
198 |                 for lower_type in _dict['vector']:
199 |                     if lower_type not in ['number', 'category']:
200 |                         raise ValueError('key of vector in param_type {} must be number or category'.format(i + 1))
201 |                     param_type[i]['vector'][lower_type] = (None, None)
202 | 
203 |             elif upper_type == 'scalar':
204 |                 if not isinstance(_dict['scalar'], dict):
205 |                     raise ValueError('type of element in param_type {} must be {upper_type: {lower_type:( , )}}}'
206 |                                      .format(i + 1))
207 |                 if len(_dict['scalar']) == 0:
208 |                     raise ValueError('length of upper_type dict in param_type {} should not be 0'.format(i + 1))
209 |                 for lower_type in _dict['scalar']:
210 |                     if lower_type == 'int':
211 |                         if not isinstance(_dict['scalar']['int'], tuple):
212 |                             raise ValueError('structure of lower_type in param_type {} must be {type: ( , )}}'
213 |                                              .format(i + 1))
214 |                         if len(_dict['scalar']['int']) != 2:
215 |                             raise ValueError("len of lower_type's structure in param_type {} must be 2".format(i + 1))
216 |                         if not isinstance(_dict['scalar']['int'][0], (int, NoneType)):
217 |                             raise ValueError("the first element in lower_type's structure in param_type {} "
218 |                                              "must be None, int or float".format(i + 1))
219 |                         if not isinstance(_dict['scalar']['int'][1], (int, NoneType)):
220 |                             raise ValueError("the second element in lower_type's structure in param_type {} "
221 |                                              "must be None, int or float".format(i + 1))
222 |                         if isinstance(_dict['scalar']['int'][0], int) and isinstance(_dict['scalar']['int'][1], int) \
223 |                                 and _dict['scalar']['int'][1] < _dict['scalar']['int'][0]:
224 |                             raise ValueError('the second element should ge the first element in param_type {}'
225 |                                              .format(i + 1))
226 | 
227 |                     elif lower_type == 'float':
228 |                         if not isinstance(_dict['scalar']['float'], tuple):
229 |                             raise ValueError('structure of lower_type in param_type {} must be {type: ( , )}}'
230 |                                              .format(i + 1))
231 |                         if len(_dict['scalar']['float']) != 2:
232 |                             raise ValueError("len of lower_type's structure in param_type {} must be 2".format(i + 1))
233 |                         if not isinstance(_dict['scalar']['float'][0], (float, int, NoneType)):
234 |                             raise ValueError("the first element in lower_type's structure in param_type {} "
235 |                                              "must be None, int or float".format(i + 1))
236 |                         if not isinstance(_dict['scalar']['float'][1], (float, int, NoneType)):
237 |                             raise ValueError("the second element in lower_type's structure in param_type {} "
238 |                                              "must be None, int or float".format(i + 1))
239 |                         if isinstance(_dict['scalar']['float'][0], (int, float)) and \
240 |                                 isinstance(_dict['scalar']['float'][1], (int, float)) \
241 |                                 and _dict['scalar']['float'][1] < _dict['scalar']['float'][0]:
242 |                             raise ValueError('the second element should ge the first element in param_type {}'
243 |                                              .format(i + 1))
244 |                     else:
245 |                         raise ValueError('key of scalar in param_type {} must be int or float'.format(i + 1))
246 |             else:
247 |                 raise ValueError('key of element in param_type {} must be vector or scalar'.format(i + 1))
248 | 
249 |     if not vector_flag:
250 |         raise ValueError('there is at least 1 vector in param_type {}'.format(i + 1))
251 | 
252 |     # Check output shape
253 |     # 生成测试数据
254 |     args = []
255 |     for _dict in param_type:
256 |         if 'vector' in _dict:
257 |             if 'number' in _dict['vector']:
258 |                 args.append(np.ones(10))
259 |             else:
260 |                 args.append(np.array([1] * 10))
261 |         elif 'scalar' in _dict:
262 |             if 'int' in _dict['scalar']:
263 |                 args.append(((0 if _dict['scalar']['int'][1] is None else _dict['scalar']['int'][1]) +
264 |                              (0 if _dict['scalar']['int'][0] is None else _dict['scalar']['int'][0])) // 2)
265 |             else:
266 |                 args.append(((0 if _dict['scalar']['float'][1] is None else _dict['scalar']['float'][1]) +
267 |                              (0 if _dict['scalar']['float'][0] is None else _dict['scalar']['float'][0])) // 2)
268 | 
269 |     try:
270 |         function(*args)
271 |     except (ValueError, TypeError):
272 |         print(args)
273 |         raise ValueError('supplied function %s does not support arity of %d.'
274 |                          % (name, arity))
275 |     if not hasattr(function(*args), 'shape'):
276 |         raise ValueError('supplied function %s does not return a numpy array.'
277 |                          % name)
278 |     if function(*args).shape != (10,):
279 |         raise ValueError('supplied function %s does not return same shape as '
280 |                          'input vectors.' % name)
281 |     if function(*args).dtype.type is np.float_ and return_type == 'category':
282 |         raise ValueError('the return type should be category not {}'.format(function(*args).dtype.type))
283 |     elif function(*args).dtype not in [np.float, np.int, np.int64] and return_type == 'number':
284 |         raise ValueError('the return type should be category not {}'.format(function(*args).dtype.type))
285 | 
286 |     # Check closure for zero & negative input arguments
287 |     args2 = []
288 |     args3 = []
289 |     for _dict in param_type:
290 |         if 'vector' in _dict:
291 |             # 兼容category向量
292 |             args2.append(np.zeros(10))
293 |             args3.append(-1 * np.ones(10))
294 |         elif 'scalar' in _dict:
295 |             if 'int' in _dict['scalar']:
296 | 
297 |                 _temp = (((0 if _dict['scalar']['int'][1] is None else _dict['scalar']['int'][1]) +
298 |                           (0 if _dict['scalar']['int'][0] is None else _dict['scalar']['int'][0])) // 2)
299 |                 args2.append(_temp)
300 |                 args3.append(_temp)
301 |             else:
302 |                 _temp = (((0 if _dict['scalar']['float'][1] is None else _dict['scalar']['float'][1]) +
303 |                           (0 if _dict['scalar']['float'][0] is None else _dict['scalar']['float'][0])) // 2)
304 |                 args2.append(_temp)
305 |                 args3.append(_temp)
306 | 
307 | 
308 |     if not np.all(np.isnan(function(*args2)) | np.isfinite(function(*args2))):
309 |         raise ValueError('supplied function %s does not have closure against '
310 |                          'zeros in argument vectors.' % name)
311 | 
312 |     if not np.all(np.isnan(function(*args3)) | np.isfinite(function(*args3))):
313 |         raise ValueError('supplied function %s does not have closure against '
314 |                          'negatives in argument vectors.' % name)
315 |     if wrap:
316 |         return _Function(function=wrap_non_picklable_objects(function),
317 |                          name=name,
318 |                          arity=arity,
319 |                          param_type=param_type,
320 |                          return_type=return_type,
321 |                          function_type=function_type)
322 |     return _Function(function=function,
323 |                      name=name,
324 |                      arity=arity,
325 |                      param_type=param_type,
326 |                      return_type=return_type,
327 |                      function_type=function_type)
328 | 
329 | 
330 | def _protected_division(x1, x2):
331 |     """Closure of division (x1/x2) for zero denominator."""
332 |     with np.errstate(divide='ignore', invalid='ignore'):
333 |         return np.where(np.abs(x2) > 0.001, np.divide(x1, x2), 1.)
334 | 
335 | 
336 | def _protected_sqrt(x1):
337 |     """Closure of square root for negative arguments."""
338 |     return np.sqrt(np.abs(x1))
339 | 
340 | 
341 | def _protected_log(x1):
342 |     """Closure of log for zero and negative arguments."""
343 |     with np.errstate(divide='ignore', invalid='ignore'):
344 |         return np.where(np.abs(x1) > 0.001, np.log(np.abs(x1)), 0.)
345 | 
346 | 
347 | def _protected_inverse(x1):
348 |     """Closure of inverse for zero arguments."""
349 |     with np.errstate(divide='ignore', invalid='ignore'):
350 |         return np.where(np.abs(x1) > 0.001, 1. / x1, 0.)
351 | 
352 | 
353 | def _sigmoid(x1):
354 |     """Special case of logistic function to transform to probabilities."""
355 |     with np.errstate(over='ignore', under='ignore'):
356 |         return 1 / (1 + np.exp(-x1))
357 | 
358 | def _groupby(gbx, func, *args, **kwargs):
359 |     indices = np.argsort(gbx)
360 |     gbx_sorted = gbx[indices]
361 |     X = np.column_stack((np.arange(len(gbx)), gbx_sorted, *args))
362 |     splits = np.split(X, np.unique(gbx_sorted, return_index=True)[1][1:])
363 |     result_list = [func(*(split[:, 2:].T), **kwargs) for split in splits]
364 |     result = np.hstack(result_list)
365 |     return result[indices.argsort()]
366 | 
367 | 
368 | add2 = _Function(function=np.add, name='add', arity=2)
369 | sub2 = _Function(function=np.subtract, name='sub', arity=2)
370 | mul2 = _Function(function=np.multiply, name='mul', arity=2)
371 | div2 = _Function(function=_protected_division, name='div', arity=2)
372 | sqrt1 = _Function(function=_protected_sqrt, name='sqrt', arity=1)
373 | log1 = _Function(function=_protected_log, name='log', arity=1)
374 | neg1 = _Function(function=np.negative, name='neg', arity=1)
375 | inv1 = _Function(function=_protected_inverse, name='inv', arity=1)
376 | abs1 = _Function(function=np.abs, name='abs', arity=1)
377 | max2 = _Function(function=np.maximum, name='max', arity=2)
378 | min2 = _Function(function=np.minimum, name='min', arity=2)
379 | sin1 = _Function(function=np.sin, name='sin', arity=1)
380 | cos1 = _Function(function=np.cos, name='cos', arity=1)
381 | tan1 = _Function(function=np.tan, name='tan', arity=1)
382 | sig1 = _Function(function=_sigmoid, name='sig', arity=1)
383 | 
384 | _function_map = {'add': add2,
385 |                  'sub': sub2,
386 |                  'mul': mul2,
387 |                  'div': div2,
388 |                  'sqrt': sqrt1,
389 |                  'log': log1,
390 |                  'abs': abs1,
391 |                  'neg': neg1,
392 |                  'inv': inv1,
393 |                  'max': max2,
394 |                  'min': min2,
395 |                  'sin': sin1,
396 |                  'cos': cos1,
397 |                  'tan': tan1}
398 | 
399 | raw_function_list = ['add', 'sub', 'mul', 'div', 'sqrt',
400 |                      'sqrt', 'log', 'abs', 'neg', 'inv',
401 |                      'max', 'min', 'sin', 'cos', 'tan']
402 | 
403 | all_function = raw_function_list.copy()
404 | 
405 | section_function = []
406 | 
407 | time_series_function = []
408 | 
409 | if __name__ == '__main__':
410 |     # def ff(a, b, c):
411 |     #     return a * b + c
412 |     #
413 |     # param_type = [{'vector':{'number': (None, None)}}, {'scalar': {'int':(None, 1)}}, {'scalar': {'float': (-1, None)}}]
414 |     # f_m = make_function(function=ff, name='ff', arity=3, param_type=param_type, wrap=True, return_type='number')
415 |     # f_m.add_range((-1, 1))
416 |     # print(f_m.param_type)
417 |     a = np.array([1, 2, 2, 1, np.nan])
418 |     b = np.array([1, 2, 3, 4, 5])
419 |     print(_groupby(a, max, b))
420 | 
421 | 


--------------------------------------------------------------------------------
/example.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | -------------------------------------------------
  4 | # @Project  :gplearn
  5 | # @File     :example.py
  6 | # @Date     :2023/3/31 0013 17:37
  7 | # @Author   :Junzhe Huang
  8 | # @Email    :acejasonhuang@163.com
  9 | # @Software :PyCharm
 10 | -------------------------------------------------
 11 | """
 12 | #####
 13 | # 目录
 14 | # 1. ALL FUNCTION 全局函数
 15 | # 2. TIME SERIES FUNCTION 一般时间序列函数
 16 | # 3. TA FUNCTION 技术指标函数
 17 | # 4. SECTION FUNCTION 截面函数
 18 | # 5. SECTION GROUPBY FUNCTION 截面分类聚合函数
 19 | #
 20 | #
 21 | ###
 22 | import numpy as np
 23 | from typing import Any
 24 | import numba as nb
 25 | from copy import copy
 26 | from numba import jit
 27 | from gplearnplus import functions
 28 | from functools import wraps
 29 | from functions import _groupby
 30 | 
 31 | 
 32 | def no_numpy_warning(func):
 33 |     @wraps(func)
 34 |     def warp(*args, **kwargs):
 35 |         with np.errstate(all='ignore'):
 36 |             _res = func(*args, **kwargs)
 37 |             return _res
 38 |     return warp
 39 | 
 40 | @nb.jit(nopython=True)
 41 | def handle_nan(X):
 42 |     # 前值填充
 43 |     X = np.copy(X)
 44 |     _temp = np.nan
 45 |     na_len = 0
 46 |     for i in range(len(X)):
 47 |         if np.isnan(X[i]):
 48 |             X[i] = _temp
 49 |             na_len += 1
 50 |         else:
 51 |             _temp = X[i]
 52 |     return X, na_len
 53 | 
 54 | #### ALL FUNCTION #####
 55 | 
 56 | @jit(nopython=True)
 57 | def _combine(X, Y):
 58 |     p1 = 15485863
 59 |     p2 = 32416190071
 60 |     p3 = 100000007
 61 |     return np.mod(X * p1 + Y * p2, p3)
 62 | 
 63 | combine = functions.make_function(function=_combine, name='combine', arity=2, return_type='category',
 64 |                                   param_type=[{'vector': {'category': (None, None)}},
 65 |                                               {'vector': {'category': (None, None)}}])
 66 | 
 67 | #### TIME SERIES FUNCTION #####
 68 | 
 69 | @jit(nopython=True)
 70 | def _delay(X, d):
 71 |     res = np.empty_like(X)
 72 |     res.fill(np.nan)
 73 |     end = len(X) - d
 74 |     for i in range(d, len(X)):
 75 |         res[i] = X[i - d]
 76 |     return res
 77 | 
 78 | delay = functions.make_function(function=_delay, name='delay', arity=2, function_type='time_series',
 79 |                                 param_type=[{'vector': {'number': (None, None)}},
 80 |                                             {'scalar': {'int':(3, 30)}}])
 81 | 
 82 | @jit(nopython=True)
 83 | def _delta(X, d):
 84 |     res = np.empty_like(X)
 85 |     res.fill(np.nan)
 86 |     end = len(X) - d
 87 |     for i in range(d, len(X)):
 88 |         res[i] = X[i] - X[i - d]
 89 |     return res
 90 | 
 91 | delta = functions.make_function(function=_delta, name='delta', arity=2, function_type='time_series',
 92 |                                 param_type=[{'vector': {'number': (None, None)}},
 93 |                                             {'scalar': {'int':(3, 30)}}])
 94 | @jit(nopython=True)
 95 | def _ts_min(X, d):
 96 |     d = len(X) - 1 if d >= len(X) else d
 97 |     shape = (X.size - d + 1, d)
 98 |     res = np.empty(X.size, dtype=X.dtype)
 99 |     res.fill(np.nan)
100 |     for i in range(len(X) - d + 1):
101 |         res[i + d - 1] = np.nanmin(X[i:i + d])
102 |     return res
103 | 
104 | ts_min = functions.make_function(function=_ts_min, name='ts_min', arity=2, function_type='time_series',
105 |                                  param_type=[{'vector': {'number': (None, None)}}, {'scalar': {'int':(3, 30)}}])
106 | 
107 | @jit(nopython=True)
108 | def _ts_max(X, d):
109 |     d = len(X) - 1 if d >= len(X) else d
110 |     shape = (X.size - d + 1, d)
111 |     res = np.empty(X.size, dtype=X.dtype)
112 |     res.fill(np.nan)
113 |     for i in range(len(X) - d + 1):
114 |         res[i + d - 1] = np.nanmax(X[i:i + d])
115 |     return res
116 | 
117 | ts_max = functions.make_function(function=_ts_max, name='ts_max', arity=2, function_type='time_series',
118 |                                  param_type=[{'vector': {'number': (None, None)}}, {'scalar': {'int':(3, 30)}}])
119 | 
120 | @jit(nopython=True)
121 | def _ts_argmax(X, d):
122 |     d = len(X) - 1 if d >= len(X) else d
123 |     res = np.empty(len(X), dtype=np.float64)
124 |     res[:d - 1] = np.nan
125 |     for i in range(len(X) - d + 1):
126 |         res[i + d - 1] = np.argmax(X[i:i + d])
127 |     return res
128 | 
129 | ts_argmax = functions.make_function(function=_ts_argmax, name='ts_argmax', arity=2, function_type='time_series',
130 |                                     param_type=[{'vector': {'number': (None, None)}}, {'scalar': {'int':(3, 30)}}])
131 | 
132 | @jit(nopython=True)
133 | def _ts_argmin(X, d):
134 |     n = len(X)
135 |     d = n - 1 if d >= n else d
136 |     res = np.full(n, np.nan)
137 |     for i in range(n - d + 1):
138 |         res[i + d - 1] = np.argmax(X[i:i + d])
139 |     return res
140 | ts_argmin = functions.make_function(function=_ts_argmin, name='ts_argmax', arity=2, function_type='time_series',
141 |                                     param_type=[{'vector': {'number': (None, None)}},
142 |                                                 {'scalar': {'int':(3, 30)}}])
143 | 
144 | @jit(nopython=True)
145 | def _ts_rank(X, d):
146 |     n = len(X)
147 |     d = n - 1 if d >= n else d
148 |     res = np.full(n, np.nan)
149 |     for i in range(n - d + 1):
150 |         rank = np.argsort(X[i:i + d]).argsort()[-1] + 1
151 |         res[i + d - 1] = rank / d
152 |     return res
153 | 
154 | ts_rank = functions.make_function(function=_ts_rank, name='ts_rank', arity=2, function_type='time_series',
155 |                                   param_type=[{'vector': {'number': (None, None)}}, {'scalar': {'int':(3, 30)}}])
156 | 
157 | @jit(nopython=True)
158 | def _ts_sum(X, d):
159 |     n = len(X)
160 |     d = n - 1 if d >= n else d
161 |     res = np.full(n, np.nan)
162 |     cumsum = np.nancumsum(X)
163 |     res[d - 1:n] = cumsum[d - 1:] - cumsum[:-d]
164 |     return res
165 | 
166 | ts_sum = functions.make_function(function=_ts_sum, name='ts_sum', arity=2, function_type='time_series',
167 |                                  param_type=[{'vector': {'number': (None, None)}}, {'scalar': {'int':(3, 30)}}])
168 | 
169 | @jit(nopython=True)
170 | def _ts_stddev(X, d):
171 |     d = len(X) - 1 if d >= len(X) else d
172 |     res = np.empty(len(X))
173 |     res[:] = np.nan
174 |     for i in range(d - 1, len(X)):
175 |         res[i] = np.nanstd(X[i - d + 1:i + 1])
176 |     return res
177 | 
178 | ts_stddev = functions.make_function(function=_ts_stddev, name='ts_stddev', arity=2, function_type='time_series',
179 |                                     param_type=[{'vector': {'number': (None, None)}}, {'scalar': {'int':(3, 30)}}])
180 | 
181 | @jit(nopython=True)
182 | def _ts_corr(X, Y, d):
183 |     d = len(X) - 1 if d >= len(X) else d
184 |     res = np.empty(len(X))
185 |     res[:d-1] = np.nan
186 |     for i in range(len(X) - d + 1):
187 |         X_ = X[i:i+d]
188 |         Y_ = Y[i:i+d]
189 |         X_ = X_[~(np.isnan(X_) | np.isnan(Y_))]
190 |         Y_ = Y_[~(np.isnan(X_) | np.isnan(Y_))]
191 |         if len(X_) <= 2:
192 |             res[i+d-1] = np.nan
193 |         else:
194 |             res[i+d-1] = np.corrcoef(X_, Y_)[0][1]
195 |     return res
196 | 
197 | ts_corr = functions.make_function(function=_ts_corr, name='ts_corr', arity=3, function_type='time_series',
198 |                                   param_type=[{'vector': {'number': (None, None)}},
199 |                                               {'vector': {'number': (None, None)}},
200 |                                               {'scalar': {'int':(3, 30)}}])
201 | 
202 | @jit(nopython=True)
203 | def _ts_mean(X, d):
204 |     d = len(X) - 1 if d >= len(X) else d
205 |     res = np.full(len(X), np.nan)
206 |     s = np.sum(X[:d])
207 |     for i in range(d - 1, len(X)):
208 |         res[i] = s / d
209 |         s += X[i + 1] - X[i - d + 1]
210 |     return res
211 | 
212 | ts_mean = functions.make_function(function=_ts_mean, name='ts_mean', arity=2,
213 |                                   function_type='time_series',
214 |                                   param_type=[{'vector': {'number': (None, None)}},
215 |                                               {'scalar': {'int':(3, 30)}}])
216 | 
217 | @jit(nopython=True)
218 | def _ts_neutralize(X, d):
219 |     N = len(X)
220 |     d = len(X) - 1 if d >= len(X) else d
221 |     mov_mean = np.empty(N - d + 1)
222 |     mov_std = np.empty(N - d + 1)
223 |     res = np.empty(N)
224 | 
225 |     for i in nb.prange(N - d + 1):
226 |         mov_mean[i] = np.mean(X[i:i + d])
227 |         mov_std[i] = np.sqrt(np.mean((X[i:i + d] - mov_mean[i]) ** 2))
228 |         mov_std[i] = mov_std[i] if mov_std[i] > 0.001 else 0.001
229 | 
230 |     for i in nb.prange(N):
231 |         if i < d - 1:
232 |             res[i] = np.nan
233 |         else:
234 |             res[i] = (X[i] - mov_mean[i - d + 1]) / mov_std[i - d + 1]
235 | 
236 |     return res
237 | 
238 | ts_neutralize = functions.make_function(function=_ts_neutralize, name='ts_neutralize', arity=2,
239 |                                         function_type='time_series',
240 |                                         param_type=[{'vector': {'number': (None, None)}},
241 |                                                     {'scalar': {'int':(3, 30)}}])
242 | 
243 | @nb.jit(nopython=True)
244 | def _ts_freq(X, d):
245 |     d = len(X) - 1 if d >= len(X) else d
246 |     res = np.empty(len(X), dtype=np.float64)
247 |     res[:d - 1] = np.nan
248 |     for i in range(d - 1, len(X)):
249 |         subarr = X[i - d + 1:i + 1]
250 |         res[i] = sum(subarr == X[i])
251 |     return res
252 | 
253 | ts_freq = functions.make_function(function=_ts_freq, name='ts_freq', arity=2,
254 |                                   function_type='time_series',
255 |                                   param_type=[{'vector': {'category': (None, None)}},
256 |                                               {'scalar': {'int':(3, 30)}}])
257 | 
258 | #### TIME SERIES TA FUNCTION ####
259 | 
260 | @nb.jit(nopython=True)
261 | def _EMA(X, d):
262 |     d = len(X) - 1 if d >= len(X) else d
263 |     X, _l = handle_nan(X)
264 |     X = X[_l:]
265 |     if len(X) < d:
266 |         return np.array([np.nan] * (len(X) + _l))
267 |     kt = 2 / (d + 1)
268 |     pre_ma = np.mean(X[:d])
269 |     __res = np.array([np.nan] * (len(X) + _l))
270 |     __res[_l + d - 1] = pre_ma
271 |     for i in range(d, len(X)):
272 |         pre_ma += (X[i] - pre_ma) * kt
273 |         __res[_l + i] = pre_ma
274 |     return __res
275 | 
276 | EMA = functions.make_function(function=_EMA, name='EMA', arity=2, function_type='time_series',
277 |                               param_type=[{'vector': {'number': (None, None)}}, {'scalar': {'int':(3, 30)}}])
278 | 
279 | @jit(nopython=True)
280 | def _DEMA(X, d):
281 |     d = d if len(X) > 2 * d - 2 else len(X) // 2 - 1
282 |     _ema = _EMA(X, d)
283 |     _eema = _EMA(_ema, d)
284 |     __res = 2 * _ema - _eema
285 |     return __res
286 | 
287 | DEMA = functions.make_function(function=_DEMA, name='DEMA', arity=2, function_type='time_series',
288 |                                param_type=[{'vector': {'number': (None, None)}}, {'scalar': {'int':(3, 30)}}])
289 | 
290 | @jit(nopython=True)
291 | def _MA(X, d):
292 |     d = len(X) - 1 if d >= len(X) else d
293 |     X, _l = handle_nan(X)
294 |     X = X[_l:]
295 |     if len(X) < d:
296 |         return np.array([np.nan] * (len(X) + _l))
297 |     __res = [np.nan] * (_l + d - 1) + [np.mean(X[i:i + d]) for i in range(len(X) - d + 1)]
298 |     return np.array(__res)
299 | 
300 | MA = functions.make_function(function=_MA, name='MA', arity=2, function_type='time_series',
301 |                              param_type=[{'vector': {'number': (None, None)}}, {'scalar': {'int':(3, 30)}}])
302 | 
303 | @jit(nopython=True)
304 | def _KAMA(X, d):
305 |     d = len(X) - 1 if d >= len(X) else d
306 |     X, _l = handle_nan(X)
307 |     X = X[_l:]
308 |     if len(X) < d:
309 |         return np.array([np.nan] * (len(X) + _l))
310 |     _af = 2 / (2 + 1)
311 |     _as = 2 / (30 + 1)
312 |     __res = np.array([np.nan] * (len(X) + _l))
313 |     for i in range(d, len(X)):
314 |         period_roc = X[i] - X[i - d]
315 |         sum_roc = np.sum(np.abs(np.diff(X[i - d: i + 1])))
316 |         _er = 1.0 if ((period_roc >= sum_roc) or (sum_roc == 0)) else abs(period_roc / sum_roc)
317 |         _at = (_er * (_af - _as) + _as) ** 2
318 |         __res[_l + i] = _at * X[i] + (1 - _at) * (__res[_l + i - 1] if i != d else X[i - 1])
319 |     return __res
320 | 
321 | KAMA = functions.make_function(function=_KAMA, name='KAMA', arity=2, function_type='time_series',
322 |                                param_type=[{'vector': {'number': (None, None)}}, {'scalar': {'int':(3, 30)}}])
323 | 
324 | @nb.jit(nopython=True)
325 | def _MIDPOINT(X, d):
326 |     d = len(X) - 1 if d >= len(X) else d
327 |     res = np.empty(len(X))
328 |     res[:] = np.nan
329 |     for i in range(d - 1, len(X)):
330 |         res[i] = (np.nanmax(X[i-d+1:i+1]) + np.nanmin(X[i-d+1:i+1])) / 2
331 |     return res
332 | 
333 | MIDPOINT = functions.make_function(function=_MIDPOINT, name='MIDPOINT', arity=2, function_type='time_series',
334 |                                    param_type=[{'vector': {'number': (None, None)}}, {'scalar': {'int':(3, 30)}}])
335 | 
336 | @nb.jit(nopython=True)
337 | def _BETA(X, Y, d):
338 |     d = len(X) - 1 if d >= len(X) else d
339 |     res = np.full(len(X), np.nan)
340 |     for i in range(d - 1, len(X)):
341 |         X_slice = X[i - d + 1: i + 1]
342 |         Y_slice = Y[i - d + 1: i + 1]
343 |         X_mean = np.mean(X_slice)
344 |         Y_mean = np.mean(Y_slice)
345 |         numerator = np.sum((X_slice - X_mean) * (Y_slice - Y_mean))
346 |         denominator = np.sum((X_slice - X_mean) ** 2)
347 |         denominator = denominator if denominator > 0.001 else 0.001
348 |         res[i] = numerator / denominator
349 |     return res
350 | 
351 | BETA = functions.make_function(function=_BETA, name='BETA', arity=3, function_type='time_series',
352 |                                param_type=[{'vector': {'number': (None, None)}},
353 |                                            {'vector': {'number': (None, None)}},
354 |                                            {'scalar': {'int':(3, 30)}}])
355 | 
356 | @nb.jit(nopython=True)
357 | def _LINEARREG_SLOPE(X, d):
358 |     d = len(X) - 1 if d >= len(X) else d
359 |     Y = np.arange(d)
360 |     res = np.full(len(X), np.nan)
361 |     for i in range(d - 1, len(X)):
362 |         X_slice = X[i - d + 1: i + 1]
363 |         Y_slice = Y[:len(X_slice)]
364 |         X_mean = np.mean(X_slice)
365 |         Y_mean = np.mean(Y_slice)
366 |         numerator = np.sum((X_slice - X_mean) * (Y_slice - Y_mean))
367 |         denominator = np.sum((X_slice - X_mean) ** 2)
368 |         denominator = denominator if denominator > 0.001 else 0.001
369 |         res[i] = numerator / denominator
370 |     return res
371 | 
372 | LINEARREG_SLOPE = functions.make_function(function=_LINEARREG_SLOPE, name='LINEARREG_SLOPE', arity=2,
373 |                                           function_type='time_series',
374 |                                           param_type=[{'vector': {'number': (None, None)}},
375 |                                                       {'scalar': {'int':(3, 30)}}])
376 | 
377 | @nb.jit(nopython=True)
378 | def _LINEARREG_ANGLE(X, d):
379 |     d = len(X) - 1 if d >= len(X) else d
380 |     Y = np.arange(d)
381 |     res = np.full(len(X), np.nan)
382 |     for i in range(d - 1, len(X)):
383 |         X_slice = X[i - d + 1: i + 1]
384 |         Y_slice = Y[:len(X_slice)]
385 |         X_mean = np.mean(X_slice)
386 |         Y_mean = np.mean(Y_slice)
387 |         numerator = np.sum((X_slice - X_mean) * (Y_slice - Y_mean))
388 |         denominator = np.sum((X_slice - X_mean) ** 2)
389 |         denominator = denominator if denominator > 0.001 else 0.001
390 |         res[i] = np.arctan(numerator / denominator) * (180.0 / np.pi)
391 |     return res
392 | 
393 | LINEARREG_ANGLE = functions.make_function(function=_LINEARREG_ANGLE, name='LINEARREG_ANGLE', arity=2,
394 |                                           function_type='time_series',
395 |                                           param_type=[{'vector': {'number': (None, None)}},
396 |                                                       {'scalar': {'int':(3, 30)}}])
397 | 
398 | @nb.jit(nopython=True)
399 | def _LINEARREG_INTERCEPT(X, d):
400 |     d = len(X) - 1 if d >= len(X) else d
401 |     Y = np.arange(d)
402 |     res = np.full(len(X), np.nan)
403 |     for i in range(d - 1, len(X)):
404 |         X_slice = X[i - d + 1: i + 1]
405 |         Y_slice = Y[:len(X_slice)]
406 |         X_mean = np.mean(X_slice)
407 |         Y_mean = np.mean(Y_slice)
408 |         numerator = np.sum((X_slice - X_mean) * (Y_slice - Y_mean))
409 |         denominator = np.sum((X_slice - X_mean) ** 2)
410 |         denominator = denominator if denominator > 0.001 else 0.001
411 |         _temp = np.arctan(numerator / denominator) * (180.0 / np.pi)
412 |         res[i] = np.sum(X_slice) - _temp * np.sum(Y_slice)
413 |     return res
414 | 
415 | LINEARREG_INTERCEPT = functions.make_function(function=_LINEARREG_INTERCEPT, name='LINEARREG_INTERCEPT',
416 |                                               arity=2, function_type='time_series',
417 |                                               param_type=[{'vector': {'number': (None, None)}},
418 |                                                           {'scalar': {'int':(3, 30)}}])
419 | 
420 | #### SECTION FUNCTION ####
421 | 
422 | @nb.jit(nopython=True)
423 | def _MAX_SECTION(X: np.ndarray) -> np.ndarray:
424 |     return np.full_like(X, np.max(X))
425 | 
426 | sec_max = functions.make_function(function=_MAX_SECTION, name='sec_max', arity=1, function_type='section',
427 |                                   param_type=[{'vector': {'number': (None, None)}}])
428 | 
429 | @nb.jit(nopython=True)
430 | def _MIN_SECTION(X):
431 |     return np.full_like(X, np.min(X))
432 | 
433 | sec_min = functions.make_function(function=_MIN_SECTION, name='sec_min', arity=1, function_type='section',
434 |                                   param_type=[{'vector': {'number': (None, None)}}])
435 | 
436 | @nb.jit(nopython=True)
437 | def _MEAN_SECTION(X):
438 |     return np.full_like(X, np.mean(X))
439 | 
440 | sec_mean = functions.make_function(function=_MEAN_SECTION, name='sec_mean', arity=1, function_type='section',
441 |                                    param_type=[{'vector': {'number': (None, None)}}])
442 | 
443 | @nb.jit(nopython=True)
444 | def _MEDIAN_SECTION(X):
445 |     return np.full_like(X, np.median(X))
446 | 
447 | sec_median = functions.make_function(function=_MEDIAN_SECTION, name='sec_median', arity=1, function_type='section',
448 |                                      param_type=[{'vector': {'number': (None, None)}}])
449 | 
450 | @nb.jit(nopython=True)
451 | def _STD_SECTION(X):
452 |     return np.full_like(X, np.std(X))
453 | 
454 | sec_std = functions.make_function(function=_STD_SECTION, name='sec_std', arity=1, function_type='section',
455 |                                   param_type=[{'vector': {'number': (None, None)}}])
456 | 
457 | @nb.jit(nopython=True)
458 | def _RANK_SECTION(X):
459 |     idx = np.argsort(X)
460 |     rank = np.empty_like(idx)
461 |     for i in range(len(X)):
462 |         rank[idx[i]] = i
463 |     return rank
464 | 
465 | sec_rank = functions.make_function(function=_RANK_SECTION, name='sec_rank', arity=1, function_type='section',
466 |                                    param_type=[{'vector': {'number': (None, None)}}])
467 | 
468 | @nb.jit(nopython=True)
469 | def _NEUTRALIZE_SECTION(X):
470 |     mean = np.mean(X)
471 |     std = np.std(X)
472 |     if std <= 0.001:
473 |         std = 0.001
474 |     return (X - mean) / np.repeat(std, len(X))
475 | 
476 | sec_neutralize = functions.make_function(function=_NEUTRALIZE_SECTION, name='sec_neutralize', arity=1,
477 |                                          function_type='section', param_type=[{'vector': {'number': (None, None)}}])
478 | 
479 | @no_numpy_warning
480 | def _FREQ_SECTION(X):
481 |     unique_values, counts = np.unique(X, return_counts=True)
482 |     count_dict = dict(zip(unique_values, counts))
483 |     vectorized_func = np.vectorize(lambda x: count_dict[x])
484 |     return vectorized_func(X)
485 | 
486 | freq = functions.make_function(function=_FREQ_SECTION, name='freq', arity=1,
487 |                                function_type='section', param_type=[{'vector': {'category': (None, None)}}])
488 | 
489 | @no_numpy_warning
490 | def _CUT_EQUAL_DISTANCE(X, d):
491 |     '''
492 |     等距分组
493 |     Parameters
494 |     ----------
495 |     X
496 |     d
497 | 
498 |     Returns
499 |     -------
500 | 
501 |     '''
502 |     d = len(X) - 1 if d >= len(X) - 1 else d
503 |     bins = [np.min(X) + i * (np.max(X) - np.min(X)) * 1.000001 / d for i in range(d + 1)]
504 |     return np.digitize(X, bins)
505 | 
506 | cut_equal_distance = functions.make_function(function=_CUT_EQUAL_DISTANCE, name='cut_eq_dist', arity=2,
507 |                                              function_type='section', return_type='category',
508 |                                              param_type=[{'vector': {'number': (None, None)}},
509 |                                                          {'scalar': {'int': (2, 30)}}])
510 | 
511 | @no_numpy_warning
512 | def _CUT_EQUAL_AMOUNT(X, d):
513 |     X_ = _RANK_SECTION(X)
514 |     return _CUT_EQUAL_DISTANCE(X_, d)
515 | 
516 | cut_equal_amount = functions.make_function(function=_CUT_EQUAL_AMOUNT, name='cut_eq_amt', arity=2,
517 |                                            function_type='section', return_type='category',
518 |                                            param_type=[{'vector': {'number': (None, None)}},
519 |                                                        {'scalar': {'int': (2, 30)}}])
520 | 
521 | @no_numpy_warning
522 | def _GROUPBYTHENMAX(gbx, X):
523 |     return _groupby(gbx, _MAX_SECTION, X)
524 | 
525 | groupby_max = functions.make_function(function=_GROUPBYTHENMAX, name='gb_max', arity=2, function_type='section',
526 |                                       param_type=[{'vector': {'category': (None, None)}},
527 |                                                   {'vector': {'number': (None, None)}}])
528 | 
529 | @no_numpy_warning
530 | def _GROUPBYTHENMIN(gbx, X):
531 |     return _groupby(gbx, _MIN_SECTION, X)
532 | 
533 | groupby_min = functions.make_function(function=_GROUPBYTHENMIN, name='gb_min', arity=2, function_type='section',
534 |                                       param_type=[{'vector': {'category': (None, None)}},
535 |                                                   {'vector': {'number': (None, None)}}])
536 | 
537 | @no_numpy_warning
538 | def _GROUPBYTHENMEAN(gbx, X):
539 |     return _groupby(gbx, _MEAN_SECTION, X)
540 | groupby_mean = functions.make_function(function=_GROUPBYTHENMEAN, name='gb_mean', arity=2, function_type='section',
541 |                                        param_type=[{'vector': {'category': (None, None)}},
542 |                                                    {'vector': {'number': (None, None)}}])
543 | 
544 | @no_numpy_warning
545 | def _GROUPBYTHENMEDIAN(gbx, X):
546 |     return _groupby(gbx, _MEDIAN_SECTION, X)
547 | groupby_median = functions.make_function(function=_GROUPBYTHENMEDIAN, name='gb_median',
548 |                                          arity=2, function_type='section',
549 |                                          param_type=[{'vector': {'category': (None, None)}},
550 |                                                      {'vector': {'number': (None, None)}}])
551 | 
552 | @no_numpy_warning
553 | def _GROUPBYTHENSTD(gbx, X):
554 |     return _groupby(gbx, _STD_SECTION, X)
555 | groupby_std = functions.make_function(function=_GROUPBYTHENSTD, name='gb_std', arity=2, function_type='section',
556 |                                       param_type=[{'vector': {'category': (None, None)}},
557 |                                                   {'vector': {'number': (None, None)}}])
558 | 
559 | @no_numpy_warning
560 | def _GROUPBYTHENRANK(gbx, X):
561 |     return _groupby(gbx, _RANK_SECTION, X)
562 | groupby_rank = functions.make_function(function=_GROUPBYTHENRANK, name='gb_rank', arity=2, function_type='section',
563 |                                        param_type=[{'vector': {'category': (None, None)}},
564 |                                                    {'vector': {'number': (None, None)}}])
565 | 
566 | @no_numpy_warning
567 | def _GROUPBYTHENNEUTRALIZE(gbx, X):
568 |     return _groupby(gbx, _NEUTRALIZE_SECTION, X)
569 | groupby_neutralize = functions.make_function(function=_GROUPBYTHENNEUTRALIZE, name='gb_neu', arity=2,
570 |                                              function_type='section',
571 |                                              param_type=[{'vector': {'category': (None, None)}},
572 |                                                          {'vector': {'number': (None, None)}}])
573 | 
574 | @no_numpy_warning
575 | def _GROUPBYTHEN_CUT_EQ_DIST(gbx, X, d):
576 |     return _groupby(gbx, _CUT_EQUAL_DISTANCE, X, d=d)
577 | groupby_cut_equal_distance = functions.make_function(function=_GROUPBYTHEN_CUT_EQ_DIST, name='gb_cut_eq_dist', arity=3,
578 |                                                      function_type='section', return_type='category',
579 |                                                      param_type=[{'vector': {'category': (None, None)}},
580 |                                                                  {'vector': {'number': (None, None)}},
581 |                                                                  {'scalar': {'int': (2, 30)}}])
582 | 
583 | @no_numpy_warning
584 | def _GROUPBYTHEN_CUT_EQ_AMT(gbx, X, d):
585 |     return _groupby(gbx, _CUT_EQUAL_AMOUNT, X, d=d)
586 | groupby_cut_equal_amount = functions.make_function(function=_GROUPBYTHEN_CUT_EQ_AMT, name='gb_cut_eq_amt', arity=3,
587 |                                                    function_type='section', return_type='category',
588 |                                                    param_type=[{'vector': {'category': (None, None)}},
589 |                                                                {'vector': {'number': (None, None)}},
590 |                                                                {'scalar': {'int': (2, 30)}}])
591 | 
592 | @no_numpy_warning
593 | def _GROUPBYTHENFREQ(gbx, X):
594 |     return _groupby(gbx, _FREQ_SECTION, X)
595 | groupby_freq = functions.make_function(function=_GROUPBYTHENFREQ, name='gb_freq', arity=2,
596 |                                        function_type='section',
597 |                                        param_type=[{'vector': {'category': (None, None)}},
598 |                                                    {'vector': {'category': (None, None)}}])
599 | 
600 | __all__ = ['delay', 'delta', 'sec_max', 'sec_min', 'sec_median', 'ts_min', 'ts_max', 'ts_sum', 'ts_corr', 'ts_rank',
601 |            'ts_stddev', 'ts_argmax', 'ts_argmin', 'ts_mean', 'EMA', 'DEMA', 'KAMA', 'MA', 'MIDPOINT',
602 |            'BETA', 'LINEARREG_ANGLE', 'LINEARREG_SLOPE', 'LINEARREG_INTERCEPT', 'sec_std', 'sec_rank', 'sec_mean',
603 |            'groupby_std', 'groupby_max', 'groupby_median', 'groupby_mean', 'groupby_rank', 'groupby_min',
604 |            'ts_neutralize', 'sec_neutralize', 'groupby_neutralize', 'cut_equal_amount', 'cut_equal_distance',
605 |            'groupby_cut_equal_amount', 'groupby_freq', 'groupby_cut_equal_distance', 'freq', 'ts_freq']
606 | 
607 | def test():
608 |     a = np.random.uniform(0.9, 1.1, 30)
609 |     b = np.random.uniform(0.9, 1.1, 30)
610 |     c = np.random.randint(0, 2, size=30)
611 |     print(groupby_cut_equal_distance(c,a,3))
612 | 
613 | 
614 | if __name__ == "__main__":
615 |     test()
616 | 


--------------------------------------------------------------------------------
/_program.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | -------------------------------------------------
  4 | # @Project  :gplearnplus 
  5 | # @File     :_program
  6 | # @Date     :2022/12/1 0001 13:37 
  7 | # @Author   :Junzhe Huang
  8 | # @Email    :acejasonhuang@163.com
  9 | # @Software :PyCharm
 10 | -------------------------------------------------
 11 | """
 12 | from copy import copy, deepcopy
 13 | import numpy as np
 14 | from sklearn.utils.random import sample_without_replacement
 15 | 
 16 | from .functions import _Function, _groupby
 17 | from .utils import check_random_state
 18 | 
 19 | 
 20 | class _Program(object):
 21 |     '''
 22 | 
 23 |     '''
 24 |     def __init__(self,
 25 |                  function_dict,
 26 |                  arities,
 27 |                  init_depth,
 28 |                  init_method,
 29 |                  n_features,
 30 |                  const_range,
 31 |                  metric,
 32 |                  p_point_replace,
 33 |                  parsimony_coefficient,
 34 |                  random_state,
 35 |                  data_type,
 36 |                  n_cat_features,
 37 |                  transformer=None,
 38 |                  feature_names=None,
 39 |                  program=None):
 40 |         '''
 41 | 
 42 |         Parameters
 43 |         ----------
 44 |         function_dict: 储存基础函数，原为function_set {'number': [], 'category': []}
 45 |         arities: 函数参数个数
 46 |         init_depth：初始深度, 接受元组（min_depth, max_depth）
 47 |         init_method：生成方式，
 48 |         n_features：特征个数
 49 |         const_range：常数范围, (-1, 1)
 50 |         metric：目标函数，’MAE‘,'MSE'
 51 |         p_point_replace：点变异概率
 52 |         parsimony_coefficient:惩罚系数，'auto'护着浮点数，默认0.01
 53 |         random_state：随机对象
 54 |         data_type：新增参数 截面，时序or面板， ’section‘， ’time_series', 'panel'
 55 |         n_cat_features：新增参数 分类特征个数
 56 |         transformer
 57 |         feature_names
 58 |         program
 59 |         '''
 60 |         self.function_dict = function_dict
 61 |         self.arities = arities
 62 |         self.init_depth = (init_depth[0], init_depth[1] + 1)
 63 |         self.init_method = init_method
 64 |         self.n_features = n_features
 65 |         self.const_range = const_range
 66 |         self.metric = metric
 67 |         self.p_point_replace = p_point_replace
 68 |         self.parsimony_coefficient = parsimony_coefficient
 69 |         self.data_type = data_type
 70 |         self.transformer = transformer
 71 |         self.feature_names = feature_names
 72 |         self.program = program
 73 |         self.n_cat_features = n_cat_features
 74 | 
 75 |         self.num_func_number = len(self.function_dict['number'])
 76 |         self.cat_func_number = len(self.function_dict['category'])
 77 | 
 78 |         if self.program is not None:
 79 |             # 验证当下树是否完整
 80 |             if not self.validate_program():
 81 |                 raise ValueError('The supplied program is incomplete.')
 82 |         else:
 83 |             # Create a naive random program
 84 |             self.program = self.build_program(random_state)
 85 | 
 86 |         self.raw_fitness_ = None
 87 |         self.fitness_ = None
 88 |         self.parents = None
 89 |         self._n_samples = None
 90 |         self._max_samples = None
 91 |         self._indices_state = None
 92 | 
 93 |     def build_program(self, random_state, type='number'):
 94 |         """
 95 |         参数中无program 初始化方法
 96 |         # v1.55 修改数的生成逻辑
 97 |         :param random_state: RandomState 对象， 随机数生成器
 98 |         :param type: 生成树返回数值还是分类
 99 |         :return: list,
100 |         """
101 |         if self.init_method == 'half and half':
102 |             method = ('full' if random_state.randint(2) else 'grow')
103 |         else:
104 |             method = self.init_method
105 |         max_depth = random_state.randint(*self.init_depth)
106 | 
107 |         # Start a program with a function to avoid degenerative programs
108 |         # 公式树返回类型必须为数值类型，随机挑选一个返回数值向量的函数作为公式树的根节点
109 |         _root_function_num = random_state.randint(len(self.function_dict['number']))
110 |         _root_function = self.function_dict['number'][_root_function_num]
111 | 
112 |         # 初始化公式树和工作栈，当前工作栈中仅有根节点,工作栈中存储参数类型列表，用于树的生成
113 |         program = [_root_function]
114 |         terminal_stack = [deepcopy(_root_function.param_type)]
115 | 
116 |         while terminal_stack:
117 |             depth = len(terminal_stack)
118 |             candidate_num = self.n_features + self.num_func_number + self.cat_func_number
119 |             candidate_choice = random_state.randint(candidate_num)
120 |             # Determine if we are adding a function or terminal
121 |             # terminal_stack的元素必须是list
122 |             if not isinstance(terminal_stack[-1], list):
123 |                 raise ValueError("element in terminal_stack should be list")
124 |             # terminal_stack的元素的list内，元素须为dict
125 |             if not isinstance(terminal_stack[-1][0], dict):
126 |                 raise ValueError("element in terminal_stack'element should be dict")
127 | 
128 |             # 深度优先的方式构建公式树，迭代处理工作栈中最后一个子树第一个子节点
129 |             # 与gplearn主要不同点
130 |             if ('vector' in terminal_stack[-1][0]) and (depth < max_depth) \
131 |                     and (method == 'full' or candidate_choice < (self.num_func_number + self.cat_func_number)):
132 |                 # 插入函数的要求，1 该节点必须接受向量，2.当前深度比最大深度低， 3.随机种子选中了函数或者模式为‘full’
133 |                 
134 |                 # 决定选择数值型函数 还是 分类型函数
135 |                 # 若该节点都可以接受，则随机决定插入的函数类型
136 |                 # 否则根据可接受类型插入相应函数
137 |                 _choice = random_state.randint(self.cat_func_number + self.num_func_number)
138 |                 if 'number' in terminal_stack[-1][0]['vector'] and 'category' in terminal_stack[-1][0]['vector']:
139 |                     key = 'number' if _choice < self.num_func_number else 'category'
140 |                 else:
141 |                     key = 'number' if 'number' in terminal_stack[-1][0]['vector'] else 'category'
142 |                 function_choice = self.function_dict[key][_choice %
143 |                                                    (self.num_func_number if key == 'number' else self.cat_func_number)]
144 |                 program.append(function_choice)
145 |                 terminal_stack.append(deepcopy(function_choice.param_type))
146 |             else:
147 |                 # 插入向量或者常量
148 |                 _choice = random_state.randint(self.n_features + 1)
149 |                 # 根据特殊情况调整_choice
150 |                 # 1.若const_range为None 或者 不接受标量类型，则默认插入向量
151 |                 # 2.若不接受向量类型，则默认插入标量
152 |                 # 3.其他情况按照随机数决定
153 |                 if _choice == self.n_features and \
154 |                         ((self.const_range is None) or \
155 |                         (('scalar') not in terminal_stack[-1][0])):
156 |                     # 只能插入向量的情况
157 |                     if 'vector' not in terminal_stack[-1][0]:
158 |                         raise ValueError('Error param type {}'.format(terminal_stack[-1][0]))
159 | 
160 |                     _choice = random_state.randint(self.n_features)
161 |                 elif ('vector' not in terminal_stack[-1][0]):
162 |                     # 只能插入常量的情况
163 |                     _choice = self.n_features
164 | 
165 |                 if _choice < self.n_features:
166 |                     # 插入向量
167 |                     if 'number' in terminal_stack[-1][0]['vector'] and 'category' in terminal_stack[-1][0][
168 |                         'vector']:
169 |                         # 可插入数值向量也可插入分类向量
170 |                         key = 'category' if _choice < self.n_cat_features else 'number'
171 |                     else:
172 |                         key = 'number' if 'number' in terminal_stack[-1][0]['vector'] else 'category'
173 |                     if self.n_cat_features == 0 and key == 'category':
174 |                         # 需要插入分类向量，特征中却没有分类向量的情况，插入常数分类向量1, 默认0
175 |                         candicate_var = 0
176 |                     else:
177 |                         candicate_var = (_choice % self.n_cat_features) + 1 if key == 'category' else \
178 |                                 ((_choice % (self.n_features - self.n_cat_features) + self.n_cat_features) + 1)
179 |                     program.append(str(candicate_var))
180 |                 else:
181 |                     # 插入常量
182 |                     if 'float' in terminal_stack[-1][0]['scalar']:
183 |                         _choice = random_state.uniform(*terminal_stack[-1][0]['scalar']['float'])
184 |                     elif 'int' in terminal_stack[-1][0]['scalar']:
185 |                         _choice = random_state.randint(*terminal_stack[-1][0]['scalar']['int'])
186 |                     else:
187 |                         raise ValueError('Error param type {}'.format(terminal_stack[-1][0]))
188 |                     program.append(_choice)
189 | 
190 |                 terminal_stack[-1].pop(0)
191 |                 while len(terminal_stack[-1]) == 0:
192 |                     terminal_stack.pop()
193 |                     if not terminal_stack:
194 |                         return program
195 |                     terminal_stack[-1].pop(0)
196 |         # We should never get here
197 |         return None
198 | 
199 |     # 检查函数是否可用，不包括类型检查
200 |     def validate_program(self):
201 |         """Rough check that the embedded program in the object is valid."""
202 |         terminals = [0]
203 |         for node in self.program:
204 |             if isinstance(node, _Function):
205 |                 terminals.append(node.arity)
206 |             else:
207 |                 terminals[-1] -= 1
208 |                 while terminals[-1] == 0:
209 |                     terminals.pop()
210 |                     terminals[-1] -= 1
211 |         return terminals == [-1]
212 | 
213 |     # 打印树
214 |     def __str__(self):
215 |         """Overloads `print` output of the object to resemble a LISP tree."""
216 |         terminals = [0]
217 |         output = ''
218 |         for i, node in enumerate(self.program):
219 |             if isinstance(node, _Function):
220 |                 terminals.append(node.arity)
221 |                 output += node.name + '('
222 |             else:
223 |                 if isinstance(node, str):
224 |                     if self.feature_names is None:
225 |                         output += 'X%s' % node
226 |                     else:
227 |                         output += self.feature_names[int(node)]
228 |                 elif isinstance(node, int):
229 |                     output += '%d' % node
230 |                 elif isinstance(node, float):
231 |                     output += '%.3f' % node
232 |                 else:
233 |                     raise ValueError('Error param type {}'.format(node))
234 |                 terminals[-1] -= 1
235 |                 while terminals[-1] == 0:
236 |                     terminals.pop()
237 |                     terminals[-1] -= 1
238 |                     output += ')'
239 |                 if i != len(self.program) - 1:
240 |                     output += ', '
241 |         return output
242 | 
243 |     # 可视化整个树
244 |     def export_graphviz(self, fade_nodes=None):
245 |         """Returns a string, Graphviz script for visualizing the program.
246 | 
247 |         Parameters
248 |         ----------
249 |         fade_nodes : list, optional
250 |             A list of node indices to fade out for showing which were removed
251 |             during evolution.
252 | 
253 |         Returns
254 |         -------
255 |         output : string
256 |             The Graphviz script to plot the tree representation of the program.
257 | 
258 |         """
259 |         terminals = []
260 |         if fade_nodes is None:
261 |             fade_nodes = []
262 |         output = 'digraph program {\nnode [style=filled]\n'
263 |         for i, node in enumerate(self.program):
264 |             fill = '#cecece'
265 |             if isinstance(node, _Function):
266 |                 if i not in fade_nodes:
267 |                     fill = '#136ed4'
268 |                 terminals.append([node.arity, i])
269 |                 output += ('%d [label="%s", fillcolor="%s"] ;\n'
270 |                            % (i, node.name, fill))
271 |             else:
272 |                 if i not in fade_nodes:
273 |                     fill = '#60a6f6'
274 | 
275 |                 if isinstance(node, str):
276 |                     if self.feature_names is None:
277 |                         feature_name = 'X%s' % node
278 |                     else:
279 |                         feature_name = self.feature_names[int(node)]
280 |                     output += ('%d [label="%s", fillcolor="%s"] ;\n'
281 |                                % (i, feature_name, fill))
282 |                 elif isinstance(node, int):
283 |                     output += ('%d [label="%d", fillcolor="%s"] ;\n'
284 |                                % (i, node, fill))
285 |                 elif isinstance(node, int):
286 |                     output += ('%d [label="%.3f", fillcolor="%s"] ;\n'
287 |                                % (i, node, fill))
288 |                 else:
289 |                     raise ValueError('Error param type {}'.format(node))
290 | 
291 |                 if i == 0:
292 |                     # A degenerative program of only one node
293 |                     return output + '}'
294 |                 terminals[-1][0] -= 1
295 |                 terminals[-1].append(i)
296 |                 while terminals[-1][0] == 0:
297 |                     output += '%d -> %d ;\n' % (terminals[-1][1],
298 |                                                 terminals[-1][-1])
299 |                     terminals[-1].pop()
300 |                     if len(terminals[-1]) == 2:
301 |                         parent = terminals[-1][-1]
302 |                         terminals.pop()
303 |                         if not terminals:
304 |                             return output + '}'
305 |                         terminals[-1].append(parent)
306 |                         terminals[-1][0] -= 1
307 | 
308 |         # We should never get here
309 |         return None
310 | 
311 |     # 计算树的深度
312 |     def _depth(self):
313 |         """Calculates the maximum depth of the program tree."""
314 |         terminals = [0]
315 |         depth = 1
316 |         for node in self.program:
317 |             if isinstance(node, _Function):
318 |                 terminals.append(node.arity)
319 |                 depth = max(len(terminals), depth)
320 |             else:
321 |                 terminals[-1] -= 1
322 |                 while terminals[-1] == 0:
323 |                     terminals.pop()
324 |                     terminals[-1] -= 1
325 |         return depth - 1
326 | 
327 |     # 计算公式中函数和变量的数量
328 |     def _length(self):
329 |         """Calculates the number of functions and terminals in the program."""
330 |         return len(self.program)
331 | 
332 |     # 计算参数X的函数结果
333 |     def execute(self, X):
334 |         """Execute the program according to X.
335 | 
336 |         Parameters
337 |         ----------
338 |         X : {array-like}
339 |             若数据类型为'section'，'time_series'则为[n_samples, n_features + 1]
340 |             若数据类型为'panel', 则为[n_samples, n_features + 3]
341 | 
342 |         Returns
343 |         -------
344 |         y_hats : array-like, shape = [n_samples]
345 |             The result of executing the program on X.
346 | 
347 |         """
348 |         # 检验X列数是否正确
349 |         if self.data_type == 'panel' and X.shape[1] != self.n_features + 3:
350 |             raise ValueError("For panel Data, the col number of X should be n_features + 3")
351 |         elif self.data_type in ['section', 'time_series'] and X.shape[1] != self.n_features + 1:
352 |             raise ValueError("For section or time_series Data, the col number of X should be n_features + 1")
353 | 
354 |         # Check for single-node programs
355 |         node = self.program[0]
356 |         # 常数
357 |         if isinstance(node, (float, int)):
358 |             return np.repeat(node, X.shape[0])
359 |         # 变量
360 |         if isinstance(node, str):
361 |             return X[:, int(node)]
362 | 
363 |         apply_stack = []
364 |         for node in self.program:
365 | 
366 |             if isinstance(node, _Function):
367 |                 apply_stack.append([node])
368 |             else:
369 |                 # Lazily evaluate later
370 |                 apply_stack[-1].append(node)
371 | 
372 |             while len(apply_stack[-1]) == apply_stack[-1][0].arity + 1:
373 |                 # Apply functions that have sufficient arguments
374 |                 function = apply_stack[-1][0]
375 |                 terminals = [np.repeat(t, X.shape[0]) if isinstance(t, (float, int))
376 |                              else (X[:, int(t)] if isinstance(t, str)
377 |                              else t) for t in apply_stack[-1][1:]]
378 |                 # 对于时序和截面函数加入管道
379 |                 if self.data_type == 'panel' and function.function_type == 'section':
380 |                     time_series_data = X[:, -1]
381 |                     intermediate_result = _groupby(time_series_data, function, *terminals)
382 |                 elif self.data_type == 'panel' and function.function_type == 'time_series':
383 |                     security_data = X[:, -2]
384 |                     intermediate_result = _groupby(security_data, function, *terminals)
385 |                 else:
386 |                     intermediate_result = function(*terminals)
387 |                 if len(apply_stack) != 1:
388 |                     apply_stack.pop()
389 |                     apply_stack[-1].append(intermediate_result)
390 |                 else:
391 |                     return intermediate_result
392 | 
393 |         # We should never get here
394 |         return None
395 | 
396 |     # 选择部分样本
397 |     def get_all_indices(self, n_samples=None, max_samples=None,
398 |                         random_state=None):
399 |         """Get the indices on which to evaluate the fitness of a program.
400 | 
401 |         Parameters
402 |         ----------
403 |         n_samples : int
404 |             The number of samples.
405 | 
406 |         max_samples : int
407 |             The maximum number of samples to use.
408 | 
409 |         random_state : RandomState instance
410 |             The random number generator.
411 | 
412 |         Returns
413 |         -------
414 |         indices : array-like, shape = [n_samples]
415 |             The in-sample indices.
416 |             抽样内index
417 | 
418 |         not_indices : array-like, shape = [n_samples]
419 |             The out-of-sample indices.
420 |             抽样外index
421 | 
422 |         """
423 |         if self._indices_state is None and random_state is None:
424 |             raise ValueError('The program has not been evaluated for fitness '
425 |                              'yet, indices not available.')
426 | 
427 |         if n_samples is not None and self._n_samples is None:
428 |             self._n_samples = n_samples
429 |         if max_samples is not None and self._max_samples is None:
430 |             self._max_samples = max_samples
431 |         if random_state is not None and self._indices_state is None:
432 |             self._indices_state = random_state.get_state()
433 | 
434 |         indices_state = check_random_state(None)
435 |         indices_state.set_state(self._indices_state)
436 | 
437 |         not_indices = sample_without_replacement(
438 |             self._n_samples,
439 |             self._n_samples - self._max_samples,
440 |             random_state=indices_state)
441 |         sample_counts = np.bincount(not_indices, minlength=self._n_samples)
442 |         indices = np.where(sample_counts == 0)[0]
443 | 
444 |         return indices, not_indices
445 | 
446 |     # 获取衡量模型适应度的指标
447 |     def _indices(self):
448 |         """Get the indices used to measure the program's fitness."""
449 |         return self.get_all_indices()[0]
450 | 
451 |     # 原始适应度
452 |     def raw_fitness(self, X, y, sample_weight):
453 |         """Evaluate the raw fitness of the program according to X, y.
454 | 
455 |         Parameters
456 |         ----------
457 |         X : {array-like}
458 |             若数据类型为'section'，'time_series'则为[n_samples, n_features + 1]
459 |             若数据类型为'panel', 则为[n_samples, n_features + 3]
460 | 
461 |         y : array-like, shape = [n_samples]
462 |             Target values.
463 | 
464 |         sample_weight : array-like, shape = [n_samples]
465 |             Weights applied to individual samples.
466 | 
467 |         Returns
468 |         -------
469 |         raw_fitness : float
470 |             The raw fitness of the program.
471 | 
472 |         """
473 |         if X.shape[0] != len(y):
474 |             raise ValueError("The length of y should be equal to X")
475 |         y_pred = self.execute(X)
476 |         if self.transformer:
477 |             y_pred = self.transformer(y_pred)
478 |         raw_fitness = self.metric(y, y_pred, sample_weight)
479 | 
480 |         return raw_fitness
481 | 
482 |     # todo 引入非线性适应度
483 |     # 惩罚后适应度 对函数长度进行惩罚
484 |     def fitness(self, parsimony_coefficient=None):
485 |         """Evaluate the penalized fitness of the program according to X, y.
486 | 
487 |         Parameters
488 |         ----------
489 |         parsimony_coefficient : float, optional
490 |             If automatic parsimony is being used, the computed value according
491 |             to the population. Otherwise the initialized value is used.
492 | 
493 |         Returns
494 |         -------
495 |         fitness : float
496 |             The penalized fitness of the program.
497 | 
498 |         """
499 |         if parsimony_coefficient is None:
500 |             parsimony_coefficient = self.parsimony_coefficient
501 |         penalty = parsimony_coefficient * len(self.program) * self.metric.sign
502 |         return self.raw_fitness_ - penalty
503 | 
504 |     # 此函数为获得指定子树
505 |     def get_subtree(self, start, program=None):
506 |         """
507 | 
508 |         Parameters
509 |         ----------
510 |         start: 子树的根节点位置
511 |         program
512 |         Returns
513 |         -------
514 |         start
515 |         end 子树截止位置 + 1 便于索引
516 |         """
517 |         if program is None:
518 |             program = self.program
519 |         stack = 1
520 |         end = start
521 |         while stack > end - start:
522 |             node = program[end]
523 |             if isinstance(node, _Function):
524 |                 stack += node.arity
525 |             end += 1
526 | 
527 |         if isinstance(program[start], _Function):
528 |             return_type = _Function.return_type
529 |         elif isinstance(program[start], str):
530 |             if int(program[start]) == 0:
531 |                 raise ValueError("The return of sub_tree's root should not be const_1")
532 |             return_type = 'category' if int(program[start]) <= self.n_cat_features else 'number'
533 |         else:
534 |             raise ValueError("The return type of sub_tree's root should be number or category")
535 |         return start, end, return_type
536 | 
537 |     # 此函数为获得随机子树
538 |     # 此处做了修改，不会选到标量
539 |     # 需要考虑返回类型
540 |     def get_random_subtree(self, random_state, program=None, return_type=None):
541 |         """Get a random subtree from the program.
542 | 
543 |         Parameters
544 |         ----------
545 |         random_state : RandomState instance
546 |             The random number generator.
547 | 
548 |         program : list, optional (default=None)
549 |             The flattened tree representation of the program. If None, the
550 |             embedded tree in the object will be used.
551 | 
552 |         return_type: 子数的返回类型限定 默认 None, number 和 category都可以选择
553 | 
554 |         Returns
555 |         -------
556 |         start, end : tuple of two ints
557 |             The indices of the start and end of the random subtree.
558 |         return_type: 子数返回类型，数值向量 还是 分类向量， 防止交叉时出现错误
559 |         """
560 |         if program is None:
561 |             program = self.program
562 |         # Choice of crossover points follows Koza's (1992) widely used approach
563 |         # 子数节点概率权重90%，向量叶子节点概率权重10%，标量叶包括常分类向量子节点概率权重0
564 |         # 若type为number， 所有返回category的节点概率权重为0
565 |         # 若type为category， 所有返回number的节点概率权重为0
566 |         if return_type not in ['number', 'category', None]:
567 |             raise ValueError("Type of sub_tree should be number, category or None")
568 |         if return_type == 'number':
569 |             probs = np.array([0.9 if isinstance(node, _Function) and node.return_type == 'number'
570 |                               else (0.1 if isinstance(node, str) and int(node) > self.n_cat_features else 0.0)
571 |                               for node in program])
572 |         elif return_type == 'category':
573 |             probs = np.array([0.9 if isinstance(node, _Function) and node.return_type == 'category'
574 |                               else (0.1 if isinstance(node, str) and int(node) <= self.n_cat_features
575 |                                     and int(node) != 0 else 0.0)
576 |                               for node in program])
577 |         else:
578 |             probs = np.array([0.9 if isinstance(node, _Function)
579 |                               else (0.1 if isinstance(node, str)
580 |                                     and int(node) != 0 else 0.0)
581 |                               for node in program])
582 |         probs = np.cumsum(probs / probs.sum())
583 |         start = np.searchsorted(probs, random_state.uniform())
584 |         return self.get_subtree(start, program)
585 | 
586 |     def reproduce(self):
587 |         """Return a copy of the embedded program."""
588 |         return copy(self.program)
589 | 
590 |     def vaild_category(self, program=None):
591 |         """验证公式树中是否包含分类向量或子树， 不包括常数分类向量"""
592 |         if program is None:
593 |             program = self.program
594 |         for node in program:
595 |             if isinstance(node, _Function) and node.return_type == 'category':
596 |                 return True
597 |             elif isinstance(node, str) and int(node) != 0 and int(node) <= self.n_cat_features:
598 |                 return True
599 |         return False
600 | 
601 |     # 交换self 和 donor 的子树
602 |     # 此处不会交换常数
603 |     def crossover(self, donor, random_state):
604 |         """Perform the crossover genetic operation on the program.
605 | 
606 |         Crossover selects a random subtree from the embedded program to be
607 |         replaced. A donor also has a subtree selected at random and this is
608 |         inserted into the original parent to form an offspring.
609 | 
610 |         Parameters
611 |         ----------
612 |         donor : list
613 |             The flattened tree representation of the donor program.
614 | 
615 |         random_state : RandomState instance
616 |             The random number generator.
617 | 
618 |         Returns
619 |         -------
620 |         program : list
621 |             The flattened tree representation of the program.
622 | 
623 |         """
624 |         # Get a subtree to replace
625 |         # 若都包含
626 |         if self.vaild_category() and self.vaild_category(donor):
627 |             start, end, self_return_type = self.get_random_subtree(random_state)
628 |         else:
629 |             start, end, self_return_type = self.get_random_subtree(random_state, return_type='number')
630 |         removed = range(start, end)
631 |         # Get a subtree to donate
632 |         donor_start, donor_end, donor_return_type = self.get_random_subtree(random_state, donor, self_return_type)
633 |         donor_removed = list(set(range(len(donor))) -
634 |                              set(range(donor_start, donor_end)))
635 |         # Insert genetic material from donor
636 |         return (self.program[:start] +
637 |                 donor[donor_start:donor_end] +
638 |                 self.program[end:]), removed, donor_removed
639 | 
640 |     # 此处不会选择常数
641 |     # 子数变异
642 |     def subtree_mutation(self, random_state):
643 |         """Perform the subtree mutation operation on the program.
644 | 
645 |         Subtree mutation selects a random subtree from the embedded program to
646 |         be replaced. A donor subtree is generated at random and this is
647 |         inserted into the original parent to form an offspring. This
648 |         implementation uses the "headless chicken" method where the donor
649 |         subtree is grown using the initialization methods and a subtree of it
650 |         is selected to be donated to the parent.
651 | 
652 |         Parameters
653 |         ----------
654 |         random_state : RandomState instance
655 |             The random number generator.
656 | 
657 |         Returns
658 |         -------
659 |         program : list
660 |             The flattened tree representation of the program.
661 | 
662 |         """
663 |         # Build a new naive program
664 |         chicken = self.build_program(random_state)
665 |         # Do subtree mutation via the headless chicken method!
666 |         return self.crossover(chicken, random_state)
667 | 
668 |     def get_hoist_list(self, program=None):
669 |         """
670 |         判断公式树哪些节点可以做hoist变异, 该节点非叶子节点 且 存在与自身同类型的子树， 常分类向量不算分类向量的同类型
671 |         Parameters
672 |         ----------
673 |         program
674 | 
675 |         Returns
676 |         -------
677 |         hoist_list
678 |         """
679 |         if program is None:
680 |             program = self.program
681 | 
682 |         apply_stack = []
683 |         hoist_list = [False] * len(program)
684 |         # 深度优先搜索，压入栈中的元素是一个list，list第一个元素表示函数再program列表中的位置，第二个元素是函数对象，后面的元素是返回结果
685 |         # 深搜结果为['number'], ['number','category'], ['category'], [], 表示该节点及其子节点所包含的类型集合
686 |         for i, node in enumerate(program):
687 |             if isinstance(node, _Function):
688 |                 apply_stack.append([i, node])
689 |             else:
690 |                 # Lazily evaluate later
691 |                 apply_stack[-1].append(node)
692 |             while len(apply_stack[-1]) == apply_stack[-1][1].arity + 2:
693 |                 father_type = apply_stack[-1][1].return_type
694 |                 type_list = [t if isinstance(t, list) else
695 |                              (['number'] if isinstance(t, str) and int(t) > self.n_cat_features else
696 |                              (['category'] if isinstance(t, str) and int(t) <= self.n_cat_features and int(t) != 0
697 |                               else []))
698 |                              for t in apply_stack[-1][2:]
699 |                              ]
700 |                 # 判断子树中是否存在与本节点同类型的节点，若存在表示可以hoist变异
701 |                 if father_type in list(set().union(*type_list)):
702 |                     hoist_list[apply_stack[-1][0]] = True
703 |                 # 函数返回类型加入列表
704 |                 type_list.append([father_type])
705 | 
706 |                 intermediate_result = list(set().union(*type_list))
707 |                 if len(apply_stack) != 1:
708 |                     apply_stack.pop()
709 |                     apply_stack[-1].append(intermediate_result)
710 |                 else:
711 |                     return hoist_list
712 |         return None
713 | 
714 |     # 将子树的子树变上提，简化公式
715 |     # 由于子树不会选到常数，故符合条件
716 |     # 子数不会选到分类变量
717 |     def hoist_mutation(self, random_state):
718 |         """Perform the hoist mutation operation on the program.
719 | 
720 |         Hoist mutation selects a random subtree from the embedded program to
721 |         be replaced. A random subtree of that subtree is then selected and this
722 |         is 'hoisted' into the original subtrees location to form an offspring.
723 |         This method helps to control bloat.
724 | 
725 |         gplearnplus修改，由于引入了变量类型，需要先考哪些节点可以hosit变异的节点
726 |         要求
727 |         1. 该节点下存在于节点同类型的子树
728 | 
729 |         Parameters
730 |         ----------
731 |         random_state : RandomState instance
732 |             The random number generator.
733 | 
734 |         Returns
735 |         -------
736 |         program : list
737 |             The flattened tree representation of the program.
738 | 
739 |         """
740 |         # Get a subtree to replace
741 |         hoist_list = self.get_hoist_list()
742 |         if sum(hoist_list) == 0:
743 |             return self.program
744 |         # 随机选取可以hoist的节点
745 |         hoist_root = random_state.choice(np.where(hoist_list)[0])
746 |         start, end, return_type = self.get_subtree(hoist_root)
747 |         subtree = self.program[start:end]
748 |         # Get a subtree of the subtree to hoist
749 |         sub_start, sub_end, _ = self.get_random_subtree(random_state, subtree, return_type=return_type)
750 |         hoist = subtree[sub_start:sub_end]
751 |         # Determine which nodes were removed for plotting
752 |         removed = list(set(range(start, end)) -
753 |                        set(range(start + sub_start, start + sub_end)))
754 |         return self.program[:start] + hoist + self.program[end:], removed
755 | 
756 |     # 点变异完全修改
757 |     # 要求函数满足is_point_mutation条件
758 |     # 由于无法得知范围，常数不变异
759 |     def point_mutation(self, random_state):
760 |         """Perform the point mutation operation on the program.
761 | 
762 |         Point mutation selects random nodes from the embedded program to be
763 |         replaced. Terminals are replaced by other terminals and functions are
764 |         replaced by other functions that require the same number of arguments
765 |         as the original node. The resulting tree forms an offspring.
766 | 
767 |         Parameters
768 |         ----------
769 |         random_state : RandomState instance
770 |             The random number generator.
771 | 
772 |         Returns
773 |         -------
774 |         program : list
775 |             The flattened tree representation of the program.
776 | 
777 |         """
778 |         program = copy(self.program)
779 | 
780 |         # Get the nodes to modify
781 |         mutate = np.where(random_state.uniform(size=len(program)) <
782 |                           self.p_point_replace)[0]
783 |         tag = np.array([True] * len(mutate))
784 |         for i, node in enumerate(mutate):
785 |             if isinstance(program[node], _Function):
786 |                 arity = program[node].arity
787 |                 # Find a valid replacement with same arity
788 |                 replacement_list = [func_ for func_ in self.arities[arity] if program[node].is_point_mutation(func_)]
789 |                 if len(replacement_list) == 0:
790 |                     # 没有满足条件的变异
791 |                     tag[i] = False
792 |                     continue
793 |                 replacement = random_state.randint(len(replacement_list))
794 |                 replacement = replacement_list[replacement]
795 |                 program[node] = replacement
796 |             elif isinstance(program[node], str):
797 |                 # We've got a terminal, add a const or variable
798 |                 terminal = random_state.randint(1, self.n_features + 1)
799 |                 program[node] = str(terminal)
800 |             else:
801 |                 # 常数不发生变异
802 |                 tag[i] = False
803 |         if len(mutate):
804 |             mutate = mutate[tag]
805 |         return program, list(mutate)
806 | 
807 |     depth_ = property(_depth)
808 |     length_ = property(_length)
809 |     indices_ = property(_indices)
810 | 


--------------------------------------------------------------------------------
/genetic.py:
--------------------------------------------------------------------------------
   1 | # -*- coding: utf-8 -*-
   2 | """
   3 | -------------------------------------------------
   4 | # @Project  :gplearnplus 
   5 | # @File     :genetic
   6 | # @Date     :2022/12/5 0005 4:23 
   7 | # @Author   :Junzhe Huang
   8 | # @Email    :acejasonhuang@163.com
   9 | # @Software :PyCharm
  10 | -------------------------------------------------
  11 | """
  12 | import itertools
  13 | from abc import ABCMeta, abstractmethod
  14 | from time import time
  15 | from warnings import warn
  16 | from copy import deepcopy
  17 | 
  18 | import numpy as np
  19 | import pandas as pd
  20 | from joblib import Parallel, delayed
  21 | from scipy.stats import rankdata
  22 | from sklearn.base import BaseEstimator
  23 | from sklearn.base import RegressorMixin, TransformerMixin, ClassifierMixin
  24 | from sklearn.exceptions import NotFittedError
  25 | from sklearn.utils import compute_sample_weight
  26 | from sklearn.utils.validation import check_array, _check_sample_weight
  27 | from sklearn.utils.multiclass import check_classification_targets
  28 | from sklearn.preprocessing import LabelEncoder
  29 | 
  30 | from ._program import _Program
  31 | from .fitness import _fitness_map, _Fitness
  32 | from .functions import _function_map, _Function, sig1 as sigmoid
  33 | from .utils import _partition_estimators
  34 | from .utils import check_random_state
  35 | 
  36 | __all__ = ['SymbolicRegressor', 'SymbolicClassifier', 'SymbolicTransformer']
  37 | 
  38 | MAX_INT = np.iinfo(np.int32).max
  39 | 
  40 | # 并行实现子树交叉，变异
  41 | def _parallel_evolve(n_programs, parents, X, y, sample_weight, seeds, params):
  42 |     """
  43 | 
  44 |     Parameters
  45 |     ----------
  46 |     n_programs: 种群数量
  47 |     parents：父辈个体集合
  48 |     X：原始特征
  49 |     y：预测label
  50 |     sample_weight：抽样比例
  51 |     seeds：随机种子
  52 |     params：参数
  53 | 
  54 |     Returns
  55 |     -------
  56 | 
  57 |     """
  58 | 
  59 |     """Private function used to build a batch of programs within a job."""
  60 |     n_samples, n_features = X.shape
  61 | 
  62 |     # Unpack parameters
  63 |     tournament_size = params['tournament_size']
  64 |     function_dict = params['function_dict']
  65 |     arities = params['arities']
  66 |     init_depth = params['init_depth']
  67 |     init_method = params['init_method']
  68 |     const_range = params['const_range']
  69 |     metric = params['_metric']
  70 |     transformer = params['_transformer']
  71 |     parsimony_coefficient = params['parsimony_coefficient']
  72 |     method_probs = params['method_probs']
  73 |     data_type = params['data_type']
  74 |     p_point_replace = params['p_point_replace']
  75 |     max_samples = params['max_samples']  # 最大样本比例
  76 |     feature_names = params['feature_names']
  77 |     n_cat_features = params['cat_var_number']
  78 | 
  79 |     # 处理不同类型的数据X
  80 |     if data_type == 'panel':
  81 |         n_features -= 3
  82 |     else:
  83 |         n_features -= 1
  84 | 
  85 |     max_samples = int(max_samples * n_samples)
  86 | 
  87 |     def _tournament():
  88 |         # 从所有父代中随机选择tournament_size个，取其中最优的那一个子代
  89 |         """Find the fittest individual from a sub-population."""
  90 |         contenders = random_state.randint(0, len(parents), tournament_size)
  91 |         fitness = [parents[p].fitness_ for p in contenders]
  92 |         if metric.greater_is_better:
  93 |             parent_index = contenders[np.argmax(fitness)]
  94 |         else:
  95 |             parent_index = contenders[np.argmin(fitness)]
  96 |         return parents[parent_index], parent_index
  97 | 
  98 |     # Build programs
  99 |     programs = []
 100 | 
 101 |     for i in range(n_programs):
 102 | 
 103 |         random_state = check_random_state(seeds[i])
 104 | 
 105 |         if parents is None:
 106 |             # 初代
 107 |             program = None
 108 |             genome = None
 109 |         else:
 110 |             method = random_state.uniform()
 111 |             # 在父辈个体集合中抽样选择一个最优的父辈
 112 |             parent, parent_index = _tournament()
 113 | 
 114 |             # 随机进行一种交叉 or 变异
 115 |             if method < method_probs[0]:
 116 |                 # crossover
 117 |                 donor, donor_index = _tournament()
 118 |                 program, removed, remains = parent.crossover(donor.program,
 119 |                                                              random_state)
 120 |                 genome = {'method': 'Crossover',
 121 |                           'parent_idx': parent_index,
 122 |                           'parent_nodes': removed,
 123 |                           'donor_idx': donor_index,
 124 |                           'donor_nodes': remains}
 125 |             elif method < method_probs[1]:
 126 |                 # subtree_mutation
 127 |                 program, removed, _ = parent.subtree_mutation(random_state)
 128 |                 genome = {'method': 'Subtree Mutation',
 129 |                           'parent_idx': parent_index,
 130 |                           'parent_nodes': removed}
 131 |             elif method < method_probs[2]:
 132 |                 # hoist_mutation
 133 |                 program, removed = parent.hoist_mutation(random_state)
 134 |                 genome = {'method': 'Hoist Mutation',
 135 |                           'parent_idx': parent_index,
 136 |                           'parent_nodes': removed}
 137 |             elif method < method_probs[3]:
 138 |                 # point_mutation
 139 |                 program, mutated = parent.point_mutation(random_state)
 140 |                 genome = {'method': 'Point Mutation',
 141 |                           'parent_idx': parent_index,
 142 |                           'parent_nodes': mutated}
 143 |             else:
 144 |                 # reproduction
 145 |                 program = parent.reproduce()
 146 |                 genome = {'method': 'Reproduction',
 147 |                           'parent_idx': parent_index,
 148 |                           'parent_nodes': []}
 149 | 
 150 |         program = _Program(function_dict=function_dict,
 151 |                            arities=arities,
 152 |                            init_depth=init_depth,
 153 |                            init_method=init_method,
 154 |                            n_features=n_features,
 155 |                            metric=metric,
 156 |                            transformer=transformer,
 157 |                            const_range=const_range,
 158 |                            p_point_replace=p_point_replace,
 159 |                            parsimony_coefficient=parsimony_coefficient,
 160 |                            data_type=data_type,
 161 |                            feature_names=feature_names,
 162 |                            random_state=random_state,
 163 |                            n_cat_features=n_cat_features,
 164 |                            program=program)
 165 | 
 166 |         program.parents = genome
 167 | 
 168 |         # Draw samples, using sample weights, and then fit
 169 |         if sample_weight is None:
 170 |             curr_sample_weight = np.ones((n_samples,))
 171 |         else:
 172 |             curr_sample_weight = sample_weight.copy()
 173 |         oob_sample_weight = curr_sample_weight.copy()
 174 | 
 175 |         indices, not_indices = program.get_all_indices(n_samples,
 176 |                                                        max_samples,
 177 |                                                        random_state)
 178 | 
 179 |         curr_sample_weight[not_indices] = 0
 180 |         oob_sample_weight[indices] = 0
 181 | 
 182 |         program.raw_fitness_ = program.raw_fitness(X, y, curr_sample_weight)
 183 |         if max_samples < n_samples:
 184 |             # Calculate OOB fitness
 185 |             program.oob_fitness_ = program.raw_fitness(X, y, oob_sample_weight)
 186 | 
 187 |         programs.append(program)
 188 | 
 189 |     return programs
 190 | 
 191 | 
 192 | class BaseSymbolic(BaseEstimator, metaclass=ABCMeta):
 193 | 
 194 |     """Base class for symbolic regression / classification estimators.
 195 | 
 196 |     Warning: This class should not be used directly.
 197 |     Use derived classes instead.
 198 | 
 199 |     """
 200 | 
 201 |     @abstractmethod
 202 |     def __init__(self,
 203 |                  *,
 204 |                  population_size=1000,
 205 |                  hall_of_fame=None,
 206 |                  n_components=None,
 207 |                  generations=20,
 208 |                  tournament_size=20,
 209 |                  stopping_criteria=0.0,
 210 |                  const_range=(-1., 1.),
 211 |                  init_depth=(2, 6),
 212 |                  init_method='half and half',
 213 |                  function_set=('add', 'sub', 'mul', 'div'),
 214 |                  transformer=None,
 215 |                  metric='mean absolute error',
 216 |                  parsimony_coefficient=0.001,
 217 |                  p_crossover=0.9,
 218 |                  p_subtree_mutation=0.01,
 219 |                  p_hoist_mutation=0.01,
 220 |                  p_point_mutation=0.01,
 221 |                  p_point_replace=0.05,
 222 |                  max_samples=1.0,
 223 |                  tolerable_corr=0.0,
 224 |                  class_weight=None,
 225 |                  feature_names=None,
 226 |                  time_series_index=None,
 227 |                  security_index=None,
 228 |                  category_features=None,
 229 |                  warm_start=False,
 230 |                  low_memory=False,
 231 |                  n_jobs=1,
 232 |                  verbose=0,
 233 |                  data_type='section',
 234 |                  random_state=None):
 235 | 
 236 |         self.population_size = population_size
 237 |         self.hall_of_fame = hall_of_fame
 238 |         self.n_components = n_components
 239 |         self.generations = generations
 240 |         self.tournament_size = tournament_size
 241 |         self.stopping_criteria = stopping_criteria
 242 |         self.const_range = const_range
 243 |         self.init_depth = init_depth
 244 |         self.init_method = init_method
 245 |         self.function_set = function_set
 246 |         self.transformer = transformer
 247 |         self.metric = metric
 248 |         self.parsimony_coefficient = parsimony_coefficient
 249 |         self.p_crossover = p_crossover
 250 |         self.p_subtree_mutation = p_subtree_mutation
 251 |         self.p_hoist_mutation = p_hoist_mutation
 252 |         self.p_point_mutation = p_point_mutation
 253 |         self.p_point_replace = p_point_replace
 254 |         self.max_samples = max_samples
 255 |         self.class_weight = class_weight
 256 |         self.feature_names = feature_names
 257 |         self.category_features = category_features
 258 |         self.time_series_index = time_series_index
 259 |         self.security_index = security_index
 260 |         self.warm_start = warm_start
 261 |         self.low_memory = low_memory
 262 |         self.n_jobs = n_jobs
 263 |         self.verbose = verbose
 264 |         self.random_state = random_state
 265 |         self.data_type = data_type
 266 |         self.tolerable_corr = tolerable_corr
 267 | 
 268 |     # 打印训练日志
 269 |     def _verbose_reporter(self, run_details=None):
 270 |         """A report of the progress of the evolution process.
 271 | 
 272 |         Parameters
 273 |         ----------
 274 |         run_details : dict
 275 |             Information about the evolution.
 276 | 
 277 |         """
 278 |         if run_details is None:
 279 |             print('    |{:^25}|{:^42}|'.format('Population Average',
 280 |                                                'Best Individual'))
 281 |             print('-' * 4 + ' ' + '-' * 25 + ' ' + '-' * 42 + ' ' + '-' * 10)
 282 |             line_format = '{:>4} {:>8} {:>16} {:>8} {:>16} {:>16} {:>10}'
 283 |             print(line_format.format('Gen', 'Length', 'Fitness', 'Length',
 284 |                                      'Fitness', 'OOB Fitness', 'Time Left'))
 285 | 
 286 |         else:
 287 |             # Estimate remaining time for run
 288 |             gen = run_details['generation'][-1]
 289 |             generation_time = run_details['generation_time'][-1]
 290 |             remaining_time = (self.generations - gen - 1) * generation_time
 291 |             if remaining_time > 60:
 292 |                 remaining_time = '{0:.2f}m'.format(remaining_time / 60.0)
 293 |             else:
 294 |                 remaining_time = '{0:.2f}s'.format(remaining_time)
 295 | 
 296 |             oob_fitness = 'N/A'
 297 |             line_format = '{:4d} {:8.2f} {:16g} {:8d} {:16g} {:>16} {:>10}'
 298 |             if self.max_samples < 1.0:
 299 |                 oob_fitness = run_details['best_oob_fitness'][-1]
 300 |                 line_format = '{:4d} {:8.2f} {:16g} {:8d} {:16g} {:16g} {:>10}'
 301 | 
 302 |             print(line_format.format(run_details['generation'][-1],
 303 |                                      run_details['average_length'][-1],
 304 |                                      run_details['average_fitness'][-1],
 305 |                                      run_details['best_length'][-1],
 306 |                                      run_details['best_fitness'][-1],
 307 |                                      oob_fitness,
 308 |                                      remaining_time))
 309 | 
 310 |     # fit 的时候考虑时序问题
 311 |     # 转移出数据处理模块
 312 |     def fit(self, X, y, sample_weight=None):
 313 |         """Fit the Genetic Program according to X, y.
 314 | 
 315 |         Parameters
 316 |         ----------
 317 |         X : array-like, shape = [n_samples, n_features]
 318 |             Training vectors, where n_samples is the number of samples and
 319 |             n_features is the number of features.
 320 | 
 321 |         y : array-like, shape = [n_samples]
 322 |             Target values.
 323 | 
 324 |         sample_weight : array-like, shape = [n_samples], optional
 325 |             Weights applied to individual samples.
 326 | 
 327 |         Returns
 328 |         -------
 329 |         self : object
 330 |             Returns self.
 331 | 
 332 |         """
 333 |         random_state = check_random_state(self.random_state)
 334 | 
 335 |         # 检查数据类型
 336 |         if self.data_type not in ('section', 'time_series', 'panel'):
 337 |             raise ValueError('Valid data_type methods include '
 338 |                              '"section", "time_series" and "panel". Given %s.'
 339 |                              % self.data_type)
 340 | 
 341 |         # 检查数据结构
 342 |         # 若含有security或者timeindex 必须为DataFrame
 343 |         if self.security_index is not None or self.time_series_index is not None:
 344 |             if not isinstance(X, pd.DataFrame):
 345 |                 raise ValueError('with security ot time index, data structure should be DataFrame')
 346 | 
 347 |         # 检查时间index和个股index， 对于截面，时序和面板数据分别检查
 348 |         security_data = None
 349 |         time_series_data = None
 350 |         if self.data_type == 'section':
 351 |             if self.time_series_index is not None:
 352 |                 raise ValueError('For Section Data, time_series_index should be None')
 353 |             if self.security_index is not None:
 354 |                 # 在index和columns中寻找security_index
 355 |                 if self.security_index not in X.columns and \
 356 |                     (X.index.name is None or self.security_index not in X.index.name):
 357 |                     raise ValueError('Can not fund security_index {} in both columns and index'
 358 |                                      .format(self.security_index))
 359 |                 elif self.security_index in X.columns:
 360 |                     X.set_index(self.security_index, inplace=True)
 361 | 
 362 |                 # 判断是否有重复个股
 363 |                 if len(X[self.security_index].unique()) < len(X[self.security_index]):
 364 |                     raise ValueError('For Section Data, security data should be unique')
 365 | 
 366 |                 security_data = X.index.values
 367 | 
 368 |         elif self.data_type == 'time_series':
 369 |             if self.time_series_index is None:
 370 |                 raise ValueError('For time_series Data, time_series_index should NOT be None')
 371 |             if self.security_index is not None:
 372 |                 raise ValueError('For time_series Data, security_index should be None')
 373 |             if self.time_series_index not in X.columns and \
 374 |                     (X.index.name is None or self.time_series_index not in X.index.name):
 375 |                 raise ValueError('Can not fund time_series_index {} in both columns and index'
 376 |                                  .format(self.time_series_index))
 377 |             elif self.time_series_index in X.columns:
 378 |                 X.set_index(self.time_series_index, inplace=True)
 379 | 
 380 |             # 判断是否有重复时间
 381 |             if len(X.index.drop_duplicates()) < len(X):
 382 |                 raise ValueError('For time_series Data, time_series data should be unique')
 383 | 
 384 |             X_combine = X.copy()
 385 |             X_combine['_label'] = y.values if isinstance(y, pd.Series) else y
 386 |             X_combine.sort_index(inplace=True)
 387 |             X, y = X_combine.loc[:, self.feature_names], X_combine.loc[:, '_label']
 388 |             time_series_data = X.index.values
 389 | 
 390 |         else:
 391 |             if self.time_series_index is None:
 392 |                 raise ValueError('For panel Data, time_series_index should NOT be None')
 393 |             if self.security_index is None:
 394 |                 raise ValueError('For panel Data, security_index should NOT be None')
 395 | 
 396 |             # security time_series 进入index
 397 |             if self.time_series_index not in X.columns and \
 398 |                     (X.index.name is None or self.time_series_index not in X.index.name):
 399 |                 raise ValueError('Can not fund time_series_index {} in both columns and index'
 400 |                                  .format(self.time_series_index))
 401 |             elif self.security_index not in X.columns and \
 402 |                     (X.index.name is None or self.security_index not in X.index.name):
 403 |                 raise ValueError('Can not fund security_index {} in both columns and index'
 404 |                                  .format(self.security_index))
 405 |             elif self.time_series_index in X.columns and self.security_index in X.columns:
 406 |                 X.set_index([self.time_series_index, self.security_index], inplace=True)
 407 |             elif self.time_series_index in X.columns:
 408 |                 X.set_index(self.security_index, inplace=True, append=True)
 409 |             elif self.security_index in X.columns:
 410 |                 X.set_index(self.time_series_index, inplace=True, append=True)
 411 | 
 412 |             # 判断没有重复
 413 |             if len(X.index) != len(X.index.drop_duplicates()):
 414 |                 raise ValueError('For time_series Data, time_series data should be unique')
 415 | 
 416 | 
 417 |             X_combine = X.copy()
 418 |             X_combine['_label'] = y.values if isinstance(y, pd.Series) else y
 419 |             X_combine.sort_index(inplace=True)
 420 |             X, y = X_combine.loc[:, self.feature_names], X_combine.loc[:, '_label']
 421 |             time_series_data = X.index.get_level_values(self.time_series_index).values
 422 |             security_data = X.index.get_level_values(self.security_index).values
 423 | 
 424 |         # 检查category_features是否与全包含在feature_names中
 425 |         # 当存在分类数据时，输入数据类型必须为pd。DataFrame
 426 |         if self.category_features is not None:
 427 |             if not isinstance(X, pd.DataFrame):
 428 |                 raise ValueError('while there are category_features in X, X must be pd.DataFrame')
 429 |             if not isinstance(self.category_features, list):
 430 |                 raise ValueError('category_features must be list')
 431 |             for cat_feature in self.category_features:
 432 |                 if cat_feature not in self.feature_names:
 433 |                     raise ValueError('Valid category_feature {} , not in feature_names'.format(cat_feature))
 434 |             # 处理分类数据，转换为整型
 435 |             label_encoder = LabelEncoder()
 436 |             X[self.category_features] = X[self.category_features].apply(label_encoder.fit_transform)
 437 |             # 重构顺序，将分类类型放在前面
 438 |             self.feature_names = \
 439 |                 [self.category_features + [_col for _col in self.feature_names if _col not in self.category_features]]
 440 |             X = X[self.feature_names]
 441 | 
 442 |         # Check arrays
 443 |         if sample_weight is not None:
 444 |             sample_weight = _check_sample_weight(sample_weight, X)
 445 | 
 446 |         # 检查数据内容
 447 |         if isinstance(self, ClassifierMixin):
 448 |             # 验证y是否为分类数据， X， y强转ndarray
 449 |             # todo 分类场景的处理有待优化，暂时不处理
 450 |             X, y = self._validate_data(X, y, y_numeric=False)
 451 |             check_classification_targets(y)
 452 | 
 453 |             if self.class_weight:
 454 |                 if sample_weight is None:
 455 |                     sample_weight = 1.
 456 |                 # modify the sample weights with the corresponding class weight
 457 |                 sample_weight = (sample_weight *
 458 |                                  compute_sample_weight(self.class_weight, y))
 459 | 
 460 |             self.classes_, y = np.unique(y, return_inverse=True)
 461 |             n_trim_classes = np.count_nonzero(np.bincount(y, sample_weight))
 462 |             if n_trim_classes != 2:
 463 |                 raise ValueError("y contains %d class after sample_weight "
 464 |                                  "trimmed classes with zero weights, while 2 "
 465 |                                  "classes are required."
 466 |                                  % n_trim_classes)
 467 |             self.n_classes_ = len(self.classes_)
 468 | 
 469 |         else:
 470 |             # 验证y是否为数值数据， X， y强转ndarray
 471 |             X, y = self._validate_data(X, y, y_numeric=True)
 472 | 
 473 |         # check hall_of_fame and n_components ,if have
 474 |         hall_of_fame = self.hall_of_fame
 475 |         if hall_of_fame is None:
 476 |             hall_of_fame = self.population_size
 477 |         if hall_of_fame > self.population_size or hall_of_fame < 1:
 478 |             raise ValueError('hall_of_fame (%d) must be less than or equal to '
 479 |                              'population_size (%d).' % (self.hall_of_fame,
 480 |                                                         self.population_size))
 481 |         n_components = self.n_components
 482 |         if n_components is None:
 483 |             n_components = hall_of_fame
 484 |         if n_components > hall_of_fame or n_components < 1:
 485 |             raise ValueError('n_components (%d) must be less than or equal to '
 486 |                              'hall_of_fame (%d).' % (self.n_components,
 487 |                                                      self.hall_of_fame))
 488 | 
 489 |         # 检查feature_names是否与n_features_in_一致
 490 |         if self.feature_names is not None:
 491 |             if self.n_features_in_ != len(self.feature_names):
 492 |                 raise ValueError('The supplied `feature_names` has different '
 493 |                                  'length to n_features. Expected %d, got %d.'
 494 |                                  % (self.n_features_in_,
 495 |                                     len(self.feature_names)))
 496 |             for feature_name in self.feature_names:
 497 |                 if not isinstance(feature_name, str):
 498 |                     raise ValueError('invalid type %s found in '
 499 |                                      '`feature_names`.' % type(feature_name))
 500 | 
 501 |         # 检查const_range
 502 |         if not ((isinstance(self.const_range, tuple) and
 503 |                  len(self.const_range) == 2) or self.const_range is None):
 504 |             raise ValueError('const_range should be a tuple with length two, '
 505 |                              'or None.')
 506 | 
 507 |         # 检查function, 稍作修改， 结合const_range到range里面, 并区分number func 和 cat function
 508 |         # 存放不同类型的函数（分类和数值）
 509 |         self._function_dict = {'number': [], 'category': []}
 510 |         # 检验是否存在接受分类变量参数的函数
 511 |         _cat_func_flag = False
 512 |         for function in self.function_set:
 513 |             # 类型检验
 514 |             if isinstance(function, str):
 515 |                 if function not in _function_map:
 516 |                     raise ValueError('invalid function name %s found in '
 517 |                                      '`function_set`.' % function)
 518 |                 function = deepcopy(_function_map[function])
 519 |                 function.add_range(self.const_range)
 520 |                 self._function_dict['number'].append(function)
 521 |             elif isinstance(function, _Function):
 522 |                 function = deepcopy(function)
 523 |                 # 添加常数范围
 524 |                 function.add_range(self.const_range)
 525 |                 # 检验是否有仅接收分类变量的函数
 526 |                 if not _cat_func_flag:
 527 |                     for _param in function.param_type:
 528 |                         if len(_param) == 1 and 'vector' in _param and \
 529 |                                 len(_param['vector']) == 1 and 'category' in _param['vector']:
 530 |                             _cat_func_flag = True
 531 |                 if function.return_type == 'number':
 532 |                     self._function_dict['number'].append(function)
 533 |                 else:
 534 |                     self._function_dict['category'].append(function)
 535 |             else:
 536 |                 raise ValueError('invalid type %s found in `function_set`.'
 537 |                                  % type(function))
 538 | 
 539 |         # number类型函数必须有
 540 |         if len(self._function_dict['number']) == 0:
 541 |             raise ValueError('No valid functions found in `function_set`.')
 542 | 
 543 |         # 当存在只接受分类变量参数的函数时（如groupby），category变量不能为空
 544 |         if _cat_func_flag and len(self.category_features) == 0:
 545 |             raise ValueError('There no category var in input features, but there are functions only get category param')
 546 | 
 547 |         # 点变异记录函数参数个数， 需要在点变异中再考察参数类型
 548 |         self._arities = {}
 549 |         for _type in ['number', 'category']:
 550 |             for function in self._function_dict[_type]:
 551 |                 arity = function.arity
 552 |                 self._arities[arity] = self._arities.get(arity, [])
 553 |                 self._arities[arity].append(function)
 554 | 
 555 |         # 检查fitness
 556 |         if isinstance(self.metric, _Fitness):
 557 |             self._metric = self.metric
 558 |         elif isinstance(self, RegressorMixin):
 559 |             if self.metric not in ('mean absolute error', 'mse', 'rmse',
 560 |                                    'pearson', 'spearman'):
 561 |                 raise ValueError('Unsupported metric: %s' % self.metric)
 562 |             self._metric = _fitness_map[self.metric]
 563 |         elif isinstance(self, ClassifierMixin):
 564 |             if self.metric != 'log loss':
 565 |                 raise ValueError('Unsupported metric: %s' % self.metric)
 566 |             self._metric = _fitness_map[self.metric]
 567 |         elif isinstance(self, TransformerMixin):
 568 |             if self.metric not in ('pearson', 'spearman'):
 569 |                 raise ValueError('Unsupported metric: %s' % self.metric)
 570 |             self._metric = _fitness_map[self.metric]
 571 | 
 572 |         # 检查概率参数
 573 |         # todo 增加交叉变异方法后需要修改此处
 574 |         self._method_probs = np.array([self.p_crossover,
 575 |                                        self.p_subtree_mutation,
 576 |                                        self.p_hoist_mutation,
 577 |                                        self.p_point_mutation])
 578 |         self._method_probs = np.cumsum(self._method_probs)
 579 |         if self._method_probs[-1] > 1:
 580 |             raise ValueError('The sum of p_crossover, p_subtree_mutation, '
 581 |                              'p_hoist_mutation and p_point_mutation should '
 582 |                              'total to 1.0 or less.')
 583 | 
 584 |         # 检查初始化模式
 585 |         if self.init_method not in ('half and half', 'grow', 'full'):
 586 |             raise ValueError('Valid program initializations methods include '
 587 |                              '"grow", "full" and "half and half". Given %s.'
 588 |                              % self.init_method)
 589 | 
 590 |         # 检查初始化深度
 591 |         if (not isinstance(self.init_depth, tuple) or
 592 |                 len(self.init_depth) != 2):
 593 |             raise ValueError('init_depth should be a tuple with length two.')
 594 |         if self.init_depth[0] > self.init_depth[1]:
 595 |             raise ValueError('init_depth should be in increasing numerical '
 596 |                              'order: (min_depth, max_depth).')
 597 | 
 598 |         # 初始化transformer函数
 599 |         if self.transformer is not None:
 600 |             if isinstance(self.transformer, _Function):
 601 |                 self._transformer = self.transformer
 602 |             elif self.transformer == 'sigmoid':
 603 |                 self._transformer = sigmoid
 604 |             else:
 605 |                 raise ValueError('Invalid `transformer`. Expected either '
 606 |                                  '"sigmoid" or _Function object, got %s' %
 607 |                                  type(self.transformer))
 608 |             if self._transformer.arity != 1:
 609 |                 raise ValueError('Invalid arity for `transformer`. Expected 1, '
 610 |                                  'got %d.' % (self._transformer.arity))
 611 | 
 612 |         params = self.get_params()
 613 |         params['_metric'] = self._metric
 614 |         if hasattr(self, '_transformer'):
 615 |             params['_transformer'] = self._transformer
 616 |         else:
 617 |             params['_transformer'] = None
 618 |         params['function_dict'] = self._function_dict
 619 |         params['arities'] = self._arities
 620 |         params['method_probs'] = self._method_probs
 621 |         params['cat_var_number'] = len(self.category_features) if self.category_features is not None else 0
 622 | 
 623 |         # 清空_program
 624 |         if not self.warm_start or not hasattr(self, '_programs'):
 625 |             # Free allocated memory, if any
 626 |             self._programs = []
 627 |             self.run_details_ = {'generation': [],
 628 |                                  'average_length': [],
 629 |                                  'average_fitness': [],
 630 |                                  'best_length': [],
 631 |                                  'best_fitness': [],
 632 |                                  'best_oob_fitness': [],
 633 |                                  'generation_time': []}
 634 | 
 635 |         prior_generations = len(self._programs)
 636 |         n_more_generations = self.generations - prior_generations
 637 | 
 638 |         if n_more_generations < 0:
 639 |             raise ValueError('generations=%d must be larger or equal to '
 640 |                              'len(_programs)=%d when warm_start==True'
 641 |                              % (self.generations, len(self._programs)))
 642 |         elif n_more_generations == 0:
 643 |             fitness = [program.raw_fitness_ for program in self._programs[-1]]
 644 |             warn('Warm-start fitting without increasing n_estimators does not '
 645 |                  'fit new programs.')
 646 | 
 647 |         if self.warm_start:
 648 |             # Generate and discard seeds that would have been produced on the
 649 |             # initial fit call.
 650 |             for i in range(len(self._programs)):
 651 |                 _ = random_state.randint(MAX_INT, size=self.population_size)
 652 | 
 653 |         if self.verbose:
 654 |             # Print header fields
 655 |             self._verbose_reporter()
 656 | 
 657 |         for gen in range(prior_generations, self.generations):
 658 |             start_time = time()
 659 | 
 660 |             if gen == 0:
 661 |                 parents = None
 662 |             else:
 663 |                 try:
 664 |                     parents = self._programs[gen - 1]
 665 |                 except:
 666 |                     print(len(self._programs))
 667 |                     print(gen)
 668 | 
 669 |                     exit()
 670 |             # Parallel loop
 671 |             # 将population_size分配给n_job个进程
 672 |             n_jobs, n_programs, starts = _partition_estimators(self.population_size, self.n_jobs)
 673 |             seeds = random_state.randint(MAX_INT, size=self.population_size)
 674 | 
 675 |             population = Parallel(n_jobs=n_jobs,
 676 |                                   verbose=int(self.verbose > 1))(
 677 |                 delayed(_parallel_evolve)(n_programs[i],
 678 |                                           parents,
 679 |                                           X,
 680 |                                           y,
 681 |                                           security_data,
 682 |                                           time_series_data,
 683 |                                           sample_weight,
 684 |                                           seeds[starts[i]:starts[i + 1]],
 685 |                                           params)
 686 |                 for i in range(n_jobs))
 687 | 
 688 |             # Reduce, maintaining order across different n_jobs
 689 |             population = list(itertools.chain.from_iterable(population))
 690 | 
 691 |             fitness = [program.raw_fitness_ for program in population]
 692 |             length = [program.length_ for program in population]
 693 | 
 694 |             # 惩罚系数
 695 |             parsimony_coefficient = None
 696 |             if self.parsimony_coefficient == 'auto':
 697 |                 parsimony_coefficient = (np.cov(length, fitness)[1, 0] /
 698 |                                          np.var(length))
 699 |             for program in population:
 700 |                 program.fitness_ = program.fitness(parsimony_coefficient)
 701 | 
 702 |             self._programs.append(population)
 703 | 
 704 |             # 去除没有进入下一代的父辈种群
 705 |             if not self.low_memory:
 706 |                 for old_gen in np.arange(gen, 0, -1):
 707 |                     indices = []
 708 |                     for program in self._programs[old_gen]:
 709 |                         if program is not None:
 710 |                             for idx in program.parents:
 711 |                                 if 'idx' in idx:
 712 |                                     indices.append(program.parents[idx])
 713 |                     indices = set(indices)
 714 |                     for idx in range(self.population_size):
 715 |                         if idx not in indices:
 716 |                             self._programs[old_gen - 1][idx] = None
 717 |             elif gen > 0:
 718 |                 # 在low_memory的情况下，去除所有
 719 |                 self._programs[gen - 1] = None
 720 | 
 721 |             # 记录运行细节
 722 |             if self._metric.greater_is_better:
 723 |                 best_program = population[np.argmax(fitness)]
 724 |             else:
 725 |                 best_program = population[np.argmin(fitness)]
 726 | 
 727 |             self.run_details_['generation'].append(gen)
 728 |             self.run_details_['average_length'].append(np.mean(length))
 729 |             self.run_details_['average_fitness'].append(np.mean(fitness))
 730 |             self.run_details_['best_length'].append(best_program.length_)
 731 |             self.run_details_['best_fitness'].append(best_program.raw_fitness_)
 732 |             oob_fitness = np.nan
 733 |             if self.max_samples < 1.0:
 734 |                 oob_fitness = best_program.oob_fitness_
 735 |             self.run_details_['best_oob_fitness'].append(oob_fitness)
 736 |             generation_time = time() - start_time
 737 |             self.run_details_['generation_time'].append(generation_time)
 738 | 
 739 |             if self.verbose:
 740 |                 self._verbose_reporter(self.run_details_)
 741 | 
 742 |             # 是否进入停止条件
 743 |             if self._metric.greater_is_better:
 744 |                 best_fitness = fitness[np.argmax(fitness)]
 745 |                 if best_fitness >= self.stopping_criteria:
 746 |                     break
 747 |             else:
 748 |                 best_fitness = fitness[np.argmin(fitness)]
 749 |                 if best_fitness <= self.stopping_criteria:
 750 |                     break
 751 | 
 752 |         # 特征工程专属模块
 753 |         if isinstance(self, TransformerMixin):
 754 |             # Find the best individuals in the final generation
 755 |             fitness = np.array(fitness)
 756 |             # 找出适应度最优的hall_of_fame个进入fitness
 757 |             if self._metric.greater_is_better:
 758 |                 hall_of_fame = fitness.argsort()[::-1][:self.hall_of_fame]
 759 |             else:
 760 |                 hall_of_fame = fitness.argsort()[:self.hall_of_fame]
 761 |             evaluation = np.array([gp.execute(X) for gp in
 762 |                                    [self._programs[-1][i] for
 763 |                                     i in hall_of_fame]])
 764 |             if self.metric == 'spearman':
 765 |                 evaluation = np.apply_along_axis(rankdata, 1, evaluation)
 766 | 
 767 |             with np.errstate(divide='ignore', invalid='ignore'):
 768 |                 correlations = np.abs(np.corrcoef(evaluation))
 769 |             np.fill_diagonal(correlations, 0.)
 770 |             components = list(range(self.hall_of_fame))
 771 |             indices = list(range(self.hall_of_fame))
 772 |             # Iteratively remove least fit individual of most correlated pair
 773 |             while len(components) > self.n_components:
 774 |                 # 去除hall_of_fame - n_components个高度相关特征
 775 |                 # 找到相关系数矩阵中相关系数绝对值最大的两个特征，删去其中fitness较低的那个
 776 |                 # 相关性低于某一阈值时按照fitness筛选（gplearnplus新增）
 777 |                 most_correlated = np.unravel_index(np.argmax(correlations),
 778 |                                                    correlations.shape)
 779 |                 # The correlation matrix is sorted by fitness, so identifying
 780 |                 # the least fit of the pair is simply getting the higher index
 781 |                 worst = max(most_correlated)
 782 |                 components.pop(worst)
 783 |                 indices.remove(worst)
 784 |                 correlations = correlations[:, indices][indices, :]
 785 |                 if np.max(correlations) < self.tolerable_corr:
 786 |                     break
 787 |                 indices = list(range(len(components)))
 788 |             # 余下的选出最优的self.n_components个
 789 |             components = components[:self.n_components]
 790 |             self._best_programs = [self._programs[-1][i] for i in
 791 |                                    hall_of_fame[components]]
 792 | 
 793 |         else:
 794 |             # Find the best individual in the final generation
 795 |             if self._metric.greater_is_better:
 796 |                 self._program = self._programs[-1][np.argmax(fitness)]
 797 |             else:
 798 |                 self._program = self._programs[-1][np.argmin(fitness)]
 799 | 
 800 |         return self
 801 | 
 802 | 
 803 | class SymbolicRegressor(BaseSymbolic, RegressorMixin):
 804 |     def __init__(self,
 805 |                  *,
 806 |                  population_size=1000,
 807 |                  generations=20,
 808 |                  tournament_size=20,
 809 |                  stopping_criteria=0.0,
 810 |                  const_range=(-1., 1.),
 811 |                  init_depth=(2, 6),
 812 |                  init_method='half and half',
 813 |                  function_set=('add', 'sub', 'mul', 'div'),
 814 |                  metric='mean absolute error',
 815 |                  parsimony_coefficient=0.001,
 816 |                  p_crossover=0.9,
 817 |                  p_subtree_mutation=0.01,
 818 |                  p_hoist_mutation=0.01,
 819 |                  p_point_mutation=0.01,
 820 |                  p_point_replace=0.05,
 821 |                  max_samples=1.0,
 822 |                  feature_names=None,
 823 |                  time_series_index=None,
 824 |                  security_index=None,
 825 |                  category_features=None,
 826 |                  warm_start=False,
 827 |                  low_memory=False,
 828 |                  n_jobs=1,
 829 |                  verbose=0,
 830 |                  data_type='section',
 831 |                  random_state=None):
 832 |         super(SymbolicRegressor, self).__init__(
 833 |             population_size=population_size,
 834 |             generations=generations,
 835 |             tournament_size=tournament_size,
 836 |             stopping_criteria=stopping_criteria,
 837 |             const_range=const_range,
 838 |             init_depth=init_depth,
 839 |             init_method=init_method,
 840 |             function_set=function_set,
 841 |             metric=metric,
 842 |             parsimony_coefficient=parsimony_coefficient,
 843 |             p_crossover=p_crossover,
 844 |             p_subtree_mutation=p_subtree_mutation,
 845 |             p_hoist_mutation=p_hoist_mutation,
 846 |             p_point_mutation=p_point_mutation,
 847 |             p_point_replace=p_point_replace,
 848 |             max_samples=max_samples,
 849 |             feature_names=feature_names,
 850 |             time_series_index=time_series_index,
 851 |             security_index=security_index,
 852 |             category_features=category_features,
 853 |             warm_start=warm_start,
 854 |             low_memory=low_memory,
 855 |             n_jobs=n_jobs,
 856 |             verbose=verbose,
 857 |             random_state=random_state,
 858 |             data_type=data_type)
 859 | 
 860 |     def __str__(self):
 861 |         """Overloads `print` output of the object to resemble a LISP tree."""
 862 |         if not hasattr(self, '_program'):
 863 |             return self.__repr__()
 864 |         return self._program.__str__()
 865 | 
 866 |     def predict(self, X):
 867 |         """Perform regression on test vectors X.
 868 | 
 869 |         Parameters
 870 |         ----------
 871 |         X : array-like, shape = [n_samples, n_features]
 872 |             Input vectors, where n_samples is the number of samples
 873 |             and n_features is the number of features.
 874 | 
 875 |         Returns
 876 |         -------
 877 |         y : array, shape = [n_samples]
 878 |             Predicted values for X.
 879 | 
 880 |         """
 881 |         if not hasattr(self, '_program'):
 882 |             raise NotFittedError('SymbolicRegressor not fitted.')
 883 | 
 884 |         X = check_array(X)
 885 |         _, n_features = X.shape
 886 |         if self.n_features_in_ != n_features:
 887 |             raise ValueError('Number of features of the model must match the '
 888 |                              'input. Model n_features is %s and input '
 889 |                              'n_features is %s.'
 890 |                              % (self.n_features_in_, n_features))
 891 | 
 892 |         y = self._program.execute(X)
 893 | 
 894 |         return y
 895 | 
 896 | 
 897 | class SymbolicClassifier(BaseSymbolic, ClassifierMixin):
 898 |     def __init__(self,
 899 |                  *,
 900 |                  population_size=1000,
 901 |                  generations=20,
 902 |                  tournament_size=20,
 903 |                  stopping_criteria=0.0,
 904 |                  const_range=(-1., 1.),
 905 |                  init_depth=(2, 6),
 906 |                  init_method='half and half',
 907 |                  function_set=('add', 'sub', 'mul', 'div'),
 908 |                  transformer='sigmoid',
 909 |                  metric='log loss',
 910 |                  parsimony_coefficient=0.001,
 911 |                  p_crossover=0.9,
 912 |                  p_subtree_mutation=0.01,
 913 |                  p_hoist_mutation=0.01,
 914 |                  p_point_mutation=0.01,
 915 |                  p_point_replace=0.05,
 916 |                  max_samples=1.0,
 917 |                  class_weight=None,
 918 |                  feature_names=None,
 919 |                  time_series_index=None,
 920 |                  security_index=None,
 921 |                  category_features=None,
 922 |                  warm_start=False,
 923 |                  low_memory=False,
 924 |                  n_jobs=1,
 925 |                  verbose=0,
 926 |                  data_type='section',
 927 |                  random_state=None):
 928 |         super(SymbolicClassifier, self).__init__(
 929 |             population_size=population_size,
 930 |             generations=generations,
 931 |             tournament_size=tournament_size,
 932 |             stopping_criteria=stopping_criteria,
 933 |             const_range=const_range,
 934 |             init_depth=init_depth,
 935 |             init_method=init_method,
 936 |             function_set=function_set,
 937 |             transformer=transformer,
 938 |             metric=metric,
 939 |             parsimony_coefficient=parsimony_coefficient,
 940 |             p_crossover=p_crossover,
 941 |             p_subtree_mutation=p_subtree_mutation,
 942 |             p_hoist_mutation=p_hoist_mutation,
 943 |             p_point_mutation=p_point_mutation,
 944 |             p_point_replace=p_point_replace,
 945 |             max_samples=max_samples,
 946 |             class_weight=class_weight,
 947 |             feature_names=feature_names,
 948 |             time_series_index=time_series_index,
 949 |             security_index=security_index,
 950 |             category_features=category_features,
 951 |             warm_start=warm_start,
 952 |             low_memory=low_memory,
 953 |             n_jobs=n_jobs,
 954 |             verbose=verbose,
 955 |             data_type=data_type,
 956 |             random_state=random_state)
 957 | 
 958 |     def __str__(self):
 959 |         """Overloads `print` output of the object to resemble a LISP tree."""
 960 |         if not hasattr(self, '_program'):
 961 |             return self.__repr__()
 962 |         return self._program.__str__()
 963 | 
 964 |     def _more_tags(self):
 965 |         return {'binary_only': True}
 966 | 
 967 |     def predict_proba(self, X):
 968 |         # 输出概率 只支持二分类
 969 |         if not hasattr(self, '_program'):
 970 |             raise NotFittedError('SymbolicClassifier not fitted.')
 971 | 
 972 |         X = check_array(X)
 973 |         _, n_features = X.shape
 974 |         if self.n_features_in_ != n_features:
 975 |             raise ValueError('Number of features of the model must match the '
 976 |                              'input. Model n_features is %s and input '
 977 |                              'n_features is %s.'
 978 |                              % (self.n_features_in_, n_features))
 979 | 
 980 |         scores = self._program.execute(X)
 981 |         proba = self._transformer(scores)
 982 |         proba = np.vstack([1 - proba, proba]).T
 983 |         return proba
 984 | 
 985 |     def predict(self, X):
 986 |         # 输出预测结果
 987 |         proba = self.predict_proba(X)
 988 |         return self.classes_.take(np.argmax(proba, axis=1), axis=0)
 989 | 
 990 | 
 991 | class SymbolicTransformer(BaseSymbolic, TransformerMixin):
 992 |     def __init__(self,
 993 |                  *,
 994 |                  population_size=1000,
 995 |                  hall_of_fame=100,
 996 |                  n_components=10,
 997 |                  generations=20,
 998 |                  tournament_size=20,
 999 |                  stopping_criteria=1.0,
1000 |                  const_range=(-1., 1.),
1001 |                  init_depth=(2, 6),
1002 |                  init_method='half and half',
1003 |                  function_set=('add', 'sub', 'mul', 'div'),
1004 |                  metric='pearson',
1005 |                  parsimony_coefficient=0.001,
1006 |                  p_crossover=0.9,
1007 |                  p_subtree_mutation=0.01,
1008 |                  p_hoist_mutation=0.01,
1009 |                  p_point_mutation=0.01,
1010 |                  p_point_replace=0.05,
1011 |                  max_samples=1.0,
1012 |                  tolerable_corr=0.0,
1013 |                  feature_names=None,
1014 |                  time_series_index=None,
1015 |                  security_index=None,
1016 |                  category_features=None,
1017 |                  warm_start=False,
1018 |                  low_memory=False,
1019 |                  n_jobs=1,
1020 |                  verbose=0,
1021 |                  data_type='section',
1022 |                  random_state=None):
1023 |         super(SymbolicTransformer, self).__init__(
1024 |             population_size=population_size,
1025 |             hall_of_fame=hall_of_fame,
1026 |             n_components=n_components,
1027 |             generations=generations,
1028 |             tournament_size=tournament_size,
1029 |             stopping_criteria=stopping_criteria,
1030 |             const_range=const_range,
1031 |             init_depth=init_depth,
1032 |             init_method=init_method,
1033 |             function_set=function_set,
1034 |             metric=metric,
1035 |             parsimony_coefficient=parsimony_coefficient,
1036 |             p_crossover=p_crossover,
1037 |             p_subtree_mutation=p_subtree_mutation,
1038 |             p_hoist_mutation=p_hoist_mutation,
1039 |             p_point_mutation=p_point_mutation,
1040 |             p_point_replace=p_point_replace,
1041 |             max_samples=max_samples,
1042 |             tolerable_corr=tolerable_corr,
1043 |             feature_names=feature_names,
1044 |             time_series_index=time_series_index,
1045 |             security_index=security_index,
1046 |             category_features=category_features,
1047 |             warm_start=warm_start,
1048 |             low_memory=low_memory,
1049 |             n_jobs=n_jobs,
1050 |             verbose=verbose,
1051 |             data_type=data_type,
1052 |             random_state=random_state)
1053 | 
1054 |     def __len__(self):
1055 |         """Overloads `len` output to be the number of fitted components."""
1056 |         if not hasattr(self, '_best_programs'):
1057 |             return 0
1058 |         return self.n_components
1059 | 
1060 |     def __getitem__(self, item):
1061 |         """Return the ith item of the fitted components."""
1062 |         if item >= len(self):
1063 |             raise IndexError
1064 |         return self._best_programs[item]
1065 | 
1066 |     def __str__(self):
1067 |         """Overloads `print` output of the object to resemble LISP trees."""
1068 |         if not hasattr(self, '_best_programs'):
1069 |             return self.__repr__()
1070 |         output = str([gp.__str__() for gp in self])
1071 |         return output.replace("',", ",\n").replace("'", "")
1072 | 
1073 |     def _more_tags(self):
1074 |         return {
1075 |             "_xfail_checks": {
1076 |                 "check_sample_weights_invariance": (
1077 |                     "zero sample_weight is not equivalent to removing samples"
1078 |                 ),
1079 |             }
1080 |         }
1081 | 
1082 |     def transform(self, X):
1083 |         # 将X转换成以及训练好的特征
1084 |         if not hasattr(self, '_best_programs'):
1085 |             raise NotFittedError('SymbolicTransformer not fitted.')
1086 | 
1087 |         X = check_array(X)
1088 |         _, n_features = X.shape
1089 |         if self.n_features_in_ != n_features:
1090 |             raise ValueError('Number of features of the model must match the '
1091 |                              'input. Model n_features is %s and input '
1092 |                              'n_features is %s.'
1093 |                              % (self.n_features_in_, n_features))
1094 | 
1095 |         X_new = np.array([gp.execute(X) for gp in self._best_programs]).T
1096 | 
1097 |         return X_new
1098 | 
1099 |     def fit_transform(self, X, y, sample_weight=None):
1100 |         # 训练之后转换
1101 |         return self.fit(X, y, sample_weight).transform(X)
1102 | 
1103 | 


--------------------------------------------------------------------------------