├── load_data ├── constants.py ├── treasuries.py ├── treasuries_can.py ├── mongodb.py └── loader.py ├── __init__.py ├── README.md ├── utils.py ├── demo.py ├── functions.py ├── fitness.py ├── data_processing └── DataProcessing ├── _program.py └── genetic.py /load_data/constants.py: -------------------------------------------------------------------------------- 1 | IP="127.0.0.1" 2 | PORT=27017 3 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Genetic Programming in Python,with a scikit-learn inspired API 3 | 4 | gplearn is a set of algorithms for learnin genetic programming models 5 | 6 | ''' 7 | __version__ = '0.4.1' 8 | __all__ = ['genetic','functions','fitness'] 9 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # gplearn_stock 2 | ## 代码作用介绍 3 | 改进gplearn,主要使用在股票数据挖掘; 4 | ## 简介 5 | 本代码在gplearn的基础上,对代码进行调整修改,以期更适用国内的股票策略; 6 | 7 | Gplearn是python内部最成熟的符号回归算法实现,作为一种监督学习方法,符号回归试图发现某种隐藏的数学公式,从而利用特征变量预测目标变量; 8 | 9 | 10 | 符号回归的具体实现方式是遗传算法。首先生成多个未经历选择公式,此后的每一代中,最合适的公式将被替换; 11 | 12 | 13 | 随着伴随次数的增长,它们不断的繁殖,变异,进化,从而不断的逼近数据分布的真相; 14 | 15 | 16 | 作为使用到国内二级市场的核心点在在于适应的计算方法,不同适应度的计算方法,得到不同的结果。 17 | 18 | 19 | 本篇文章,主要使用调仓收益的预期作为适应度计算公式,对所有的随机生成公式进行进化。 20 | 21 | 22 | ## 使用方法: 23 | 安装gplean,将相应的部分进行替换。并结合demo 进行尝试 24 | ## 本代码适应度计算方法: 25 | 按照公式的大小提取前50(或100)只股票,等比例建仓,按照5个交易日进行调仓,适应度为股票收益率的均值,越大适应度越高,不设上限。 26 | 具体方法请查看_program.py的stock_excute函数,如需其他适应度,也可对此部分进行调整修改。 27 | ## demo数据。需要的请联系qq:94006733 28 | # 由于QQ并不经常登录,有需求请关注公共号“小王子的数量分析”,并留言。 29 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | """Utilities that are required by gplearn. 2 | 3 | Most of these functions are slightly modified versions of some key utility 4 | functions from scikit-learn that gplearn depends upon. They reside here in 5 | order to maintain compatibility across different versions of scikit-learn. 6 | make_function 7 | """ 8 | 9 | import numbers 10 | 11 | import numpy as np 12 | from joblib import cpu_count 13 | 14 | 15 | def check_random_state(seed): 16 | """Turn seed into a np.random.RandomState instance 17 | 18 | Parameters 19 | ---------- 20 | seed : None | int | instance of RandomState 21 | If seed is None, return the RandomState singleton used by np.random. 22 | If seed is an int, return a new RandomState instance seeded with seed. 23 | If seed is already a RandomState instance, return it. 24 | Otherwise raise ValueError. 25 | 26 | """ 27 | if seed is None or seed is np.random: 28 | return np.random.mtrand._rand 29 | if isinstance(seed, (numbers.Integral, np.integer)): 30 | return np.random.RandomState(seed) 31 | if isinstance(seed, np.random.RandomState): 32 | return seed 33 | raise ValueError('%r cannot be used to seed a numpy.random.RandomState' 34 | ' instance' % seed) 35 | 36 | 37 | def _get_n_jobs(n_jobs): 38 | """Get number of jobs for the computation. 39 | 40 | This function reimplements the logic of joblib to determine the actual 41 | number of jobs depending on the cpu count. If -1 all CPUs are used. 42 | If 1 is given, no parallel computing code is used at all, which is useful 43 | for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. 44 | Thus for n_jobs = -2, all CPUs but one are used. 45 | 46 | Parameters 47 | ---------- 48 | n_jobs : int 49 | Number of jobs stated in joblib convention. 50 | 51 | Returns 52 | ------- 53 | n_jobs : int 54 | The actual number of jobs as positive integer. 55 | 56 | """ 57 | if n_jobs < 0: 58 | return max(cpu_count() + 1 + n_jobs, 1) 59 | elif n_jobs == 0: 60 | raise ValueError('Parameter n_jobs == 0 has no meaning.') 61 | else: 62 | return n_jobs 63 | 64 | 65 | def _partition_estimators(n_estimators, n_jobs): 66 | """Private function used to partition estimators between jobs.""" 67 | # Compute the number of jobs 68 | n_jobs = min(_get_n_jobs(n_jobs), n_estimators) 69 | 70 | # Partition estimators between jobs 71 | n_estimators_per_job = (n_estimators // n_jobs) * np.ones(n_jobs, 72 | dtype=np.int) 73 | n_estimators_per_job[:n_estimators % n_jobs] += 1 74 | starts = np.cumsum(n_estimators_per_job) 75 | 76 | return n_jobs, n_estimators_per_job.tolist(), [0] + starts.tolist() 77 | -------------------------------------------------------------------------------- /demo.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | from load_data.loader import load_day_data,load_minute_data 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | from gplearn.utils import check_random_state 6 | from gplearn.genetic import SymbolicRegressor 7 | import time 8 | import numba 9 | import copy 10 | import h5py 11 | import gcdata_chuli 12 | from data_processing.DataProcessing import data_chuli 13 | 14 | 15 | # 定义一个函数,用于进行遗传进化 16 | if __name__ == '__main__': 17 | ''' 18 | 由于数据可能需要多次处理,这里现将处理好的数据进行保存 19 | ''' 20 | 21 | ''' 22 | stocks = list(pd.read_csv('C:\\Users\\94006\\Desktop\\HS300.csv')['HS300']) 23 | start_time = '2010-06-30' 24 | end_time= '2018-12-30' 25 | stock_list = [str(i).zfill(6) for i in stocks] 26 | day_or_minute = 'day' 27 | metric = 'stock_dedicated' 28 | 29 | train_ratio = 0.7 30 | adjust=True 31 | 32 | data_all = data_chuli(start_time = start_time, 33 | end_time = end_time, 34 | stock_list = stock_list, 35 | day_or_minute = day_or_minute, 36 | train_ratio = train_ratio, 37 | adjust = adjust) 38 | (x_train,y_train,x_test,y_test) = data_all.time_adjust()#获取训练和测试用的数据 39 | x_train_HS300 = 'D:\\HS300_x_train_datas150.npy' 40 | y_train_HS300 = 'D:\\HS300_y_train_datas150.npy' 41 | x_test_HS300 = 'D:\\HS300_x_test_datas150.npy' 42 | y_test_HS300 = 'D:\\HS300_y_test_datas150.npy' 43 | np.save(x_train_HS300,np.array(x_train)) 44 | np.save(y_train_HS300,np.array(y_train)) 45 | np.save(x_test_HS300,np.array(x_test)) 46 | np.save(y_test_HS300,np.array(y_test)) 47 | 48 | ''' 49 | 50 | x_train = np.array(np.load('D:\\HS300_x_train_datas150.npy')) # 不同用户数据保存的位置可能不一样,请根据自身的地址进行调整 51 | y_train = np.array(np.load('D:\\HS300_y_train_datas150.npy')) 52 | x_test = np.array(np.load('D:\\HS300_x_test_datas150.npy')) 53 | y_test =np.array( np.load('D:\\HS300_y_test_datas150.npy')) 54 | (a,b,c,d) = jinghua_data(x_train,y_train,x_test,y_test) 55 | 56 | print (u'数据准备完成,进入进化') 57 | est_gp = SymbolicRegressor(population_size=5000, 58 | generations=8, stopping_criteria=10000, 59 | p_crossover=0.7, p_subtree_mutation=0.1, 60 | p_hoist_mutation=0.05, p_point_mutation=0.1, 61 | max_samples=0.9, verbose=1, 62 | parsimony_coefficient=0.01, random_state=0, 63 | metric= 'stock_dedicated', # 本参数为 = ‘stock_dedicated’则为股票的 64 | n_jobs=2)# 构建一个遗传进化的类 65 | print (u'类构件完成') 66 | input() 67 | x_trains = a 68 | y_trains = b 69 | est_gp.fit(x_trains, y_trains) 70 | -------------------------------------------------------------------------------- /load_data/treasuries.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2013 Quantopian, Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from operator import itemgetter 16 | import re 17 | 18 | import numpy as np 19 | import pandas as pd 20 | 21 | 22 | get_unit_and_periods = itemgetter('unit', 'periods') 23 | 24 | 25 | def parse_treasury_csv_column(column): 26 | """ 27 | Parse a treasury CSV column into a more human-readable format. 28 | 29 | Columns start with 'RIFLGFC', followed by Y or M (year or month), followed 30 | by a two-digit number signifying number of years/months, followed by _N.B. 31 | We only care about the middle two entries, which we turn into a string like 32 | 3month or 30year. 33 | """ 34 | column_re = re.compile( 35 | r"^(?PRIFLGFC)" 36 | "(?P[YM])" 37 | "(?P[0-9]{2})" 38 | "(?P_N.B)$" 39 | ) 40 | 41 | match = column_re.match(column) 42 | if match is None: 43 | raise ValueError("Couldn't parse CSV column %r." % column) 44 | unit, periods = get_unit_and_periods(match.groupdict()) 45 | 46 | # Roundtrip through int to coerce '06' into '6'. 47 | return str(int(periods)) + ('year' if unit == 'Y' else 'month') 48 | 49 | 50 | def earliest_possible_date(): 51 | """ 52 | The earliest date for which we can load data from this module. 53 | """ 54 | # The US Treasury actually has data going back further than this, but it's 55 | # pretty rare to find pricing data going back that far, and there's no 56 | # reason to make people download benchmarks back to 1950 that they'll never 57 | # be able to use. 58 | return pd.Timestamp('1980', tz='UTC') 59 | 60 | 61 | def get_treasury_data(start_date, end_date): 62 | return pd.read_csv( 63 | "http://www.federalreserve.gov/datadownload/Output.aspx" 64 | "?rel=H15" 65 | "&series=bf17364827e38702b42a58cf8eaa3f78" 66 | "&lastObs=" 67 | "&from=" # An unbounded query is ~2x faster than specifying dates. 68 | "&to=" 69 | "&filetype=csv" 70 | "&label=omit" 71 | "&layout=seriescolumn" 72 | "&type=package", 73 | skiprows=1, # First row is a useless header. 74 | parse_dates=['Time Period'], 75 | na_values=['ND'], # Presumably this stands for "No Data". 76 | index_col=0, 77 | ).loc[ 78 | start_date:end_date 79 | ].dropna( 80 | how='all' 81 | ).rename( 82 | columns=parse_treasury_csv_column 83 | ).tz_localize('UTC') * 0.01 # Convert from 2.57% to 0.0257. 84 | 85 | 86 | def dataconverter(s): 87 | try: 88 | return float(s) / 100 89 | except: 90 | return np.nan 91 | 92 | 93 | def get_daily_10yr_treasury_data(): 94 | """Download daily 10 year treasury rates from the Federal Reserve and 95 | return a pandas.Series.""" 96 | url = "http://www.federalreserve.gov/datadownload/Output.aspx?rel=H15" \ 97 | "&series=bcb44e57fb57efbe90002369321bfb3f&lastObs=&from=&to=" \ 98 | "&filetype=csv&label=include&layout=seriescolumn" 99 | return pd.read_csv(url, header=5, index_col=0, names=['DATE', 'BC_10YEAR'], 100 | parse_dates=True, converters={1: dataconverter}, 101 | squeeze=True) 102 | -------------------------------------------------------------------------------- /load_data/treasuries_can.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2013 Quantopian, Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import pandas as pd 17 | import six 18 | from toolz import curry 19 | from toolz.curried.operator import add as prepend 20 | 21 | COLUMN_NAMES = { 22 | "V39063": '1month', 23 | "V39065": '3month', 24 | "V39066": '6month', 25 | "V39067": '1year', 26 | "V39051": '2year', 27 | "V39052": '3year', 28 | "V39053": '5year', 29 | "V39054": '7year', 30 | "V39055": '10year', 31 | # Bank of Canada refers to this as 'Long' Rate, approximately 30 years. 32 | "V39056": '30year', 33 | } 34 | BILL_IDS = ['V39063', 'V39065', 'V39066', 'V39067'] 35 | BOND_IDS = ['V39051', 'V39052', 'V39053', 'V39054', 'V39055', 'V39056'] 36 | 37 | 38 | @curry 39 | def _format_url(instrument_type, 40 | instrument_ids, 41 | start_date, 42 | end_date, 43 | earliest_allowed_date): 44 | """ 45 | Format a URL for loading data from Bank of Canada. 46 | """ 47 | return ( 48 | "http://www.bankofcanada.ca/stats/results/csv" 49 | "?lP=lookup_{instrument_type}_yields.php" 50 | "&sR={restrict}" 51 | "&se={instrument_ids}" 52 | "&dF={start}" 53 | "&dT={end}".format( 54 | instrument_type=instrument_type, 55 | instrument_ids='-'.join(map(prepend("L_"), instrument_ids)), 56 | restrict=earliest_allowed_date.strftime("%Y-%m-%d"), 57 | start=start_date.strftime("%Y-%m-%d"), 58 | end=end_date.strftime("%Y-%m-%d"), 59 | ) 60 | ) 61 | 62 | 63 | format_bill_url = _format_url('tbill', BILL_IDS) 64 | format_bond_url = _format_url('bond', BOND_IDS) 65 | 66 | 67 | def load_frame(url, skiprows): 68 | """ 69 | Load a DataFrame of data from a Bank of Canada site. 70 | """ 71 | return pd.read_csv( 72 | url, 73 | skiprows=skiprows, 74 | skipinitialspace=True, 75 | na_values=["Bank holiday", "Not available"], 76 | parse_dates=["Date"], 77 | index_col="Date", 78 | ).dropna(how='all') \ 79 | .tz_localize('UTC') \ 80 | .rename(columns=COLUMN_NAMES) 81 | 82 | 83 | def check_known_inconsistencies(bill_data, bond_data): 84 | """ 85 | There are a couple quirks in the data provided by Bank of Canada. 86 | Check that no new quirks have been introduced in the latest download. 87 | """ 88 | inconsistent_dates = bill_data.index.sym_diff(bond_data.index) 89 | known_inconsistencies = [ 90 | # bill_data has an entry for 2010-02-15, which bond_data doesn't. 91 | # bond_data has an entry for 2006-09-04, which bill_data doesn't. 92 | # Both of these dates are bank holidays (Flag Day and Labor Day, 93 | # respectively). 94 | pd.Timestamp('2006-09-04', tz='UTC'), 95 | pd.Timestamp('2010-02-15', tz='UTC'), 96 | # 2013-07-25 comes back as "Not available" from the bills endpoint. 97 | # This date doesn't seem to be a bank holiday, but the previous 98 | # calendar implementation dropped this entry, so we drop it as well. 99 | # If someone cares deeply about the integrity of the Canadian trading 100 | # calendar, they may want to consider forward-filling here rather than 101 | # dropping the row. 102 | pd.Timestamp('2013-07-25', tz='UTC'), 103 | ] 104 | unexpected_inconsistences = inconsistent_dates.drop(known_inconsistencies) 105 | if len(unexpected_inconsistences): 106 | in_bills = bill_data.index.difference(bond_data.index).difference( 107 | known_inconsistencies 108 | ) 109 | in_bonds = bond_data.index.difference(bill_data.index).difference( 110 | known_inconsistencies 111 | ) 112 | raise ValueError( 113 | "Inconsistent dates for Canadian treasury bills vs bonds. \n" 114 | "Dates with bills but not bonds: {in_bills}.\n" 115 | "Dates with bonds but not bills: {in_bonds}.".format( 116 | in_bills=in_bills, 117 | in_bonds=in_bonds, 118 | ) 119 | ) 120 | 121 | 122 | def earliest_possible_date(): 123 | """ 124 | The earliest date for which we can load data from this module. 125 | """ 126 | today = pd.Timestamp('now', tz='UTC').normalize() 127 | # Bank of Canada only has the last 10 years of data at any given time. 128 | return today.replace(year=today.year - 10) 129 | 130 | 131 | def get_treasury_data(start_date, end_date): 132 | bill_data = load_frame( 133 | format_bill_url(start_date, end_date, start_date), 134 | # We skip fewer rows here because we query for fewer bill fields, 135 | # which makes the header smaller. 136 | skiprows=18, 137 | ) 138 | 139 | bond_data = load_frame( 140 | format_bond_url(start_date, end_date, start_date), 141 | skiprows=22, 142 | ) 143 | check_known_inconsistencies(bill_data, bond_data) 144 | 145 | # dropna('any') removes the rows for which we only had data for one of 146 | # bills/bonds. 147 | out = pd.concat([bond_data, bill_data], axis=1).dropna(how='any') 148 | assert set(out.columns) == set(six.itervalues(COLUMN_NAMES)) 149 | 150 | # Multiply by 0.01 to convert from percentages to expected output format. 151 | return out * 0.01 152 | -------------------------------------------------------------------------------- /functions.py: -------------------------------------------------------------------------------- 1 | """The functions used to create programs. 2 | 3 | The :mod:`gplearn.functions` module contains all of the functions used by 4 | gplearn programs. It also contains helper methods for a user to define their 5 | own custom functions. 6 | """ 7 | 8 | # Author: Trevor Stephens 9 | # 10 | # License: BSD 3 clause 11 | 12 | import numpy as np 13 | from joblib import wrap_non_picklable_objects 14 | 15 | __all__ = ['make_function'] 16 | 17 | 18 | class _Function(object): 19 | 20 | """A representation of a mathematical relationship, a node in a program. 21 | 22 | This object is able to be called with NumPy vectorized arguments and return 23 | a resulting vector based on a mathematical relationship. 24 | 25 | Parameters 26 | ---------- 27 | function : callable 28 | A function with signature function(x1, *args) that returns a Numpy 29 | array of the same shape as its arguments. 30 | 31 | name : str 32 | The name for the function as it should be represented in the program 33 | and its visualizations. 34 | 35 | arity : int 36 | The number of arguments that the ``function`` takes. 37 | 38 | """ 39 | 40 | def __init__(self, function, name, arity): 41 | self.function = function 42 | self.name = name 43 | self.arity = arity 44 | 45 | def __call__(self, *args): 46 | return self.function(*args) 47 | 48 | 49 | def make_function(function, name, arity, wrap=True): 50 | """Make a function node, a representation of a mathematical relationship. 51 | 52 | This factory function creates a function node, one of the core nodes in any 53 | program. The resulting object is able to be called with NumPy vectorized 54 | arguments and return a resulting vector based on a mathematical 55 | relationship. 56 | 57 | Parameters 58 | ---------- 59 | function : callable 60 | A function with signature `function(x1, *args)` that returns a Numpy 61 | array of the same shape as its arguments. 62 | 63 | name : str 64 | The name for the function as it should be represented in the program 65 | and its visualizations. 66 | 67 | arity : int 68 | The number of arguments that the `function` takes. 69 | 70 | wrap : bool, optional (default=True) 71 | When running in parallel, pickling of custom functions is not supported 72 | by Python's default pickler. This option will wrap the function using 73 | cloudpickle allowing you to pickle your solution, but the evolution may 74 | run slightly more slowly. If you are running single-threaded in an 75 | interactive Python session or have no need to save the model, set to 76 | `False` for faster runs. 77 | 78 | """ 79 | if not isinstance(arity, int): 80 | raise ValueError('arity must be an int, got %s' % type(arity)) 81 | if not isinstance(function, np.ufunc): 82 | if function.__code__.co_argcount != arity: 83 | raise ValueError('arity %d does not match required number of ' 84 | 'function arguments of %d.' 85 | % (arity, function.__code__.co_argcount)) 86 | if not isinstance(name, str): 87 | raise ValueError('name must be a string, got %s' % type(name)) 88 | if not isinstance(wrap, bool): 89 | raise ValueError('wrap must be an bool, got %s' % type(wrap)) 90 | #print (arity,'niaho') 91 | # Check output shape 92 | args = [np.ones(10) for _ in range(arity)] 93 | try: 94 | function(*args) 95 | except ValueError: 96 | raise ValueError('supplied function %s does not support arity of %d.' 97 | % (name, arity)) 98 | if not hasattr(function(*args), 'shape'): 99 | raise ValueError('supplied function %s does not return a numpy array.' 100 | % name) 101 | if function(*args).shape != (10,): 102 | raise ValueError('supplied function %s does not return same shape as ' 103 | 'input vectors.' % name) 104 | 105 | # Check closure for zero & negative input arguments 106 | args = [np.zeros(10) for _ in range(arity)] 107 | if not np.all(np.isfinite(function(*args))): 108 | raise ValueError('supplied function %s does not have closure against ' 109 | 'zeros in argument vectors.' % name) 110 | args = [-1 * np.ones(10) for _ in range(arity)] 111 | if not np.all(np.isfinite(function(*args))): 112 | raise ValueError('supplied function %s does not have closure against ' 113 | 'negatives in argument vectors.' % name) 114 | 115 | if wrap: 116 | return _Function(function=wrap_non_picklable_objects(function), 117 | name=name, 118 | arity=arity) 119 | return _Function(function=function, 120 | name=name, 121 | arity=arity) 122 | 123 | 124 | def _protected_division(x1, x2): 125 | """Closure of division (x1/x2) for zero denominator.""" 126 | with np.errstate(divide='ignore', invalid='ignore'): 127 | return np.where(np.abs(x2) > 0.001, np.divide(x1, x2), 1.) 128 | 129 | 130 | def _protected_sqrt(x1): 131 | """Closure of square root for negative arguments.""" 132 | return np.sqrt(np.abs(x1)) 133 | 134 | 135 | def _protected_log(x1): 136 | """Closure of log for zero arguments.""" 137 | with np.errstate(divide='ignore', invalid='ignore'): 138 | return np.where(np.abs(x1) > 0.001, np.log(np.abs(x1)), 0.) 139 | 140 | 141 | def _protected_inverse(x1): 142 | """Closure of log for zero arguments.""" 143 | with np.errstate(divide='ignore', invalid='ignore'): 144 | return np.where(np.abs(x1) > 0.001, 1. / x1, 0.) 145 | 146 | 147 | def _sigmoid(x1): 148 | """Special case of logistic function to transform to probabilities.""" 149 | with np.errstate(over='ignore', under='ignore'): 150 | return 1 / (1 + np.exp(-x1)) 151 | 152 | 153 | add2 = _Function(function=np.add, name='add', arity=2) 154 | sub2 = _Function(function=np.subtract, name='sub', arity=2) 155 | mul2 = _Function(function=np.multiply, name='mul', arity=2) 156 | div2 = _Function(function=_protected_division, name='div', arity=2) 157 | sqrt1 = _Function(function=_protected_sqrt, name='sqrt', arity=1) 158 | log1 = _Function(function=_protected_log, name='log', arity=1) 159 | neg1 = _Function(function=np.negative, name='neg', arity=1) 160 | inv1 = _Function(function=_protected_inverse, name='inv', arity=1) 161 | abs1 = _Function(function=np.abs, name='abs', arity=1) 162 | max2 = _Function(function=np.maximum, name='max', arity=2) 163 | min2 = _Function(function=np.minimum, name='min', arity=2) 164 | sin1 = _Function(function=np.sin, name='sin', arity=1) 165 | cos1 = _Function(function=np.cos, name='cos', arity=1) 166 | tan1 = _Function(function=np.tan, name='tan', arity=1) 167 | sig1 = _Function(function=_sigmoid, name='sig', arity=1) 168 | 169 | _function_map = {'add': add2, 170 | 'sub': sub2, 171 | 'mul': mul2, 172 | 'div': div2, 173 | 'sqrt': sqrt1, 174 | 'log': log1, 175 | 'abs': abs1, 176 | 'neg': neg1, 177 | 'inv': inv1, 178 | 'max': max2, 179 | 'min': min2, 180 | 'sin': sin1, 181 | 'cos': cos1, 182 | 'tan': tan1} 183 | -------------------------------------------------------------------------------- /fitness.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | """Metrics to evaluate the fitness of a program. 3 | 4 | The :mod:`gplearn.fitness` module contains some metric with which to evaluate 5 | the computer programs created by the :mod:`gplearn.genetic` module. 6 | """ 7 | 8 | # Author: Trevor Stephens 9 | # 10 | # License: BSD 3 clause 11 | 12 | import numbers 13 | import numpy as np 14 | from joblib import wrap_non_picklable_objects 15 | from scipy.stats import rankdata 16 | 17 | __all__ = ['make_fitness'] 18 | 19 | 20 | class _Fitness(object): 21 | 22 | """A metric to measure the fitness of a program. 23 | 24 | This object is able to be called with NumPy vectorized arguments and return 25 | a resulting floating point score quantifying the quality of the program's 26 | representation of the true relationship. 27 | 28 | Parameters 29 | ---------- 30 | function : callable 31 | A function with signature function(y, y_pred, sample_weight) that 32 | returns a floating point number. Where `y` is the input target y 33 | vector, `y_pred` is the predicted values from the genetic program, and 34 | sample_weight is the sample_weight vector. 35 | 36 | greater_is_better : bool 37 | Whether a higher value from `function` indicates a better fit. In 38 | general this would be False for metrics indicating the magnitude of 39 | the error, and True for metrics indicating the quality of fit. 40 | 41 | """ 42 | 43 | def __init__(self, function, greater_is_better,stock_is = None): 44 | self.function = function 45 | self.stock_is = stock_is 46 | self.greater_is_better = greater_is_better 47 | self.sign = 1 if greater_is_better else -1 48 | 49 | def __call__(self, *args): 50 | return self.function(*args) 51 | 52 | 53 | def make_fitness(function, greater_is_better, wrap=True): 54 | """Make a fitness measure, a metric scoring the quality of a program's fit. 55 | 56 | This factory function creates a fitness measure object which measures the 57 | quality of a program's fit and thus its likelihood to undergo genetic 58 | operations into the next generation. The resulting object is able to be 59 | called with NumPy vectorized arguments and return a resulting floating 60 | point score quantifying the quality of the program's representation of the 61 | true relationship. 62 | 63 | Parameters 64 | ---------- 65 | function : callable 66 | A function with signature function(y, y_pred, sample_weight) that 67 | returns a floating point number. Where `y` is the input target y 68 | vector, `y_pred` is the predicted values from the genetic program, and 69 | sample_weight is the sample_weight vector. 70 | 71 | greater_is_better : bool 72 | Whether a higher value from `function` indicates a better fit. In 73 | general this would be False for metrics indicating the magnitude of 74 | the error, and True for metrics indicating the quality of fit. 75 | 76 | wrap : bool, optional (default=True) 77 | When running in parallel, pickling of custom metrics is not supported 78 | by Python's default pickler. This option will wrap the function using 79 | cloudpickle allowing you to pickle your solution, but the evolution may 80 | run slightly more slowly. If you are running single-threaded in an 81 | interactive Python session or have no need to save the model, set to 82 | `False` for faster runs. 83 | 84 | """ 85 | if not isinstance(greater_is_better, bool): 86 | raise ValueError('greater_is_better must be bool, got %s' 87 | % type(greater_is_better)) 88 | if not isinstance(wrap, bool): 89 | raise ValueError('wrap must be an bool, got %s' % type(wrap)) 90 | if function.__code__.co_argcount != 3: 91 | raise ValueError('function requires 3 arguments (y, y_pred, w),' 92 | ' got %d.' % function.__code__.co_argcount) 93 | if not isinstance(function(np.array([1, 1]), 94 | np.array([2, 2]), 95 | np.array([1, 1])), numbers.Number): 96 | raise ValueError('function must return a numeric.') 97 | 98 | if wrap: 99 | return _Fitness(function=wrap_non_picklable_objects(function), 100 | greater_is_better=greater_is_better) 101 | return _Fitness(function=function, 102 | greater_is_better=greater_is_better) 103 | 104 | 105 | def _weighted_pearson(y, y_pred, w): 106 | """Calculate the weighted Pearson correlation coefficient.""" 107 | with np.errstate(divide='ignore', invalid='ignore'): 108 | y_pred_demean = y_pred - np.average(y_pred, weights=w) 109 | y_demean = y - np.average(y, weights=w) 110 | corr = ((np.sum(w * y_pred_demean * y_demean) / np.sum(w)) / 111 | np.sqrt((np.sum(w * y_pred_demean ** 2) * 112 | np.sum(w * y_demean ** 2)) / 113 | (np.sum(w) ** 2))) 114 | if np.isfinite(corr): 115 | return np.abs(corr) 116 | return 0. 117 | 118 | 119 | def _weighted_spearman(y, y_pred, w): 120 | """Calculate the weighted Spearman correlation coefficient.""" 121 | y_pred_ranked = np.apply_along_axis(rankdata, 0, y_pred) 122 | y_ranked = np.apply_along_axis(rankdata, 0, y) 123 | return _weighted_pearson(y_pred_ranked, y_ranked, w) 124 | 125 | 126 | def _mean_absolute_error(y, y_pred, w): 127 | """Calculate the mean absolute error.""" 128 | return np.average(np.abs(y_pred - y), weights=w) 129 | 130 | 131 | def _mean_square_error(y, y_pred, w): 132 | """Calculate the mean square error.""" 133 | return np.average(((y_pred - y) ** 2), weights=w) 134 | 135 | #为了不破坏整体的结构,以原结构进行比较 136 | def _stock_dedicated(y,y_pred,w): 137 | return np.average(y_pred,weights = w) 138 | 139 | def _root_mean_square_error(y, y_pred, w): 140 | """Calculate the root mean square error.""" 141 | return np.sqrt(np.average(((y_pred - y) ** 2), weights=w)) 142 | 143 | 144 | def _log_loss(y, y_pred, w): 145 | """Calculate the log loss.""" 146 | eps = 1e-15 147 | inv_y_pred = np.clip(1 - y_pred, eps, 1 - eps) 148 | y_pred = np.clip(y_pred, eps, 1 - eps) 149 | score = y * np.log(y_pred) + (1 - y) * np.log(inv_y_pred) 150 | return np.average(-score, weights=w) 151 | 152 | 153 | weighted_pearson = _Fitness(function=_weighted_pearson, 154 | greater_is_better=True) 155 | weighted_spearman = _Fitness(function=_weighted_spearman, 156 | greater_is_better=True) 157 | mean_absolute_error = _Fitness(function=_mean_absolute_error, 158 | greater_is_better=False) 159 | mean_square_error = _Fitness(function=_mean_square_error, 160 | greater_is_better=False) 161 | root_mean_square_error = _Fitness(function=_root_mean_square_error, 162 | greater_is_better=False) 163 | log_loss = _Fitness(function=_log_loss, 164 | greater_is_better=False) 165 | stock_dedicated = _Fitness(function=_stock_dedicated, 166 | greater_is_better=True, 167 | stock_is = True) 168 | 169 | 170 | 171 | _fitness_map = {'pearson': weighted_pearson, 172 | 'spearman': weighted_spearman, 173 | 'mean absolute error': mean_absolute_error, 174 | 'mse': mean_square_error, 175 | 'rmse': root_mean_square_error, 176 | 'log loss': log_loss, 177 | 'stock_dedicated':stock_dedicated} 178 | -------------------------------------------------------------------------------- /data_processing/DataProcessing: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | from loader import load_day_data,load_minute_data 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | from gplearn.utils import check_random_state 6 | from gplearn.genetic import SymbolicRegressor 7 | import time 8 | import numba 9 | import copy 10 | import h5py 11 | import gc 12 | 13 | # ['open', 'high', 'low', 'close', 'volume', 'price', 'change', 'mean_five', 'highest_five', 'lowest_five', 14 | # 'mean_20', 'highest_20', 'lowest_20', '20_profit', '20_mean', '5_mean', '20_mean_vol', '5_mean_vol'] 15 | from collections import OrderedDict 16 | time_now = time.time() 17 | import pandas as pd 18 | # 定义一个函数,用于计算最大回撤 input() 19 | # 最大回撤(输入为list或array) 20 | @numba.jit 21 | def MaxDrawdown(strategy): 22 | #print (strategy) 23 | length = (strategy.shape)[0] 24 | drawdown = [] 25 | py =strategy[0] 26 | for i in range(1,length): 27 | px = strategy[i] 28 | py = max(strategy[:i]) 29 | drawdown.append(1-(px/py)) 30 | if max(drawdown)>0: 31 | return max(drawdown) 32 | else: 33 | return 0 34 | 35 | #定义一个函数,判断是否存在np.nan 36 | @numba.jit 37 | def kong_pan(xulie): 38 | for i in xulie: 39 | if abs(i)>=0: 40 | pass 41 | else: 42 | return True 43 | return False 44 | 45 | # 定义一个类,用于进行数据方面的处理 46 | class data_chuli(object): 47 | def __init__(self, start_time,end_time, stock_list,day_or_minute = 'day',train_ratio = 0.3,adjust=True): 48 | self.start_time = start_time 49 | self.end_time = end_time 50 | self.stock_list = stock_list # 股票列表 51 | self.day_or_minute = day_or_minute # 交易日历 52 | self.train_ratio = train_ratio # 测试数据占总体数据的比例 53 | self.adjust = adjust # 是否复权 54 | 55 | 56 | def data_get(self): 57 | ''' 58 | 本程序主要用于获取股票数据 59 | ''' 60 | print (self.day_or_minute) 61 | if self.day_or_minute =='day': 62 | print (u'获取日线数据') 63 | [data,startdate,enddate] = load_day_data(stockList = self.stock_list, 64 | start = self.start_time, 65 | end = self.end_time) 66 | else: 67 | print (u'获取分钟数据') 68 | [data,startdate,enddate] = load_minute_data(stockList = self.stock_list, 69 | start = self.start_time, 70 | end = self.end_time) 71 | return data 72 | 73 | 74 | def index_get(self,indexes = '000001'): 75 | data = load_day_data(indexes=indexes,start = self.start_time,end = self.end_time) 76 | 77 | #print (data[indexes].head()) 78 | return data[indexes] 79 | 80 | def adaptability_compute(self,cycle = 5): 81 | 82 | ''' 83 | 本程序主要是进行适应度计算 84 | 本处主要计算收益与最大回撤的比值 85 | ''' 86 | data =self.data_get() 87 | new_data = OrderedDict() 88 | for stocks in self.stock_list: 89 | stock = data[stocks] 90 | #print (stock.head(10)) 91 | closes = list(stock['close']) 92 | close_list = [] 93 | try: 94 | for i in range(cycle): 95 | close_list.append(closes[i+1:]+[closes[-1] for t in range(i+1)]) 96 | close_list = np.array(close_list).T 97 | t = stock.shift(-1*cycle)['close']/stock['close'] 98 | maxdowntown = [MaxDrawdown(close_list[i,:]) for i in range(len(close_list[:,0]))] 99 | stock['after_5_profit'] = t 100 | stock['after_maxdowntown'] = maxdowntown 101 | new_data[stocks] = stock 102 | except: 103 | pass 104 | del data 105 | gc.collect() 106 | return new_data 107 | def index_chuli(self,index_data): 108 | 109 | pindex_data = pd.DataFrame() 110 | pindex_data['close'] = list(index_data['close']) 111 | pindex_data['volume'] = list(index_data['volume']) 112 | #print (pindex_data.head(10)) 113 | #获取20、个5个交易日的收益 114 | pindex_data['20_profit'] = pindex_data['close']/(pindex_data.shift(20)['close'])-1 115 | pindex_data['5_profit'] = pindex_data['close']/(pindex_data.shift(5)['close'])-1 116 | # 获取,当前价格和5日均价,20日均价之间的收益情况 117 | pindex_data_mean5 = pindex_data.rolling(5).mean() 118 | pindex_data_mean20 = pindex_data.rolling(20).mean() 119 | pindex_data['20_mean'] = (pindex_data['close'])/(pindex_data_mean20['close']) -1 120 | pindex_data['5_mean'] = (pindex_data['close'])/(pindex_data_mean5['close']) -1 121 | # 获取当前成交量与5日和20日均成交量之间的关系 122 | pindex_data['20_mean_vol'] = (pindex_data['volume'])/(pindex_data_mean20['volume']) -1 123 | pindex_data['5_mean_vol'] = (pindex_data['volume'])/(pindex_data_mean5['volume']) -1 124 | pindex_data['date'] = list(index_data['date']) 125 | #print (pindex_data.head(10)) 126 | #print (list(pindex_data.columns)) 127 | return pindex_data 128 | def factor_get(self): 129 | ''' 130 | 本函数的作用是为了,构建相应的基础因子模块。包括成交价成交量等 131 | 本次试验,基础factors 为 开、收、高、低价格,成交量,5日均价,5日均成交量,5日内最高价 132 | ''' 133 | stock_data = self.adaptability_compute() 134 | index_data = self.index_get() 135 | index_data = self.index_chuli(index_data = index_data) 136 | # 对指数进行处理,获取指数的当日的涨跌幅情况, 137 | # 前20个交易日的涨跌幅,前5个交易日的涨跌幅,当日涨跌幅 138 | # 今天较20日均值的涨跌幅,今日较5日均值的涨跌幅 139 | # 几日较20个交易日均值的成交量,今日缴5个个交易日的成家量 140 | index_data = index_data[['date','20_profit','20_mean','5_mean','20_mean_vol','5_mean_vol']] 141 | for stock in list(stock_data.keys()): 142 | stock_data[stock]['mean_five'] = (stock_data[stock][['close','low']].rolling(5).mean())['close'] 143 | 144 | stock_data[stock]['highest_five'] = (stock_data[stock][['high','low']].rolling(5).max())['high'] 145 | stock_data[stock]['lowest_five'] = (stock_data[stock][['close','low']].rolling(5).min())['low'] 146 | stock_data[stock]['mean_20'] = (stock_data[stock][['close','low']].rolling(20).mean())['close'] 147 | stock_data[stock]['highest_20'] = (stock_data[stock][['high','low']].rolling(20).max())['high'] 148 | stock_data[stock]['lowest_20'] = (stock_data[stock][['close','low']].rolling(20).min())['low'] 149 | stock_data[stock]['date'] = list(stock_data[stock].index) 150 | stock_data[stock] = pd.merge(stock_data[stock],index_data,on='date',how='inner') 151 | return stock_data 152 | 153 | #定义一个函数,按照时间将股票,和后续收益进行分割进行分割,并对有NAN数据进行剔除 154 | 155 | def time_adjust(self): 156 | all_data = self.factor_get() 157 | index_data = self.index_get() 158 | columns = list(all_data[self.stock_list[0]].columns) 159 | adjusted_factor = ['after_5_profit','after_maxdowntown'] 160 | for i in adjusted_factor: 161 | columns.remove(i) 162 | columns.remove('code') 163 | #print (columns) 164 | time_list = list(index_data['date']) 165 | changdu =len(time_list) 166 | t_columns = copy.deepcopy(columns) 167 | t_columns.remove('date') 168 | #print (list((all_data['000333'])['date'])) 169 | x_datas = [[] for i in range(len(time_list))] 170 | y_datas = [[] for i in range(len(time_list))] 171 | f = 0 172 | nan_value = [np.nan for i in t_columns] 173 | print (len(nan_value),t_columns) 174 | for stock in list(all_data.keys()): 175 | stockdata = all_data[stock] 176 | x_data = stockdata[columns] 177 | print (stock,f) 178 | f+=1 179 | y_data = stockdata[adjusted_factor+['date']] 180 | del stockdata 181 | for i in range(len(time_list)): 182 | now_time = time_list[i] 183 | x_stkdata = x_data[x_data['date'] ==now_time] 184 | y_stkdata= y_data[y_data['date'] == now_time] 185 | new_x = np.array(x_stkdata[t_columns]) 186 | #print (x_stkdata[t_columns].head()) 187 | #input() 188 | new_y = list(np.array(y_stkdata['after_5_profit'])) 189 | #print (new_x,new_y,u'herere') 190 | if len(new_y)==0 or len(new_x)==0: 191 | x_datas[i].append(nan_value) 192 | y_datas[i].append(np.nan) 193 | 194 | else: 195 | #print (len(new_x[0]),u'here2',new_y[0]) 196 | x_datas[i].append(list(new_x[0])) 197 | y_datas[i].append(new_y[0]) 198 | #input() 199 | del x_data,y_data 200 | gc.collect() 201 | x_train_data = (x_datas[20:int(changdu*self.train_ratio)]) 202 | y_train_data = y_datas[20:int(changdu*self.train_ratio)] 203 | x_test_data = x_datas[int(changdu*self.train_ratio):-5] 204 | y_test_data = y_datas[int(changdu*self.train_ratio):-5] 205 | return (x_train_data,y_train_data,x_test_data,y_test_data) 206 | 207 | 208 | 209 | #定义一个函数,用于进行train_data 与test_data,以及适应度数据选取 210 | def train_test(self): 211 | ''' 212 | 本代码的作用是为了区分,train,test ,x,y 213 | ''' 214 | all_data = self.factor_get() 215 | columns = list(all_data[self.stock_list[0]].columns) 216 | 217 | adjusted_factor = ['after_5_profit','after_maxdowntown'] 218 | for i in adjusted_factor: 219 | columns.remove(i) 220 | columns.remove('date') 221 | columns.remove('code') 222 | new_stock_data_train = OrderedDict() 223 | new_adjust_data_train = OrderedDict() 224 | new_stock_data_test = OrderedDict() 225 | new_adjust_data_test = OrderedDict() 226 | 227 | for stock in self.stock_list: 228 | changdu = len(all_data[stock]['open']) 229 | x_data = np.array(all_data[stock][columns]) 230 | y_data = np.array(all_data[stock][adjusted_factor]) 231 | new_stock_data_train[stock] = x_data[21:int(changdu*self.train_ratio),:] 232 | new_adjust_data_train[stock] = y_data[21:int(changdu*self.train_ratio),:] 233 | new_stock_data_test[stock] = x_data[int(changdu*self.train_ratio):,:-20] 234 | new_adjust_data_test[stock] =y_data[int(changdu*self.train_ratio):,:-20] 235 | #input() 236 | return (new_stock_data_train,new_adjust_data_train,new_stock_data_test,new_adjust_data_test) 237 | #定以一个函数,用于对保存后读取的数据进行净化处理 238 | #@numba.jit 239 | def jinghua_data(x_train,y_train,x_test,y_test): 240 | x_trains = [[] for i in range(len(x_train))] 241 | y_trains = [[] for i in range(len(x_train))] 242 | x_tests = [[] for i in range(len(x_test))] 243 | y_tests = [[] for i in range(len(x_test))] 244 | (trains_time,trains_stock,f) = x_train.shape 245 | (test_time,test_stock,f) = x_test.shape 246 | for i in range(trains_time): 247 | for j in range(trains_stock): 248 | #print (x_train[i,j]) 249 | if not kong_pan(x_train[i,j]) and abs(y_train[i][j])>=0: 250 | 251 | x_trains[i].append(list(x_train[i,j])) 252 | y_trains[i].append(y_train[i][j]) 253 | ''' 254 | for t in range(test_time): 255 | for k in range(test_stock): 256 | if not kong_pan(x_test[t,k]) or abs(y_test[t][k])>=0: 257 | x_tests[t].append(list(x_test[t,k])) 258 | y_tests[t].append(y_test[t][k]) 259 | ''' 260 | return (x_trains,y_trains,x_test,y_tests) 261 | 262 | # 定义一个函数,用于进行遗传进化 263 | if __name__ == '__main__': 264 | ''' 265 | stocks = list(pd.read_csv('C:\\Users\\94006\\Desktop\\HS300.csv')['HS300'])[0:150] 266 | start_time = '2010-06-30' 267 | end_time= '2018-12-30' 268 | stock_list = [str(i).zfill(6) for i in stocks] 269 | day_or_minute = 'day' 270 | metric = 'stock_dedicated' 271 | 272 | train_ratio = 0.7 273 | adjust=True 274 | 275 | data_all = data_chuli(start_time = start_time, 276 | end_time = end_time, 277 | stock_list = stock_list, 278 | day_or_minute = day_or_minute, 279 | train_ratio = train_ratio, 280 | adjust = adjust) 281 | (x_train,y_train,x_test,y_test) = data_all.time_adjust()#获取训练和测试用的数据 282 | print (u'ppppppppppppppppp') 283 | x_train_HS300 = 'D:\\HS300_x_train_datas150.npy' 284 | y_train_HS300 = 'D:\\HS300_y_train_datas150.npy' 285 | x_test_HS300 = 'D:\\HS300_x_test_datas150.npy' 286 | y_test_HS300 = 'D:\\HS300_y_test_datas150.npy' 287 | np.save(x_train_HS300,np.array(x_train)) 288 | np.save(y_train_HS300,np.array(y_train)) 289 | np.save(x_test_HS300,np.array(x_test)) 290 | np.save(y_test_HS300,np.array(y_test)) 291 | ''' 292 | 293 | x_train = np.array(np.load('D:\\HS300_x_train_datas150.npy')) 294 | y_train = np.array(np.load('D:\\HS300_y_train_datas150.npy')) 295 | x_test = np.array(np.load('D:\\HS300_x_test_datas150.npy')) 296 | y_test =np.array( np.load('D:\\HS300_y_test_datas150.npy')) 297 | print (x_train[:,0:2]) 298 | input() 299 | 300 | (a,b,c,d) = jinghua_data(x_train,y_train,x_test,y_test) 301 | 302 | print (u'数据准备完成,进入进化') 303 | est_gp = SymbolicRegressor(population_size=200, 304 | generations=8, stopping_criteria=10000, 305 | p_crossover=0.7, p_subtree_mutation=0.1, 306 | p_hoist_mutation=0.05, p_point_mutation=0.1, 307 | max_samples=0.9, verbose=1, 308 | parsimony_coefficient=0.01, random_state=0, 309 | metric= 'stock_dedicated', 310 | n_jobs=2)# 构建一个遗传进化的类 311 | print (u'类构件完成') 312 | input() 313 | x_trains = a 314 | #print (a) 315 | y_trains = b 316 | 317 | est_gp.fit(x_trains, y_trains) 318 | -------------------------------------------------------------------------------- /load_data/mongodb.py: -------------------------------------------------------------------------------- 1 | #encoding:utf-8 2 | import sys 3 | 4 | #verion1: get all companies data from tushare and store them in Mongodb 5 | import pymongo 6 | import datetime 7 | import tushare as ts 8 | import time 9 | import json 10 | import pandas as pd 11 | from collections import OrderedDict 12 | import pytz 13 | import types 14 | import requests 15 | from io import BytesIO, StringIO 16 | import os 17 | import click 18 | import re 19 | from os import listdir 20 | from os.path import isfile, join 21 | from os import walk 22 | import gc 23 | 24 | from pandas import DataFrame 25 | 26 | 27 | 28 | class LoadDataCVS: 29 | 30 | basedir="E:/data_new" 31 | stockdata=basedir+"/stock_data" 32 | indexdata=basedir+"/index_data" 33 | 34 | #treasurvity 35 | in_package_data = range(2002, 2018) 36 | DONWLOAD_URL = "http://yield.chinabond.com.cn/cbweb-mn/yc/downYearBzqx?year=%s&&wrjxCBFlag=0&&zblx=txy&ycDefId=%s" 37 | YIELD_MAIN_URL = 'http://yield.chinabond.com.cn/cbweb-mn/yield_main' 38 | # 39 | #'http://yield.chinabond.com.cn/cbweb-mn/yield_main' 40 | 41 | #'#http://yield.chinabond.com.cn/cbweb-mn/yield_main?locale=zh_CN','http://yield.chinabond.com.cn/cbweb-mn/yield_main?locale=zh_CN' 42 | 43 | 44 | 45 | def __init__(self,Ip,port): 46 | self.ip=Ip 47 | self.port=port 48 | 49 | ## connect to the data base 50 | def Conn(self): 51 | self.client = pymongo.MongoClient(self.ip,self.port) 52 | self.connection=self.client.stock #storage stock information 53 | self.index=self.client.index #storage index 54 | self.pool=self.client.pool #storate pool 55 | self.treasure=self.client.treasure 56 | self.minute_stock = self.client.minute_stock 57 | self.minute_index = self.client.minute_index 58 | #print self.connection.collection_names() 59 | #print self.index.collection_names() 60 | #print self.pool.collection_names() 61 | def Close(self): 62 | self.client.close() 63 | 64 | 65 | #store data information into database, do not always call this 66 | def storagedaily(self): 67 | #get the filelist 68 | onlyfiles = [ f for f in listdir(self.stockdata) if isfile(join(self.stockdata,f)) ] 69 | #read from using pandas 70 | for f in onlyfiles: 71 | df = pd.read_csv(self.stockdata+"/"+f) 72 | #print df.head() 73 | s=f.split('.') 74 | name = s[0][2:8] 75 | #print name 76 | records = json.loads(df.T.to_json()).values() 77 | for row in records: 78 | row['date'] = datetime.datetime.strptime(row['date'], "%Y-%m-%d") 79 | #print row 80 | #raw_input() 81 | print (name) 82 | self.connection[name].insert_many(records) 83 | 84 | #store index information into database,do not always call this 85 | 86 | def storageindex(self): 87 | #get the filelist 88 | onlyfiles = [ f for f in listdir(self.indexdata) if isfile(join(self.indexdata,f)) ] 89 | #read from using pandas 90 | for f in onlyfiles: 91 | df = pd.read_csv(self.indexdata+"/"+f) 92 | s=f.split('.') 93 | name = s[0][2:8] 94 | records = json.loads(df.T.to_json()).values() 95 | for row in records: 96 | row['date'] = datetime.datetime.strptime(row['date'], "%Y-%m-%d") 97 | print (name) 98 | self.index[name].insert_many(records) 99 | 100 | 101 | 102 | #storage stock pool into database 103 | def storagepool(self): 104 | #storage zz500 105 | df=ts.get_zz500s() 106 | self.pool['zz500'].insert_many(json.loads(df.to_json(orient='records'))) 107 | #hs300 108 | df=ts.get_hs300s() 109 | self.pool['hz300'].insert_many(json.loads(df.to_json(orient='records'))) 110 | #zh50 111 | df=ts.get_sz50s() 112 | self.pool['sz'].insert_many(json.loads(df.to_json(orient='records'))) 113 | #st 114 | df=ts.get_st_classified() 115 | self.pool['st'].insert_many(json.loads(df.to_json(orient='records'))) 116 | 117 | 118 | 119 | 120 | #get the particular stock list from data base 121 | def getstocklist(self,kind): 122 | ret=[] 123 | if kind=="hs300": 124 | for t in self.pool['hz300'].find(): 125 | ret.append(t['code']) 126 | if kind =="zz500": 127 | for t in self.pool['zz500'].find(): 128 | ret.append(t['code']) 129 | if kind=='sz50': 130 | for t in self.pool['sz'].find(): 131 | ret.append(t['code']) 132 | if kind =='st': 133 | for t in self.pool['st'].find(): 134 | ret.append(t['code']) 135 | if kind == 'all': 136 | for t in self.pool['all'].find(): 137 | ret.append(t['codes']) 138 | 139 | return ret 140 | 141 | #get daily stock information from database 142 | #return dataframe which contains the information we set in the parameters 143 | 144 | def getstockdaily(self,code,start='2000-01-01',end='2099-01-01'): 145 | total=[] 146 | startdate = datetime.datetime.strptime(start, "%Y-%m-%d") 147 | enddate=datetime.datetime.strptime(end, "%Y-%m-%d") 148 | series={"date":[],"open":[],"close":[],"high":[],"low":[],"volume":[],"prices":[],"change":[],"code":[]} 149 | #now_time=time.time() 150 | #print self.connection[code].find({"date": {"$gte": startdate,"$lt":enddate}}).sort("date") 151 | #new_time = time.time() 152 | #print new_time - now_time 153 | #tt = self.connection[code].find({},{'_id':0,'date':1}).sort('date',-1) 154 | #print tt[0]['date'] 155 | #raw_input() 156 | tt = self.connection[code].find({"date": {"$gte": startdate,"$lte":enddate}}).sort("date") 157 | for stockdaily in tt: 158 | series["date"].append(stockdaily["date"]) 159 | series["open"].append(stockdaily["open"]) 160 | series["close"].append(stockdaily["close"]) 161 | series["high"].append(stockdaily["high"]) 162 | series["low"].append(stockdaily["low"]) 163 | series["volume"].append(stockdaily["volume"]) 164 | series["prices"].append(stockdaily["adj_factor"]) 165 | series["change"].append(stockdaily["change"]) 166 | series["code"].append(stockdaily["code"]) 167 | #pp=time.time() 168 | del tt 169 | gc.collect() 170 | totaldata=zip(series['open'],series['high'],series['low'],series['close'],series['volume'],series["prices"],series["change"],series["code"]) 171 | df = pd.DataFrame(data=list(totaldata),index=series["date"],columns = ['open','high','low','close','volume','prices','change',"code"]) 172 | try: 173 | df['price'] = (df['close']*df['prices'])/(list(df['prices'])[-1]) 174 | df = df[['open','high','low','close','volume','price','change',"code"]] 175 | #print df.drop_duplicates() 176 | #raw_input() 177 | return df.drop_duplicates() 178 | except: 179 | df.columns = ['open','high','low','close','volume','price','change',"code"] 180 | return df.drop_duplicates() 181 | 182 | 183 | def getstockminute(self,code,start,end): 184 | startdate = datetime.datetime.strptime(start, "%Y-%m-%d") 185 | enddate=datetime.datetime.strptime(end, "%Y-%m-%d") 186 | series={"date":[],"open":[],"close":[],"high":[],"low":[],"volume":[],"prices":[],"change":[],"code":[]} 187 | tt_date = '1991-01-01' 188 | tt = self.minute_stock[code].find({"date": {"$gte": startdate,"$lte":enddate}}).sort("date") 189 | for stockdaily in tt: 190 | if tt_date != str(stockdaily["date"])[0:10]: 191 | time_day = datetime.datetime.strptime(str(stockdaily['date'])[0:10], "%Y-%m-%d") 192 | tt =self.connection[code].find({"date": {"$gte":time_day ,"$lte":time_day}})[0] 193 | tt_date = str(stockdaily["date"])[0:10] 194 | else: 195 | pass 196 | series["date"].append(stockdaily["date"]) 197 | series["open"].append(stockdaily["open"]) 198 | series["close"].append(stockdaily["close"]) 199 | series["high"].append(stockdaily["high"]) 200 | series["low"].append(stockdaily["low"]) 201 | series["volume"].append(stockdaily["vol"]) 202 | series["prices"].append(tt["adj_factor"]) 203 | series["change"].append(stockdaily["p_change"]) 204 | series["code"].append(stockdaily["code"]) 205 | #pp=time.time 206 | del tt 207 | gc.collect() 208 | totaldata=zip(series['open'],series['high'],series['low'],series['close'],series['volume'],series["prices"],series["change"],series["code"],series['date']) 209 | 210 | df = pd.DataFrame(data=list(totaldata),index=series["date"],columns = ['open','high','low','close','volume','prices','change',"code",'date']) 211 | df['change']= df['change']/100 212 | df['volume'] = df['volume']*100 213 | for factor in ['open','close','high','low','prices']: 214 | df[factor] = [float("%.2f"%i) for i in list(df[factor])] 215 | df = df.drop_duplicates(subset=['date']) 216 | #df.to_csv('E:\\stock_%sdatashujuqingkaung.csv'%list(df['code'])[0]) 217 | try: 218 | df['price'] = (df['close']*df['prices'])/(list(df['prices'])[-1]) 219 | df = df[['open','high','low','close','volume','price','change',"code"]] 220 | return df 221 | except: 222 | #df.drop_duplicates().fillna(method='pad').to_csv('E:\\stock_datashujuqingkaung.csv') 223 | df.columns = ['open','high','low','close','volume','price','change',"code"] 224 | return df 225 | 226 | 227 | 228 | def getBenchamark(self,code,start,end): 229 | #if it is timestamp type 230 | startdate=start 231 | enddate=end 232 | #print u'这里',start,end 233 | if type(start) is types.StringType: 234 | startdate = datetime.datetime.strptime(start, "%Y-%m-%d") 235 | if type(end) is types.StringType: 236 | enddate=datetime.datetime.strptime(end, "%Y-%m-%d") 237 | series={"date":[],"change":[]} 238 | for stockdaily in self.index[code].find({"date": {"$gte": startdate,"$lte":enddate}}).sort("date"): 239 | series["date"].append(stockdaily["date"]) 240 | series["change"].append(stockdaily["change"]) 241 | df=pd.Series(data=series["change"],index=series["date"]) 242 | return df.sort_index().tz_localize('UTC') 243 | 244 | def getindexdaily(self,code,start,end): 245 | total=[] 246 | startdate = datetime.datetime.strptime(start, "%Y-%m-%d") 247 | enddate=datetime.datetime.strptime(end, "%Y-%m-%d") 248 | series={"date":[],"open":[],"close":[],"high":[],"low":[],"volume":[]} 249 | 250 | for stockdaily in self.index[code].find({"date": {"$gte": startdate,"$lt":enddate}}).sort("date"): 251 | series["date"].append(stockdaily["date"]) 252 | series["open"].append(stockdaily["open"]) 253 | series["close"].append(stockdaily["close"]) 254 | series["high"].append(stockdaily["high"]) 255 | series["low"].append(stockdaily["low"]) 256 | series["volume"].append(stockdaily["volume"]) 257 | 258 | totaldata=zip(series['date'],series['open'],series['close'],series['high'],series['low'],series['volume']) 259 | df = pd.DataFrame(list(totaldata)) 260 | df.columns = ['date','open','close','high','low','volume'] 261 | #print (df.head()) 262 | #df.index=df.date 263 | return df 264 | 265 | def getindexminute(self,code,start,end): 266 | total=[] 267 | startdate = datetime.datetime.strptime(start, "%Y-%m-%d") 268 | enddate=datetime.datetime.strptime(end, "%Y-%m-%d") 269 | series={"date":[],"open":[],"close":[],"high":[],"low":[],"volume":[]} 270 | 271 | for stockdaily in self.minute_index[code].find({"date": {"$gte": startdate,"$lt":enddate}}).sort("date"): 272 | series["date"].append(stockdaily["date"]) 273 | series["open"].append(stockdaily["open"]) 274 | series["close"].append(stockdaily["close"]) 275 | series["high"].append(stockdaily["high"]) 276 | series["low"].append(stockdaily["low"]) 277 | series["volume"].append(stockdaily["vol"]) 278 | 279 | totaldata=zip(series['date'],series['open'],series['close'],series['high'],series['low'],series['volume']) 280 | df = pd.DataFrame(list(totaldata)) 281 | df.index=df.date 282 | return df.drop_duplicates() 283 | 284 | 285 | 286 | 287 | def get_data(self): 288 | 289 | in_package_data = range(2002, 2019) 290 | print (in_package_data) 291 | cur_year = datetime.datetime.now().year 292 | last_in_package_data = max(in_package_data) 293 | 294 | 295 | # download new data 296 | ''' 297 | to_downloads = range(last_in_package_data + 1, cur_year + 1) 298 | print to_downloads 299 | raw_input() 300 | 301 | # frist, get ycDefIds params 302 | response = requests.get(self.YIELD_MAIN_URL) 303 | 304 | matchs = re.search(r'\?ycDefIds=(.*?)\&', response.text) 305 | ycdefids = matchs.group(1) 306 | assert (ycdefids is not None) 307 | 308 | fetched_data = [] 309 | for year in to_downloads: 310 | print('Downloading from ' + self.DONWLOAD_URL % (year, ycdefids)) 311 | response = requests.get(self.DONWLOAD_URL % (year, ycdefids)) 312 | fetched_data.append(BytesIO(response.content)) 313 | 314 | # combine all data''' 315 | 316 | dfs = [] 317 | 318 | basedir = os.path.join(os.path.dirname(__file__), "xlsx") 319 | 320 | for i in in_package_data: 321 | dfs.append(pd.read_excel(os.path.join(basedir, "%d.xlsx" % i))) 322 | ''' 323 | for memfile in fetched_data: 324 | dfs.append(pd.read_excel(memfile)) 325 | ''' 326 | df = pd.concat(dfs) 327 | 328 | return df 329 | 330 | def get_pivot_data(self): 331 | 332 | df = self.get_data() 333 | return df.pivot(index=u'日期', columns=u'标准期限(年)', values=u'收益率(%)') 334 | 335 | 336 | 337 | def insert_zipline_treasure_format(self): 338 | self.treasure['treasure'].drop() 339 | pivot_data = self.get_pivot_data() 340 | #print pivot_data.tail() 341 | #raw_input() 342 | 343 | frame=pivot_data[[0.08,0.25,0.5,1,2,3,5,7,10,20,30]] 344 | frame['Time Period']=frame.index 345 | #print frame.head() 346 | frame['Time Period']= frame['Time Period'].astype('str') # [str(i) for i in list(frame['Time Period'])]# 347 | frame.columns=['1month', '3month','6month', '1year', '2year', '3year', '5year', '7year', '10year', '20year', '30year','Time Period'] 348 | records = json.loads(frame.T.to_json()).values() 349 | for row in records: 350 | temp=row['Time Period'] 351 | temp=temp.split('T')[0] 352 | row['Time Period'] = datetime.datetime.strptime(temp, "%Y-%m-%d") 353 | 354 | self.treasure['treasure'].insert_many(records) 355 | 356 | 357 | def read_treasure_from_mongodb(self,start,end): 358 | 359 | startdate=start 360 | enddate=end 361 | series={"Time Period":[],"1month":[],"3month":[],"6month":[],"1year":[],"2year":[],"3year":[],"5year":[],"7year":[],"10year":[],"20year":[],"30year":[]} 362 | if type(start) is types.StringType: 363 | startdate = datetime.datetime.strptime(start, "%Y-%m-%d") 364 | if type(end) is types.StringType: 365 | enddate=datetime.datetime.strptime(end, "%Y-%m-%d") 366 | for treasuredaily in self.treasure['treasure'].find({"Time Period": {"$gte": startdate,"$lt":enddate}}).sort("date"): 367 | series["Time Period"].append(treasuredaily["Time Period"]) 368 | series["1month"].append(treasuredaily["1month"]) 369 | series["3month"].append(treasuredaily["3month"]) 370 | series["6month"].append(treasuredaily["6month"]) 371 | series["1year"].append(treasuredaily["1year"]) 372 | series["2year"].append(treasuredaily["2year"]) 373 | series["3year"].append(treasuredaily["3year"]) 374 | series["5year"].append(treasuredaily["5year"]) 375 | series["7year"].append(treasuredaily["7year"]) 376 | series["10year"].append(treasuredaily["10year"]) 377 | series["20year"].append(treasuredaily["20year"]) 378 | series["30year"].append(treasuredaily["30year"]) 379 | totaldata=zip(series["1month"],series["3month"],series["6month"],series["1year"],series["2year"],series["3year"],series["5year"],series["7year"],series["10year"],series["20year"],series["30year"]) 380 | df = pd.DataFrame(data=list(totaldata),index=series["Time Period"],columns = ['1month', '3month','6month', '1year', '2year', '3year', '5year', '7year', '10year', '20year', '30year']) 381 | return df.sort_index().tz_localize('UTC') 382 | 383 | def storageStockName(self): 384 | totalstock=[] 385 | onlyfiles = [ f for f in listdir(self.stockdata) if isfile(join(self.stockdata,f)) ] 386 | for f in onlyfiles: 387 | s=f.split('.') 388 | name=s[0][2:8] 389 | totalstock.append(name) 390 | 391 | data = {'codes': totalstock} 392 | frame = DataFrame(data) 393 | 394 | self.pool['all'].insert_many(json.loads(frame.to_json(orient='records'))) 395 | print (frame) 396 | 397 | 398 | 399 | if __name__ == '__main__': 400 | l=LoadDataCVS('127.0.0.1',27017) 401 | l.Conn() 402 | #l.storagedaily() 403 | #l.storageindex() 404 | # l.storagepool() 405 | # l.storageStockName() 406 | #l.insert_zipline_treasure_format() 407 | #l.Close() 408 | 409 | #l.storageStockName() 410 | #print l.getstocklist('all') 411 | 412 | -------------------------------------------------------------------------------- /load_data/loader.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | # Copyright 2016 Quantopian, Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | import os 16 | import datetime 17 | from collections import OrderedDict 18 | 19 | import logbook 20 | 21 | import constants 22 | 23 | import pandas as pd 24 | #from pandas_datareader import DataReader 25 | import pytz 26 | 27 | from six import iteritems 28 | from six.moves.urllib_error import HTTPError 29 | 30 | #from . benchmarks import get_benchmark_returns 31 | from mongodb import LoadDataCVS 32 | import treasuries, treasuries_can 33 | 34 | 35 | logger = logbook.Logger('Loader') 36 | 37 | # Mapping from index symbol to appropriate bond data 38 | INDEX_MAPPING = { 39 | '^GSPC': 40 | (treasuries, 'treasury_curves.csv', 'www.federalreserve.gov'), 41 | '^GSPTSE': 42 | (treasuries_can, 'treasury_curves_can.csv', 'bankofcanada.ca'), 43 | '^FTSE': # use US treasuries until UK bonds implemented 44 | (treasuries, 'treasury_curves.csv', 'www.federalreserve.gov'), 45 | } 46 | 47 | ONE_HOUR = pd.Timedelta(hours=1) 48 | 49 | 50 | def last_modified_time(path): 51 | """ 52 | Get the last modified time of path as a Timestamp. 53 | """ 54 | return pd.Timestamp(os.path.getmtime(path), unit='s', tz='UTC') 55 | 56 | 57 | def get_data_filepath(name): 58 | """ 59 | Returns a handle to data file. 60 | 61 | Creates containing directory, if needed. 62 | """ 63 | dr = data_root() 64 | 65 | if not os.path.exists(dr): 66 | os.makedirs(dr) 67 | 68 | return os.path.join(dr, name) 69 | 70 | 71 | def get_cache_filepath(name): 72 | cr = cache_root() 73 | if not os.path.exists(cr): 74 | os.makedirs(cr) 75 | 76 | return os.path.join(cr, name) 77 | 78 | 79 | def get_benchmark_filename(symbol): 80 | return "%s_benchmark.csv" % symbol 81 | 82 | 83 | def has_data_for_dates(series_or_df, first_date, last_date): 84 | """ 85 | Does `series_or_df` have data on or before first_date and on or after 86 | last_date? 87 | """ 88 | dts = series_or_df.index 89 | if not isinstance(dts, pd.DatetimeIndex): 90 | raise TypeError("Expected a DatetimeIndex, but got %s." % type(dts)) 91 | first, last = dts[[0, -1]] 92 | return (first <= first_date) and (last >= last_date) 93 | 94 | 95 | def load_market_data(trading_day, 96 | trading_days, 97 | bm_symbol='000001'): 98 | """ 99 | Load benchmark returns and treasury yield curves for the given calendar and 100 | benchmark symbol. 101 | 102 | Benchmarks are downloaded as a Series from Yahoo Finance. Treasury curves 103 | are US Treasury Bond rates and are downloaded from 'www.federalreserve.gov' 104 | by default. For Canadian exchanges, a loader for Canadian bonds from the 105 | Bank of Canada is also available. 106 | 107 | Results downloaded from the internet are cached in 108 | ~/.zipline/data. Subsequent loads will attempt to read from the cached 109 | files before falling back to redownload. 110 | 111 | Parameters 112 | ---------- 113 | trading_day : pandas.CustomBusinessDay, optional 114 | A trading_day used to determine the latest day for which we 115 | expect to have data. Defaults to an NYSE trading day. 116 | trading_days : pd.DatetimeIndex, optional 117 | A calendar of trading days. Also used for determining what cached 118 | dates we should expect to have cached. Defaults to the NYSE calendar. 119 | bm_symbol : str, optional 120 | Symbol for the benchmark index to load. Defaults to '^GSPC', the Yahoo 121 | ticker for the S&P 500. 122 | 123 | Returns 124 | ------- 125 | (benchmark_returns, treasury_curves) : (pd.Series, pd.DataFrame) 126 | 127 | Notes 128 | ----- 129 | 130 | Both return values are DatetimeIndexed with values dated to midnight in UTC 131 | of each stored date. The columns of `treasury_curves` are: 132 | 133 | '1month', '3month', '6month', 134 | '1year','2year','3year','5year','7year','10year','20year','30year' 135 | #为给定的日历和基准符号加载基准回报和国债收益率曲线。基准测试从Yahoo Finance下载为系列。 136 | #资金曲线是美国国债利率,默认情况下从'www.federalreserve.gov'下载。 137 | #对于加拿大交易所,也可以使用加拿大银行的加拿大债券装载机。 138 | 139 | #从互联网下载的结果将被缓存 140 | #〜/ .zipline /数据。后续加载将尝试从缓存中读取 141 | #文件在退回重新下载之前。 142 | 143 | #参数 144 |  ---------- 145 | trading_day:pandas.CustomBusinessDay,可选 146 | 交易日用于确定我们的最新日期 147 | 期待有数据。默认为纽约证券交易所交易日。 148 | trading_days:pd.DatetimeIndex,可选 149 | 交易日的日历。还用于确定缓存的内容 150 | 我们应该期望缓存的日期。默认为纽约证券交易所日历。 151 | bm_symbol:str,可选 152 |   要加载的基准索引的符号。默认为'^ GSPC',雅虎 153 | 标准普尔500指数的股票代码。 154 | 155 | 返回 156 | ------- 157 | (benchmark_returns,treasury_curves):( pd.Series,pd.DataFrame) 158 | 159 | 笔记 160 | """ 161 | first_date = trading_days[0] 162 | now = pd.Timestamp.utcnow() 163 | 164 | # We expect to have benchmark and treasury data that's current up until 165 | # **two** full trading days prior to the most recently completed trading 166 | # day. 167 | # Example: 168 | # On Thu Oct 22 2015, the previous completed trading day is Wed Oct 21. 169 | # However, data for Oct 21 doesn't become available until the early morning 170 | # hours of Oct 22. This means that there are times on the 22nd at which we 171 | # cannot reasonably expect to have data for the 21st available. To be 172 | # conservative, we instead expect that at any time on the 22nd, we can 173 | # download data for Tuesday the 20th, which is two full trading days prior 174 | # to the date on which we're running a test. 175 | # We'll attempt to download new data if the latest entry in our cache is 176 | # before this date. 177 | # 我们预计基准和财务数据将持续到最近完成交易日前的两个**完整交易日 178 | # 例: 179 | # 2015年10月22日星期四,之前完成的交易日为10月21日星期三。 180 | # 但是,10月21日的数据直到清晨才可用 181 | # 10月22日的小时数。这意味着我们22日有时间 182 | # 不能合理地期望有21日的数据可用。 保守一点,我们反而希望在22日的任何时候,我们都可以 183 | # 下载20日星期二的数据,这是两个完整的交易日 184 | # 到我们正在进行测试的日期。 如果缓存中的最新条目在此日期之前,我们将尝试下载新数据。 185 | #print trading_days.get_loc(now, method='ffill') 186 | #print trading_days[-1] 187 | #last_date = trading_days[trading_days.get_loc(now, method='ffill') - 2] 188 | #print last_date 189 | last_date = trading_days[-1] 190 | #raw_input() 191 | 192 | br = ensure_benchmark_data( 193 | bm_symbol, 194 | first_date, 195 | last_date, 196 | now, 197 | # We need the trading_day to figure out the close prior to the first 198 | # date so that we can compute returns for the first date. 199 | trading_day, 200 | ) 201 | 202 | tc = ensure_treasury_data( 203 | bm_symbol, 204 | first_date, 205 | last_date, 206 | now, 207 | ) 208 | 209 | benchmark_returns = br[br.index.slice_indexer(first_date, last_date)] 210 | treasury_curves = tc[tc.index.slice_indexer(first_date, last_date)] 211 | return benchmark_returns, treasury_curves 212 | 213 | 214 | def ensure_benchmark_data(symbol, first_date, last_date, now, trading_day): 215 | """ 216 | Ensure we have benchmark data for `symbol` from `first_date` to `last_date` 217 | 218 | Parameters 219 | ---------- 220 | symbol : str 221 | The symbol for the benchmark to load. 222 | first_date : pd.Timestamp 223 | First required date for the cache. 224 | last_date : pd.Timestamp 225 | Last required date for the cache. 226 | now : pd.Timestamp 227 | The current time. This is used to prevent repeated attempts to 228 | re-download data that isn't available due to scheduling quirks or other 229 | failures. 230 | trading_day : pd.CustomBusinessDay 231 | A trading day delta. Used to find the day before first_date so we can 232 | get the close of the day prior to first_date. 233 | 234 | We attempt to download data unless we already have data stored at the data 235 | cache for `symbol` whose first entry is before or on `first_date` and whose 236 | last entry is on or after `last_date`. 237 | 238 | If we perform a download and the cache criteria are not satisfied, we wait 239 | at least one hour before attempting a redownload. This is determined by 240 | comparing the current time to the result of os.path.getmtime on the cache 241 | path. 242 | """ 243 | 244 | # If the path does not exist, it means the first download has not happened 245 | # yet, so don't try to read from 'path'. 246 | 247 | try: 248 | data = get_benchmark_returns( 249 | symbol, 250 | first_date - trading_day, 251 | last_date, 252 | ) 253 | except (OSError, IOError, HTTPError): 254 | logger.exception('failed to cache the new benchmark returns') 255 | if not has_data_for_dates(data, first_date, last_date): 256 | logger.warn("Still don't have expected data after redownload!") 257 | return data 258 | 259 | 260 | def ensure_treasury_data(bm_symbol, first_date, last_date, now): 261 | """ 262 | Ensure we have treasury data from treasury module associated with 263 | `bm_symbol`. 264 | 265 | Parameters 266 | ---------- 267 | bm_symbol : str 268 | Benchmark symbol for which we're loading associated treasury curves. 269 | first_date : pd.Timestamp 270 | First date required to be in the cache. 271 | last_date : pd.Timestamp 272 | Last date required to be in the cache. 273 | now : pd.Timestamp 274 | The current time. This is used to prevent repeated attempts to 275 | re-download data that isn't available due to scheduling quirks or other 276 | failures. 277 | 278 | We attempt to download data unless we already have data stored in the cache 279 | for `module_name` whose first entry is before or on `first_date` and whose 280 | last entry is on or after `last_date`. 281 | 282 | If we perform a download and the cache criteria are not satisfied, we wait 283 | at least one hour before attempting a redownload. This is determined by 284 | comparing the current time to the result of os.path.getmtime on the cache 285 | path. 286 | """ 287 | # loader_module, filename, source = INDEX_MAPPING.get( 288 | # bm_symbol, INDEX_MAPPING['^GSPC'] 289 | # ) 290 | # first_date = max(first_date, loader_module.earliest_possible_date()) 291 | # path = get_data_filepath(filename) 292 | 293 | # # If the path does not exist, it means the first download has not happened 294 | # # yet, so don't try to read from 'path'. 295 | # if os.path.exists(path): 296 | # try: 297 | # data = pd.DataFrame.from_csv(path).tz_localize('UTC') 298 | # if has_data_for_dates(data, first_date, last_date): 299 | # return data 300 | 301 | # # Don't re-download if we've successfully downloaded and written a 302 | # # file in the last hour. 303 | # last_download_time = last_modified_time(path) 304 | # if (now - last_download_time) <= ONE_HOUR: 305 | # logger.warn( 306 | # "Refusing to download new treasury data because a " 307 | # "download succeeded at %s." % last_download_time 308 | # ) 309 | # return data 310 | 311 | # except (OSError, IOError, ValueError) as e: 312 | # # These can all be raised by various versions of pandas on various 313 | # # classes of malformed input. Treat them all as cache misses. 314 | # logger.info( 315 | # "Loading data for {path} failed with error [{error}].".format( 316 | # path=path, error=e, 317 | # ) 318 | # ) 319 | 320 | # try: 321 | # data = loader_module.get_treasury_data(first_date, last_date) 322 | # data.to_csv(path) 323 | # except (OSError, IOError, HTTPError): 324 | # logger.exception('failed to cache treasury data') 325 | # if not has_data_for_dates(data, first_date, last_date): 326 | # logger.warn("Still don't have expected data after redownload!") 327 | l=LoadDataCVS(constants.IP,constants.PORT) 328 | l.Conn() 329 | data=l.read_treasure_from_mongodb(first_date, last_date) 330 | l.Close() 331 | return data 332 | 333 | 334 | #提取分钟数据,这里我们提取 335 | def load_day_data(indexes=None,stockList=None,start=None,end=None,adjusted=True,rolling_count= 10): 336 | # 337 | """ 338 | load stocks from Mongo 339 | """ 340 | assert indexes is not None or stockList is not None, """ 341 | must specify stockList or indexes""" 342 | #对日期进行改造,提取的数据日期应该高于多于开始日期一个月,这样对于原数据有缓冲作用 343 | start_time = pd.Timestamp(start,tz='UTC') 344 | end_time = pd.Timestamp(end,tz='UTC') 345 | 346 | if start is None: 347 | start = "1990-01-01" 348 | 349 | if start is not None and end is not None: 350 | startdate = datetime.datetime.strptime(start, "%Y-%m-%d") 351 | enddate=datetime.datetime.strptime(end, "%Y-%m-%d") 352 | assert startdate < enddate, "start date is later than end date." 353 | 354 | data = OrderedDict() 355 | start = (datetime.datetime.strptime(start,'%Y-%m-%d')-datetime.timedelta(days=rolling_count+1)).strftime('%Y-%m-%d') 356 | 357 | l=LoadDataCVS(constants.IP,constants.PORT) 358 | l.Conn() 359 | 360 | if stockList=="hs300" or stockList=="zz500" or stockList=="sz50" or stockList=="all": 361 | stocks=l.getstocklist(stockList) 362 | else: 363 | stocks=stockList 364 | 365 | #print stocks 366 | 367 | if stocks is not None: 368 | for stock in stocks: 369 | stkd= l.getstockdaily(stock,start,end) 370 | if not adjusted: 371 | data[stock] = stkd 372 | else: 373 | adj_cols = ['open', 'high', 'low', 'close'] 374 | ratio = stkd['price']/stkd['close'] 375 | ratio_filtered = ratio.fillna(0).values 376 | for col in adj_cols: 377 | stkd[col] *= ratio_filtered 378 | data[stock] = stkd 379 | return [data,start_time,end_time] 380 | 381 | 382 | if indexes is not None: 383 | stkd= l.getindexdaily(indexes,start,end) 384 | data[indexes] = stkd 385 | return data 386 | ''' 387 | for name, ticker in items(indexes): 388 | print (name,ticker) 389 | logger.info('Loading index: {} ({})'.format(name, ticker)) 390 | stkd= l.getindexdaily(indexes,start,end) 391 | data[name] = stkd 392 | return data 393 | ''' 394 | 395 | 396 | 397 | ''' 398 | #['open','high','low','close','volume','price','change',"code"] 399 | print (data) 400 | panel = pd.Panel(data) 401 | panel.minor_axis = ['open', 'high', 'low', 'close', 'volume', 'price','change','code'] 402 | panel.major_axis = panel.major_axis.tz_localize(pytz.utc) 403 | #print panel[stocks[0]].head(5) 404 | 405 | #close the connection 406 | l.Close() 407 | 408 | # Adjust data 409 | if adjusted: 410 | adj_cols = ['open', 'high', 'low', 'close'] 411 | for ticker in panel.items: 412 | ratio = (panel[ticker]['price'] / panel[ticker]['close']) 413 | ratio_filtered = ratio.fillna(0).values 414 | for col in adj_cols: 415 | panel[ticker][col] *= ratio_filtered 416 | 417 | 418 | return [panel,start_time,end_time] 419 | ''' 420 | 421 | 422 | #定义一个函数,用于获取分钟数据,其中分钟数据也需要进行复权调整 423 | def load_minute_data(indexes=None,stockList=None,start=None,end=None,adjusted=False,rolling_count= 10): 424 | 425 | """ 426 | load stocks from Mongo 427 | """ 428 | assert indexes is not None or stockList is not None 429 | """ 430 | must specify stockList or indexes""" 431 | #对日期进行改造,提取的数据日期应该高于多于开始日期一个月,这样对于原数据有缓冲作用 432 | 433 | starts = start 434 | ends =end 435 | #start_time = pd.Timestamp(start,tz='UTC') 436 | #end_time = pd.Timestamp(end,tz='UTC') 437 | 438 | if start is None: 439 | start = "1990-01-01" 440 | 441 | if start is not None and end is not None: 442 | startdate = datetime.datetime.strptime(start, "%Y-%m-%d") 443 | enddate=datetime.datetime.strptime(end, "%Y-%m-%d") 444 | assert startdate < enddate, "start date is later than end date." 445 | 446 | data = OrderedDict() 447 | start = (datetime.datetime.strptime(start,'%Y-%m-%d')-datetime.timedelta(days=rolling_count+1)).strftime('%Y-%m-%d') 448 | end = (datetime.datetime.strptime(end,'%Y-%m-%d')+datetime.timedelta(days=1)).strftime('%Y-%m-%d') 449 | l=LoadDataCVS(constants.IP,constants.PORT) 450 | l.Conn() 451 | 452 | if stockList=="hs300" or stockList=="zz500" or stockList=="sz50" or stockList=="all": 453 | stocks=l.getstocklist(stockList) 454 | else: 455 | stocks=stockList 456 | 457 | #print stocks 458 | 459 | if stocks is not None: 460 | for stock in stocks: 461 | stkd= l.getstockminute(stock,start,end) 462 | data[stock] = stkd 463 | #print data[stock].head(5) 464 | #print data[stock].tail(5) 465 | 466 | if indexes is not None: 467 | for name, ticker in iteritems(indexes): 468 | logger.info('Loading index: {} ({})'.format(name, ticker)) 469 | stkd= l.getindexminute(indexes,start,end) 470 | data[name] = stkd 471 | 472 | #['open','high','low','close','volume','price','change',"code"] 473 | panel = pd.Panel(data) 474 | panel.minor_axis = ['open', 'high', 'low', 'close', 'volume', 'price','change','code'] 475 | panel.major_axis = panel.major_axis.tz_localize(pytz.utc) 476 | #print panel[stocks[0]].head(5) 477 | 478 | #close the connection 479 | l.Close() 480 | 481 | # Adjust data 482 | if adjusted: 483 | adj_cols = ['open', 'high', 'low', 'close'] 484 | for ticker in panel.items: 485 | ratio = (panel[ticker]['price'] / panel[ticker]['close']) 486 | ratio_filtered = ratio.fillna(0).values 487 | for col in adj_cols: 488 | panel[ticker][col] *= ratio_filtered 489 | 490 | starts = str(starts) + ' 09:25:00' 491 | start_time = pd.Timestamp(starts,tz='UTC') 492 | ends = str(ends) + ' 09:25:00' 493 | end_time = pd.Timestamp(ends,tz='UTC') 494 | return [panel,start_time,end_time] 495 | 496 | 497 | 498 | def _load_raw_yahoo_data(indexes=None, stocks=None, start=None, end=None): 499 | """Load closing prices from yahoo finance. 500 | 501 | :Optional: 502 | indexes : dict (Default: {'SPX': '^GSPC'}) 503 | Financial indexes to load. 504 | stocks : list (Default: ['AAPL', 'GE', 'IBM', 'MSFT', 505 | 'XOM', 'AA', 'JNJ', 'PEP', 'KO']) 506 | Stock closing prices to load. 507 | start : datetime (Default: datetime(1993, 1, 1, 0, 0, 0, 0, pytz.utc)) 508 | Retrieve prices from start date on. 509 | end : datetime (Default: datetime(2002, 1, 1, 0, 0, 0, 0, pytz.utc)) 510 | Retrieve prices until end date. 511 | 512 | :Note: 513 | This is based on code presented in a talk by Wes McKinney: 514 | http://wesmckinney.com/files/20111017/notebook_output.pdf 515 | """ 516 | assert indexes is not None or stocks is not None, """ 517 | must specify stocks or indexes""" 518 | 519 | if start is None: 520 | start = pd.datetime(1990, 1, 1, 0, 0, 0, 0, pytz.utc) 521 | 522 | if start is not None and end is not None: 523 | assert start < end, "start date is later than end date." 524 | 525 | data = OrderedDict() 526 | if stocks is not None: 527 | for stock in stocks: 528 | logger.info('Loading stock: {}'.format(stock)) 529 | stock_pathsafe = stock.replace(os.path.sep, '--') 530 | cache_filename = "{stock}-{start}-{end}.csv".format( 531 | stock=stock_pathsafe, 532 | start=start, 533 | end=end).replace(':', '-') 534 | cache_filepath = get_cache_filepath(cache_filename) 535 | if os.path.exists(cache_filepath): 536 | stkd = pd.DataFrame.from_csv(cache_filepath) 537 | else: 538 | stkd = DataReader(stock, 'yahoo', start, end).sort_index() 539 | stkd.to_csv(cache_filepath) 540 | data[stock] = stkd 541 | 542 | if indexes is not None: 543 | for name, ticker in iteritems(indexes): 544 | logger.info('Loading index: {} ({})'.format(name, ticker)) 545 | stkd = DataReader(ticker, 'yahoo', start, end).sort_index() 546 | data[name] = stkd 547 | 548 | return data 549 | 550 | 551 | def load_from_yahoo(indexes=None, 552 | stocks=None, 553 | start=None, 554 | end=None, 555 | adjusted=True): 556 | """ 557 | Loads price data from Yahoo into a dataframe for each of the indicated 558 | assets. By default, 'price' is taken from Yahoo's 'Adjusted Close', 559 | which removes the impact of splits and dividends. If the argument 560 | 'adjusted' is False, then the non-adjusted 'close' field is used instead. 561 | 562 | :param indexes: Financial indexes to load. 563 | :type indexes: dict 564 | :param stocks: Stock closing prices to load. 565 | :type stocks: list 566 | :param start: Retrieve prices from start date on. 567 | :type start: datetime 568 | :param end: Retrieve prices until end date. 569 | :type end: datetime 570 | :param adjusted: Adjust the price for splits and dividends. 571 | :type adjusted: bool 572 | 573 | """ 574 | data = _load_raw_yahoo_data(indexes, stocks, start, end) 575 | if adjusted: 576 | close_key = 'Adj Close' 577 | else: 578 | close_key = 'Close' 579 | df = pd.DataFrame({key: d[close_key] for key, d in iteritems(data)}) 580 | df.index = df.index.tz_localize(pytz.utc) 581 | return df 582 | 583 | 584 | def load_bars_from_yahoo(indexes=None, 585 | stocks=None, 586 | start=None, 587 | end=None, 588 | adjusted=True): 589 | """ 590 | Loads data from Yahoo into a panel with the following 591 | column names for each indicated security: 592 | 593 | - open 594 | - high 595 | - low 596 | - close 597 | - volume 598 | - price 599 | 600 | Note that 'price' is Yahoo's 'Adjusted Close', which removes the 601 | impact of splits and dividends. If the argument 'adjusted' is True, then 602 | the open, high, low, and close values are adjusted as well. 603 | 604 | :param indexes: Financial indexes to load. 605 | :type indexes: dict 606 | :param stocks: Stock closing prices to load. 607 | :type stocks: list 608 | :param start: Retrieve prices from start date on. 609 | :type start: datetime 610 | :param end: Retrieve prices until end date. 611 | :type end: datetime 612 | :param adjusted: Adjust open/high/low/close for splits and dividends. 613 | The 'price' field is always adjusted. 614 | :type adjusted: bool 615 | 616 | """ 617 | data = _load_raw_yahoo_data(indexes, stocks, start, end) 618 | panel = pd.Panel(data) 619 | # Rename columns 620 | panel.minor_axis = ['open', 'high', 'low', 'close', 'volume', 'price'] 621 | panel.major_axis = panel.major_axis.tz_localize(pytz.utc) 622 | # Adjust data 623 | if adjusted: 624 | adj_cols = ['open', 'high', 'low', 'close'] 625 | for ticker in panel.items: 626 | ratio = (panel[ticker]['price'] / panel[ticker]['close']) 627 | ratio_filtered = ratio.fillna(0).values 628 | for col in adj_cols: 629 | panel[ticker][col] *= ratio_filtered 630 | return panel 631 | 632 | 633 | def load_prices_from_csv(filepath, identifier_col, tz='UTC'): 634 | data = pd.read_csv(filepath, index_col=identifier_col) 635 | data.index = pd.DatetimeIndex(data.index, tz=tz) 636 | data.sort_index(inplace=True) 637 | return data 638 | 639 | 640 | def load_prices_from_csv_folder(folderpath, identifier_col, tz='UTC'): 641 | data = None 642 | for file in os.listdir(folderpath): 643 | if '.csv' not in file: 644 | continue 645 | raw = load_prices_from_csv(os.path.join(folderpath, file), 646 | identifier_col, tz) 647 | if data is None: 648 | data = raw 649 | else: 650 | data = pd.concat([data, raw], axis=1) 651 | return data 652 | -------------------------------------------------------------------------------- /_program.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | """The underlying data structure used in gplearn. 3 | 4 | The :mod:`gplearn._program` module contains the underlying representation of a 5 | computer program. It is used for creating and evolving programs used in the 6 | :mod:`gplearn.genetic` module.print 7 | """ 8 | # input make_function X 9 | # Author: Trevor Stephens 10 | # 11 | # License: BSD 3 clause 12 | import line_profiler 13 | import os 14 | import sys 15 | from copy import copy 16 | import gc 17 | import numpy as np 18 | from sklearn.utils.random import sample_without_replacement 19 | from time import time 20 | from .functions import _Function 21 | from .utils import check_random_state 22 | 23 | 24 | class _Program(object): 25 | 26 | """A program-like representation of the evolved program. 27 | 28 | This is the underlying data-structure used by the public classes in the 29 | :mod:`gplearn.genetic` module. It should not be used directly by the user. 30 | 31 | Parameters 32 | ---------- 33 | function_set : list 34 | A list of valid functions to use in the program. 35 | 36 | arities : dict 37 | A dictionary of the form `{arity: [functions]}`. The arity is the 38 | number of arguments that the function takes, the functions must match 39 | those in the `function_set` parameter. 40 | 41 | init_depth : tuple of two ints 42 | The range of tree depths for the initial population of naive formulas. 43 | Individual trees will randomly choose a maximum depth from this range. 44 | When combined with `init_method='half and half'` this yields the well- 45 | known 'ramped half and half' initialization method. 46 | 47 | init_method : str 48 | - 'grow' : Nodes are chosen at random from both functions and 49 | terminals, allowing for smaller trees than `init_depth` allows. Tends 50 | to grow asymmetrical trees. 51 | - 'full' : Functions are chosen until the `init_depth` is reached, and 52 | then terminals are selected. Tends to grow 'bushy' trees. 53 | - 'half and half' : Trees are grown through a 50/50 mix of 'full' and 54 | 'grow', making for a mix of tree shapes in the initial population. 55 | 56 | n_features : int 57 | The number of features in `X`. 58 | 59 | const_range : tuple of two floats 60 | The range of constants to include in the formulas. 61 | 62 | metric : _Fitness object 63 | The raw fitness metric. 64 | 65 | p_point_replace : float 66 | The probability that any given node will be mutated during point 67 | mutation. 68 | 69 | parsimony_coefficient : float 70 | This constant penalizes large programs by adjusting their fitness to 71 | be less favorable for selection. Larger values penalize the program 72 | more which can control the phenomenon known as 'bloat'. Bloat is when 73 | evolution is increasing the size of programs without a significant 74 | increase in fitness, which is costly for computation time and makes for 75 | a less understandable final result. This parameter may need to be tuned 76 | over successive runs. 77 | 78 | random_state : RandomState instance 79 | The random number generator. Note that ints, or None are not allowed. 80 | The reason for this being passed is that during parallel evolution the 81 | same program object may be accessed by multiple parallel processes. 82 | 83 | transformer : _Function object, optional (default=None) 84 | The function to transform the output of the program to probabilities, 85 | only used for the SymbolicClassifier. 86 | 87 | feature_names : list, optional (default=None) 88 | Optional list of feature names, used purely for representations in 89 | the `print` operation or `export_graphviz`. If None, then X0, X1, etc 90 | will be used for representations. 91 | 92 | program : list, optional (default=None) 93 | The flattened tree representation of the program. If None, a new naive 94 | random tree will be grown. If provided, it will be validated. 95 | 96 | Attributes 97 | ---------- 98 | program : list 99 | The flattened tree representation of the program. 100 | 101 | raw_fitness_ : float 102 | The raw fitness of the individual program. 103 | 104 | fitness_ : float 105 | The penalized fitness of the individual program. 106 | 107 | oob_fitness_ : float 108 | The out-of-bag raw fitness of the individual program for the held-out 109 | samples. Only present when sub-sampling was used in the estimator by 110 | specifying `max_samples` < 1.0. 111 | 112 | parents : dict, or None 113 | If None, this is a naive random program from the initial population. 114 | Otherwise it includes meta-data about the program's parent(s) as well 115 | as the genetic operations performed to yield the current program. This 116 | is set outside this class by the controlling evolution loops. 117 | 118 | depth_ : int 119 | The maximum depth of the program tree. 120 | 121 | length_ : int 122 | The number of functions and terminals in the program. 123 | 124 | """ 125 | 126 | def __init__(self, 127 | function_set, 128 | arities, 129 | init_depth, 130 | init_method, 131 | n_features, 132 | const_range, 133 | metric, 134 | p_point_replace, 135 | parsimony_coefficient, 136 | random_state, 137 | transformer=None, 138 | feature_names=None, 139 | program=None): 140 | 141 | self.function_set = function_set 142 | self.arities = arities 143 | self.init_depth = (init_depth[0], init_depth[1] + 1) 144 | self.init_method = init_method 145 | self.n_features = n_features 146 | self.const_range = const_range 147 | self.metric = metric 148 | self.p_point_replace = p_point_replace 149 | self.parsimony_coefficient = parsimony_coefficient 150 | self.transformer = transformer 151 | self.feature_names = feature_names 152 | self.program = program 153 | 154 | if self.program is not None: 155 | if not self.validate_program(): 156 | raise ValueError('The supplied program is incomplete.') 157 | else: 158 | # Create a naive random program,如果判断program wei none则进行建立program 159 | self.program = self.build_program(random_state) 160 | 161 | self.raw_fitness_ = None 162 | self.fitness_ = None 163 | self.parents = None 164 | self._n_samples = None 165 | self._max_samples = None 166 | self._indices_state = None 167 | 168 | def build_program(self, random_state): 169 | """Build a naive random program. 170 | 171 | Parameters 172 | ---------- 173 | random_state : RandomState instance 174 | The random number generator. 175 | 176 | Returns 177 | ------- 178 | program : list 179 | The flattened tree representation of the program. 180 | 181 | """ 182 | if self.init_method == 'half and half': 183 | method = ('full' if random_state.randint(2) else 'grow') 184 | else: 185 | method = self.init_method 186 | max_depth = random_state.randint(*self.init_depth) 187 | 188 | # Start a program with a function to avoid degenerative programs 189 | #print (len(self.function_set)) 190 | 191 | function = random_state.randint(len(self.function_set)) 192 | # 随机选择一个加减乘除的方法 193 | function = self.function_set[function] 194 | #print (function) 195 | 196 | program = [function] 197 | terminal_stack = [function.arity] #function.arity 表示函数所需要的参数个数 198 | while terminal_stack: 199 | depth = len(terminal_stack) 200 | choice = self.n_features + len(self.function_set) 201 | choice = random_state.randint(choice) 202 | #input() 203 | # Determine if we are adding a function or terminal,决定我们是继续添加功能,或者是终止 204 | if (depth < max_depth) and (method == 'full' or 205 | choice <= len(self.function_set)): 206 | #print ('here1') 207 | function = random_state.randint(len(self.function_set)) 208 | function = self.function_set[function] 209 | program.append(function) 210 | terminal_stack.append(function.arity) 211 | else: 212 | #print ('here2') 213 | # We need a terminal, add a variable or constant 214 | if self.const_range is not None: 215 | terminal = random_state.randint(self.n_features + 1) 216 | else: 217 | terminal = random_state.randint(self.n_features) 218 | if terminal == self.n_features: 219 | terminal = random_state.uniform(*self.const_range) 220 | if self.const_range is None: 221 | # We should never get here 222 | raise ValueError('A constant was produced with ' 223 | 'const_range=None.') 224 | program.append(terminal) 225 | terminal_stack[-1] -= 1 226 | while terminal_stack[-1] == 0: 227 | terminal_stack.pop() 228 | if not terminal_stack: 229 | #print (u'tiaochuxunhuan') 230 | return program 231 | terminal_stack[-1] -= 1 232 | # 233 | # We should never get here 234 | return None 235 | 236 | def validate_program(self): 237 | """Rough check that the embedded program in the object is valid.""" 238 | terminals = [0] 239 | for node in self.program: 240 | if isinstance(node, _Function): 241 | terminals.append(node.arity) 242 | else: 243 | terminals[-1] -= 1 244 | while terminals[-1] == 0: 245 | terminals.pop() 246 | terminals[-1] -= 1 247 | return terminals == [-1] 248 | 249 | def __str__(self): 250 | """Overloads `print` output of the object to resemble a LISP tree.""" 251 | terminals = [0] 252 | output = '' 253 | for i, node in enumerate(self.program): 254 | #print (u'i',i,u'node','node') 255 | if isinstance(node, _Function): 256 | terminals.append(node.arity) 257 | output += node.name + '(' 258 | else: 259 | if isinstance(node, int): 260 | if self.feature_names is None: 261 | output += 'X%s' % node 262 | else: 263 | output += self.feature_names[node] 264 | else: 265 | output += '%.3f' % node 266 | terminals[-1] -= 1 267 | while terminals[-1] == 0: 268 | terminals.pop() 269 | terminals[-1] -= 1 270 | output += ')' 271 | if i != len(self.program) - 1: 272 | output += ', ' 273 | return output 274 | 275 | def export_graphviz(self, fade_nodes=None): 276 | """Returns a string, Graphviz script for visualizing the program. 277 | 278 | Parameters 279 | ---------- 280 | fade_nodes : list, optional 281 | A list of node indices to fade out for showing which were removed 282 | during evolution. 283 | 284 | Returns 285 | ------- 286 | output : string 287 | The Graphviz script to plot the tree representation of the program. 288 | 289 | """ 290 | terminals = [] 291 | if fade_nodes is None: 292 | fade_nodes = [] 293 | output = 'digraph program {\nnode [style=filled]\n' 294 | for i, node in enumerate(self.program): 295 | fill = '#cecece' 296 | if isinstance(node, _Function): 297 | if i not in fade_nodes: 298 | fill = '#136ed4' 299 | terminals.append([node.arity, i]) 300 | output += ('%d [label="%s", fillcolor="%s"] ;\n' 301 | % (i, node.name, fill)) 302 | else: 303 | if i not in fade_nodes: 304 | fill = '#60a6f6' 305 | if isinstance(node, int): 306 | if self.feature_names is None: 307 | feature_name = 'X%s' % node 308 | else: 309 | feature_name = self.feature_names[node] 310 | output += ('%d [label="%s", fillcolor="%s"] ;\n' 311 | % (i, feature_name, fill)) 312 | else: 313 | output += ('%d [label="%.3f", fillcolor="%s"] ;\n' 314 | % (i, node, fill)) 315 | if i == 0: 316 | # A degenerative program of only one node 317 | return output + '}' 318 | terminals[-1][0] -= 1 319 | terminals[-1].append(i) 320 | while terminals[-1][0] == 0: 321 | output += '%d -> %d ;\n' % (terminals[-1][1], 322 | terminals[-1][-1]) 323 | terminals[-1].pop() 324 | if len(terminals[-1]) == 2: 325 | parent = terminals[-1][-1] 326 | terminals.pop() 327 | if not terminals: 328 | return output + '}' 329 | terminals[-1].append(parent) 330 | terminals[-1][0] -= 1 331 | 332 | # We should never get here 333 | return None 334 | 335 | def _depth(self): 336 | """Calculates the maximum depth of the program tree.""" 337 | terminals = [0] 338 | depth = 1 339 | for node in self.program: 340 | if isinstance(node, _Function): 341 | terminals.append(node.arity) 342 | depth = max(len(terminals), depth) 343 | else: 344 | terminals[-1] -= 1 345 | while terminals[-1] == 0: 346 | terminals.pop() 347 | terminals[-1] -= 1 348 | return depth - 1 349 | 350 | def _length(self): 351 | """Calculates the number of functions and terminals in the program.""" 352 | return len(self.program) 353 | 354 | def execute(self, X): 355 | """Execute the program according to X. 356 | 357 | Parameters 358 | ---------- 359 | X : {array-like}, shape = [n_samples, n_features] 360 | Training vectors, where n_samples is the number of samples and 361 | n_features is the number of features. 362 | 363 | Returns 364 | ------- 365 | y_hats : array-like, shape = [n_samples] 366 | The result of executing the program on X. 367 | 368 | """ 369 | # Check for single-node programs 370 | #ts = time() 371 | node = self.program[0] 372 | if isinstance(node, float): 373 | #print(time() -t,u'no1') 374 | return np.repeat(node, X.shape[0]) 375 | if isinstance(node, int): 376 | #print (time()-t,u'no2') 377 | return X[:, node] 378 | 379 | apply_stack = [] 380 | 381 | for node in self.program: 382 | if isinstance(node, _Function): 383 | apply_stack.append([node]) 384 | else: 385 | # Lazily evaluate later 386 | apply_stack[-1].append(node) 387 | 388 | while len(apply_stack[-1]) == apply_stack[-1][0].arity + 1: 389 | function = apply_stack[-1][0] 390 | terminals = [np.repeat(t, X.shape[0]) if isinstance(t, float) 391 | else X[:, t] if isinstance(t, int) 392 | else t for t in apply_stack[-1][1:]] 393 | 394 | intermediate_result = function(*terminals) 395 | if len(apply_stack) != 1: 396 | apply_stack.pop() 397 | apply_stack[-1].append(intermediate_result) 398 | else: 399 | return intermediate_result 400 | # We should never get here 401 | return None 402 | 403 | def jiasu(self,y_pred,y): 404 | list_t = np.argsort(y_pred) 405 | tt =sum([y[i] for i in list_t[-50:]]) 406 | return tt 407 | def stock_excute(self,x,y): 408 | ''' 409 | 本程序用于对股票数据进行处理,用于计算股票收益的适应度情况 410 | X:股票的因子序列 411 | Y:给定的适应度情况 412 | ''' 413 | shouyi = [] 414 | for i in range(len(x)): 415 | if i%5==0: 416 | y_pred = self.execute(np.array(x[i])) 417 | shouyi.append(self.jiasu(y_pred,y[i])) 418 | del y_pred 419 | gc.collect() 420 | return shouyi 421 | 422 | def get_all_indices(self, n_samples=None, max_samples=None, 423 | random_state=None): 424 | """Get the indices on which to evaluate the fitness of a program. 425 | 426 | Parameters 427 | ---------- 428 | n_samples : int 429 | The number of samples. 430 | 431 | max_samples : int 432 | The maximum number of samples to use. 433 | 434 | random_state : RandomState instance 435 | The random number generator. 436 | 437 | Returns 438 | ------- 439 | indices : array-like, shape = [n_samples] 440 | The in-sample indices. 441 | 442 | not_indices : array-like, shape = [n_samples] 443 | The out-of-sample indices. 444 | 445 | """ 446 | if self._indices_state is None and random_state is None: 447 | raise ValueError('The program has not been evaluated for fitness ' 448 | 'yet, indices not available.') 449 | 450 | if n_samples is not None and self._n_samples is None: 451 | self._n_samples = n_samples 452 | if max_samples is not None and self._max_samples is None: 453 | self._max_samples = max_samples 454 | if random_state is not None and self._indices_state is None: 455 | self._indices_state = random_state.get_state() 456 | 457 | indices_state = check_random_state(None) 458 | indices_state.set_state(self._indices_state) 459 | 460 | not_indices = sample_without_replacement( 461 | self._n_samples, 462 | self._n_samples - self._max_samples, 463 | random_state=indices_state) 464 | sample_counts = np.bincount(not_indices, minlength=self._n_samples) 465 | indices = np.where(sample_counts == 0)[0] 466 | 467 | return indices, not_indices 468 | 469 | def _indices(self): 470 | """Get the indices used to measure the program's fitness.""" 471 | return self.get_all_indices()[0] 472 | 473 | def raw_fitness(self, X, y, sample_weight): 474 | """Evaluate the raw fitness of the program according to X, y. 475 | 476 | Parameters 477 | ---------- 478 | X : {array-like}, shape = [n_samples, n_features] 479 | Training vectors, where n_samples is the number of samples and 480 | n_features is the number of features. 481 | 482 | y : array-like, shape = [n_samples] 483 | Target values. 484 | 485 | sample_weight : array-like, shape = [n_samples] 486 | Weights applied to individual samples. 487 | 488 | Returns 489 | ------- 490 | raw_fitness : float 491 | The raw fitness of the program. 492 | 493 | """ 494 | #print (self.metric.stock_is) 495 | if not self.metric.stock_is: 496 | y_pred = self.execute(X) 497 | else: 498 | y_pred = self.stock_excute(X,y) 499 | if self.transformer: 500 | y_pred = self.transformer(y_pred) 501 | sample_weight = [1 for i in range(len(y_pred))] 502 | raw_fitness = self.metric(y, y_pred, sample_weight) 503 | del X,y,y_pred 504 | gc.collect() 505 | return raw_fitness 506 | 507 | def fitness(self, parsimony_coefficient=None): 508 | """Evaluate the penalized fitness of the program according to X, y. 509 | 510 | Parameters 511 | ---------- 512 | parsimony_coefficient : float, optional 513 | If automatic parsimony is being used, the computed value according 514 | to the population. Otherwise the initialized value is used. 515 | 516 | Returns 517 | ------- 518 | fitness : float 519 | The penalized fitness of the program. 520 | 521 | """ 522 | if parsimony_coefficient is None: 523 | parsimony_coefficient = self.parsimony_coefficient 524 | 525 | penalty = parsimony_coefficient * len(self.program) * self.metric.sign 526 | return self.raw_fitness_ - penalty 527 | def get_subtree(self, random_state, program=None): 528 | """Get a random subtree from the program. 529 | 530 | Parameters 531 | ---------- 532 | random_state : RandomState instance 533 | The random number generator. 534 | 535 | program : list, optional (default=None) 536 | The flattened tree representation of the program. If None, the 537 | embedded tree in the object will be used. 538 | 539 | Returns 540 | ------- 541 | start, end : tuple of two ints 542 | The indices of the start and end of the random subtree. 543 | 544 | """ 545 | if program is None: 546 | program = self.program 547 | # Choice of crossover points follows Koza's (1992) widely used approach 548 | # of choosing functions 90% of the time and leaves 10% of the time. 549 | probs = np.array([0.9 if isinstance(node, _Function) else 0.1 550 | for node in program]) 551 | probs = np.cumsum(probs / probs.sum()) 552 | start = np.searchsorted(probs, random_state.uniform()) 553 | 554 | stack = 1 555 | end = start 556 | while stack > end - start: 557 | node = program[end] 558 | if isinstance(node, _Function): 559 | stack += node.arity 560 | end += 1 561 | 562 | return start, end 563 | 564 | def reproduce(self): 565 | """Return a copy of the embedded program.""" 566 | return copy(self.program) 567 | 568 | def crossover(self, donor, random_state): 569 | """Perform the crossover genetic operation on the program. 570 | 571 | Crossover selects a random subtree from the embedded program to be 572 | replaced. A donor also has a subtree selected at random and this is 573 | inserted into the original parent to form an offspring. 574 | 575 | Parameters 576 | ---------- 577 | donor : list 578 | The flattened tree representation of the donor program. 579 | 580 | random_state : RandomState instance 581 | The random number generator. 582 | 583 | Returns 584 | ------- 585 | program : list 586 | The flattened tree representation of the program. 587 | 588 | """ 589 | # Get a subtree to replace 590 | start, end = self.get_subtree(random_state) 591 | removed = range(start, end) 592 | # Get a subtree to donate 593 | donor_start, donor_end = self.get_subtree(random_state, donor) 594 | donor_removed = list(set(range(len(donor))) - 595 | set(range(donor_start, donor_end))) 596 | # Insert genetic material from donor 597 | return (self.program[:start] + 598 | donor[donor_start:donor_end] + 599 | self.program[end:]), removed, donor_removed 600 | 601 | def subtree_mutation(self, random_state): 602 | """Perform the subtree mutation operation on the program. 603 | 604 | Subtree mutation selects a random subtree from the embedded program to 605 | be replaced. A donor subtree is generated at random and this is 606 | inserted into the original parent to form an offspring. This 607 | implementation uses the "headless chicken" method where the donor 608 | subtree is grown using the initialization methods and a subtree of it 609 | is selected to be donated to the parent. 610 | 611 | Parameters 612 | ---------- 613 | random_state : RandomState instance 614 | The random number generator. 615 | 616 | Returns 617 | ------- 618 | program : list 619 | The flattened tree representation of the program. 620 | 621 | """ 622 | # Build a new naive program 623 | chicken = self.build_program(random_state) 624 | # Do subtree mutation via the headless chicken method! 625 | return self.crossover(chicken, random_state) 626 | 627 | def hoist_mutation(self, random_state): 628 | """Perform the hoist mutation operation on the program. 629 | 630 | Hoist mutation selects a random subtree from the embedded program to 631 | be replaced. A random subtree of that subtree is then selected and this 632 | is 'hoisted' into the original subtrees location to form an offspring. 633 | This method helps to control bloat. 634 | 635 | Parameters 636 | ---------- 637 | random_state : RandomState instance 638 | The random number generator. 639 | 640 | Returns 641 | ------- 642 | program : list 643 | The flattened tree representation of the program. 644 | 645 | """ 646 | # Get a subtree to replace 647 | start, end = self.get_subtree(random_state) 648 | subtree = self.program[start:end] 649 | # Get a subtree of the subtree to hoist 650 | sub_start, sub_end = self.get_subtree(random_state, subtree) 651 | hoist = subtree[sub_start:sub_end] 652 | # Determine which nodes were removed for plotting 653 | removed = list(set(range(start, end)) - 654 | set(range(start + sub_start, start + sub_end))) 655 | return self.program[:start] + hoist + self.program[end:], removed 656 | 657 | def point_mutation(self, random_state): 658 | """Perform the point mutation operation on the program. 659 | 660 | Point mutation selects random nodes from the embedded program to be 661 | replaced. Terminals are replaced by other terminals and functions are 662 | replaced by other functions that require the same number of arguments 663 | as the original node. The resulting tree forms an offspring. 664 | 665 | Parameters 666 | ---------- 667 | random_state : RandomState instance 668 | The random number generator. 669 | 670 | Returns 671 | ------- 672 | program : list 673 | The flattened tree representation of the program. 674 | 675 | """ 676 | program = copy(self.program) 677 | 678 | # Get the nodes to modify 679 | mutate = np.where(random_state.uniform(size=len(program)) < 680 | self.p_point_replace)[0] 681 | 682 | for node in mutate: 683 | if isinstance(program[node], _Function): 684 | arity = program[node].arity 685 | # Find a valid replacement with same arity 686 | replacement = len(self.arities[arity]) 687 | replacement = random_state.randint(replacement) 688 | replacement = self.arities[arity][replacement] 689 | program[node] = replacement 690 | else: 691 | # We've got a terminal, add a const or variable 692 | if self.const_range is not None: 693 | terminal = random_state.randint(self.n_features + 1) 694 | else: 695 | terminal = random_state.randint(self.n_features) 696 | if terminal == self.n_features: 697 | terminal = random_state.uniform(*self.const_range) 698 | if self.const_range is None: 699 | # We should never get here 700 | raise ValueError('A constant was produced with ' 701 | 'const_range=None.') 702 | program[node] = terminal 703 | 704 | return program, list(mutate) 705 | 706 | depth_ = property(_depth) 707 | length_ = property(_length) 708 | indices_ = property(_indices) 709 | -------------------------------------------------------------------------------- /genetic.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | """Genetic Programming in Python, with a scikit-learn inspired API 3 | 4 | The :mod:`gplearn.genetic` module implements Genetic Programming. These 5 | are supervised learning methods based on applying evolutionary operations on 6 | computer programs. 7 | """ 8 | # print,input make_function make_fitness 9 | # Author: Trevor Stephens 10 | # input print 11 | # License: BSD 3 clause 12 | 13 | import itertools 14 | from abc import ABCMeta, abstractmethod 15 | from time import time 16 | from warnings import warn 17 | import gc 18 | 19 | import numpy as np 20 | from joblib import Parallel, delayed 21 | from scipy.stats import rankdata 22 | from sklearn.base import BaseEstimator 23 | from sklearn.base import RegressorMixin, TransformerMixin, ClassifierMixin 24 | from sklearn.exceptions import NotFittedError 25 | from sklearn.utils.validation import check_X_y, check_array 26 | from sklearn.utils.multiclass import check_classification_targets 27 | 28 | from ._program import _Program 29 | from .fitness import _fitness_map, _Fitness 30 | from .functions import _function_map, _Function, sig1 as sigmoid 31 | from .utils import _partition_estimators 32 | from .utils import check_random_state 33 | 34 | __all__ = ['SymbolicRegressor', 'SymbolicClassifier', 'SymbolicTransformer'] 35 | 36 | MAX_INT = np.iinfo(np.int32).max 37 | 38 | 39 | def _parallel_evolve(n_programs, parents, X, y, sample_weight, seeds, params): 40 | """Private function used to build a batch of programs within a job.""" 41 | ''' 一个进化函数''' 42 | #n_samples, n_features = X.shape # 此为原版,如需要可调整回 43 | n_samples, n_features= len(X),18 44 | # Unpack parameters 45 | tournament_size = params['tournament_size'] 46 | function_set = params['function_set'] 47 | arities = params['arities'] 48 | init_depth = params['init_depth'] 49 | init_method = params['init_method'] 50 | const_range = params['const_range'] 51 | metric = params['_metric'] 52 | transformer = params['_transformer'] 53 | parsimony_coefficient = params['parsimony_coefficient'] 54 | method_probs = params['method_probs'] #方法的概率 55 | p_point_replace = params['p_point_replace'] 56 | max_samples = params['max_samples'] 57 | feature_names = params['feature_names'] 58 | 59 | max_samples = int(max_samples * n_samples) 60 | def _tournament(): 61 | """Find the fittest individual from a sub-population.""" 62 | contenders = random_state.randint(0, len(parents), tournament_size) 63 | fitness = [parents[p].fitness_ for p in contenders] 64 | if metric.greater_is_better: 65 | parent_index = contenders[np.argmax(fitness)] 66 | else: 67 | parent_index = contenders[np.argmin(fitness)] 68 | return parents[parent_index], parent_index 69 | 70 | # Build programs 71 | programs = [] 72 | for i in range(n_programs): 73 | print (i) 74 | tt = time() 75 | # 随机选择一个数据,用于判断进化向那个方向进行 76 | random_state = check_random_state(seeds[i]) 77 | # 如过父树不存在,则,若存在则选择下一步的方法 78 | if parents is None: 79 | program = None 80 | genome = None 81 | else: 82 | method = random_state.uniform() 83 | parent, parent_index = _tournament() 84 | 85 | if method < method_probs[0]: 86 | # crossover # 交叉 87 | # parent 为父树,donor 为捐赠者 88 | donor, donor_index = _tournament() 89 | program, removed, remains = parent.crossover(donor.program, 90 | random_state) 91 | genome = {'method': 'Crossover', 92 | 'parent_idx': parent_index, 93 | 'parent_nodes': removed, 94 | 'donor_idx': donor_index, 95 | 'donor_nodes': remains} 96 | elif method < method_probs[1]: 97 | # subtree_mutation,子树突变 98 | program, removed, _ = parent.subtree_mutation(random_state) 99 | genome = {'method': 'Subtree Mutation', 100 | 'parent_idx': parent_index, 101 | 'parent_nodes': removed} 102 | elif method < method_probs[2]: 103 | # hoist_mutation hoist 变异 104 | program, removed = parent.hoist_mutation(random_state) 105 | genome = {'method': 'Hoist Mutation', 106 | 'parent_idx': parent_index, 107 | 'parent_nodes': removed} 108 | elif method < method_probs[3]: 109 | # point_mutation 点突变 110 | program, mutated = parent.point_mutation(random_state) 111 | genome = {'method': 'Point Mutation', 112 | 'parent_idx': parent_index, 113 | 'parent_nodes': mutated} 114 | else: 115 | # reproduction # 直接繁殖 116 | program = parent.reproduce() 117 | genome = {'method': 'Reproduction', 118 | 'parent_idx': parent_index, 119 | 'parent_nodes': []} 120 | #print (function_set) 121 | #input() 122 | program = _Program(function_set=function_set, 123 | arities=arities, 124 | init_depth=init_depth, 125 | init_method=init_method, 126 | n_features=n_features, 127 | metric=metric, 128 | transformer=transformer, 129 | const_range=const_range, 130 | p_point_replace=p_point_replace, 131 | parsimony_coefficient=parsimony_coefficient, 132 | feature_names=feature_names, 133 | random_state=random_state, 134 | program=program) 135 | 136 | program.parents = genome 137 | 138 | # Draw samples, using sample weights, and then fit 139 | if sample_weight is None: 140 | curr_sample_weight = np.ones((n_samples,)) 141 | else: 142 | curr_sample_weight = sample_weight.copy() 143 | oob_sample_weight = curr_sample_weight.copy() 144 | 145 | indices, not_indices = program.get_all_indices(n_samples, 146 | max_samples, 147 | random_state) 148 | 149 | curr_sample_weight[not_indices] = 0 150 | oob_sample_weight[indices] = 0 151 | bb = time() 152 | #print (u'花费事假',time() - tt) 153 | program.raw_fitness_ = program.raw_fitness(X, y, curr_sample_weight) 154 | #print (u'这里时间',time()-bb) 155 | #a = time() 156 | if max_samples < n_samples: 157 | # Calculate OOB fitness 158 | program.oob_fitness_ = program.raw_fitness(X, y, oob_sample_weight) 159 | # print (time() - a,u'第二个raw_fitness') 160 | #input() 161 | programs.append(program) 162 | 163 | return programs 164 | 165 | 166 | class BaseSymbolic(BaseEstimator, metaclass=ABCMeta): 167 | 168 | """Base class for symbolic regression / classification estimators. 169 | 170 | Warning: This class should not be used directly. 171 | Use derived classes instead. 172 | 173 | """ 174 | 175 | @abstractmethod 176 | def __init__(self, 177 | population_size=1000, 178 | hall_of_fame=None, 179 | n_components=None, 180 | generations=20, 181 | tournament_size=20, 182 | stopping_criteria=0.0, 183 | const_range=(-1., 1.), 184 | init_depth=(2, 6), 185 | init_method='half and half', 186 | function_set=('add', 'sub', 'mul', 'div'), 187 | transformer=None, 188 | metric='mean absolute error', 189 | parsimony_coefficient=0.001, 190 | p_crossover=0.9, 191 | p_subtree_mutation=0.01, 192 | p_hoist_mutation=0.01, 193 | p_point_mutation=0.01, 194 | p_point_replace=0.05, 195 | max_samples=1.0, 196 | feature_names=None, 197 | warm_start=False, 198 | low_memory=False, 199 | n_jobs=1, 200 | verbose=0, 201 | random_state=None): 202 | 203 | self.population_size = population_size 204 | self.hall_of_fame = hall_of_fame 205 | self.n_components = n_components 206 | self.generations = generations 207 | self.tournament_size = tournament_size 208 | self.stopping_criteria = stopping_criteria 209 | self.const_range = const_range 210 | self.init_depth = init_depth 211 | self.init_method = init_method 212 | self.function_set = function_set 213 | self.transformer = transformer 214 | self.metric = metric 215 | self.parsimony_coefficient = parsimony_coefficient 216 | self.p_crossover = p_crossover 217 | self.p_subtree_mutation = p_subtree_mutation 218 | self.p_hoist_mutation = p_hoist_mutation 219 | self.p_point_mutation = p_point_mutation 220 | self.p_point_replace = p_point_replace 221 | self.max_samples = max_samples 222 | self.feature_names = feature_names 223 | self.warm_start = warm_start 224 | self.low_memory = low_memory 225 | self.n_jobs = n_jobs 226 | self.verbose = verbose 227 | self.random_state = random_state 228 | 229 | def _verbose_reporter(self, run_details=None): 230 | """A report of the progress of the evolution process. 231 | 232 | Parameters 233 | ---------- 234 | run_details : dict 235 | Information about the evolution. 236 | 237 | """ 238 | if run_details is None: 239 | print(' |{:^25}|{:^42}|'.format('Population Average', 240 | 'Best Individual')) 241 | print('-' * 4 + ' ' + '-' * 25 + ' ' + '-' * 42 + ' ' + '-' * 10) 242 | line_format = '{:>4} {:>8} {:>16} {:>8} {:>16} {:>16} {:>10}' 243 | print(line_format.format('Gen', 'Length', 'Fitness', 'Length', 244 | 'Fitness', 'OOB Fitness', 'Time Left')) 245 | 246 | else: 247 | # Estimate remaining time for run 248 | gen = run_details['generation'][-1] 249 | generation_time = run_details['generation_time'][-1] 250 | remaining_time = (self.generations - gen - 1) * generation_time 251 | if remaining_time > 60: 252 | remaining_time = '{0:.2f}m'.format(remaining_time / 60.0) 253 | else: 254 | remaining_time = '{0:.2f}s'.format(remaining_time) 255 | 256 | oob_fitness = 'N/A' 257 | line_format = '{:4d} {:8.2f} {:16g} {:8d} {:16g} {:>16} {:>10}' 258 | if self.max_samples < 1.0: 259 | oob_fitness = run_details['best_oob_fitness'][-1] 260 | line_format = '{:4d} {:8.2f} {:16g} {:8d} {:16g} {:16g} {:>10}' 261 | 262 | print(line_format.format(run_details['generation'][-1], 263 | run_details['average_length'][-1], 264 | run_details['average_fitness'][-1], 265 | run_details['best_length'][-1], 266 | run_details['best_fitness'][-1], 267 | oob_fitness, 268 | remaining_time)) 269 | 270 | def fit(self, X, y, sample_weight=None): 271 | """Fit the Genetic Program according to X, y. 272 | 273 | Parameters 274 | ---------- 275 | X : array-like, shape = [n_samples, n_features] 276 | Training vectors, where n_samples is the number of samples and 277 | n_features is the number of features. 278 | 279 | y : array-like, shape = [n_samples] 280 | Target values. 281 | 282 | sample_weight : array-like, shape = [n_samples], optional 283 | Weights applied to individual samples. 284 | 285 | Returns 286 | ------- 287 | self : object 288 | Returns self. 289 | 290 | """ 291 | random_state = check_random_state(self.random_state) 292 | 293 | # Check arrays 294 | if isinstance(self, ClassifierMixin): 295 | X, y = check_X_y(X, y, y_numeric=False) 296 | check_classification_targets(y) 297 | self.classes_, y = np.unique(y, return_inverse=True) 298 | n_trim_classes = np.count_nonzero(np.bincount(y, sample_weight)) 299 | if n_trim_classes != 2: 300 | raise ValueError("y contains %d class after sample_weight " 301 | "trimmed classes with zero weights, while 2 " 302 | "classes are required." 303 | % n_trim_classes) 304 | self.n_classes_ = len(self.classes_) 305 | else: 306 | print(u'here') 307 | pass 308 | #X, y = check_X_y(X, y, y_numeric=True) # 此为原版,需要的话进行调整 309 | if sample_weight is not None: 310 | sample_weight = check_array(sample_weight, ensure_2d=False) 311 | # 为了进行循环测试,对x进行改造, 312 | #_, self.n_features_ = X.shape # 此为原版,如需要可调整回 313 | self.n_features_ = 18 # 此为改版,为了符合现有的情况进行改造 314 | 315 | hall_of_fame = self.hall_of_fame 316 | if hall_of_fame is None: 317 | hall_of_fame = self.population_size 318 | if hall_of_fame > self.population_size or hall_of_fame < 1: 319 | raise ValueError('hall_of_fame (%d) must be less than or equal to ' 320 | 'population_size (%d).' % (self.hall_of_fame, 321 | self.population_size)) 322 | n_components = self.n_components 323 | if n_components is None: 324 | n_components = hall_of_fame 325 | if n_components > hall_of_fame or n_components < 1: 326 | raise ValueError('n_components (%d) must be less than or equal to ' 327 | 'hall_of_fame (%d).' % (self.n_components, 328 | self.hall_of_fame)) 329 | 330 | self._function_set = [] 331 | for function in self.function_set: 332 | #print (function) 333 | if isinstance(function, str): 334 | if function not in _function_map: 335 | raise ValueError('invalid function name %s found in ' 336 | '`function_set`.' % function) 337 | self._function_set.append(_function_map[function]) 338 | elif isinstance(function, _Function): 339 | self._function_set.append(function) 340 | else: 341 | raise ValueError('invalid type %s found in `function_set`.' 342 | % type(function)) 343 | if not self._function_set: 344 | raise ValueError('No valid functions found in `function_set`.') 345 | 346 | # For point-mutation to find a compatible replacement node 347 | self._arities = {} 348 | for function in self._function_set: 349 | arity = function.arity 350 | self._arities[arity] = self._arities.get(arity, []) 351 | self._arities[arity].append(function) 352 | 353 | if isinstance(self.metric, _Fitness): 354 | self._metric = self.metric 355 | elif isinstance(self, RegressorMixin): 356 | if self.metric not in ('mean absolute error', 'mse', 'rmse', 357 | 'pearson', 'spearman','stock_dedicated'): 358 | raise ValueError('Unsupported metric: %s' % self.metric) 359 | self._metric = _fitness_map[self.metric] 360 | elif isinstance(self, ClassifierMixin): 361 | if self.metric != 'log loss': 362 | raise ValueError('Unsupported metric: %s' % self.metric) 363 | self._metric = _fitness_map[self.metric] 364 | elif isinstance(self, TransformerMixin): 365 | if self.metric not in ('pearson', 'spearman'): 366 | raise ValueError('Unsupported metric: %s' % self.metric) 367 | self._metric = _fitness_map[self.metric] 368 | if self.metric in ('stock_dedicate'): 369 | self._metric.stock_is = True 370 | 371 | 372 | self._method_probs = np.array([self.p_crossover, 373 | self.p_subtree_mutation, 374 | self.p_hoist_mutation, 375 | self.p_point_mutation]) 376 | self._method_probs = np.cumsum(self._method_probs) 377 | 378 | if self._method_probs[-1] > 1: 379 | raise ValueError('The sum of p_crossover, p_subtree_mutation, ' 380 | 'p_hoist_mutation and p_point_mutation should ' 381 | 'total to 1.0 or less.') 382 | 383 | if self.init_method not in ('half and half', 'grow', 'full'): 384 | raise ValueError('Valid program initializations methods include ' 385 | '"grow", "full" and "half and half". Given %s.' 386 | % self.init_method) 387 | 388 | if not((isinstance(self.const_range, tuple) and 389 | len(self.const_range) == 2) or self.const_range is None): 390 | raise ValueError('const_range should be a tuple with length two, ' 391 | 'or None.') 392 | 393 | if (not isinstance(self.init_depth, tuple) or 394 | len(self.init_depth) != 2): 395 | raise ValueError('init_depth should be a tuple with length two.') 396 | if self.init_depth[0] > self.init_depth[1]: 397 | raise ValueError('init_depth should be in increasing numerical ' 398 | 'order: (min_depth, max_depth).') 399 | 400 | if self.feature_names is not None: 401 | if self.n_features_ != len(self.feature_names): 402 | raise ValueError('The supplied `feature_names` has different ' 403 | 'length to n_features. Expected %d, got %d.' 404 | % (self.n_features_, len(self.feature_names))) 405 | for feature_name in self.feature_names: 406 | if not isinstance(feature_name, str): 407 | raise ValueError('invalid type %s found in ' 408 | '`feature_names`.' % type(feature_name)) 409 | 410 | if self.transformer is not None: 411 | if isinstance(self.transformer, _Function): 412 | self._transformer = self.transformer 413 | elif self.transformer == 'sigmoid': 414 | self._transformer = sigmoid 415 | else: 416 | raise ValueError('Invalid `transformer`. Expected either ' 417 | '"sigmoid" or _Function object, got %s' % 418 | type(self.transformer)) 419 | if self._transformer.arity != 1: 420 | raise ValueError('Invalid arity for `transformer`. Expected 1, ' 421 | 'got %d.' % (self._transformer.arity)) 422 | 423 | params = self.get_params() 424 | params['_metric'] = self._metric 425 | if hasattr(self, '_transformer'): 426 | params['_transformer'] = self._transformer 427 | else: 428 | params['_transformer'] = None 429 | params['function_set'] = self._function_set # 加减乘除 430 | params['arities'] = self._arities 431 | params['method_probs'] = self._method_probs 432 | 433 | if not self.warm_start or not hasattr(self, '_programs'): 434 | # Free allocated memory, if any 435 | self._programs = [] 436 | self.run_details_ = {'generation': [], 437 | 'average_length': [], 438 | 'average_fitness': [], 439 | 'best_length': [], 440 | 'best_fitness': [], 441 | 'best_oob_fitness': [], 442 | 'generation_time': []} 443 | 444 | prior_generations = len(self._programs) 445 | n_more_generations = self.generations - prior_generations 446 | 447 | if n_more_generations < 0: 448 | raise ValueError('generations=%d must be larger or equal to ' 449 | 'len(_programs)=%d when warm_start==True' 450 | % (self.generations, len(self._programs))) 451 | elif n_more_generations == 0: 452 | fitness = [program.raw_fitness_ for program in self._programs[-1]] 453 | warn('Warm-start fitting without increasing n_estimators does not ' 454 | 'fit new programs.') 455 | 456 | if self.warm_start: 457 | # Generate and discard seeds that would have been produced on the 458 | # initial fit call. 459 | for i in range(len(self._programs)): 460 | _ = random_state.randint(MAX_INT, size=self.population_size) 461 | 462 | if self.verbose: 463 | # Print header fields 464 | self._verbose_reporter() 465 | 466 | for gen in range(prior_generations, self.generations): 467 | print(gen,u'当前代数') 468 | 469 | start_time = time() 470 | 471 | if gen == 0: 472 | parents = None 473 | else: 474 | parents = self._programs[gen - 1] 475 | 476 | # Parallel loop 477 | n_jobs, n_programs, starts = _partition_estimators( 478 | self.population_size, self.n_jobs) 479 | seeds = random_state.randint(MAX_INT, size=self.population_size) 480 | 481 | population = Parallel(n_jobs=n_jobs, 482 | verbose=int(self.verbose > 1))( 483 | delayed(_parallel_evolve)(n_programs[i], 484 | parents, 485 | X, 486 | y, 487 | sample_weight, 488 | seeds[starts[i]:starts[i + 1]], 489 | params) 490 | for i in range(n_jobs)) 491 | 492 | # Reduce, maintaining order across different n_jobs 493 | population = list(itertools.chain.from_iterable(population)) 494 | #print (population,'ninininini') 495 | #input() 496 | fitness = [program.raw_fitness_ for program in population] 497 | length = [program.length_ for program in population] 498 | 499 | parsimony_coefficient = None 500 | if self.parsimony_coefficient == 'auto': 501 | parsimony_coefficient = (np.cov(length, fitness)[1, 0] / 502 | np.var(length)) 503 | 504 | for program in population: 505 | program.fitness_ = program.fitness(parsimony_coefficient) 506 | 507 | self._programs.append(population) 508 | # Remove old programs that didn't make it into the new population. 509 | if not self.low_memory: 510 | for old_gen in np.arange(gen, 0, -1): 511 | #print (old_gen) 512 | indices = [] 513 | for program in self._programs[old_gen]: 514 | if program is not None: 515 | for idx in program.parents: 516 | if 'idx' in idx: 517 | indices.append(program.parents[idx]) 518 | indices = set(indices) 519 | for idx in range(self.population_size): 520 | if idx not in indices: 521 | self._programs[old_gen - 1][idx] = None 522 | elif gen > 0: 523 | # Remove old generations 524 | self._programs[gen - 1] = None 525 | 526 | 527 | if self._metric.greater_is_better: 528 | best_program = population[np.argmax(fitness)] 529 | else: 530 | best_program = population[np.argmin(fitness)] 531 | 532 | self.run_details_['generation'].append(gen) 533 | self.run_details_['average_length'].append(np.mean(length)) 534 | self.run_details_['average_fitness'].append(np.mean(fitness)) 535 | self.run_details_['best_length'].append(best_program.length_) 536 | self.run_details_['best_fitness'].append(best_program.raw_fitness_) 537 | oob_fitness = np.nan 538 | if self.max_samples < 1.0: 539 | oob_fitness = best_program.oob_fitness_ 540 | self.run_details_['best_oob_fitness'].append(oob_fitness) 541 | generation_time = time() - start_time 542 | self.run_details_['generation_time'].append(generation_time) 543 | 544 | if self.verbose: 545 | self._verbose_reporter(self.run_details_) 546 | 547 | # Check for early stopping 548 | if self._metric.greater_is_better: 549 | best_fitness = fitness[np.argmax(fitness)] 550 | if best_fitness >= self.stopping_criteria: 551 | break 552 | else: 553 | best_fitness = fitness[np.argmin(fitness)] 554 | if best_fitness <= self.stopping_criteria: 555 | break 556 | for tt in self._programs[-1]: 557 | print (tt,'uuuuuu') 558 | input() 559 | if isinstance(self, TransformerMixin): 560 | # Find the best individuals in the final generation 561 | fitness = np.array(fitness) 562 | if self._metric.greater_is_better: 563 | hall_of_fame = fitness.argsort()[::-1][:self.hall_of_fame] 564 | else: 565 | hall_of_fame = fitness.argsort()[:self.hall_of_fame] 566 | evaluation = np.array([gp.execute(X) for gp in 567 | [self._programs[-1][i] for 568 | i in hall_of_fame]]) 569 | if self.metric == 'spearman': 570 | evaluation = np.apply_along_axis(rankdata, 1, evaluation) 571 | 572 | with np.errstate(divide='ignore', invalid='ignore'): 573 | correlations = np.abs(np.corrcoef(evaluation)) 574 | np.fill_diagonal(correlations, 0.) 575 | components = list(range(self.hall_of_fame)) 576 | indices = list(range(self.hall_of_fame)) 577 | # Iteratively remove least fit individual of most correlated pair 578 | while len(components) > self.n_components: 579 | most_correlated = np.unravel_index(np.argmax(correlations), 580 | correlations.shape) 581 | # The correlation matrix is sorted by fitness, so identifying 582 | # the least fit of the pair is simply getting the higher index 583 | worst = max(most_correlated) 584 | components.pop(worst) 585 | indices.remove(worst) 586 | correlations = correlations[:, indices][indices, :] 587 | indices = list(range(len(components))) 588 | self._best_programs = [self._programs[-1][i] for i in 589 | hall_of_fame[components]] 590 | 591 | else: 592 | # Find the best individual in the final generation 593 | if self._metric.greater_is_better: 594 | self._program = self._programs[-1][np.argmax(fitness)] 595 | else: 596 | self._program = self._programs[-1][np.argmin(fitness)] 597 | print (self._program,u'program') 598 | c = sorted(fitness)[-100:] 599 | d = sorted(fitness)[0:20] 600 | qq = 0 601 | for i in c: 602 | print (i) 603 | if self._programs[-1][fitness.index(i)]==qq: 604 | pass 605 | else: 606 | qq = self._programs[-1][fitness.index(i)] 607 | print (qq) 608 | return self 609 | 610 | 611 | class SymbolicRegressor(BaseSymbolic, RegressorMixin): 612 | 613 | """A Genetic Programming symbolic regressor. 614 | 615 | A symbolic regressor is an estimator that begins by building a population 616 | of naive random formulas to represent a relationship. The formulas are 617 | represented as tree-like structures with mathematical functions being 618 | recursively applied to variables and constants. Each successive generation 619 | of programs is then evolved from the one that came before it by selecting 620 | the fittest individuals from the population to undergo genetic operations 621 | such as crossover, mutation or reproduction. 622 | 623 | Parameters 624 | ---------- 625 | population_size : integer, optional (default=1000) 626 | The number of programs in each generation. 627 | # 每一代人口的数量 628 | 629 | generations : integer, optional (default=20) 630 | The number of generations to evolve. 631 | # 进化的世代数 632 | tournament_size : integer, optional (default=20) 633 | The number of programs that will compete to become part of the next 634 | generation. 635 | # 成为下一代的数量 636 | 637 | stopping_criteria : float, optional (default=0.0) 638 | The required metric value required in order to stop evolution early. 639 | #停止的度量值 640 | const_range : tuple of two floats, or None, optional (default=(-1., 1.)) 641 | The range of constants to include in the formulas. If None then no 642 | constants will be included in the candidate programs. 643 | # const 范围 644 | init_depth : tuple of two ints, optional (default=(2, 6)) 645 | The range of tree depths for the initial population of naive formulas. 646 | Individual trees will randomly choose a maximum depth from this range. 647 | When combined with `init_method='half and half'` this yields the well- 648 | known 'ramped half and half' initialization method. 649 | # 初始熟的深度 650 | init_method : str, optional (default='half and half') 651 | - 'grow' : Nodes are chosen at random from both functions and 652 | terminals, allowing for smaller trees than `init_depth` allows. Tends 653 | to grow asymmetrical trees. 654 | - 'full' : Functions are chosen until the `init_depth` is reached, and 655 | then terminals are selected. Tends to grow 'bushy' trees. 656 | - 'half and half' : Trees are grown through a 50/50 mix of 'full' and 657 | 'grow', making for a mix of tree shapes in the initial population. 658 | 659 | function_set : iterable, optional (default=('add', 'sub', 'mul', 'div')) 660 | The functions to use when building and evolving programs. This iterable 661 | can include strings to indicate either individual functions as outlined 662 | below, or you can also include your own functions as built using the 663 | ``make_function`` factory from the ``functions`` module. 664 | 665 | Available individual functions are: 666 | 667 | - 'add' : addition, arity=2. 668 | - 'sub' : subtraction, arity=2. 669 | - 'mul' : multiplication, arity=2. 670 | - 'div' : protected division where a denominator near-zero returns 1., 671 | arity=2. 672 | - 'sqrt' : protected square root where the absolute value of the 673 | argument is used, arity=1. 674 | - 'log' : protected log where the absolute value of the argument is 675 | used and a near-zero argument returns 0., arity=1. 676 | - 'abs' : absolute value, arity=1. 677 | - 'neg' : negative, arity=1. 678 | - 'inv' : protected inverse where a near-zero argument returns 0., 679 | arity=1. 680 | - 'max' : maximum, arity=2. 681 | - 'min' : minimum, arity=2. 682 | - 'sin' : sine (radians), arity=1. 683 | - 'cos' : cosine (radians), arity=1. 684 | - 'tan' : tangent (radians), arity=1. 685 | 686 | metric : str, optional (default='mean absolute error') 687 | The name of the raw fitness metric. Available options include: 688 | 689 | - 'mean absolute error'. 690 | - 'mse' for mean squared error. 691 | - 'rmse' for root mean squared error. 692 | - 'pearson', for Pearson's product-moment correlation coefficient. 693 | - 'spearman' for Spearman's rank-order correlation coefficient. 694 | 695 | Note that 'pearson' and 'spearman' will not directly predict the target 696 | but could be useful as value-added features in a second-step estimator. 697 | This would allow the user to generate one engineered feature at a time, 698 | using the SymbolicTransformer would allow creation of multiple features 699 | at once. 700 | 701 | parsimony_coefficient : float or "auto", optional (default=0.001) 702 | This constant penalizes large programs by adjusting their fitness to 703 | be less favorable for selection. Larger values penalize the program 704 | more which can control the phenomenon known as 'bloat'. Bloat is when 705 | evolution is increasing the size of programs without a significant 706 | increase in fitness, which is costly for computation time and makes for 707 | a less understandable final result. This parameter may need to be tuned 708 | over successive runs. 709 | 710 | If "auto" the parsimony coefficient is recalculated for each generation 711 | using c = Cov(l,f)/Var( l), where Cov(l,f) is the covariance between 712 | program size l and program fitness f in the population, and Var(l) is 713 | the variance of program sizes. 714 | 715 | p_crossover : float, optional (default=0.9) 716 | The probability of performing crossover on a tournament winner. 717 | Crossover takes the winner of a tournament and selects a random subtree 718 | from it to be replaced. A second tournament is performed to find a 719 | donor. The donor also has a subtree selected at random and this is 720 | inserted into the original parent to form an offspring in the next 721 | generation. 722 | 723 | p_subtree_mutation : float, optional (default=0.01) 724 | The probability of performing subtree mutation on a tournament winner. 725 | Subtree mutation takes the winner of a tournament and selects a random 726 | subtree from it to be replaced. A donor subtree is generated at random 727 | and this is inserted into the original parent to form an offspring in 728 | the next generation. 729 | 730 | p_hoist_mutation : float, optional (default=0.01) 731 | The probability of performing hoist mutation on a tournament winner. 732 | Hoist mutation takes the winner of a tournament and selects a random 733 | subtree from it. A random subtree of that subtree is then selected 734 | and this is 'hoisted' into the original subtrees location to form an 735 | offspring in the next generation. This method helps to control bloat. 736 | 737 | p_point_mutation : float, optional (default=0.01) 738 | The probability of performing point mutation on a tournament winner. 739 | Point mutation takes the winner of a tournament and selects random 740 | nodes from it to be replaced. Terminals are replaced by other terminals 741 | and functions are replaced by other functions that require the same 742 | number of arguments as the original node. The resulting tree forms an 743 | offspring in the next generation. 744 | 745 | Note : The above genetic operation probabilities must sum to less than 746 | one. The balance of probability is assigned to 'reproduction', where a 747 | tournament winner is cloned and enters the next generation unmodified. 748 | 749 | p_point_replace : float, optional (default=0.05) 750 | For point mutation only, the probability that any given node will be 751 | mutated. 752 | 753 | max_samples : float, optional (default=1.0) 754 | The fraction of samples to draw from X to evaluate each program on. 755 | 756 | feature_names : list, optional (default=None) 757 | Optional list of feature names, used purely for representations in 758 | the `print` operation or `export_graphviz`. If None, then X0, X1, etc 759 | will be used for representations. 760 | 761 | warm_start : bool, optional (default=False) 762 | When set to ``True``, reuse the solution of the previous call to fit 763 | and add more generations to the evolution, otherwise, just fit a new 764 | evolution. 765 | 766 | low_memory : bool, optional (default=False) 767 | When set to ``True``, only the current generation is retained. Parent 768 | information is discarded. For very large populations or runs with many 769 | generations, this can result in substantial memory use reduction. 770 | 771 | n_jobs : integer, optional (default=1) 772 | The number of jobs to run in parallel for `fit`. If -1, then the number 773 | of jobs is set to the number of cores. 774 | 775 | verbose : int, optional (default=0) 776 | Controls the verbosity of the evolution building process. 777 | 778 | random_state : int, RandomState instance or None, optional (default=None) 779 | If int, random_state is the seed used by the random number generator; 780 | If RandomState instance, random_state is the random number generator; 781 | If None, the random number generator is the RandomState instance used 782 | by `np.random`. 783 | 784 | Attributes 785 | ---------- 786 | run_details_ : dict 787 | Details of the evolution process. Includes the following elements: 788 | 789 | - 'generation' : The generation index. 790 | - 'average_length' : The average program length of the generation. 791 | - 'average_fitness' : The average program fitness of the generation. 792 | - 'best_length' : The length of the best program in the generation. 793 | - 'best_fitness' : The fitness of the best program in the generation. 794 | - 'best_oob_fitness' : The out of bag fitness of the best program in 795 | the generation (requires `max_samples` < 1.0). 796 | - 'generation_time' : The time it took for the generation to evolve. 797 | 798 | See Also 799 | -------- 800 | SymbolicTransformer 801 | 802 | References 803 | ---------- 804 | .. [1] J. Koza, "Genetic Programming", 1992. 805 | 806 | .. [2] R. Poli, et al. "A Field Guide to Genetic Programming", 2008. 807 | 808 | """ 809 | 810 | def __init__(self, 811 | population_size=1000, 812 | generations=20, 813 | tournament_size=20, 814 | stopping_criteria=0.0, 815 | const_range=(-1., 1.), 816 | init_depth=(2, 6), 817 | init_method='half and half', 818 | function_set=('add', 'sub', 'mul', 'div'), 819 | metric='mean absolute error', 820 | parsimony_coefficient=0.001, 821 | p_crossover=0.9, 822 | p_subtree_mutation=0.01, 823 | p_hoist_mutation=0.01, 824 | p_point_mutation=0.01, 825 | p_point_replace=0.05, 826 | max_samples=1.0, 827 | feature_names=None, 828 | warm_start=False, 829 | low_memory=False, 830 | n_jobs=1, 831 | verbose=0, 832 | random_state=None): 833 | super(SymbolicRegressor, self).__init__( 834 | population_size=population_size, 835 | generations=generations, 836 | tournament_size=tournament_size, 837 | stopping_criteria=stopping_criteria, 838 | const_range=const_range, 839 | init_depth=init_depth, 840 | init_method=init_method, 841 | function_set=function_set, 842 | metric=metric, 843 | parsimony_coefficient=parsimony_coefficient, 844 | p_crossover=p_crossover, 845 | p_subtree_mutation=p_subtree_mutation, 846 | p_hoist_mutation=p_hoist_mutation, 847 | p_point_mutation=p_point_mutation, 848 | p_point_replace=p_point_replace, 849 | max_samples=max_samples, 850 | feature_names=feature_names, 851 | warm_start=warm_start, 852 | low_memory=low_memory, 853 | n_jobs=n_jobs, 854 | verbose=verbose, 855 | random_state=random_state) 856 | 857 | def __str__(self): 858 | """Overloads `print` output of the object to resemble a LISP tree.""" 859 | if not hasattr(self, '_program'): 860 | return self.__repr__() 861 | return self._program.__str__() 862 | 863 | def predict(self, X): 864 | """Perform regression on test vectors X. 865 | 866 | Parameters 867 | ---------- 868 | X : array-like, shape = [n_samples, n_features] 869 | Input vectors, where n_samples is the number of samples 870 | and n_features is the number of features. 871 | 872 | Returns 873 | ------- 874 | y : array, shape = [n_samples] 875 | Predicted values for X. 876 | 877 | """ 878 | if not hasattr(self, '_program'): 879 | raise NotFittedError('SymbolicRegressor not fitted.') 880 | 881 | X = check_array(X) 882 | _, n_features = X.shape 883 | if self.n_features_ != n_features: 884 | raise ValueError('Number of features of the model must match the ' 885 | 'input. Model n_features is %s and input ' 886 | 'n_features is %s.' 887 | % (self.n_features_, n_features)) 888 | 889 | y = self._program.execute(X) 890 | 891 | return y 892 | 893 | 894 | class SymbolicClassifier(BaseSymbolic, ClassifierMixin): 895 | 896 | """A Genetic Programming symbolic classifier. 897 | 898 | A symbolic classifier is an estimator that begins by building a population 899 | of naive random formulas to represent a relationship. The formulas are 900 | represented as tree-like structures with mathematical functions being 901 | recursively applied to variables and constants. Each successive generation 902 | of programs is then evolved from the one that came before it by selecting 903 | the fittest individuals from the population to undergo genetic operations 904 | such as crossover, mutation or reproduction. 905 | 906 | Parameters 907 | ---------- 908 | population_size : integer, optional (default=500) 909 | The number of programs in each generation. 910 | 911 | generations : integer, optional (default=10) 912 | The number of generations to evolve. 913 | 914 | tournament_size : integer, optional (default=20) 915 | The number of programs that will compete to become part of the next 916 | generation. 917 | 918 | stopping_criteria : float, optional (default=0.0) 919 | The required metric value required in order to stop evolution early. 920 | 921 | const_range : tuple of two floats, or None, optional (default=(-1., 1.)) 922 | The range of constants to include in the formulas. If None then no 923 | constants will be included in the candidate programs. 924 | 925 | init_depth : tuple of two ints, optional (default=(2, 6)) 926 | The range of tree depths for the initial population of naive formulas. 927 | Individual trees will randomly choose a maximum depth from this range. 928 | When combined with `init_method='half and half'` this yields the well- 929 | known 'ramped half and half' initialization method. 930 | 931 | init_method : str, optional (default='half and half') 932 | - 'grow' : Nodes are chosen at random from both functions and 933 | terminals, allowing for smaller trees than `init_depth` allows. Tends 934 | to grow asymmetrical trees. 935 | - 'full' : Functions are chosen until the `init_depth` is reached, and 936 | then terminals are selected. Tends to grow 'bushy' trees. 937 | - 'half and half' : Trees are grown through a 50/50 mix of 'full' and 938 | 'grow', making for a mix of tree shapes in the initial population. 939 | 940 | function_set : iterable, optional (default=('add', 'sub', 'mul', 'div')) 941 | The functions to use when building and evolving programs. This iterable 942 | can include strings to indicate either individual functions as outlined 943 | below, or you can also include your own functions as built using the 944 | ``make_function`` factory from the ``functions`` module. 945 | 946 | Available individual functions are: 947 | 948 | - 'add' : addition, arity=2. 949 | - 'sub' : subtraction, arity=2. 950 | - 'mul' : multiplication, arity=2. 951 | - 'div' : protected division where a denominator near-zero returns 1., 952 | arity=2. 953 | - 'sqrt' : protected square root where the absolute value of the 954 | argument is used, arity=1. 955 | - 'log' : protected log where the absolute value of the argument is 956 | used and a near-zero argument returns 0., arity=1. 957 | - 'abs' : absolute value, arity=1. 958 | - 'neg' : negative, arity=1. 959 | - 'inv' : protected inverse where a near-zero argument returns 0., 960 | arity=1. 961 | - 'max' : maximum, arity=2. 962 | - 'min' : minimum, arity=2. 963 | - 'sin' : sine (radians), arity=1. 964 | - 'cos' : cosine (radians), arity=1. 965 | - 'tan' : tangent (radians), arity=1. 966 | 967 | transformer : str, optional (default='sigmoid') 968 | The name of the function through which the raw decision function is 969 | passed. This function will transform the raw decision function into 970 | probabilities of each class. 971 | 972 | This can also be replaced by your own functions as built using the 973 | ``make_function`` factory from the ``functions`` module. 974 | 975 | metric : str, optional (default='log loss') 976 | The name of the raw fitness metric. Available options include: 977 | 978 | - 'log loss' aka binary cross-entropy loss. 979 | 980 | parsimony_coefficient : float or "auto", optional (default=0.001) 981 | This constant penalizes large programs by adjusting their fitness to 982 | be less favorable for selection. Larger values penalize the program 983 | more which can control the phenomenon known as 'bloat'. Bloat is when 984 | evolution is increasing the size of programs without a significant 985 | increase in fitness, which is costly for computation time and makes for 986 | a less understandable final result. This parameter may need to be tuned 987 | over successive runs. 988 | 989 | If "auto" the parsimony coefficient is recalculated for each generation 990 | using c = Cov(l,f)/Var( l), where Cov(l,f) is the covariance between 991 | program size l and program fitness f in the population, and Var(l) is 992 | the variance of program sizes. 993 | 994 | p_crossover : float, optional (default=0.9) 995 | The probability of performing crossover on a tournament winner. 996 | Crossover takes the winner of a tournament and selects a random subtree 997 | from it to be replaced. A second tournament is performed to find a 998 | donor. The donor also has a subtree selected at random and this is 999 | inserted into the original parent to form an offspring in the next 1000 | generation. 1001 | 1002 | p_subtree_mutation : float, optional (default=0.01) 1003 | The probability of performing subtree mutation on a tournament winner. 1004 | Subtree mutation takes the winner of a tournament and selects a random 1005 | subtree from it to be replaced. A donor subtree is generated at random 1006 | and this is inserted into the original parent to form an offspring in 1007 | the next generation. 1008 | 1009 | p_hoist_mutation : float, optional (default=0.01) 1010 | The probability of performing hoist mutation on a tournament winner. 1011 | Hoist mutation takes the winner of a tournament and selects a random 1012 | subtree from it. A random subtree of that subtree is then selected 1013 | and this is 'hoisted' into the original subtrees location to form an 1014 | offspring in the next generation. This method helps to control bloat. 1015 | 1016 | p_point_mutation : float, optional (default=0.01) 1017 | The probability of performing point mutation on a tournament winner. 1018 | Point mutation takes the winner of a tournament and selects random 1019 | nodes from it to be replaced. Terminals are replaced by other terminals 1020 | and functions are replaced by other functions that require the same 1021 | number of arguments as the original node. The resulting tree forms an 1022 | offspring in the next generation. 1023 | 1024 | Note : The above genetic operation probabilities must sum to less than 1025 | one. The balance of probability is assigned to 'reproduction', where a 1026 | tournament winner is cloned and enters the next generation unmodified. 1027 | 1028 | p_point_replace : float, optional (default=0.05) 1029 | For point mutation only, the probability that any given node will be 1030 | mutated. 1031 | 1032 | max_samples : float, optional (default=1.0) 1033 | The fraction of samples to draw from X to evaluate each program on. 1034 | 1035 | feature_names : list, optional (default=None) 1036 | Optional list of feature names, used purely for representations in 1037 | the `print` operation or `export_graphviz`. If None, then X0, X1, etc 1038 | will be used for representations. 1039 | 1040 | warm_start : bool, optional (default=False) 1041 | When set to ``True``, reuse the solution of the previous call to fit 1042 | and add more generations to the evolution, otherwise, just fit a new 1043 | evolution. 1044 | 1045 | low_memory : bool, optional (default=False) 1046 | When set to ``True``, only the current generation is retained. Parent 1047 | information is discarded. For very large populations or runs with many 1048 | generations, this can result in substantial memory use reduction. 1049 | 1050 | n_jobs : integer, optional (default=1) 1051 | The number of jobs to run in parallel for `fit`. If -1, then the number 1052 | of jobs is set to the number of cores. 1053 | 1054 | verbose : int, optional (default=0) 1055 | Controls the verbosity of the evolution building process. 1056 | 1057 | random_state : int, RandomState instance or None, optional (default=None) 1058 | If int, random_state is the seed used by the random number generator; 1059 | If RandomState instance, random_state is the random number generator; 1060 | If None, the random number generator is the RandomState instance used 1061 | by `np.random`. 1062 | 1063 | Attributes 1064 | ---------- 1065 | run_details_ : dict 1066 | Details of the evolution process. Includes the following elements: 1067 | 1068 | - 'generation' : The generation index. 1069 | - 'average_length' : The average program length of the generation. 1070 | - 'average_fitness' : The average program fitness of the generation. 1071 | - 'best_length' : The length of the best program in the generation. 1072 | - 'best_fitness' : The fitness of the best program in the generation. 1073 | - 'best_oob_fitness' : The out of bag fitness of the best program in 1074 | the generation (requires `max_samples` < 1.0). 1075 | - 'generation_time' : The time it took for the generation to evolve. 1076 | 1077 | See Also 1078 | -------- 1079 | SymbolicTransformer 1080 | 1081 | References 1082 | ---------- 1083 | .. [1] J. Koza, "Genetic Programming", 1992. 1084 | 1085 | .. [2] R. Poli, et al. "A Field Guide to Genetic Programming", 2008. 1086 | 1087 | """ 1088 | 1089 | def __init__(self, 1090 | population_size=1000, 1091 | generations=20, 1092 | tournament_size=20, 1093 | stopping_criteria=0.0, 1094 | const_range=(-1., 1.), 1095 | init_depth=(2, 6), 1096 | init_method='half and half', 1097 | function_set=('add', 'sub', 'mul', 'div'), 1098 | transformer='sigmoid', 1099 | metric='log loss', 1100 | parsimony_coefficient=0.001, 1101 | p_crossover=0.9, 1102 | p_subtree_mutation=0.01, 1103 | p_hoist_mutation=0.01, 1104 | p_point_mutation=0.01, 1105 | p_point_replace=0.05, 1106 | max_samples=1.0, 1107 | feature_names=None, 1108 | warm_start=False, 1109 | low_memory=False, 1110 | n_jobs=1, 1111 | verbose=0, 1112 | random_state=None): 1113 | super(SymbolicClassifier, self).__init__( 1114 | population_size=population_size, 1115 | generations=generations, 1116 | tournament_size=tournament_size, 1117 | stopping_criteria=stopping_criteria, 1118 | const_range=const_range, 1119 | init_depth=init_depth, 1120 | init_method=init_method, 1121 | function_set=function_set, 1122 | transformer=transformer, 1123 | metric=metric, 1124 | parsimony_coefficient=parsimony_coefficient, 1125 | p_crossover=p_crossover, 1126 | p_subtree_mutation=p_subtree_mutation, 1127 | p_hoist_mutation=p_hoist_mutation, 1128 | p_point_mutation=p_point_mutation, 1129 | p_point_replace=p_point_replace, 1130 | max_samples=max_samples, 1131 | feature_names=feature_names, 1132 | warm_start=warm_start, 1133 | low_memory=low_memory, 1134 | n_jobs=n_jobs, 1135 | verbose=verbose, 1136 | random_state=random_state) 1137 | 1138 | def __str__(self): 1139 | """Overloads `print` output of the object to resemble a LISP tree.""" 1140 | if not hasattr(self, '_program'): 1141 | return self.__repr__() 1142 | return self._program.__str__() 1143 | 1144 | def predict_proba(self, X): 1145 | """Predict probabilities on test vectors X. 1146 | 1147 | Parameters 1148 | ---------- 1149 | X : array-like, shape = [n_samples, n_features] 1150 | Input vectors, where n_samples is the number of samples 1151 | and n_features is the number of features. 1152 | 1153 | Returns 1154 | ------- 1155 | proba : array, shape = [n_samples, n_classes] 1156 | The class probabilities of the input samples. The order of the 1157 | classes corresponds to that in the attribute `classes_`. 1158 | 1159 | """ 1160 | if not hasattr(self, '_program'): 1161 | raise NotFittedError('SymbolicClassifier not fitted.') 1162 | 1163 | X = check_array(X) 1164 | _, n_features = X.shape 1165 | if self.n_features_ != n_features: 1166 | raise ValueError('Number of features of the model must match the ' 1167 | 'input. Model n_features is %s and input ' 1168 | 'n_features is %s.' 1169 | % (self.n_features_, n_features)) 1170 | 1171 | scores = self._program.execute(X) 1172 | proba = self._transformer(scores) 1173 | proba = np.vstack([1 - proba, proba]).T 1174 | return proba 1175 | 1176 | def predict(self, X): 1177 | """Predict classes on test vectors X. 1178 | 1179 | Parameters 1180 | ---------- 1181 | X : array-like, shape = [n_samples, n_features] 1182 | Input vectors, where n_samples is the number of samples 1183 | and n_features is the number of features. 1184 | 1185 | Returns 1186 | ------- 1187 | y : array, shape = [n_samples,] 1188 | The predicted classes of the input samples. 1189 | 1190 | """ 1191 | proba = self.predict_proba(X) 1192 | return self.classes_.take(np.argmax(proba, axis=1), axis=0) 1193 | 1194 | 1195 | class SymbolicTransformer(BaseSymbolic, TransformerMixin): 1196 | 1197 | """A Genetic Programming symbolic transformer. 1198 | 1199 | A symbolic transformer is a supervised transformer that begins by building 1200 | a population of naive random formulas to represent a relationship. The 1201 | formulas are represented as tree-like structures with mathematical 1202 | functions being recursively applied to variables and constants. Each 1203 | successive generation of programs is then evolved from the one that came 1204 | before it by selecting the fittest individuals from the population to 1205 | undergo genetic operations such as crossover, mutation or reproduction. 1206 | The final population is searched for the fittest individuals with the least 1207 | correlation to one another. 1208 | 1209 | Parameters 1210 | ---------- 1211 | population_size : integer, optional (default=1000) 1212 | The number of programs in each generation. 1213 | 1214 | hall_of_fame : integer, or None, optional (default=100) 1215 | The number of fittest programs to compare from when finding the 1216 | least-correlated individuals for the n_components. If `None`, the 1217 | entire final generation will be used. 1218 | 1219 | n_components : integer, or None, optional (default=10) 1220 | The number of best programs to return after searching the hall_of_fame 1221 | for the least-correlated individuals. If `None`, the entire 1222 | hall_of_fame will be used. 1223 | 1224 | generations : integer, optional (default=20) 1225 | The number of generations to evolve. 1226 | 1227 | tournament_size : integer, optional (default=20) 1228 | The number of programs that will compete to become part of the next 1229 | generation. 1230 | 1231 | stopping_criteria : float, optional (default=1.0) 1232 | The required metric value required in order to stop evolution early. 1233 | 1234 | const_range : tuple of two floats, or None, optional (default=(-1., 1.)) 1235 | The range of constants to include in the formulas. If None then no 1236 | constants will be included in the candidate programs. 1237 | 1238 | init_depth : tuple of two ints, optional (default=(2, 6)) 1239 | The range of tree depths for the initial population of naive formulas. 1240 | Individual trees will randomly choose a maximum depth from this range. 1241 | When combined with `init_method='half and half'` this yields the well- 1242 | known 'ramped half and half' initialization method. 1243 | 1244 | init_method : str, optional (default='half and half') 1245 | - 'grow' : Nodes are chosen at random from both functions and 1246 | terminals, allowing for smaller trees than `init_depth` allows. Tends 1247 | to grow asymmetrical trees. 1248 | - 'full' : Functions are chosen until the `init_depth` is reached, and 1249 | then terminals are selected. Tends to grow 'bushy' trees. 1250 | - 'half and half' : Trees are grown through a 50/50 mix of 'full' and 1251 | 'grow', making for a mix of tree shapes in the initial population. 1252 | 1253 | function_set : iterable, optional (default=('add', 'sub', 'mul', 'div')) 1254 | The functions to use when building and evolving programs. This iterable 1255 | can include strings to indicate either individual functions as outlined 1256 | below, or you can also include your own functions as built using the 1257 | ``make_function`` factory from the ``functions`` module. 1258 | 1259 | Available individual functions are: 1260 | 1261 | - 'add' : addition, arity=2. 1262 | - 'sub' : subtraction, arity=2. 1263 | - 'mul' : multiplication, arity=2. 1264 | - 'div' : protected division where a denominator near-zero returns 1., 1265 | arity=2. 1266 | - 'sqrt' : protected square root where the absolute value of the 1267 | argument is used, arity=1. 1268 | - 'log' : protected log where the absolute value of the argument is 1269 | used and a near-zero argument returns 0., arity=1. 1270 | - 'abs' : absolute value, arity=1. 1271 | - 'neg' : negative, arity=1. 1272 | - 'inv' : protected inverse where a near-zero argument returns 0., 1273 | arity=1. 1274 | - 'max' : maximum, arity=2. 1275 | - 'min' : minimum, arity=2. 1276 | - 'sin' : sine (radians), arity=1. 1277 | - 'cos' : cosine (radians), arity=1. 1278 | - 'tan' : tangent (radians), arity=1. 1279 | 1280 | metric : str, optional (default='pearson') 1281 | The name of the raw fitness metric. Available options include: 1282 | 1283 | - 'pearson', for Pearson's product-moment correlation coefficient. 1284 | - 'spearman' for Spearman's rank-order correlation coefficient. 1285 | 1286 | parsimony_coefficient : float or "auto", optional (default=0.001) 1287 | This constant penalizes large programs by adjusting their fitness to 1288 | be less favorable for selection. Larger values penalize the program 1289 | more which can control the phenomenon known as 'bloat'. Bloat is when 1290 | evolution is increasing the size of programs without a significant 1291 | increase in fitness, which is costly for computation time and makes for 1292 | a less understandable final result. This parameter may need to be tuned 1293 | over successive runs. 1294 | 1295 | If "auto" the parsimony coefficient is recalculated for each generation 1296 | using c = Cov(l,f)/Var( l), where Cov(l,f) is the covariance between 1297 | program size l and program fitness f in the population, and Var(l) is 1298 | the variance of program sizes. 1299 | 1300 | p_crossover : float, optional (default=0.9) 1301 | The probability of performing crossover on a tournament winner. 1302 | Crossover takes the winner of a tournament and selects a random subtree 1303 | from it to be replaced. A second tournament is performed to find a 1304 | donor. The donor also has a subtree selected at random and this is 1305 | inserted into the original parent to form an offspring in the next 1306 | generation. 1307 | 1308 | p_subtree_mutation : float, optional (default=0.01) 1309 | The probability of performing subtree mutation on a tournament winner. 1310 | Subtree mutation takes the winner of a tournament and selects a random 1311 | subtree from it to be replaced. A donor subtree is generated at random 1312 | and this is inserted into the original parent to form an offspring in 1313 | the next generation. 1314 | 1315 | p_hoist_mutation : float, optional (default=0.01) 1316 | The probability of performing hoist mutation on a tournament winner. 1317 | Hoist mutation takes the winner of a tournament and selects a random 1318 | subtree from it. A random subtree of that subtree is then selected 1319 | and this is 'hoisted' into the original subtrees location to form an 1320 | offspring in the next generation. This method helps to control bloat. 1321 | 1322 | p_point_mutation : float, optional (default=0.01) 1323 | The probability of performing point mutation on a tournament winner. 1324 | Point mutation takes the winner of a tournament and selects random 1325 | nodes from it to be replaced. Terminals are replaced by other terminals 1326 | and functions are replaced by other functions that require the same 1327 | number of arguments as the original node. The resulting tree forms an 1328 | offspring in the next generation. 1329 | 1330 | Note : The above genetic operation probabilities must sum to less than 1331 | one. The balance of probability is assigned to 'reproduction', where a 1332 | tournament winner is cloned and enters the next generation unmodified. 1333 | 1334 | p_point_replace : float, optional (default=0.05) 1335 | For point mutation only, the probability that any given node will be 1336 | mutated. 1337 | 1338 | max_samples : float, optional (default=1.0) 1339 | The fraction of samples to draw from X to evaluate each program on. 1340 | 1341 | feature_names : list, optional (default=None) 1342 | Optional list of feature names, used purely for representations in 1343 | the `print` operation or `export_graphviz`. If None, then X0, X1, etc 1344 | will be used for representations. 1345 | 1346 | warm_start : bool, optional (default=False) 1347 | When set to ``True``, reuse the solution of the previous call to fit 1348 | and add more generations to the evolution, otherwise, just fit a new 1349 | evolution. 1350 | 1351 | low_memory : bool, optional (default=False) 1352 | When set to ``True``, only the current generation is retained. Parent 1353 | information is discarded. For very large populations or runs with many 1354 | generations, this can result in substantial memory use reduction. 1355 | 1356 | n_jobs : integer, optional (default=1) 1357 | The number of jobs to run in parallel for `fit`. If -1, then the number 1358 | of jobs is set to the number of cores. 1359 | 1360 | verbose : int, optional (default=0) 1361 | Controls the verbosity of the evolution building process. 1362 | 1363 | random_state : int, RandomState instance or None, optional (default=None) 1364 | If int, random_state is the seed used by the random number generator; 1365 | If RandomState instance, random_state is the random number generator; 1366 | If None, the random number generator is the RandomState instance used 1367 | by `np.random`. 1368 | 1369 | Attributes 1370 | ---------- 1371 | run_details_ : dict 1372 | Details of the evolution process. Includes the following elements: 1373 | 1374 | - 'generation' : The generation index. 1375 | - 'average_length' : The average program length of the generation. 1376 | - 'average_fitness' : The average program fitness of the generation. 1377 | - 'best_length' : The length of the best program in the generation. 1378 | - 'best_fitness' : The fitness of the best program in the generation. 1379 | - 'best_oob_fitness' : The out of bag fitness of the best program in 1380 | the generation (requires `max_samples` < 1.0). 1381 | - 'generation_time' : The time it took for the generation to evolve. 1382 | 1383 | See Also 1384 | -------- 1385 | SymbolicRegressor 1386 | 1387 | References 1388 | ---------- 1389 | .. [1] J. Koza, "Genetic Programming", 1992. 1390 | 1391 | .. [2] R. Poli, et al. "A Field Guide to Genetic Programming", 2008. 1392 | 1393 | """ 1394 | 1395 | def __init__(self, 1396 | population_size=1000, 1397 | hall_of_fame=100, 1398 | n_components=10, 1399 | generations=20, 1400 | tournament_size=20, 1401 | stopping_criteria=1.0, 1402 | const_range=(-1., 1.), 1403 | init_depth=(2, 6), 1404 | init_method='half and half', 1405 | function_set=('add', 'sub', 'mul', 'div'), 1406 | metric='pearson', 1407 | parsimony_coefficient=0.001, 1408 | p_crossover=0.9, 1409 | p_subtree_mutation=0.01, 1410 | p_hoist_mutation=0.01, 1411 | p_point_mutation=0.01, 1412 | p_point_replace=0.05, 1413 | max_samples=1.0, 1414 | feature_names=None, 1415 | warm_start=False, 1416 | low_memory=False, 1417 | n_jobs=1, 1418 | verbose=0, 1419 | random_state=None): 1420 | super(SymbolicTransformer, self).__init__( 1421 | population_size=population_size, 1422 | hall_of_fame=hall_of_fame, 1423 | n_components=n_components, 1424 | generations=generations, 1425 | tournament_size=tournament_size, 1426 | stopping_criteria=stopping_criteria, 1427 | const_range=const_range, 1428 | init_depth=init_depth, 1429 | init_method=init_method, 1430 | function_set=function_set, 1431 | metric=metric, 1432 | parsimony_coefficient=parsimony_coefficient, 1433 | p_crossover=p_crossover, 1434 | p_subtree_mutation=p_subtree_mutation, 1435 | p_hoist_mutation=p_hoist_mutation, 1436 | p_point_mutation=p_point_mutation, 1437 | p_point_replace=p_point_replace, 1438 | max_samples=max_samples, 1439 | feature_names=feature_names, 1440 | warm_start=warm_start, 1441 | low_memory=low_memory, 1442 | n_jobs=n_jobs, 1443 | verbose=verbose, 1444 | random_state=random_state) 1445 | 1446 | def __len__(self): 1447 | """Overloads `len` output to be the number of fitted components.""" 1448 | if not hasattr(self, '_best_programs'): 1449 | return 0 1450 | return self.n_components 1451 | 1452 | def __getitem__(self, item): 1453 | """Return the ith item of the fitted components.""" 1454 | if item >= len(self): 1455 | raise IndexError 1456 | return self._best_programs[item] 1457 | 1458 | def __str__(self): 1459 | """Overloads `print` output of the object to resemble LISP trees.""" 1460 | if not hasattr(self, '_best_programs'): 1461 | return self.__repr__() 1462 | output = str([gp.__str__() for gp in self]) 1463 | return output.replace("',", ",\n").replace("'", "") 1464 | 1465 | def transform(self, X): 1466 | """Transform X according to the fitted transformer. 1467 | 1468 | Parameters 1469 | ---------- 1470 | X : array-like, shape = [n_samples, n_features] 1471 | Input vectors, where n_samples is the number of samples 1472 | and n_features is the number of features. 1473 | 1474 | Returns 1475 | ------- 1476 | X_new : array-like, shape = [n_samples, n_components] 1477 | Transformed array. 1478 | 1479 | """ 1480 | if not hasattr(self, '_best_programs'): 1481 | raise NotFittedError('SymbolicTransformer not fitted.') 1482 | 1483 | X = check_array(X) 1484 | _, n_features = X.shape 1485 | if self.n_features_ != n_features: 1486 | raise ValueError('Number of features of the model must match the ' 1487 | 'input. Model n_features is %s and input ' 1488 | 'n_features is %s.' 1489 | % (self.n_features_, n_features)) 1490 | 1491 | X_new = np.array([gp.execute(X) for gp in self._best_programs]).T 1492 | 1493 | return X_new 1494 | 1495 | def fit_transform(self, X, y, sample_weight=None): 1496 | """Fit to data, then transform it. 1497 | 1498 | Parameters 1499 | ---------- 1500 | X : array-like, shape = [n_samples, n_features] 1501 | Training vectors, where n_samples is the number of samples and 1502 | n_features is the number of features. 1503 | 1504 | y : array-like, shape = [n_samples] 1505 | Target values. 1506 | 1507 | sample_weight : array-like, shape = [n_samples], optional 1508 | Weights applied to individual samples. 1509 | 1510 | Returns 1511 | ------- 1512 | X_new : array-like, shape = [n_samples, n_components] 1513 | Transformed array. 1514 | 1515 | """ 1516 | return self.fit(X, y, sample_weight).transform(X) 1517 | --------------------------------------------------------------------------------