├── load_data
    ├── constants.py
    ├── treasuries.py
    ├── treasuries_can.py
    ├── mongodb.py
    └── loader.py
├── __init__.py
├── README.md
├── utils.py
├── demo.py
├── functions.py
├── fitness.py
├── data_processing
    └── DataProcessing
├── _program.py
└── genetic.py


/load_data/constants.py:
--------------------------------------------------------------------------------
1 | IP="127.0.0.1"
2 | PORT=27017
3 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | '''
2 | Genetic Programming in Python,with a scikit-learn inspired API
3 | 
4 | gplearn is a set of algorithms for learnin genetic programming models
5 | 
6 | '''
7 | __version__ = '0.4.1'
8 | __all__ = ['genetic','functions','fitness']
9 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # gplearn_stock
 2 | ## 代码作用介绍 
 3 |  改进gplearn，主要使用在股票数据挖掘;
 4 | ## 简介
 5 |  本代码在gplearn的基础上，对代码进行调整修改，以期更适用国内的股票策略;
 6 |  
 7 |  Gplearn是python内部最成熟的符号回归算法实现，作为一种监督学习方法，符号回归试图发现某种隐藏的数学公式，从而利用特征变量预测目标变量;
 8 |  
 9 |  
10 |  符号回归的具体实现方式是遗传算法。首先生成多个未经历选择公式，此后的每一代中，最合适的公式将被替换;
11 |  
12 |  
13 |  随着伴随次数的增长，它们不断的繁殖，变异，进化，从而不断的逼近数据分布的真相;
14 |  
15 |  
16 |  作为使用到国内二级市场的核心点在在于适应的计算方法，不同适应度的计算方法，得到不同的结果。
17 |  
18 |  
19 |  本篇文章，主要使用调仓收益的预期作为适应度计算公式，对所有的随机生成公式进行进化。
20 |  
21 | 
22 | ## 使用方法：
23 |  安装gplean,将相应的部分进行替换。并结合demo 进行尝试
24 | ## 本代码适应度计算方法：
25 |  按照公式的大小提取前50（或100）只股票，等比例建仓，按照5个交易日进行调仓，适应度为股票收益率的均值，越大适应度越高，不设上限。
26 |  具体方法请查看_program.py的stock_excute函数，如需其他适应度，也可对此部分进行调整修改。
27 | ## demo数据。需要的请联系qq：94006733
28 | # 由于QQ并不经常登录，有需求请关注公共号“小王子的数量分析”，并留言。
29 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | """Utilities that are required by gplearn.
 2 | 
 3 | Most of these functions are slightly modified versions of some key utility
 4 | functions from scikit-learn that gplearn depends upon. They reside here in
 5 | order to maintain compatibility across different versions of scikit-learn.
 6 | make_function
 7 | """
 8 | 
 9 | import numbers
10 | 
11 | import numpy as np
12 | from joblib import cpu_count
13 | 
14 | 
15 | def check_random_state(seed):
16 |     """Turn seed into a np.random.RandomState instance
17 | 
18 |     Parameters
19 |     ----------
20 |     seed : None | int | instance of RandomState
21 |         If seed is None, return the RandomState singleton used by np.random.
22 |         If seed is an int, return a new RandomState instance seeded with seed.
23 |         If seed is already a RandomState instance, return it.
24 |         Otherwise raise ValueError.
25 | 
26 |     """
27 |     if seed is None or seed is np.random:
28 |         return np.random.mtrand._rand
29 |     if isinstance(seed, (numbers.Integral, np.integer)):
30 |         return np.random.RandomState(seed)
31 |     if isinstance(seed, np.random.RandomState):
32 |         return seed
33 |     raise ValueError('%r cannot be used to seed a numpy.random.RandomState'
34 |                      ' instance' % seed)
35 | 
36 | 
37 | def _get_n_jobs(n_jobs):
38 |     """Get number of jobs for the computation.
39 | 
40 |     This function reimplements the logic of joblib to determine the actual
41 |     number of jobs depending on the cpu count. If -1 all CPUs are used.
42 |     If 1 is given, no parallel computing code is used at all, which is useful
43 |     for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used.
44 |     Thus for n_jobs = -2, all CPUs but one are used.
45 | 
46 |     Parameters
47 |     ----------
48 |     n_jobs : int
49 |         Number of jobs stated in joblib convention.
50 | 
51 |     Returns
52 |     -------
53 |     n_jobs : int
54 |         The actual number of jobs as positive integer.
55 | 
56 |     """
57 |     if n_jobs < 0:
58 |         return max(cpu_count() + 1 + n_jobs, 1)
59 |     elif n_jobs == 0:
60 |         raise ValueError('Parameter n_jobs == 0 has no meaning.')
61 |     else:
62 |         return n_jobs
63 | 
64 | 
65 | def _partition_estimators(n_estimators, n_jobs):
66 |     """Private function used to partition estimators between jobs."""
67 |     # Compute the number of jobs
68 |     n_jobs = min(_get_n_jobs(n_jobs), n_estimators)
69 | 
70 |     # Partition estimators between jobs
71 |     n_estimators_per_job = (n_estimators // n_jobs) * np.ones(n_jobs,
72 |                                                               dtype=np.int)
73 |     n_estimators_per_job[:n_estimators % n_jobs] += 1
74 |     starts = np.cumsum(n_estimators_per_job)
75 | 
76 |     return n_jobs, n_estimators_per_job.tolist(), [0] + starts.tolist()
77 | 


--------------------------------------------------------------------------------
/demo.py:
--------------------------------------------------------------------------------
 1 | # encoding:utf-8
 2 | from load_data.loader import load_day_data,load_minute_data
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | from gplearn.utils import check_random_state
 6 | from gplearn.genetic import SymbolicRegressor
 7 | import time 
 8 | import numba 
 9 | import copy 
10 | import h5py
11 | import gcdata_chuli
12 | from data_processing.DataProcessing import data_chuli
13 | 
14 | 
15 | # 定义一个函数，用于进行遗传进化    
16 | if __name__ == '__main__':
17 |     '''
18 |     由于数据可能需要多次处理，这里现将处理好的数据进行保存
19 |     '''
20 |     
21 |     '''
22 |     stocks = list(pd.read_csv('C:\\Users\\94006\\Desktop\\HS300.csv')['HS300'])
23 |     start_time = '2010-06-30'
24 |     end_time= '2018-12-30'
25 |     stock_list = [str(i).zfill(6) for i in stocks]
26 |     day_or_minute = 'day'
27 |     metric = 'stock_dedicated'
28 |     
29 |     train_ratio = 0.7
30 |     adjust=True
31 |     
32 |     data_all = data_chuli(start_time = start_time,
33 |                             end_time = end_time,
34 |                             stock_list = stock_list,
35 |                             day_or_minute = day_or_minute,
36 |                             train_ratio = train_ratio,
37 |                             adjust = adjust)
38 |     (x_train,y_train,x_test,y_test) = data_all.time_adjust()#获取训练和测试用的数据
39 |     x_train_HS300 = 'D:\\HS300_x_train_datas150.npy'
40 |     y_train_HS300 = 'D:\\HS300_y_train_datas150.npy'
41 |     x_test_HS300 = 'D:\\HS300_x_test_datas150.npy'
42 |     y_test_HS300 = 'D:\\HS300_y_test_datas150.npy'
43 |     np.save(x_train_HS300,np.array(x_train))
44 |     np.save(y_train_HS300,np.array(y_train))
45 |     np.save(x_test_HS300,np.array(x_test))
46 |     np.save(y_test_HS300,np.array(y_test))
47 |     
48 |     '''
49 |    
50 |     x_train = np.array(np.load('D:\\HS300_x_train_datas150.npy')) # 不同用户数据保存的位置可能不一样，请根据自身的地址进行调整
51 |     y_train = np.array(np.load('D:\\HS300_y_train_datas150.npy'))
52 |     x_test = np.array(np.load('D:\\HS300_x_test_datas150.npy'))
53 |     y_test =np.array( np.load('D:\\HS300_y_test_datas150.npy'))
54 |     (a,b,c,d) = jinghua_data(x_train,y_train,x_test,y_test)
55 |  
56 |     print (u'数据准备完成，进入进化')
57 |     est_gp = SymbolicRegressor(population_size=5000,
58 |                            generations=8, stopping_criteria=10000,
59 |                            p_crossover=0.7, p_subtree_mutation=0.1,
60 |                            p_hoist_mutation=0.05, p_point_mutation=0.1,
61 |                            max_samples=0.9, verbose=1,
62 |                            parsimony_coefficient=0.01, random_state=0,
63 |                            metric= 'stock_dedicated',             # 本参数为 = ‘stock_dedicated’则为股票的
64 |                            n_jobs=2)# 构建一个遗传进化的类 
65 |     print (u'类构件完成')
66 |     input()                        
67 |     x_trains = a
68 |     y_trains = b
69 |     est_gp.fit(x_trains, y_trains)   
70 | 


--------------------------------------------------------------------------------
/load_data/treasuries.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright 2013 Quantopian, Inc.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | from operator import itemgetter
 16 | import re
 17 | 
 18 | import numpy as np
 19 | import pandas as pd
 20 | 
 21 | 
 22 | get_unit_and_periods = itemgetter('unit', 'periods')
 23 | 
 24 | 
 25 | def parse_treasury_csv_column(column):
 26 |     """
 27 |     Parse a treasury CSV column into a more human-readable format.
 28 | 
 29 |     Columns start with 'RIFLGFC', followed by Y or M (year or month), followed
 30 |     by a two-digit number signifying number of years/months, followed by _N.B.
 31 |     We only care about the middle two entries, which we turn into a string like
 32 |     3month or 30year.
 33 |     """
 34 |     column_re = re.compile(
 35 |         r"^(?P<prefix>RIFLGFC)"
 36 |         "(?P<unit>[YM])"
 37 |         "(?P<periods>[0-9]{2})"
 38 |         "(?P<suffix>_N.B)$"
 39 |     )
 40 | 
 41 |     match = column_re.match(column)
 42 |     if match is None:
 43 |         raise ValueError("Couldn't parse CSV column %r." % column)
 44 |     unit, periods = get_unit_and_periods(match.groupdict())
 45 | 
 46 |     # Roundtrip through int to coerce '06' into '6'.
 47 |     return str(int(periods)) + ('year' if unit == 'Y' else 'month')
 48 | 
 49 | 
 50 | def earliest_possible_date():
 51 |     """
 52 |     The earliest date for which we can load data from this module.
 53 |     """
 54 |     # The US Treasury actually has data going back further than this, but it's
 55 |     # pretty rare to find pricing data going back that far, and there's no
 56 |     # reason to make people download benchmarks back to 1950 that they'll never
 57 |     # be able to use.
 58 |     return pd.Timestamp('1980', tz='UTC')
 59 | 
 60 | 
 61 | def get_treasury_data(start_date, end_date):
 62 |     return pd.read_csv(
 63 |         "http://www.federalreserve.gov/datadownload/Output.aspx"
 64 |         "?rel=H15"
 65 |         "&series=bf17364827e38702b42a58cf8eaa3f78"
 66 |         "&lastObs="
 67 |         "&from="  # An unbounded query is ~2x faster than specifying dates.
 68 |         "&to="
 69 |         "&filetype=csv"
 70 |         "&label=omit"
 71 |         "&layout=seriescolumn"
 72 |         "&type=package",
 73 |         skiprows=1,  # First row is a useless header.
 74 |         parse_dates=['Time Period'],
 75 |         na_values=['ND'],  # Presumably this stands for "No Data".
 76 |         index_col=0,
 77 |     ).loc[
 78 |         start_date:end_date
 79 |     ].dropna(
 80 |         how='all'
 81 |     ).rename(
 82 |         columns=parse_treasury_csv_column
 83 |     ).tz_localize('UTC') * 0.01  # Convert from 2.57% to 0.0257.
 84 | 
 85 | 
 86 | def dataconverter(s):
 87 |     try:
 88 |         return float(s) / 100
 89 |     except:
 90 |         return np.nan
 91 | 
 92 | 
 93 | def get_daily_10yr_treasury_data():
 94 |     """Download daily 10 year treasury rates from the Federal Reserve and
 95 |     return a pandas.Series."""
 96 |     url = "http://www.federalreserve.gov/datadownload/Output.aspx?rel=H15" \
 97 |           "&series=bcb44e57fb57efbe90002369321bfb3f&lastObs=&from=&to=" \
 98 |           "&filetype=csv&label=include&layout=seriescolumn"
 99 |     return pd.read_csv(url, header=5, index_col=0, names=['DATE', 'BC_10YEAR'],
100 |                        parse_dates=True, converters={1: dataconverter},
101 |                        squeeze=True)
102 | 


--------------------------------------------------------------------------------
/load_data/treasuries_can.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright 2013 Quantopian, Inc.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | import pandas as pd
 17 | import six
 18 | from toolz import curry
 19 | from toolz.curried.operator import add as prepend
 20 | 
 21 | COLUMN_NAMES = {
 22 |     "V39063": '1month',
 23 |     "V39065": '3month',
 24 |     "V39066": '6month',
 25 |     "V39067": '1year',
 26 |     "V39051": '2year',
 27 |     "V39052": '3year',
 28 |     "V39053": '5year',
 29 |     "V39054": '7year',
 30 |     "V39055": '10year',
 31 |     # Bank of Canada refers to this as 'Long' Rate, approximately 30 years.
 32 |     "V39056": '30year',
 33 | }
 34 | BILL_IDS = ['V39063', 'V39065', 'V39066', 'V39067']
 35 | BOND_IDS = ['V39051', 'V39052', 'V39053', 'V39054', 'V39055', 'V39056']
 36 | 
 37 | 
 38 | @curry
 39 | def _format_url(instrument_type,
 40 |                 instrument_ids,
 41 |                 start_date,
 42 |                 end_date,
 43 |                 earliest_allowed_date):
 44 |     """
 45 |     Format a URL for loading data from Bank of Canada.
 46 |     """
 47 |     return (
 48 |         "http://www.bankofcanada.ca/stats/results/csv"
 49 |         "?lP=lookup_{instrument_type}_yields.php"
 50 |         "&sR={restrict}"
 51 |         "&se={instrument_ids}"
 52 |         "&dF={start}"
 53 |         "&dT={end}".format(
 54 |             instrument_type=instrument_type,
 55 |             instrument_ids='-'.join(map(prepend("L_"), instrument_ids)),
 56 |             restrict=earliest_allowed_date.strftime("%Y-%m-%d"),
 57 |             start=start_date.strftime("%Y-%m-%d"),
 58 |             end=end_date.strftime("%Y-%m-%d"),
 59 |         )
 60 |     )
 61 | 
 62 | 
 63 | format_bill_url = _format_url('tbill', BILL_IDS)
 64 | format_bond_url = _format_url('bond', BOND_IDS)
 65 | 
 66 | 
 67 | def load_frame(url, skiprows):
 68 |     """
 69 |     Load a DataFrame of data from a Bank of Canada site.
 70 |     """
 71 |     return pd.read_csv(
 72 |         url,
 73 |         skiprows=skiprows,
 74 |         skipinitialspace=True,
 75 |         na_values=["Bank holiday", "Not available"],
 76 |         parse_dates=["Date"],
 77 |         index_col="Date",
 78 |     ).dropna(how='all') \
 79 |      .tz_localize('UTC') \
 80 |      .rename(columns=COLUMN_NAMES)
 81 | 
 82 | 
 83 | def check_known_inconsistencies(bill_data, bond_data):
 84 |     """
 85 |     There are a couple quirks in the data provided by Bank of Canada.
 86 |     Check that no new quirks have been introduced in the latest download.
 87 |     """
 88 |     inconsistent_dates = bill_data.index.sym_diff(bond_data.index)
 89 |     known_inconsistencies = [
 90 |         # bill_data has an entry for 2010-02-15, which bond_data doesn't.
 91 |         # bond_data has an entry for 2006-09-04, which bill_data doesn't.
 92 |         # Both of these dates are bank holidays (Flag Day and Labor Day,
 93 |         # respectively).
 94 |         pd.Timestamp('2006-09-04', tz='UTC'),
 95 |         pd.Timestamp('2010-02-15', tz='UTC'),
 96 |         # 2013-07-25 comes back as "Not available" from the bills endpoint.
 97 |         # This date doesn't seem to be a bank holiday, but the previous
 98 |         # calendar implementation dropped this entry, so we drop it as well.
 99 |         # If someone cares deeply about the integrity of the Canadian trading
100 |         # calendar, they may want to consider forward-filling here rather than
101 |         # dropping the row.
102 |         pd.Timestamp('2013-07-25', tz='UTC'),
103 |     ]
104 |     unexpected_inconsistences = inconsistent_dates.drop(known_inconsistencies)
105 |     if len(unexpected_inconsistences):
106 |         in_bills = bill_data.index.difference(bond_data.index).difference(
107 |             known_inconsistencies
108 |         )
109 |         in_bonds = bond_data.index.difference(bill_data.index).difference(
110 |             known_inconsistencies
111 |         )
112 |         raise ValueError(
113 |             "Inconsistent dates for Canadian treasury bills vs bonds. \n"
114 |             "Dates with bills but not bonds: {in_bills}.\n"
115 |             "Dates with bonds but not bills: {in_bonds}.".format(
116 |                 in_bills=in_bills,
117 |                 in_bonds=in_bonds,
118 |             )
119 |         )
120 | 
121 | 
122 | def earliest_possible_date():
123 |     """
124 |     The earliest date for which we can load data from this module.
125 |     """
126 |     today = pd.Timestamp('now', tz='UTC').normalize()
127 |     # Bank of Canada only has the last 10 years of data at any given time.
128 |     return today.replace(year=today.year - 10)
129 | 
130 | 
131 | def get_treasury_data(start_date, end_date):
132 |     bill_data = load_frame(
133 |         format_bill_url(start_date, end_date, start_date),
134 |         # We skip fewer rows here because we query for fewer bill fields,
135 |         # which makes the header smaller.
136 |         skiprows=18,
137 |     )
138 | 
139 |     bond_data = load_frame(
140 |         format_bond_url(start_date, end_date, start_date),
141 |         skiprows=22,
142 |     )
143 |     check_known_inconsistencies(bill_data, bond_data)
144 | 
145 |     # dropna('any') removes the rows for which we only had data for one of
146 |     # bills/bonds.
147 |     out = pd.concat([bond_data, bill_data], axis=1).dropna(how='any')
148 |     assert set(out.columns) == set(six.itervalues(COLUMN_NAMES))
149 | 
150 |     # Multiply by 0.01 to convert from percentages to expected output format.
151 |     return out * 0.01
152 | 


--------------------------------------------------------------------------------
/functions.py:
--------------------------------------------------------------------------------
  1 | """The functions used to create programs.
  2 | 
  3 | The :mod:`gplearn.functions` module contains all of the functions used by
  4 | gplearn programs. It also contains helper methods for a user to define their
  5 | own custom functions.
  6 | """
  7 | 
  8 | # Author: Trevor Stephens <trevorstephens.com>
  9 | #
 10 | # License: BSD 3 clause
 11 | 
 12 | import numpy as np
 13 | from joblib import wrap_non_picklable_objects
 14 | 
 15 | __all__ = ['make_function']
 16 | 
 17 | 
 18 | class _Function(object):
 19 | 
 20 |     """A representation of a mathematical relationship, a node in a program.
 21 | 
 22 |     This object is able to be called with NumPy vectorized arguments and return
 23 |     a resulting vector based on a mathematical relationship.
 24 | 
 25 |     Parameters
 26 |     ----------
 27 |     function : callable
 28 |         A function with signature function(x1, *args) that returns a Numpy
 29 |         array of the same shape as its arguments.
 30 | 
 31 |     name : str
 32 |         The name for the function as it should be represented in the program
 33 |         and its visualizations.
 34 | 
 35 |     arity : int
 36 |         The number of arguments that the ``function`` takes.
 37 | 
 38 |     """
 39 | 
 40 |     def __init__(self, function, name, arity):
 41 |         self.function = function
 42 |         self.name = name
 43 |         self.arity = arity
 44 | 
 45 |     def __call__(self, *args):
 46 |         return self.function(*args)
 47 | 
 48 | 
 49 | def make_function(function, name, arity, wrap=True):
 50 |     """Make a function node, a representation of a mathematical relationship.
 51 | 
 52 |     This factory function creates a function node, one of the core nodes in any
 53 |     program. The resulting object is able to be called with NumPy vectorized
 54 |     arguments and return a resulting vector based on a mathematical
 55 |     relationship.
 56 | 
 57 |     Parameters
 58 |     ----------
 59 |     function : callable
 60 |         A function with signature `function(x1, *args)` that returns a Numpy
 61 |         array of the same shape as its arguments.
 62 | 
 63 |     name : str
 64 |         The name for the function as it should be represented in the program
 65 |         and its visualizations.
 66 | 
 67 |     arity : int
 68 |         The number of arguments that the `function` takes.
 69 | 
 70 |     wrap : bool, optional (default=True)
 71 |         When running in parallel, pickling of custom functions is not supported
 72 |         by Python's default pickler. This option will wrap the function using
 73 |         cloudpickle allowing you to pickle your solution, but the evolution may
 74 |         run slightly more slowly. If you are running single-threaded in an
 75 |         interactive Python session or have no need to save the model, set to
 76 |         `False` for faster runs.
 77 | 
 78 |     """
 79 |     if not isinstance(arity, int):
 80 |         raise ValueError('arity must be an int, got %s' % type(arity))
 81 |     if not isinstance(function, np.ufunc):
 82 |         if function.__code__.co_argcount != arity:
 83 |             raise ValueError('arity %d does not match required number of '
 84 |                              'function arguments of %d.'
 85 |                              % (arity, function.__code__.co_argcount))
 86 |     if not isinstance(name, str):
 87 |         raise ValueError('name must be a string, got %s' % type(name))
 88 |     if not isinstance(wrap, bool):
 89 |         raise ValueError('wrap must be an bool, got %s' % type(wrap))
 90 |     #print (arity,'niaho')
 91 |     # Check output shape
 92 |     args = [np.ones(10) for _ in range(arity)]
 93 |     try:
 94 |         function(*args)
 95 |     except ValueError:
 96 |         raise ValueError('supplied function %s does not support arity of %d.'
 97 |                          % (name, arity))
 98 |     if not hasattr(function(*args), 'shape'):
 99 |         raise ValueError('supplied function %s does not return a numpy array.'
100 |                          % name)
101 |     if function(*args).shape != (10,):
102 |         raise ValueError('supplied function %s does not return same shape as '
103 |                          'input vectors.' % name)
104 | 
105 |     # Check closure for zero & negative input arguments
106 |     args = [np.zeros(10) for _ in range(arity)]
107 |     if not np.all(np.isfinite(function(*args))):
108 |         raise ValueError('supplied function %s does not have closure against '
109 |                          'zeros in argument vectors.' % name)
110 |     args = [-1 * np.ones(10) for _ in range(arity)]
111 |     if not np.all(np.isfinite(function(*args))):
112 |         raise ValueError('supplied function %s does not have closure against '
113 |                          'negatives in argument vectors.' % name)
114 | 
115 |     if wrap:
116 |         return _Function(function=wrap_non_picklable_objects(function),
117 |                          name=name,
118 |                          arity=arity)
119 |     return _Function(function=function,
120 |                      name=name,
121 |                      arity=arity)
122 | 
123 | 
124 | def _protected_division(x1, x2):
125 |     """Closure of division (x1/x2) for zero denominator."""
126 |     with np.errstate(divide='ignore', invalid='ignore'):
127 |         return np.where(np.abs(x2) > 0.001, np.divide(x1, x2), 1.)
128 | 
129 | 
130 | def _protected_sqrt(x1):
131 |     """Closure of square root for negative arguments."""
132 |     return np.sqrt(np.abs(x1))
133 | 
134 | 
135 | def _protected_log(x1):
136 |     """Closure of log for zero arguments."""
137 |     with np.errstate(divide='ignore', invalid='ignore'):
138 |         return np.where(np.abs(x1) > 0.001, np.log(np.abs(x1)), 0.)
139 | 
140 | 
141 | def _protected_inverse(x1):
142 |     """Closure of log for zero arguments."""
143 |     with np.errstate(divide='ignore', invalid='ignore'):
144 |         return np.where(np.abs(x1) > 0.001, 1. / x1, 0.)
145 | 
146 | 
147 | def _sigmoid(x1):
148 |     """Special case of logistic function to transform to probabilities."""
149 |     with np.errstate(over='ignore', under='ignore'):
150 |         return 1 / (1 + np.exp(-x1))
151 | 
152 | 
153 | add2 = _Function(function=np.add, name='add', arity=2)
154 | sub2 = _Function(function=np.subtract, name='sub', arity=2)
155 | mul2 = _Function(function=np.multiply, name='mul', arity=2)
156 | div2 = _Function(function=_protected_division, name='div', arity=2)
157 | sqrt1 = _Function(function=_protected_sqrt, name='sqrt', arity=1)
158 | log1 = _Function(function=_protected_log, name='log', arity=1)
159 | neg1 = _Function(function=np.negative, name='neg', arity=1)
160 | inv1 = _Function(function=_protected_inverse, name='inv', arity=1)
161 | abs1 = _Function(function=np.abs, name='abs', arity=1)
162 | max2 = _Function(function=np.maximum, name='max', arity=2)
163 | min2 = _Function(function=np.minimum, name='min', arity=2)
164 | sin1 = _Function(function=np.sin, name='sin', arity=1)
165 | cos1 = _Function(function=np.cos, name='cos', arity=1)
166 | tan1 = _Function(function=np.tan, name='tan', arity=1)
167 | sig1 = _Function(function=_sigmoid, name='sig', arity=1)
168 | 
169 | _function_map = {'add': add2,
170 |                  'sub': sub2,
171 |                  'mul': mul2,
172 |                  'div': div2,
173 |                  'sqrt': sqrt1,
174 |                  'log': log1,
175 |                  'abs': abs1,
176 |                  'neg': neg1,
177 |                  'inv': inv1,
178 |                  'max': max2,
179 |                  'min': min2,
180 |                  'sin': sin1,
181 |                  'cos': cos1,
182 |                  'tan': tan1}
183 | 


--------------------------------------------------------------------------------
/fitness.py:
--------------------------------------------------------------------------------
  1 | # encoding:utf-8
  2 | """Metrics to evaluate the fitness of a program.
  3 | 
  4 | The :mod:`gplearn.fitness` module contains some metric with which to evaluate
  5 | the computer programs created by the :mod:`gplearn.genetic` module.
  6 | """
  7 | 
  8 | # Author: Trevor Stephens <trevorstephens.com>
  9 | #
 10 | # License: BSD 3 clause
 11 | 
 12 | import numbers
 13 | import numpy as np
 14 | from joblib import wrap_non_picklable_objects
 15 | from scipy.stats import rankdata
 16 | 
 17 | __all__ = ['make_fitness']
 18 | 
 19 | 
 20 | class _Fitness(object):
 21 | 
 22 |     """A metric to measure the fitness of a program.
 23 | 
 24 |     This object is able to be called with NumPy vectorized arguments and return
 25 |     a resulting floating point score quantifying the quality of the program's
 26 |     representation of the true relationship.
 27 | 
 28 |     Parameters
 29 |     ----------
 30 |     function : callable
 31 |         A function with signature function(y, y_pred, sample_weight) that
 32 |         returns a floating point number. Where `y` is the input target y
 33 |         vector, `y_pred` is the predicted values from the genetic program, and
 34 |         sample_weight is the sample_weight vector.
 35 | 
 36 |     greater_is_better : bool
 37 |         Whether a higher value from `function` indicates a better fit. In
 38 |         general this would be False for metrics indicating the magnitude of
 39 |         the error, and True for metrics indicating the quality of fit.
 40 | 
 41 |     """
 42 | 
 43 |     def __init__(self, function, greater_is_better,stock_is = None):
 44 |         self.function = function
 45 |         self.stock_is = stock_is
 46 |         self.greater_is_better = greater_is_better
 47 |         self.sign = 1 if greater_is_better else -1
 48 | 
 49 |     def __call__(self, *args):
 50 |         return self.function(*args)
 51 | 
 52 | 
 53 | def make_fitness(function, greater_is_better, wrap=True):
 54 |     """Make a fitness measure, a metric scoring the quality of a program's fit.
 55 | 
 56 |     This factory function creates a fitness measure object which measures the
 57 |     quality of a program's fit and thus its likelihood to undergo genetic
 58 |     operations into the next generation. The resulting object is able to be
 59 |     called with NumPy vectorized arguments and return a resulting floating
 60 |     point score quantifying the quality of the program's representation of the
 61 |     true relationship.
 62 | 
 63 |     Parameters
 64 |     ----------
 65 |     function : callable
 66 |         A function with signature function(y, y_pred, sample_weight) that
 67 |         returns a floating point number. Where `y` is the input target y
 68 |         vector, `y_pred` is the predicted values from the genetic program, and
 69 |         sample_weight is the sample_weight vector.
 70 | 
 71 |     greater_is_better : bool
 72 |         Whether a higher value from `function` indicates a better fit. In
 73 |         general this would be False for metrics indicating the magnitude of
 74 |         the error, and True for metrics indicating the quality of fit.
 75 | 
 76 |     wrap : bool, optional (default=True)
 77 |         When running in parallel, pickling of custom metrics is not supported
 78 |         by Python's default pickler. This option will wrap the function using
 79 |         cloudpickle allowing you to pickle your solution, but the evolution may
 80 |         run slightly more slowly. If you are running single-threaded in an
 81 |         interactive Python session or have no need to save the model, set to
 82 |         `False` for faster runs.
 83 | 
 84 |     """
 85 |     if not isinstance(greater_is_better, bool):
 86 |         raise ValueError('greater_is_better must be bool, got %s'
 87 |                          % type(greater_is_better))
 88 |     if not isinstance(wrap, bool):
 89 |         raise ValueError('wrap must be an bool, got %s' % type(wrap))
 90 |     if function.__code__.co_argcount != 3:
 91 |         raise ValueError('function requires 3 arguments (y, y_pred, w),'
 92 |                          ' got %d.' % function.__code__.co_argcount)
 93 |     if not isinstance(function(np.array([1, 1]),
 94 |                       np.array([2, 2]),
 95 |                       np.array([1, 1])), numbers.Number):
 96 |         raise ValueError('function must return a numeric.')
 97 | 
 98 |     if wrap:
 99 |         return _Fitness(function=wrap_non_picklable_objects(function),
100 |                         greater_is_better=greater_is_better)
101 |     return _Fitness(function=function,
102 |                     greater_is_better=greater_is_better)
103 | 
104 | 
105 | def _weighted_pearson(y, y_pred, w):
106 |     """Calculate the weighted Pearson correlation coefficient."""
107 |     with np.errstate(divide='ignore', invalid='ignore'):
108 |         y_pred_demean = y_pred - np.average(y_pred, weights=w)
109 |         y_demean = y - np.average(y, weights=w)
110 |         corr = ((np.sum(w * y_pred_demean * y_demean) / np.sum(w)) /
111 |                 np.sqrt((np.sum(w * y_pred_demean ** 2) *
112 |                          np.sum(w * y_demean ** 2)) /
113 |                         (np.sum(w) ** 2)))
114 |     if np.isfinite(corr):
115 |         return np.abs(corr)
116 |     return 0.
117 | 
118 | 
119 | def _weighted_spearman(y, y_pred, w):
120 |     """Calculate the weighted Spearman correlation coefficient."""
121 |     y_pred_ranked = np.apply_along_axis(rankdata, 0, y_pred)
122 |     y_ranked = np.apply_along_axis(rankdata, 0, y)
123 |     return _weighted_pearson(y_pred_ranked, y_ranked, w)
124 | 
125 | 
126 | def _mean_absolute_error(y, y_pred, w):
127 |     """Calculate the mean absolute error."""
128 |     return np.average(np.abs(y_pred - y), weights=w)
129 | 
130 | 
131 | def _mean_square_error(y, y_pred, w):
132 |     """Calculate the mean square error."""
133 |     return np.average(((y_pred - y) ** 2), weights=w)
134 | 
135 | #为了不破坏整体的结构，以原结构进行比较
136 | def _stock_dedicated(y,y_pred,w):
137 |     return np.average(y_pred,weights = w)
138 | 
139 | def _root_mean_square_error(y, y_pred, w):
140 |     """Calculate the root mean square error."""
141 |     return np.sqrt(np.average(((y_pred - y) ** 2), weights=w))
142 | 
143 | 
144 | def _log_loss(y, y_pred, w):
145 |     """Calculate the log loss."""
146 |     eps = 1e-15
147 |     inv_y_pred = np.clip(1 - y_pred, eps, 1 - eps)
148 |     y_pred = np.clip(y_pred, eps, 1 - eps)
149 |     score = y * np.log(y_pred) + (1 - y) * np.log(inv_y_pred)
150 |     return np.average(-score, weights=w)
151 | 
152 | 
153 | weighted_pearson = _Fitness(function=_weighted_pearson,
154 |                             greater_is_better=True)
155 | weighted_spearman = _Fitness(function=_weighted_spearman,
156 |                              greater_is_better=True)
157 | mean_absolute_error = _Fitness(function=_mean_absolute_error,
158 |                                greater_is_better=False)
159 | mean_square_error = _Fitness(function=_mean_square_error,
160 |                              greater_is_better=False)
161 | root_mean_square_error = _Fitness(function=_root_mean_square_error,
162 |                                   greater_is_better=False)
163 | log_loss = _Fitness(function=_log_loss,
164 |                     greater_is_better=False)
165 | stock_dedicated = _Fitness(function=_stock_dedicated,
166 |                     greater_is_better=True,
167 |                     stock_is = True)
168 |                     
169 |                     
170 | 
171 | _fitness_map = {'pearson': weighted_pearson,
172 |                 'spearman': weighted_spearman,
173 |                 'mean absolute error': mean_absolute_error,
174 |                 'mse': mean_square_error,
175 |                 'rmse': root_mean_square_error,
176 |                 'log loss': log_loss,
177 |                 'stock_dedicated':stock_dedicated}
178 | 


--------------------------------------------------------------------------------
/data_processing/DataProcessing:
--------------------------------------------------------------------------------
  1 | # encoding:utf-8
  2 | from loader import load_day_data,load_minute_data
  3 | import numpy as np
  4 | import matplotlib.pyplot as plt
  5 | from gplearn.utils import check_random_state
  6 | from gplearn.genetic import SymbolicRegressor
  7 | import time 
  8 | import numba 
  9 | import copy 
 10 | import h5py
 11 | import gc
 12 | 
 13 | # ['open', 'high', 'low', 'close', 'volume', 'price', 'change', 'mean_five', 'highest_five', 'lowest_five',
 14 | # 'mean_20', 'highest_20', 'lowest_20', '20_profit', '20_mean', '5_mean', '20_mean_vol', '5_mean_vol']
 15 | from collections import OrderedDict
 16 | time_now = time.time()
 17 | import pandas as pd
 18 | # 定义一个函数，用于计算最大回撤 input()
 19 | # 最大回撤（输入为list或array）
 20 | @numba.jit
 21 | def MaxDrawdown(strategy): 
 22 |     #print (strategy)
 23 |     length = (strategy.shape)[0]
 24 |     drawdown = []
 25 |     py =strategy[0]
 26 |     for i in range(1,length):
 27 |         px = strategy[i]
 28 |         py = max(strategy[:i])
 29 |         drawdown.append(1-(px/py))
 30 |     if max(drawdown)>0:
 31 |         return max(drawdown)
 32 |     else:
 33 |         return 0
 34 |     
 35 | #定义一个函数，判断是否存在np.nan
 36 | @numba.jit
 37 | def kong_pan(xulie):
 38 |     for i in xulie:
 39 |         if abs(i)>=0:
 40 |             pass
 41 |         else:
 42 |             return True      
 43 |     return False
 44 | 
 45 | # 定义一个类，用于进行数据方面的处理
 46 | class data_chuli(object):
 47 |     def __init__(self, start_time,end_time, stock_list,day_or_minute = 'day',train_ratio = 0.3,adjust=True):
 48 |         self.start_time = start_time
 49 |         self.end_time = end_time 
 50 |         self.stock_list = stock_list        # 股票列表
 51 |         self.day_or_minute = day_or_minute  # 交易日历
 52 |         self.train_ratio = train_ratio      # 测试数据占总体数据的比例
 53 |         self.adjust = adjust                # 是否复权
 54 |      
 55 |     
 56 |     def data_get(self):
 57 |         '''
 58 |         本程序主要用于获取股票数据
 59 |         '''
 60 |         print (self.day_or_minute)
 61 |         if self.day_or_minute =='day':
 62 |             print (u'获取日线数据')
 63 |             [data,startdate,enddate] = load_day_data(stockList = self.stock_list,
 64 |                                                     start = self.start_time,
 65 |                                                     end = self.end_time)
 66 |         else:
 67 |             print (u'获取分钟数据')
 68 |             [data,startdate,enddate] = load_minute_data(stockList = self.stock_list,
 69 |                                                     start = self.start_time,
 70 |                                                     end = self.end_time)
 71 |         return data 
 72 |         
 73 |     
 74 |     def index_get(self,indexes = '000001'):
 75 |         data = load_day_data(indexes=indexes,start = self.start_time,end = self.end_time)
 76 |         
 77 |         #print (data[indexes].head())
 78 |         return data[indexes]
 79 |         
 80 |     def adaptability_compute(self,cycle = 5):
 81 |         
 82 |         '''
 83 |         本程序主要是进行适应度计算
 84 |         本处主要计算收益与最大回撤的比值
 85 |         '''
 86 |         data  =self.data_get()
 87 |         new_data = OrderedDict()
 88 |         for stocks in self.stock_list:
 89 |             stock = data[stocks]
 90 |             #print (stock.head(10))
 91 |             closes = list(stock['close']) 
 92 |             close_list = []
 93 |             try:
 94 |                 for i in range(cycle):
 95 |                     close_list.append(closes[i+1:]+[closes[-1] for t in range(i+1)])
 96 |                 close_list = np.array(close_list).T    
 97 |                 t = stock.shift(-1*cycle)['close']/stock['close']
 98 |                 maxdowntown = [MaxDrawdown(close_list[i,:]) for i in range(len(close_list[:,0]))]
 99 |                 stock['after_5_profit'] = t
100 |                 stock['after_maxdowntown'] = maxdowntown
101 |                 new_data[stocks] = stock
102 |             except:
103 |                 pass
104 |         del data
105 |         gc.collect()
106 |         return new_data
107 |     def index_chuli(self,index_data):
108 | 
109 |         pindex_data = pd.DataFrame()
110 |         pindex_data['close'] = list(index_data['close'])
111 |         pindex_data['volume'] = list(index_data['volume'])
112 |         #print (pindex_data.head(10))
113 |         #获取20、个5个交易日的收益
114 |         pindex_data['20_profit'] = pindex_data['close']/(pindex_data.shift(20)['close'])-1
115 |         pindex_data['5_profit'] = pindex_data['close']/(pindex_data.shift(5)['close'])-1
116 |         # 获取，当前价格和5日均价，20日均价之间的收益情况
117 |         pindex_data_mean5 = pindex_data.rolling(5).mean()
118 |         pindex_data_mean20 = pindex_data.rolling(20).mean()
119 |         pindex_data['20_mean'] = (pindex_data['close'])/(pindex_data_mean20['close']) -1
120 |         pindex_data['5_mean'] = (pindex_data['close'])/(pindex_data_mean5['close']) -1
121 |         # 获取当前成交量与5日和20日均成交量之间的关系
122 |         pindex_data['20_mean_vol'] = (pindex_data['volume'])/(pindex_data_mean20['volume']) -1
123 |         pindex_data['5_mean_vol'] = (pindex_data['volume'])/(pindex_data_mean5['volume']) -1
124 |         pindex_data['date'] = list(index_data['date'])
125 |         #print (pindex_data.head(10))
126 |         #print (list(pindex_data.columns))
127 |         return pindex_data
128 |     def factor_get(self):
129 |         '''
130 |         本函数的作用是为了，构建相应的基础因子模块。包括成交价成交量等
131 |         本次试验，基础factors 为 开、收、高、低价格，成交量，5日均价，5日均成交量，5日内最高价
132 |         '''
133 |         stock_data = self.adaptability_compute()
134 |         index_data = self.index_get()
135 |         index_data = self.index_chuli(index_data = index_data)
136 |         # 对指数进行处理，获取指数的当日的涨跌幅情况，
137 |         # 前20个交易日的涨跌幅,前5个交易日的涨跌幅，当日涨跌幅
138 |         # 今天较20日均值的涨跌幅，今日较5日均值的涨跌幅
139 |         # 几日较20个交易日均值的成交量，今日缴5个个交易日的成家量
140 |         index_data = index_data[['date','20_profit','20_mean','5_mean','20_mean_vol','5_mean_vol']]
141 |         for stock in list(stock_data.keys()):
142 |             stock_data[stock]['mean_five'] = (stock_data[stock][['close','low']].rolling(5).mean())['close']
143 |             
144 |             stock_data[stock]['highest_five'] = (stock_data[stock][['high','low']].rolling(5).max())['high']
145 |             stock_data[stock]['lowest_five'] = (stock_data[stock][['close','low']].rolling(5).min())['low']
146 |             stock_data[stock]['mean_20'] = (stock_data[stock][['close','low']].rolling(20).mean())['close']
147 |             stock_data[stock]['highest_20'] = (stock_data[stock][['high','low']].rolling(20).max())['high']
148 |             stock_data[stock]['lowest_20'] = (stock_data[stock][['close','low']].rolling(20).min())['low']
149 |             stock_data[stock]['date'] = list(stock_data[stock].index)
150 |             stock_data[stock] = pd.merge(stock_data[stock],index_data,on='date',how='inner')
151 |         return stock_data
152 |         
153 |     #定义一个函数，按照时间将股票,和后续收益进行分割进行分割，并对有NAN数据进行剔除
154 |     
155 |     def time_adjust(self):
156 |         all_data = self.factor_get()
157 |         index_data = self.index_get()
158 |         columns = list(all_data[self.stock_list[0]].columns)
159 |         adjusted_factor = ['after_5_profit','after_maxdowntown']
160 |         for i in adjusted_factor:
161 |             columns.remove(i)
162 |         columns.remove('code')
163 |         #print (columns)
164 |         time_list = list(index_data['date'])
165 |         changdu =len(time_list)
166 |         t_columns = copy.deepcopy(columns)
167 |         t_columns.remove('date')
168 |         #print (list((all_data['000333'])['date']))
169 |         x_datas = [[] for i in range(len(time_list))]
170 |         y_datas = [[] for i in range(len(time_list))]
171 |         f = 0
172 |         nan_value = [np.nan for i in t_columns]
173 |         print (len(nan_value),t_columns)
174 |         for stock in list(all_data.keys()):
175 |             stockdata = all_data[stock]
176 |             x_data = stockdata[columns]
177 |             print (stock,f)
178 |             f+=1
179 |             y_data = stockdata[adjusted_factor+['date']]
180 |             del stockdata
181 |             for i in range(len(time_list)):
182 |                 now_time = time_list[i]
183 |                 x_stkdata = x_data[x_data['date'] ==now_time]
184 |                 y_stkdata= y_data[y_data['date'] == now_time]
185 |                 new_x = np.array(x_stkdata[t_columns])
186 |                 #print (x_stkdata[t_columns].head())
187 |                 #input()
188 |                 new_y = list(np.array(y_stkdata['after_5_profit']))
189 |                 #print (new_x,new_y,u'herere')
190 |                 if len(new_y)==0 or len(new_x)==0:
191 |                     x_datas[i].append(nan_value)
192 |                     y_datas[i].append(np.nan)
193 |                 
194 |                 else:
195 |                     #print (len(new_x[0]),u'here2',new_y[0])
196 |                     x_datas[i].append(list(new_x[0]))
197 |                     y_datas[i].append(new_y[0])
198 |                     #input()
199 |             del x_data,y_data
200 |             gc.collect()
201 |         x_train_data = (x_datas[20:int(changdu*self.train_ratio)])
202 |         y_train_data = y_datas[20:int(changdu*self.train_ratio)]
203 |         x_test_data = x_datas[int(changdu*self.train_ratio):-5]
204 |         y_test_data = y_datas[int(changdu*self.train_ratio):-5]
205 |         return (x_train_data,y_train_data,x_test_data,y_test_data)
206 |                 
207 |                 
208 |         
209 |     #定义一个函数，用于进行train_data 与test_data，以及适应度数据选取
210 |     def train_test(self):
211 |         '''
212 |         本代码的作用是为了区分，train,test ,x,y
213 |         '''
214 |         all_data = self.factor_get()
215 |         columns = list(all_data[self.stock_list[0]].columns)
216 |         
217 |         adjusted_factor = ['after_5_profit','after_maxdowntown']
218 |         for i in adjusted_factor:
219 |             columns.remove(i)
220 |         columns.remove('date')
221 |         columns.remove('code')
222 |         new_stock_data_train = OrderedDict()
223 |         new_adjust_data_train = OrderedDict()
224 |         new_stock_data_test = OrderedDict()
225 |         new_adjust_data_test = OrderedDict()
226 |         
227 |         for stock in self.stock_list:
228 |             changdu = len(all_data[stock]['open'])
229 |             x_data = np.array(all_data[stock][columns])
230 |             y_data = np.array(all_data[stock][adjusted_factor])
231 |             new_stock_data_train[stock] = x_data[21:int(changdu*self.train_ratio),:]
232 |             new_adjust_data_train[stock] = y_data[21:int(changdu*self.train_ratio),:]
233 |             new_stock_data_test[stock] = x_data[int(changdu*self.train_ratio):,:-20]
234 |             new_adjust_data_test[stock] =y_data[int(changdu*self.train_ratio):,:-20]
235 |             #input()
236 |         return (new_stock_data_train,new_adjust_data_train,new_stock_data_test,new_adjust_data_test)
237 | #定以一个函数，用于对保存后读取的数据进行净化处理
238 | #@numba.jit
239 | def jinghua_data(x_train,y_train,x_test,y_test):
240 |     x_trains = [[] for i in range(len(x_train))]
241 |     y_trains = [[] for i in range(len(x_train))]
242 |     x_tests  =  [[] for i in range(len(x_test))]
243 |     y_tests  =  [[] for i in range(len(x_test))]
244 |     (trains_time,trains_stock,f) = x_train.shape
245 |     (test_time,test_stock,f) = x_test.shape
246 |     for i in range(trains_time):
247 |         for j in range(trains_stock):
248 |             #print (x_train[i,j])
249 |             if not kong_pan(x_train[i,j]) and abs(y_train[i][j])>=0:
250 | 
251 |                 x_trains[i].append(list(x_train[i,j]))
252 |                 y_trains[i].append(y_train[i][j])
253 |     '''
254 |     for t in range(test_time):
255 |         for k in range(test_stock):
256 |             if not kong_pan(x_test[t,k]) or abs(y_test[t][k])>=0:
257 |                 x_tests[t].append(list(x_test[t,k]))
258 |                 y_tests[t].append(y_test[t][k])
259 |     '''
260 |     return (x_trains,y_trains,x_test,y_tests)
261 |             
262 | # 定义一个函数，用于进行遗传进化    
263 | if __name__ == '__main__':
264 |     '''
265 |     stocks = list(pd.read_csv('C:\\Users\\94006\\Desktop\\HS300.csv')['HS300'])[0:150]
266 |     start_time = '2010-06-30'
267 |     end_time= '2018-12-30'
268 |     stock_list = [str(i).zfill(6) for i in stocks]
269 |     day_or_minute = 'day'
270 |     metric = 'stock_dedicated'
271 |     
272 |     train_ratio = 0.7
273 |     adjust=True
274 |     
275 |     data_all = data_chuli(start_time = start_time,
276 |                             end_time = end_time,
277 |                             stock_list = stock_list,
278 |                             day_or_minute = day_or_minute,
279 |                             train_ratio = train_ratio,
280 |                             adjust = adjust)
281 |     (x_train,y_train,x_test,y_test) = data_all.time_adjust()#获取训练和测试用的数据
282 |     print (u'ppppppppppppppppp')
283 |     x_train_HS300 = 'D:\\HS300_x_train_datas150.npy'
284 |     y_train_HS300 = 'D:\\HS300_y_train_datas150.npy'
285 |     x_test_HS300 = 'D:\\HS300_x_test_datas150.npy'
286 |     y_test_HS300 = 'D:\\HS300_y_test_datas150.npy'
287 |     np.save(x_train_HS300,np.array(x_train))
288 |     np.save(y_train_HS300,np.array(y_train))
289 |     np.save(x_test_HS300,np.array(x_test))
290 |     np.save(y_test_HS300,np.array(y_test))
291 |     '''
292 |    
293 |     x_train = np.array(np.load('D:\\HS300_x_train_datas150.npy'))
294 |     y_train = np.array(np.load('D:\\HS300_y_train_datas150.npy'))
295 |     x_test = np.array(np.load('D:\\HS300_x_test_datas150.npy'))
296 |     y_test =np.array( np.load('D:\\HS300_y_test_datas150.npy'))
297 |     print (x_train[:,0:2])
298 |     input()
299 |     
300 |     (a,b,c,d) = jinghua_data(x_train,y_train,x_test,y_test)
301 |  
302 |     print (u'数据准备完成，进入进化')
303 |     est_gp = SymbolicRegressor(population_size=200,
304 |                            generations=8, stopping_criteria=10000,
305 |                            p_crossover=0.7, p_subtree_mutation=0.1,
306 |                            p_hoist_mutation=0.05, p_point_mutation=0.1,
307 |                            max_samples=0.9, verbose=1,
308 |                            parsimony_coefficient=0.01, random_state=0,
309 |                            metric= 'stock_dedicated',
310 |                            n_jobs=2)# 构建一个遗传进化的类 
311 |     print (u'类构件完成')
312 |     input()                        
313 |     x_trains = a
314 |     #print (a)
315 |     y_trains = b
316 |     
317 |     est_gp.fit(x_trains, y_trains)   
318 | 


--------------------------------------------------------------------------------
/load_data/mongodb.py:
--------------------------------------------------------------------------------
  1 | #encoding:utf-8
  2 | import sys
  3 | 
  4 | #verion1: get all companies data from tushare and store them in Mongodb
  5 | import pymongo
  6 | import datetime
  7 | import tushare as ts
  8 | import time
  9 | import json
 10 | import pandas as pd
 11 | from collections import OrderedDict
 12 | import pytz
 13 | import types 
 14 | import requests
 15 | from io import BytesIO, StringIO
 16 | import os
 17 | import click
 18 | import re
 19 | from os import listdir
 20 | from os.path import isfile, join
 21 | from os import walk
 22 | import gc
 23 | 
 24 | from pandas import DataFrame
 25 | 
 26 | 
 27 | 
 28 | class LoadDataCVS:
 29 | 
 30 |     basedir="E:/data_new"
 31 |     stockdata=basedir+"/stock_data"
 32 |     indexdata=basedir+"/index_data"
 33 | 
 34 |     #treasurvity 
 35 |     in_package_data = range(2002, 2018)
 36 |     DONWLOAD_URL = "http://yield.chinabond.com.cn/cbweb-mn/yc/downYearBzqx?year=%s&&wrjxCBFlag=0&&zblx=txy&ycDefId=%s"
 37 |     YIELD_MAIN_URL =  'http://yield.chinabond.com.cn/cbweb-mn/yield_main'
 38 |     #
 39 |     #'http://yield.chinabond.com.cn/cbweb-mn/yield_main'
 40 |     
 41 |     #'#http://yield.chinabond.com.cn/cbweb-mn/yield_main?locale=zh_CN','http://yield.chinabond.com.cn/cbweb-mn/yield_main?locale=zh_CN'
 42 | 
 43 | 
 44 |     
 45 |     def __init__(self,Ip,port):
 46 |         self.ip=Ip
 47 |         self.port=port
 48 | 
 49 |     ## connect to the data base
 50 |     def Conn(self):
 51 |         self.client = pymongo.MongoClient(self.ip,self.port)
 52 |         self.connection=self.client.stock #storage stock information
 53 |         self.index=self.client.index #storage index
 54 |         self.pool=self.client.pool  #storate pool
 55 |         self.treasure=self.client.treasure
 56 |         self.minute_stock = self.client.minute_stock
 57 |         self.minute_index = self.client.minute_index
 58 |         #print self.connection.collection_names()
 59 |         #print self.index.collection_names()
 60 |         #print self.pool.collection_names()
 61 |     def Close(self):
 62 |         self.client.close()
 63 |         
 64 |         
 65 |     #store data information into database, do not always call this
 66 |     def storagedaily(self):
 67 |         #get the filelist
 68 |         onlyfiles = [ f for f in listdir(self.stockdata) if isfile(join(self.stockdata,f)) ]
 69 |         #read from using pandas
 70 |         for f in onlyfiles:
 71 |             df = pd.read_csv(self.stockdata+"/"+f)
 72 |             #print df.head()
 73 |             s=f.split('.')
 74 |             name = s[0][2:8]
 75 |             #print name
 76 |             records = json.loads(df.T.to_json()).values()
 77 |             for row in records:
 78 |                 row['date'] = datetime.datetime.strptime(row['date'], "%Y-%m-%d")
 79 |                 #print row
 80 |                 #raw_input()
 81 |             print (name)
 82 |             self.connection[name].insert_many(records)
 83 |             
 84 |     #store index information into database,do not always call this
 85 |             
 86 |     def storageindex(self):
 87 |         #get the filelist
 88 |         onlyfiles = [ f for f in listdir(self.indexdata) if isfile(join(self.indexdata,f)) ]
 89 |         #read from using pandas
 90 |         for f in onlyfiles:
 91 |             df = pd.read_csv(self.indexdata+"/"+f)
 92 |             s=f.split('.')
 93 |             name = s[0][2:8]
 94 |             records = json.loads(df.T.to_json()).values()
 95 |             for row in records:
 96 |                 row['date'] = datetime.datetime.strptime(row['date'], "%Y-%m-%d")
 97 |             print (name)
 98 |             self.index[name].insert_many(records)
 99 |             
100 |     
101 |     
102 |     #storage stock pool into database
103 |     def storagepool(self):
104 |         #storage zz500
105 |         df=ts.get_zz500s()
106 |         self.pool['zz500'].insert_many(json.loads(df.to_json(orient='records')))
107 |         #hs300
108 |         df=ts.get_hs300s()
109 |         self.pool['hz300'].insert_many(json.loads(df.to_json(orient='records')))
110 |         #zh50
111 |         df=ts.get_sz50s()
112 |         self.pool['sz'].insert_many(json.loads(df.to_json(orient='records')))
113 |         #st
114 |         df=ts.get_st_classified()
115 |         self.pool['st'].insert_many(json.loads(df.to_json(orient='records')))
116 |         
117 |         
118 |     
119 |         
120 |     #get the particular stock list from data base
121 |     def getstocklist(self,kind):
122 |         ret=[]
123 |         if kind=="hs300":
124 |             for t in self.pool['hz300'].find():
125 |                 ret.append(t['code'])
126 |         if kind =="zz500":
127 |             for t in self.pool['zz500'].find():
128 |                 ret.append(t['code'])
129 |         if kind=='sz50':
130 |             for t in self.pool['sz'].find():
131 |                 ret.append(t['code'])
132 |         if kind =='st':
133 |             for t in self.pool['st'].find():
134 |                 ret.append(t['code'])
135 |         if kind == 'all':
136 |             for t in self.pool['all'].find():
137 |                 ret.append(t['codes'])
138 |             
139 |         return ret 
140 |         
141 |         #get daily stock information from database
142 |         #return dataframe which contains the information we set in the parameters
143 | 
144 |     def getstockdaily(self,code,start='2000-01-01',end='2099-01-01'):
145 |         total=[]
146 |         startdate = datetime.datetime.strptime(start, "%Y-%m-%d")
147 |         enddate=datetime.datetime.strptime(end, "%Y-%m-%d")
148 |         series={"date":[],"open":[],"close":[],"high":[],"low":[],"volume":[],"prices":[],"change":[],"code":[]}
149 |         #now_time=time.time()
150 |         #print self.connection[code].find({"date": {"$gte": startdate,"$lt":enddate}}).sort("date")
151 |         #new_time = time.time()
152 |         #print new_time - now_time
153 |         #tt = self.connection[code].find({},{'_id':0,'date':1}).sort('date',-1)
154 |         #print tt[0]['date']
155 |         #raw_input()
156 |         tt = self.connection[code].find({"date": {"$gte": startdate,"$lte":enddate}}).sort("date")
157 |         for stockdaily in tt:
158 |             series["date"].append(stockdaily["date"])
159 |             series["open"].append(stockdaily["open"])
160 |             series["close"].append(stockdaily["close"])
161 |             series["high"].append(stockdaily["high"])
162 |             series["low"].append(stockdaily["low"])
163 |             series["volume"].append(stockdaily["volume"])
164 |             series["prices"].append(stockdaily["adj_factor"])
165 |             series["change"].append(stockdaily["change"])
166 |             series["code"].append(stockdaily["code"])
167 |         #pp=time.time() 
168 |         del tt 
169 |         gc.collect()
170 |         totaldata=zip(series['open'],series['high'],series['low'],series['close'],series['volume'],series["prices"],series["change"],series["code"])
171 |         df = pd.DataFrame(data=list(totaldata),index=series["date"],columns = ['open','high','low','close','volume','prices','change',"code"])
172 |         try:
173 |             df['price'] = (df['close']*df['prices'])/(list(df['prices'])[-1])
174 |             df =  df[['open','high','low','close','volume','price','change',"code"]]
175 |             #print df.drop_duplicates()
176 |             #raw_input()
177 |             return df.drop_duplicates()
178 |         except:
179 |             df.columns = ['open','high','low','close','volume','price','change',"code"]
180 |             return df.drop_duplicates()
181 |             
182 |             
183 |     def getstockminute(self,code,start,end):
184 |         startdate = datetime.datetime.strptime(start, "%Y-%m-%d")
185 |         enddate=datetime.datetime.strptime(end, "%Y-%m-%d")
186 |         series={"date":[],"open":[],"close":[],"high":[],"low":[],"volume":[],"prices":[],"change":[],"code":[]}
187 |         tt_date = '1991-01-01'
188 |         tt = self.minute_stock[code].find({"date": {"$gte": startdate,"$lte":enddate}}).sort("date")
189 |         for stockdaily in tt:
190 |             if tt_date != str(stockdaily["date"])[0:10]:
191 |                 time_day = datetime.datetime.strptime(str(stockdaily['date'])[0:10], "%Y-%m-%d")
192 |                 tt =self.connection[code].find({"date": {"$gte":time_day ,"$lte":time_day}})[0]
193 |                 tt_date = str(stockdaily["date"])[0:10]
194 |             else:
195 |                 pass
196 |             series["date"].append(stockdaily["date"])
197 |             series["open"].append(stockdaily["open"])
198 |             series["close"].append(stockdaily["close"])
199 |             series["high"].append(stockdaily["high"])
200 |             series["low"].append(stockdaily["low"])
201 |             series["volume"].append(stockdaily["vol"])
202 |             series["prices"].append(tt["adj_factor"])
203 |             series["change"].append(stockdaily["p_change"])
204 |             series["code"].append(stockdaily["code"])
205 |         #pp=time.time
206 |         del tt
207 |         gc.collect()
208 |         totaldata=zip(series['open'],series['high'],series['low'],series['close'],series['volume'],series["prices"],series["change"],series["code"],series['date'])
209 |         
210 |         df = pd.DataFrame(data=list(totaldata),index=series["date"],columns = ['open','high','low','close','volume','prices','change',"code",'date'])
211 |         df['change']= df['change']/100
212 |         df['volume'] = df['volume']*100
213 |         for factor in ['open','close','high','low','prices']:
214 |             df[factor] = [float("%.2f"%i) for i in list(df[factor])]
215 |         df = df.drop_duplicates(subset=['date'])
216 |         #df.to_csv('E:\\stock_%sdatashujuqingkaung.csv'%list(df['code'])[0])
217 |         try:
218 |             df['price'] = (df['close']*df['prices'])/(list(df['prices'])[-1])
219 |             df =  df[['open','high','low','close','volume','price','change',"code"]]
220 |             return df
221 |         except:
222 |             #df.drop_duplicates().fillna(method='pad').to_csv('E:\\stock_datashujuqingkaung.csv')
223 |             df.columns = ['open','high','low','close','volume','price','change',"code"]
224 |             return df
225 | 
226 | 
227 |     
228 |     def getBenchamark(self,code,start,end):
229 |         #if it is timestamp type
230 |         startdate=start
231 |         enddate=end
232 |         #print u'这里',start,end
233 |         if type(start) is types.StringType:
234 |             startdate = datetime.datetime.strptime(start, "%Y-%m-%d")
235 |         if type(end) is types.StringType:
236 |             enddate=datetime.datetime.strptime(end, "%Y-%m-%d")           
237 |         series={"date":[],"change":[]}
238 |         for stockdaily in self.index[code].find({"date": {"$gte": startdate,"$lte":enddate}}).sort("date"):
239 |             series["date"].append(stockdaily["date"])
240 |             series["change"].append(stockdaily["change"])
241 |         df=pd.Series(data=series["change"],index=series["date"])
242 |         return df.sort_index().tz_localize('UTC')
243 | 
244 |     def getindexdaily(self,code,start,end):
245 |         total=[]
246 |         startdate = datetime.datetime.strptime(start, "%Y-%m-%d")
247 |         enddate=datetime.datetime.strptime(end, "%Y-%m-%d")
248 |         series={"date":[],"open":[],"close":[],"high":[],"low":[],"volume":[]}
249 |         
250 |         for stockdaily in self.index[code].find({"date": {"$gte": startdate,"$lt":enddate}}).sort("date"):
251 |             series["date"].append(stockdaily["date"])
252 |             series["open"].append(stockdaily["open"])
253 |             series["close"].append(stockdaily["close"])
254 |             series["high"].append(stockdaily["high"])
255 |             series["low"].append(stockdaily["low"])
256 |             series["volume"].append(stockdaily["volume"])
257 |             
258 |         totaldata=zip(series['date'],series['open'],series['close'],series['high'],series['low'],series['volume'])
259 |         df = pd.DataFrame(list(totaldata))
260 |         df.columns = ['date','open','close','high','low','volume']
261 |         #print (df.head())
262 |         #df.index=df.date
263 |         return df
264 |         
265 |     def getindexminute(self,code,start,end):
266 |         total=[]
267 |         startdate = datetime.datetime.strptime(start, "%Y-%m-%d")
268 |         enddate=datetime.datetime.strptime(end, "%Y-%m-%d")
269 |         series={"date":[],"open":[],"close":[],"high":[],"low":[],"volume":[]}
270 |         
271 |         for stockdaily in self.minute_index[code].find({"date": {"$gte": startdate,"$lt":enddate}}).sort("date"):
272 |             series["date"].append(stockdaily["date"])
273 |             series["open"].append(stockdaily["open"])
274 |             series["close"].append(stockdaily["close"])
275 |             series["high"].append(stockdaily["high"])
276 |             series["low"].append(stockdaily["low"])
277 |             series["volume"].append(stockdaily["vol"])
278 |             
279 |         totaldata=zip(series['date'],series['open'],series['close'],series['high'],series['low'],series['volume'])
280 |         df = pd.DataFrame(list(totaldata))
281 |         df.index=df.date
282 |         return df.drop_duplicates()
283 |         
284 | 
285 | 
286 | 
287 |     def get_data(self):
288 | 
289 |         in_package_data = range(2002, 2019)
290 |         print (in_package_data)
291 |         cur_year = datetime.datetime.now().year
292 |         last_in_package_data = max(in_package_data)
293 | 
294 | 
295 |         # download new data
296 |         '''
297 |         to_downloads = range(last_in_package_data + 1, cur_year + 1)
298 |         print to_downloads
299 |         raw_input()
300 | 
301 |         # frist, get ycDefIds params
302 |         response = requests.get(self.YIELD_MAIN_URL)
303 | 
304 |         matchs = re.search(r'\?ycDefIds=(.*?)\&', response.text)
305 |         ycdefids = matchs.group(1)
306 |         assert (ycdefids is not None)
307 | 
308 |         fetched_data = []
309 |         for year in to_downloads:
310 |             print('Downloading from ' + self.DONWLOAD_URL % (year, ycdefids))
311 |             response = requests.get(self.DONWLOAD_URL % (year, ycdefids))
312 |             fetched_data.append(BytesIO(response.content))
313 | 
314 |         # combine all data'''
315 | 
316 |         dfs = []
317 | 
318 |         basedir = os.path.join(os.path.dirname(__file__), "xlsx")
319 | 
320 |         for i in in_package_data:
321 |             dfs.append(pd.read_excel(os.path.join(basedir, "%d.xlsx" % i)))
322 |         '''
323 |         for memfile in fetched_data:
324 |             dfs.append(pd.read_excel(memfile))
325 |         '''
326 |         df = pd.concat(dfs)
327 | 
328 |         return df
329 | 
330 |     def get_pivot_data(self):
331 | 
332 |         df = self.get_data()
333 |         return df.pivot(index=u'日期', columns=u'标准期限(年)', values=u'收益率(%)')
334 | 
335 | 
336 | 
337 |     def insert_zipline_treasure_format(self):
338 |         self.treasure['treasure'].drop()
339 |         pivot_data = self.get_pivot_data()
340 |         #print pivot_data.tail()
341 |         #raw_input()
342 | 
343 |         frame=pivot_data[[0.08,0.25,0.5,1,2,3,5,7,10,20,30]]
344 |         frame['Time Period']=frame.index
345 |         #print frame.head()
346 |         frame['Time Period']=  frame['Time Period'].astype('str') # [str(i) for i in list(frame['Time Period'])]#
347 |         frame.columns=['1month', '3month','6month', '1year', '2year', '3year', '5year', '7year', '10year', '20year', '30year','Time Period']
348 |         records = json.loads(frame.T.to_json()).values()
349 |         for row in records:
350 |             temp=row['Time Period']
351 |             temp=temp.split('T')[0]
352 |             row['Time Period'] = datetime.datetime.strptime(temp, "%Y-%m-%d")
353 | 
354 |         self.treasure['treasure'].insert_many(records)
355 |        
356 | 
357 |     def read_treasure_from_mongodb(self,start,end):
358 | 
359 |         startdate=start
360 |         enddate=end
361 |         series={"Time Period":[],"1month":[],"3month":[],"6month":[],"1year":[],"2year":[],"3year":[],"5year":[],"7year":[],"10year":[],"20year":[],"30year":[]}
362 |         if type(start) is types.StringType:
363 |             startdate = datetime.datetime.strptime(start, "%Y-%m-%d")
364 |         if type(end) is types.StringType:
365 |             enddate=datetime.datetime.strptime(end, "%Y-%m-%d")
366 |         for treasuredaily in self.treasure['treasure'].find({"Time Period": {"$gte": startdate,"$lt":enddate}}).sort("date"):
367 |             series["Time Period"].append(treasuredaily["Time Period"])
368 |             series["1month"].append(treasuredaily["1month"])
369 |             series["3month"].append(treasuredaily["3month"])
370 |             series["6month"].append(treasuredaily["6month"])
371 |             series["1year"].append(treasuredaily["1year"])
372 |             series["2year"].append(treasuredaily["2year"])
373 |             series["3year"].append(treasuredaily["3year"])
374 |             series["5year"].append(treasuredaily["5year"])
375 |             series["7year"].append(treasuredaily["7year"])
376 |             series["10year"].append(treasuredaily["10year"])
377 |             series["20year"].append(treasuredaily["20year"])
378 |             series["30year"].append(treasuredaily["30year"])
379 |         totaldata=zip(series["1month"],series["3month"],series["6month"],series["1year"],series["2year"],series["3year"],series["5year"],series["7year"],series["10year"],series["20year"],series["30year"])
380 |         df = pd.DataFrame(data=list(totaldata),index=series["Time Period"],columns = ['1month', '3month','6month', '1year', '2year', '3year', '5year', '7year', '10year', '20year', '30year'])
381 |         return df.sort_index().tz_localize('UTC')
382 | 
383 |     def storageStockName(self):
384 |         totalstock=[]
385 |         onlyfiles = [ f for f in listdir(self.stockdata) if isfile(join(self.stockdata,f)) ]
386 |         for f in onlyfiles:
387 |             s=f.split('.')
388 |             name=s[0][2:8]
389 |             totalstock.append(name)
390 |             
391 |         data = {'codes': totalstock}
392 |         frame = DataFrame(data)
393 |         
394 |         self.pool['all'].insert_many(json.loads(frame.to_json(orient='records')))
395 |         print (frame)
396 |             
397 |         
398 | 
399 | if __name__ == '__main__':
400 |     l=LoadDataCVS('127.0.0.1',27017)
401 |     l.Conn()
402 |     #l.storagedaily()
403 |     #l.storageindex()
404 |     # l.storagepool()
405 |     # l.storageStockName()
406 |     #l.insert_zipline_treasure_format()
407 |     #l.Close()
408 |     
409 |     #l.storageStockName()
410 |     #print l.getstocklist('all')
411 | 
412 | 


--------------------------------------------------------------------------------
/load_data/loader.py:
--------------------------------------------------------------------------------
  1 | # encoding:utf-8
  2 | # Copyright 2016 Quantopian, Inc.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | import os
 16 | import datetime
 17 | from collections import OrderedDict
 18 | 
 19 | import logbook
 20 | 
 21 | import constants
 22 | 
 23 | import pandas as pd
 24 | #from pandas_datareader import DataReader
 25 | import pytz
 26 | 
 27 | from six import iteritems
 28 | from six.moves.urllib_error import HTTPError
 29 | 
 30 | #from . benchmarks import get_benchmark_returns
 31 | from mongodb import LoadDataCVS
 32 | import treasuries, treasuries_can
 33 | 
 34 | 
 35 | logger = logbook.Logger('Loader')
 36 | 
 37 | # Mapping from index symbol to appropriate bond data
 38 | INDEX_MAPPING = {
 39 |     '^GSPC':
 40 |     (treasuries, 'treasury_curves.csv', 'www.federalreserve.gov'),
 41 |     '^GSPTSE':
 42 |     (treasuries_can, 'treasury_curves_can.csv', 'bankofcanada.ca'),
 43 |     '^FTSE':  # use US treasuries until UK bonds implemented
 44 |     (treasuries, 'treasury_curves.csv', 'www.federalreserve.gov'),
 45 | }
 46 | 
 47 | ONE_HOUR = pd.Timedelta(hours=1)
 48 | 
 49 | 
 50 | def last_modified_time(path):
 51 |     """
 52 |     Get the last modified time of path as a Timestamp.
 53 |     """
 54 |     return pd.Timestamp(os.path.getmtime(path), unit='s', tz='UTC')
 55 | 
 56 | 
 57 | def get_data_filepath(name):
 58 |     """
 59 |     Returns a handle to data file.
 60 | 
 61 |     Creates containing directory, if needed.
 62 |     """
 63 |     dr = data_root()
 64 | 
 65 |     if not os.path.exists(dr):
 66 |         os.makedirs(dr)
 67 | 
 68 |     return os.path.join(dr, name)
 69 | 
 70 | 
 71 | def get_cache_filepath(name):
 72 |     cr = cache_root()
 73 |     if not os.path.exists(cr):
 74 |         os.makedirs(cr)
 75 | 
 76 |     return os.path.join(cr, name)
 77 | 
 78 | 
 79 | def get_benchmark_filename(symbol):
 80 |     return "%s_benchmark.csv" % symbol
 81 | 
 82 | 
 83 | def has_data_for_dates(series_or_df, first_date, last_date):
 84 |     """
 85 |     Does `series_or_df` have data on or before first_date and on or after
 86 |     last_date?
 87 |     """
 88 |     dts = series_or_df.index
 89 |     if not isinstance(dts, pd.DatetimeIndex):
 90 |         raise TypeError("Expected a DatetimeIndex, but got %s." % type(dts))
 91 |     first, last = dts[[0, -1]]
 92 |     return (first <= first_date) and (last >= last_date)
 93 | 
 94 | 
 95 | def load_market_data(trading_day,
 96 |                      trading_days,
 97 |                      bm_symbol='000001'):
 98 |     """
 99 |     Load benchmark returns and treasury yield curves for the given calendar and
100 |     benchmark symbol.
101 | 
102 |     Benchmarks are downloaded as a Series from Yahoo Finance.  Treasury curves
103 |     are US Treasury Bond rates and are downloaded from 'www.federalreserve.gov'
104 |     by default.  For Canadian exchanges, a loader for Canadian bonds from the
105 |     Bank of Canada is also available.
106 | 
107 |     Results downloaded from the internet are cached in
108 |     ~/.zipline/data. Subsequent loads will attempt to read from the cached
109 |     files before falling back to redownload.
110 | 
111 |     Parameters
112 |     ----------
113 |     trading_day : pandas.CustomBusinessDay, optional
114 |         A trading_day used to determine the latest day for which we
115 |         expect to have data.  Defaults to an NYSE trading day.
116 |     trading_days : pd.DatetimeIndex, optional
117 |         A calendar of trading days.  Also used for determining what cached
118 |         dates we should expect to have cached. Defaults to the NYSE calendar.
119 |     bm_symbol : str, optional
120 |         Symbol for the benchmark index to load.  Defaults to '^GSPC', the Yahoo
121 |         ticker for the S&P 500.
122 | 
123 |     Returns
124 |     -------
125 |     (benchmark_returns, treasury_curves) : (pd.Series, pd.DataFrame)
126 | 
127 |     Notes
128 |     -----
129 | 
130 |     Both return values are DatetimeIndexed with values dated to midnight in UTC
131 |     of each stored date.  The columns of `treasury_curves` are:
132 | 
133 |     '1month', '3month', '6month',
134 |     '1year','2year','3year','5year','7year','10year','20year','30year'
135 |     #为给定的日历和基准符号加载基准回报和国债收益率曲线。基准测试从Yahoo Finance下载为系列。
136 |     #资金曲线是美国国债利率，默认情况下从'www.federalreserve.gov'下载。
137 |     #对于加拿大交易所，也可以使用加拿大银行的加拿大债券装载机。
138 | 
139 |     #从互联网下载的结果将被缓存
140 |     #〜/ .zipline /数据。后续加载将尝试从缓存中读取
141 |     #文件在退回重新下载之前。
142 | 
143 |     #参数
144 |      ----------
145 |     trading_day：pandas.CustomBusinessDay，可选
146 |     交易日用于确定我们的最新日期
147 |     期待有数据。默认为纽约证券交易所交易日。
148 |     trading_days：pd.DatetimeIndex，可选
149 |     交易日的日历。还用于确定缓存的内容
150 |     我们应该期望缓存的日期。默认为纽约证券交易所日历。
151 |     bm_symbol：str，可选
152 |     要加载的基准索引的符号。默认为'^ GSPC'，雅虎
153 |     标准普尔500指数的股票代码。
154 |     
155 |     返回
156 |     -------
157 |     （benchmark_returns，treasury_curves）:( pd.Series，pd.DataFrame）
158 |     
159 |     笔记
160 |     """
161 |     first_date = trading_days[0]
162 |     now = pd.Timestamp.utcnow()
163 | 
164 |     # We expect to have benchmark and treasury data that's current up until
165 |     # **two** full trading days prior to the most recently completed trading
166 |     # day.
167 |     # Example:
168 |     # On Thu Oct 22 2015, the previous completed trading day is Wed Oct 21.
169 |     # However, data for Oct 21 doesn't become available until the early morning
170 |     # hours of Oct 22.  This means that there are times on the 22nd at which we
171 |     # cannot reasonably expect to have data for the 21st available.  To be
172 |     # conservative, we instead expect that at any time on the 22nd, we can
173 |     # download data for Tuesday the 20th, which is two full trading days prior
174 |     # to the date on which we're running a test.
175 |     # We'll attempt to download new data if the latest entry in our cache is
176 |     # before this date.
177 |     # 我们预计基准和财务数据将持续到最近完成交易日前的两个**完整交易日
178 |     # 例：
179 |     # 2015年10月22日星期四，之前完成的交易日为10月21日星期三。
180 |     # 但是，10月21日的数据直到清晨才可用
181 |     # 10月22日的小时数。这意味着我们22日有时间
182 |     # 不能合理地期望有21日的数据可用。 保守一点，我们反而希望在22日的任何时候，我们都可以
183 |     # 下载20日星期二的数据，这是两个完整的交易日
184 |     # 到我们正在进行测试的日期。 如果缓存中的最新条目在此日期之前，我们将尝试下载新数据。
185 |     #print trading_days.get_loc(now, method='ffill')
186 |     #print trading_days[-1]
187 |     #last_date = trading_days[trading_days.get_loc(now, method='ffill') - 2]
188 |     #print last_date
189 |     last_date = trading_days[-1]
190 |     #raw_input()
191 | 
192 |     br = ensure_benchmark_data(
193 |         bm_symbol,
194 |         first_date,
195 |         last_date,
196 |         now,
197 |         # We need the trading_day to figure out the close prior to the first
198 |         # date so that we can compute returns for the first date.
199 |         trading_day,
200 |     )
201 | 
202 |     tc = ensure_treasury_data(
203 |         bm_symbol,
204 |         first_date,
205 |         last_date,
206 |         now,
207 |     )
208 | 
209 |     benchmark_returns = br[br.index.slice_indexer(first_date, last_date)]
210 |     treasury_curves = tc[tc.index.slice_indexer(first_date, last_date)]
211 |     return benchmark_returns, treasury_curves
212 | 
213 | 
214 | def ensure_benchmark_data(symbol, first_date, last_date, now, trading_day):
215 |     """
216 |     Ensure we have benchmark data for `symbol` from `first_date` to `last_date`
217 | 
218 |     Parameters
219 |     ----------
220 |     symbol : str
221 |         The symbol for the benchmark to load.
222 |     first_date : pd.Timestamp
223 |         First required date for the cache.
224 |     last_date : pd.Timestamp
225 |         Last required date for the cache.
226 |     now : pd.Timestamp
227 |         The current time.  This is used to prevent repeated attempts to
228 |         re-download data that isn't available due to scheduling quirks or other
229 |         failures.
230 |     trading_day : pd.CustomBusinessDay
231 |         A trading day delta.  Used to find the day before first_date so we can
232 |         get the close of the day prior to first_date.
233 | 
234 |     We attempt to download data unless we already have data stored at the data
235 |     cache for `symbol` whose first entry is before or on `first_date` and whose
236 |     last entry is on or after `last_date`.
237 | 
238 |     If we perform a download and the cache criteria are not satisfied, we wait
239 |     at least one hour before attempting a redownload.  This is determined by
240 |     comparing the current time to the result of os.path.getmtime on the cache
241 |     path.
242 |     """
243 | 
244 |     # If the path does not exist, it means the first download has not happened
245 |     # yet, so don't try to read from 'path'.
246 | 
247 |     try:
248 |         data = get_benchmark_returns(
249 |             symbol,
250 |             first_date - trading_day,
251 |             last_date,
252 |         )
253 |     except (OSError, IOError, HTTPError):
254 |         logger.exception('failed to cache the new benchmark returns')
255 |     if not has_data_for_dates(data, first_date, last_date):
256 |         logger.warn("Still don't have expected data after redownload!")
257 |     return data
258 | 
259 | 
260 | def ensure_treasury_data(bm_symbol, first_date, last_date, now):
261 |     """
262 |     Ensure we have treasury data from treasury module associated with
263 |     `bm_symbol`.
264 | 
265 |     Parameters
266 |     ----------
267 |     bm_symbol : str
268 |         Benchmark symbol for which we're loading associated treasury curves.
269 |     first_date : pd.Timestamp
270 |         First date required to be in the cache.
271 |     last_date : pd.Timestamp
272 |         Last date required to be in the cache.
273 |     now : pd.Timestamp
274 |         The current time.  This is used to prevent repeated attempts to
275 |         re-download data that isn't available due to scheduling quirks or other
276 |         failures.
277 | 
278 |     We attempt to download data unless we already have data stored in the cache
279 |     for `module_name` whose first entry is before or on `first_date` and whose
280 |     last entry is on or after `last_date`.
281 | 
282 |     If we perform a download and the cache criteria are not satisfied, we wait
283 |     at least one hour before attempting a redownload.  This is determined by
284 |     comparing the current time to the result of os.path.getmtime on the cache
285 |     path.
286 |     """
287 |     # loader_module, filename, source = INDEX_MAPPING.get(
288 |     #     bm_symbol, INDEX_MAPPING['^GSPC']
289 |     # )
290 |     # first_date = max(first_date, loader_module.earliest_possible_date())
291 |     # path = get_data_filepath(filename)
292 | 
293 |     # # If the path does not exist, it means the first download has not happened
294 |     # # yet, so don't try to read from 'path'.
295 |     # if os.path.exists(path):
296 |     #     try:
297 |     #         data = pd.DataFrame.from_csv(path).tz_localize('UTC')
298 |     #         if has_data_for_dates(data, first_date, last_date):
299 |     #             return data
300 | 
301 |     #         # Don't re-download if we've successfully downloaded and written a
302 |     #         # file in the last hour.
303 |     #         last_download_time = last_modified_time(path)
304 |     #         if (now - last_download_time) <= ONE_HOUR:
305 |     #             logger.warn(
306 |     #                 "Refusing to download new treasury data because a "
307 |     #                 "download succeeded at %s." % last_download_time
308 |     #             )
309 |     #             return data
310 | 
311 |     #     except (OSError, IOError, ValueError) as e:
312 |     #         # These can all be raised by various versions of pandas on various
313 |     #         # classes of malformed input.  Treat them all as cache misses.
314 |     #         logger.info(
315 |     #             "Loading data for {path} failed with error [{error}].".format(
316 |     #                 path=path, error=e,
317 |     #             )
318 |     #         )
319 | 
320 |     # try:
321 |     #     data = loader_module.get_treasury_data(first_date, last_date)
322 |     #     data.to_csv(path)
323 |     # except (OSError, IOError, HTTPError):
324 |     #     logger.exception('failed to cache treasury data')
325 |     # if not has_data_for_dates(data, first_date, last_date):
326 |     #     logger.warn("Still don't have expected data after redownload!")
327 |     l=LoadDataCVS(constants.IP,constants.PORT)
328 |     l.Conn()
329 |     data=l.read_treasure_from_mongodb(first_date, last_date)
330 |     l.Close()
331 |     return data
332 | 
333 | 
334 | #提取分钟数据，这里我们提取
335 | def load_day_data(indexes=None,stockList=None,start=None,end=None,adjusted=True,rolling_count= 10):
336 |     # 
337 |     """
338 |     load stocks from Mongo
339 |     """
340 |     assert indexes is not None or stockList is not None, """
341 | must specify stockList or indexes"""
342 |     #对日期进行改造，提取的数据日期应该高于多于开始日期一个月，这样对于原数据有缓冲作用
343 |     start_time = pd.Timestamp(start,tz='UTC')
344 |     end_time = pd.Timestamp(end,tz='UTC')
345 | 
346 |     if start is None:
347 |         start = "1990-01-01"
348 | 
349 |     if start is not None and end is not None:
350 |         startdate = datetime.datetime.strptime(start, "%Y-%m-%d")
351 |         enddate=datetime.datetime.strptime(end, "%Y-%m-%d")
352 |         assert startdate < enddate, "start date is later than end date."
353 | 
354 |     data = OrderedDict()
355 |     start = (datetime.datetime.strptime(start,'%Y-%m-%d')-datetime.timedelta(days=rolling_count+1)).strftime('%Y-%m-%d')
356 | 
357 |     l=LoadDataCVS(constants.IP,constants.PORT)
358 |     l.Conn()
359 | 
360 |     if stockList=="hs300" or stockList=="zz500" or stockList=="sz50" or stockList=="all":
361 |         stocks=l.getstocklist(stockList)
362 |     else:
363 |         stocks=stockList
364 |     
365 |     #print stocks
366 | 
367 |     if stocks is not None:
368 |         for stock in stocks:
369 |             stkd= l.getstockdaily(stock,start,end)
370 |             if not adjusted:   
371 |                 data[stock] = stkd
372 |             else:
373 |                 adj_cols = ['open', 'high', 'low', 'close']
374 |                 ratio = stkd['price']/stkd['close']
375 |                 ratio_filtered = ratio.fillna(0).values
376 |                 for col in adj_cols:
377 |                     stkd[col] *= ratio_filtered
378 |                 data[stock] = stkd
379 |         return [data,start_time,end_time]        
380 |             
381 |     
382 |     if indexes is not None:
383 |         stkd= l.getindexdaily(indexes,start,end)
384 |         data[indexes] = stkd
385 |         return data
386 |         '''
387 |         for name, ticker in items(indexes):
388 |             print (name,ticker)
389 |             logger.info('Loading index: {} ({})'.format(name, ticker))
390 |             stkd= l.getindexdaily(indexes,start,end)
391 |             data[name] = stkd
392 |         return data 
393 |         '''
394 |         
395 |         
396 |         
397 |     '''
398 |     #['open','high','low','close','volume','price','change',"code"]
399 |     print (data)
400 |     panel = pd.Panel(data)
401 |     panel.minor_axis = ['open', 'high', 'low', 'close', 'volume', 'price','change','code']
402 |     panel.major_axis = panel.major_axis.tz_localize(pytz.utc)
403 |     #print panel[stocks[0]].head(5)
404 | 
405 |     #close the connection
406 |     l.Close()
407 | 
408 |     # Adjust data
409 |     if adjusted:
410 |         adj_cols = ['open', 'high', 'low', 'close']
411 |         for ticker in panel.items:
412 |             ratio = (panel[ticker]['price'] / panel[ticker]['close'])
413 |             ratio_filtered = ratio.fillna(0).values
414 |             for col in adj_cols:
415 |                 panel[ticker][col] *= ratio_filtered
416 | 
417 | 
418 |     return [panel,start_time,end_time]
419 |    ''' 
420 | 
421 | 
422 | #定义一个函数，用于获取分钟数据，其中分钟数据也需要进行复权调整
423 | def load_minute_data(indexes=None,stockList=None,start=None,end=None,adjusted=False,rolling_count= 10):
424 | 
425 |     """
426 |     load stocks from Mongo
427 |     """
428 |     assert indexes is not None or stockList is not None
429 |     """
430 |     must specify stockList or indexes"""
431 |     #对日期进行改造，提取的数据日期应该高于多于开始日期一个月，这样对于原数据有缓冲作用
432 |     
433 |     starts = start
434 |     ends =end
435 |     #start_time = pd.Timestamp(start,tz='UTC')
436 |     #end_time = pd.Timestamp(end,tz='UTC')
437 | 
438 |     if start is None:
439 |         start = "1990-01-01"
440 | 
441 |     if start is not None and end is not None:
442 |         startdate = datetime.datetime.strptime(start, "%Y-%m-%d")
443 |         enddate=datetime.datetime.strptime(end, "%Y-%m-%d")
444 |         assert startdate < enddate, "start date is later than end date."
445 | 
446 |     data = OrderedDict()
447 |     start = (datetime.datetime.strptime(start,'%Y-%m-%d')-datetime.timedelta(days=rolling_count+1)).strftime('%Y-%m-%d')
448 |     end = (datetime.datetime.strptime(end,'%Y-%m-%d')+datetime.timedelta(days=1)).strftime('%Y-%m-%d')
449 |     l=LoadDataCVS(constants.IP,constants.PORT)
450 |     l.Conn()
451 | 
452 |     if stockList=="hs300" or stockList=="zz500" or stockList=="sz50" or stockList=="all":
453 |         stocks=l.getstocklist(stockList)
454 |     else:
455 |         stocks=stockList
456 |     
457 |     #print stocks
458 | 
459 |     if stocks is not None:
460 |         for stock in stocks:
461 |             stkd= l.getstockminute(stock,start,end)
462 |             data[stock] = stkd
463 |             #print data[stock].head(5)
464 |             #print data[stock].tail(5)
465 | 
466 |     if indexes is not None:
467 |         for name, ticker in iteritems(indexes):
468 |             logger.info('Loading index: {} ({})'.format(name, ticker))
469 |             stkd= l.getindexminute(indexes,start,end)
470 |             data[name] = stkd
471 | 
472 |     #['open','high','low','close','volume','price','change',"code"]
473 |     panel = pd.Panel(data)
474 |     panel.minor_axis = ['open', 'high', 'low', 'close', 'volume', 'price','change','code']
475 |     panel.major_axis = panel.major_axis.tz_localize(pytz.utc)
476 |     #print panel[stocks[0]].head(5)
477 | 
478 |     #close the connection
479 |     l.Close()
480 | 
481 |     # Adjust data
482 |     if adjusted:
483 |         adj_cols = ['open', 'high', 'low', 'close']
484 |         for ticker in panel.items:
485 |             ratio = (panel[ticker]['price'] / panel[ticker]['close'])
486 |             ratio_filtered = ratio.fillna(0).values
487 |             for col in adj_cols:
488 |                 panel[ticker][col] *= ratio_filtered
489 | 
490 |     starts = str(starts) + ' 09:25:00'
491 |     start_time = pd.Timestamp(starts,tz='UTC')
492 |     ends = str(ends) + ' 09:25:00'
493 |     end_time = pd.Timestamp(ends,tz='UTC')
494 |     return [panel,start_time,end_time]
495 |     
496 |     
497 | 
498 | def _load_raw_yahoo_data(indexes=None, stocks=None, start=None, end=None):
499 |     """Load closing prices from yahoo finance.
500 | 
501 |     :Optional:
502 |         indexes : dict (Default: {'SPX': '^GSPC'})
503 |             Financial indexes to load.
504 |         stocks : list (Default: ['AAPL', 'GE', 'IBM', 'MSFT',
505 |                                  'XOM', 'AA', 'JNJ', 'PEP', 'KO'])
506 |             Stock closing prices to load.
507 |         start : datetime (Default: datetime(1993, 1, 1, 0, 0, 0, 0, pytz.utc))
508 |             Retrieve prices from start date on.
509 |         end : datetime (Default: datetime(2002, 1, 1, 0, 0, 0, 0, pytz.utc))
510 |             Retrieve prices until end date.
511 | 
512 |     :Note:
513 |         This is based on code presented in a talk by Wes McKinney:
514 |         http://wesmckinney.com/files/20111017/notebook_output.pdf
515 |     """
516 |     assert indexes is not None or stocks is not None, """
517 | must specify stocks or indexes"""
518 | 
519 |     if start is None:
520 |         start = pd.datetime(1990, 1, 1, 0, 0, 0, 0, pytz.utc)
521 | 
522 |     if start is not None and end is not None:
523 |         assert start < end, "start date is later than end date."
524 | 
525 |     data = OrderedDict()
526 |     if stocks is not None:
527 |         for stock in stocks:
528 |             logger.info('Loading stock: {}'.format(stock))
529 |             stock_pathsafe = stock.replace(os.path.sep, '--')
530 |             cache_filename = "{stock}-{start}-{end}.csv".format(
531 |                 stock=stock_pathsafe,
532 |                 start=start,
533 |                 end=end).replace(':', '-')
534 |             cache_filepath = get_cache_filepath(cache_filename)
535 |             if os.path.exists(cache_filepath):
536 |                 stkd = pd.DataFrame.from_csv(cache_filepath)
537 |             else:
538 |                 stkd = DataReader(stock, 'yahoo', start, end).sort_index()
539 |                 stkd.to_csv(cache_filepath)
540 |             data[stock] = stkd
541 | 
542 |     if indexes is not None:
543 |         for name, ticker in iteritems(indexes):
544 |             logger.info('Loading index: {} ({})'.format(name, ticker))
545 |             stkd = DataReader(ticker, 'yahoo', start, end).sort_index()
546 |             data[name] = stkd
547 | 
548 |     return data
549 | 
550 | 
551 | def load_from_yahoo(indexes=None,
552 |                     stocks=None,
553 |                     start=None,
554 |                     end=None,
555 |                     adjusted=True):
556 |     """
557 |     Loads price data from Yahoo into a dataframe for each of the indicated
558 |     assets.  By default, 'price' is taken from Yahoo's 'Adjusted Close',
559 |     which removes the impact of splits and dividends. If the argument
560 |     'adjusted' is False, then the non-adjusted 'close' field is used instead.
561 | 
562 |     :param indexes: Financial indexes to load.
563 |     :type indexes: dict
564 |     :param stocks: Stock closing prices to load.
565 |     :type stocks: list
566 |     :param start: Retrieve prices from start date on.
567 |     :type start: datetime
568 |     :param end: Retrieve prices until end date.
569 |     :type end: datetime
570 |     :param adjusted: Adjust the price for splits and dividends.
571 |     :type adjusted: bool
572 | 
573 |     """
574 |     data = _load_raw_yahoo_data(indexes, stocks, start, end)
575 |     if adjusted:
576 |         close_key = 'Adj Close'
577 |     else:
578 |         close_key = 'Close'
579 |     df = pd.DataFrame({key: d[close_key] for key, d in iteritems(data)})
580 |     df.index = df.index.tz_localize(pytz.utc)
581 |     return df
582 | 
583 | 
584 | def load_bars_from_yahoo(indexes=None,
585 |                          stocks=None,
586 |                          start=None,
587 |                          end=None,
588 |                          adjusted=True):
589 |     """
590 |     Loads data from Yahoo into a panel with the following
591 |     column names for each indicated security:
592 | 
593 |         - open
594 |         - high
595 |         - low
596 |         - close
597 |         - volume
598 |         - price
599 | 
600 |     Note that 'price' is Yahoo's 'Adjusted Close', which removes the
601 |     impact of splits and dividends. If the argument 'adjusted' is True, then
602 |     the open, high, low, and close values are adjusted as well.
603 | 
604 |     :param indexes: Financial indexes to load.
605 |     :type indexes: dict
606 |     :param stocks: Stock closing prices to load.
607 |     :type stocks: list
608 |     :param start: Retrieve prices from start date on.
609 |     :type start: datetime
610 |     :param end: Retrieve prices until end date.
611 |     :type end: datetime
612 |     :param adjusted: Adjust open/high/low/close for splits and dividends.
613 |         The 'price' field is always adjusted.
614 |     :type adjusted: bool
615 | 
616 |     """
617 |     data = _load_raw_yahoo_data(indexes, stocks, start, end)
618 |     panel = pd.Panel(data)
619 |     # Rename columns
620 |     panel.minor_axis = ['open', 'high', 'low', 'close', 'volume', 'price']
621 |     panel.major_axis = panel.major_axis.tz_localize(pytz.utc)
622 |     # Adjust data
623 |     if adjusted:
624 |         adj_cols = ['open', 'high', 'low', 'close']
625 |         for ticker in panel.items:
626 |             ratio = (panel[ticker]['price'] / panel[ticker]['close'])
627 |             ratio_filtered = ratio.fillna(0).values
628 |             for col in adj_cols:
629 |                 panel[ticker][col] *= ratio_filtered
630 |     return panel
631 | 
632 | 
633 | def load_prices_from_csv(filepath, identifier_col, tz='UTC'):
634 |     data = pd.read_csv(filepath, index_col=identifier_col)
635 |     data.index = pd.DatetimeIndex(data.index, tz=tz)
636 |     data.sort_index(inplace=True)
637 |     return data
638 | 
639 | 
640 | def load_prices_from_csv_folder(folderpath, identifier_col, tz='UTC'):
641 |     data = None
642 |     for file in os.listdir(folderpath):
643 |         if '.csv' not in file:
644 |             continue
645 |         raw = load_prices_from_csv(os.path.join(folderpath, file),
646 |                                    identifier_col, tz)
647 |         if data is None:
648 |             data = raw
649 |         else:
650 |             data = pd.concat([data, raw], axis=1)
651 |     return data
652 | 


--------------------------------------------------------------------------------
/_program.py:
--------------------------------------------------------------------------------
  1 | # encoding:utf-8
  2 | """The underlying data structure used in gplearn.
  3 | 
  4 | The :mod:`gplearn._program` module contains the underlying representation of a
  5 | computer program. It is used for creating and evolving programs used in the
  6 | :mod:`gplearn.genetic` module.print 
  7 | """
  8 | # input make_function X
  9 | # Author: Trevor Stephens <trevorstephens.com>
 10 | #
 11 | # License: BSD 3 clause
 12 | import line_profiler
 13 | import os 
 14 | import sys
 15 | from copy import copy
 16 | import gc
 17 | import numpy as np
 18 | from sklearn.utils.random import sample_without_replacement
 19 | from time import time
 20 | from .functions import _Function
 21 | from .utils import check_random_state
 22 | 
 23 | 
 24 | class _Program(object):
 25 | 
 26 |     """A program-like representation of the evolved program.
 27 | 
 28 |     This is the underlying data-structure used by the public classes in the
 29 |     :mod:`gplearn.genetic` module. It should not be used directly by the user.
 30 | 
 31 |     Parameters
 32 |     ----------
 33 |     function_set : list
 34 |         A list of valid functions to use in the program.
 35 | 
 36 |     arities : dict
 37 |         A dictionary of the form `{arity: [functions]}`. The arity is the
 38 |         number of arguments that the function takes, the functions must match
 39 |         those in the `function_set` parameter.
 40 | 
 41 |     init_depth : tuple of two ints
 42 |         The range of tree depths for the initial population of naive formulas.
 43 |         Individual trees will randomly choose a maximum depth from this range.
 44 |         When combined with `init_method='half and half'` this yields the well-
 45 |         known 'ramped half and half' initialization method.
 46 | 
 47 |     init_method : str
 48 |         - 'grow' : Nodes are chosen at random from both functions and
 49 |           terminals, allowing for smaller trees than `init_depth` allows. Tends
 50 |           to grow asymmetrical trees.
 51 |         - 'full' : Functions are chosen until the `init_depth` is reached, and
 52 |           then terminals are selected. Tends to grow 'bushy' trees.
 53 |         - 'half and half' : Trees are grown through a 50/50 mix of 'full' and
 54 |           'grow', making for a mix of tree shapes in the initial population.
 55 | 
 56 |     n_features : int
 57 |         The number of features in `X`.
 58 | 
 59 |     const_range : tuple of two floats
 60 |         The range of constants to include in the formulas.
 61 | 
 62 |     metric : _Fitness object
 63 |         The raw fitness metric.
 64 | 
 65 |     p_point_replace : float
 66 |         The probability that any given node will be mutated during point
 67 |         mutation.
 68 | 
 69 |     parsimony_coefficient : float
 70 |         This constant penalizes large programs by adjusting their fitness to
 71 |         be less favorable for selection. Larger values penalize the program
 72 |         more which can control the phenomenon known as 'bloat'. Bloat is when
 73 |         evolution is increasing the size of programs without a significant
 74 |         increase in fitness, which is costly for computation time and makes for
 75 |         a less understandable final result. This parameter may need to be tuned
 76 |         over successive runs.
 77 | 
 78 |     random_state : RandomState instance
 79 |         The random number generator. Note that ints, or None are not allowed.
 80 |         The reason for this being passed is that during parallel evolution the
 81 |         same program object may be accessed by multiple parallel processes.
 82 | 
 83 |     transformer : _Function object, optional (default=None)
 84 |         The function to transform the output of the program to probabilities,
 85 |         only used for the SymbolicClassifier.
 86 | 
 87 |     feature_names : list, optional (default=None)
 88 |         Optional list of feature names, used purely for representations in
 89 |         the `print` operation or `export_graphviz`. If None, then X0, X1, etc
 90 |         will be used for representations.
 91 | 
 92 |     program : list, optional (default=None)
 93 |         The flattened tree representation of the program. If None, a new naive
 94 |         random tree will be grown. If provided, it will be validated.
 95 | 
 96 |     Attributes
 97 |     ----------
 98 |     program : list
 99 |         The flattened tree representation of the program.
100 | 
101 |     raw_fitness_ : float
102 |         The raw fitness of the individual program.
103 | 
104 |     fitness_ : float
105 |         The penalized fitness of the individual program.
106 | 
107 |     oob_fitness_ : float
108 |         The out-of-bag raw fitness of the individual program for the held-out
109 |         samples. Only present when sub-sampling was used in the estimator by
110 |         specifying `max_samples` < 1.0.
111 | 
112 |     parents : dict, or None
113 |         If None, this is a naive random program from the initial population.
114 |         Otherwise it includes meta-data about the program's parent(s) as well
115 |         as the genetic operations performed to yield the current program. This
116 |         is set outside this class by the controlling evolution loops.
117 | 
118 |     depth_ : int
119 |         The maximum depth of the program tree.
120 | 
121 |     length_ : int
122 |         The number of functions and terminals in the program.
123 | 
124 |     """
125 | 
126 |     def __init__(self,
127 |                  function_set,
128 |                  arities,
129 |                  init_depth,
130 |                  init_method,
131 |                  n_features,
132 |                  const_range,
133 |                  metric,
134 |                  p_point_replace,
135 |                  parsimony_coefficient,
136 |                  random_state,
137 |                  transformer=None,
138 |                  feature_names=None,
139 |                  program=None):
140 | 
141 |         self.function_set = function_set
142 |         self.arities = arities
143 |         self.init_depth = (init_depth[0], init_depth[1] + 1)
144 |         self.init_method = init_method
145 |         self.n_features = n_features
146 |         self.const_range = const_range
147 |         self.metric = metric
148 |         self.p_point_replace = p_point_replace
149 |         self.parsimony_coefficient = parsimony_coefficient
150 |         self.transformer = transformer
151 |         self.feature_names = feature_names
152 |         self.program = program
153 | 
154 |         if self.program is not None:
155 |             if not self.validate_program():
156 |                 raise ValueError('The supplied program is incomplete.')
157 |         else:
158 |             # Create a naive random program,如果判断program wei none则进行建立program
159 |             self.program = self.build_program(random_state)
160 | 
161 |         self.raw_fitness_ = None
162 |         self.fitness_ = None
163 |         self.parents = None
164 |         self._n_samples = None
165 |         self._max_samples = None
166 |         self._indices_state = None
167 | 
168 |     def build_program(self, random_state):
169 |         """Build a naive random program.
170 | 
171 |         Parameters
172 |         ----------
173 |         random_state : RandomState instance
174 |             The random number generator.
175 | 
176 |         Returns
177 |         -------
178 |         program : list
179 |             The flattened tree representation of the program.
180 | 
181 |         """
182 |         if self.init_method == 'half and half':
183 |             method = ('full' if random_state.randint(2) else 'grow')
184 |         else:
185 |             method = self.init_method
186 |         max_depth = random_state.randint(*self.init_depth)
187 | 
188 |         # Start a program with a function to avoid degenerative programs
189 |         #print (len(self.function_set))
190 |         
191 |         function = random_state.randint(len(self.function_set))
192 |         # 随机选择一个加减乘除的方法
193 |         function = self.function_set[function]
194 |         #print (function)
195 |         
196 |         program = [function]
197 |         terminal_stack = [function.arity] #function.arity 表示函数所需要的参数个数
198 |         while terminal_stack:
199 |             depth = len(terminal_stack)
200 |             choice = self.n_features + len(self.function_set)          
201 |             choice = random_state.randint(choice) 
202 |             #input()
203 |             # Determine if we are adding a function or terminal,决定我们是继续添加功能，或者是终止
204 |             if (depth < max_depth) and (method == 'full' or
205 |                                         choice <= len(self.function_set)):
206 |                 #print ('here1')                        
207 |                 function = random_state.randint(len(self.function_set))
208 |                 function = self.function_set[function]
209 |                 program.append(function)
210 |                 terminal_stack.append(function.arity)
211 |             else:
212 |                 #print ('here2')
213 |                 # We need a terminal, add a variable or constant
214 |                 if self.const_range is not None:
215 |                     terminal = random_state.randint(self.n_features + 1)
216 |                 else:
217 |                     terminal = random_state.randint(self.n_features)
218 |                 if terminal == self.n_features:
219 |                     terminal = random_state.uniform(*self.const_range)
220 |                     if self.const_range is None:
221 |                         # We should never get here
222 |                         raise ValueError('A constant was produced with '
223 |                                          'const_range=None.')
224 |                 program.append(terminal)
225 |                 terminal_stack[-1] -= 1
226 |                 while terminal_stack[-1] == 0:
227 |                     terminal_stack.pop()
228 |                     if not terminal_stack:
229 |                         #print (u'tiaochuxunhuan')
230 |                         return program
231 |                     terminal_stack[-1] -= 1
232 |         #
233 |         # We should never get here
234 |         return None
235 | 
236 |     def validate_program(self):
237 |         """Rough check that the embedded program in the object is valid."""
238 |         terminals = [0]
239 |         for node in self.program:
240 |             if isinstance(node, _Function):
241 |                 terminals.append(node.arity)
242 |             else:
243 |                 terminals[-1] -= 1
244 |                 while terminals[-1] == 0:
245 |                     terminals.pop()
246 |                     terminals[-1] -= 1
247 |         return terminals == [-1]
248 | 
249 |     def __str__(self):
250 |         """Overloads `print` output of the object to resemble a LISP tree."""
251 |         terminals = [0]
252 |         output = ''
253 |         for i, node in enumerate(self.program):
254 |             #print (u'i',i,u'node','node')
255 |             if isinstance(node, _Function):
256 |                 terminals.append(node.arity)
257 |                 output += node.name + '('
258 |             else:
259 |                 if isinstance(node, int):
260 |                     if self.feature_names is None:
261 |                         output += 'X%s' % node
262 |                     else:
263 |                         output += self.feature_names[node]
264 |                 else:
265 |                     output += '%.3f' % node
266 |                 terminals[-1] -= 1
267 |                 while terminals[-1] == 0:
268 |                     terminals.pop()
269 |                     terminals[-1] -= 1
270 |                     output += ')'
271 |                 if i != len(self.program) - 1:
272 |                     output += ', '
273 |         return output
274 | 
275 |     def export_graphviz(self, fade_nodes=None):
276 |         """Returns a string, Graphviz script for visualizing the program.
277 | 
278 |         Parameters
279 |         ----------
280 |         fade_nodes : list, optional
281 |             A list of node indices to fade out for showing which were removed
282 |             during evolution.
283 | 
284 |         Returns
285 |         -------
286 |         output : string
287 |             The Graphviz script to plot the tree representation of the program.
288 | 
289 |         """
290 |         terminals = []
291 |         if fade_nodes is None:
292 |             fade_nodes = []
293 |         output = 'digraph program {\nnode [style=filled]\n'
294 |         for i, node in enumerate(self.program):
295 |             fill = '#cecece'
296 |             if isinstance(node, _Function):
297 |                 if i not in fade_nodes:
298 |                     fill = '#136ed4'
299 |                 terminals.append([node.arity, i])
300 |                 output += ('%d [label="%s", fillcolor="%s"] ;\n'
301 |                            % (i, node.name, fill))
302 |             else:
303 |                 if i not in fade_nodes:
304 |                     fill = '#60a6f6'
305 |                 if isinstance(node, int):
306 |                     if self.feature_names is None:
307 |                         feature_name = 'X%s' % node
308 |                     else:
309 |                         feature_name = self.feature_names[node]
310 |                     output += ('%d [label="%s", fillcolor="%s"] ;\n'
311 |                                % (i, feature_name, fill))
312 |                 else:
313 |                     output += ('%d [label="%.3f", fillcolor="%s"] ;\n'
314 |                                % (i, node, fill))
315 |                 if i == 0:
316 |                     # A degenerative program of only one node
317 |                     return output + '}'
318 |                 terminals[-1][0] -= 1
319 |                 terminals[-1].append(i)
320 |                 while terminals[-1][0] == 0:
321 |                     output += '%d -> %d ;\n' % (terminals[-1][1],
322 |                                                 terminals[-1][-1])
323 |                     terminals[-1].pop()
324 |                     if len(terminals[-1]) == 2:
325 |                         parent = terminals[-1][-1]
326 |                         terminals.pop()
327 |                         if not terminals:
328 |                             return output + '}'
329 |                         terminals[-1].append(parent)
330 |                         terminals[-1][0] -= 1
331 | 
332 |         # We should never get here
333 |         return None
334 | 
335 |     def _depth(self):
336 |         """Calculates the maximum depth of the program tree."""
337 |         terminals = [0]
338 |         depth = 1
339 |         for node in self.program:
340 |             if isinstance(node, _Function):
341 |                 terminals.append(node.arity)
342 |                 depth = max(len(terminals), depth)
343 |             else:
344 |                 terminals[-1] -= 1
345 |                 while terminals[-1] == 0:
346 |                     terminals.pop()
347 |                     terminals[-1] -= 1
348 |         return depth - 1
349 | 
350 |     def _length(self):
351 |         """Calculates the number of functions and terminals in the program."""
352 |         return len(self.program)
353 | 
354 |     def execute(self, X):
355 |         """Execute the program according to X.
356 | 
357 |         Parameters
358 |         ----------
359 |         X : {array-like}, shape = [n_samples, n_features]
360 |             Training vectors, where n_samples is the number of samples and
361 |             n_features is the number of features.
362 | 
363 |         Returns
364 |         -------
365 |         y_hats : array-like, shape = [n_samples]
366 |             The result of executing the program on X.
367 | 
368 |         """
369 |         # Check for single-node programs
370 |         #ts = time()
371 |         node = self.program[0]
372 |         if isinstance(node, float):
373 |             #print(time() -t,u'no1')
374 |             return np.repeat(node, X.shape[0])
375 |         if isinstance(node, int):
376 |             #print (time()-t,u'no2')
377 |             return X[:, node]
378 |         
379 |         apply_stack = []
380 |         
381 |         for node in self.program:
382 |             if isinstance(node, _Function):
383 |                 apply_stack.append([node])
384 |             else:
385 |                 # Lazily evaluate later
386 |                 apply_stack[-1].append(node)
387 |            
388 |             while len(apply_stack[-1]) == apply_stack[-1][0].arity + 1:
389 |                 function = apply_stack[-1][0]
390 |                 terminals = [np.repeat(t, X.shape[0]) if isinstance(t, float)
391 |                              else X[:, t] if isinstance(t, int)
392 |                              else t for t in apply_stack[-1][1:]]
393 | 
394 |                 intermediate_result = function(*terminals)
395 |                 if len(apply_stack) != 1:
396 |                     apply_stack.pop()
397 |                     apply_stack[-1].append(intermediate_result)
398 |                 else:
399 |                     return intermediate_result       
400 |         # We should never get here
401 |         return None    
402 |     
403 |     def jiasu(self,y_pred,y):
404 |         list_t = np.argsort(y_pred)
405 |         tt =sum([y[i] for i in list_t[-50:]])
406 |         return tt
407 |     def stock_excute(self,x,y):
408 |         '''
409 |         本程序用于对股票数据进行处理，用于计算股票收益的适应度情况
410 |         X:股票的因子序列
411 |         Y:给定的适应度情况
412 |         '''
413 |         shouyi = []
414 |         for i in range(len(x)):
415 |             if i%5==0:
416 |                 y_pred = self.execute(np.array(x[i]))
417 |                 shouyi.append(self.jiasu(y_pred,y[i]))
418 |                 del y_pred
419 |         gc.collect()
420 |         return shouyi
421 | 
422 |     def get_all_indices(self, n_samples=None, max_samples=None,
423 |                         random_state=None):
424 |         """Get the indices on which to evaluate the fitness of a program.
425 | 
426 |         Parameters
427 |         ----------
428 |         n_samples : int
429 |             The number of samples.
430 | 
431 |         max_samples : int
432 |             The maximum number of samples to use.
433 | 
434 |         random_state : RandomState instance
435 |             The random number generator.
436 | 
437 |         Returns
438 |         -------
439 |         indices : array-like, shape = [n_samples]
440 |             The in-sample indices.
441 | 
442 |         not_indices : array-like, shape = [n_samples]
443 |             The out-of-sample indices.
444 | 
445 |         """
446 |         if self._indices_state is None and random_state is None:
447 |             raise ValueError('The program has not been evaluated for fitness '
448 |                              'yet, indices not available.')
449 | 
450 |         if n_samples is not None and self._n_samples is None:
451 |             self._n_samples = n_samples
452 |         if max_samples is not None and self._max_samples is None:
453 |             self._max_samples = max_samples
454 |         if random_state is not None and self._indices_state is None:
455 |             self._indices_state = random_state.get_state()
456 | 
457 |         indices_state = check_random_state(None)
458 |         indices_state.set_state(self._indices_state)
459 | 
460 |         not_indices = sample_without_replacement(
461 |             self._n_samples,
462 |             self._n_samples - self._max_samples,
463 |             random_state=indices_state)
464 |         sample_counts = np.bincount(not_indices, minlength=self._n_samples)
465 |         indices = np.where(sample_counts == 0)[0]
466 | 
467 |         return indices, not_indices
468 | 
469 |     def _indices(self):
470 |         """Get the indices used to measure the program's fitness."""
471 |         return self.get_all_indices()[0]
472 | 
473 |     def raw_fitness(self, X, y, sample_weight):
474 |         """Evaluate the raw fitness of the program according to X, y.
475 | 
476 |         Parameters
477 |         ----------
478 |         X : {array-like}, shape = [n_samples, n_features]
479 |             Training vectors, where n_samples is the number of samples and
480 |             n_features is the number of features.
481 | 
482 |         y : array-like, shape = [n_samples]
483 |             Target values.
484 | 
485 |         sample_weight : array-like, shape = [n_samples]
486 |             Weights applied to individual samples.
487 | 
488 |         Returns
489 |         -------
490 |         raw_fitness : float
491 |             The raw fitness of the program.
492 | 
493 |         """
494 |         #print (self.metric.stock_is)
495 |         if not self.metric.stock_is:    
496 |             y_pred = self.execute(X)
497 |         else:
498 |             y_pred = self.stock_excute(X,y)
499 |         if self.transformer:
500 |             y_pred = self.transformer(y_pred)
501 |         sample_weight = [1 for i in range(len(y_pred))]
502 |         raw_fitness = self.metric(y, y_pred, sample_weight)
503 |         del X,y,y_pred
504 |         gc.collect()
505 |         return raw_fitness
506 | 
507 |     def fitness(self, parsimony_coefficient=None):
508 |         """Evaluate the penalized fitness of the program according to X, y.
509 | 
510 |         Parameters
511 |         ----------
512 |         parsimony_coefficient : float, optional
513 |             If automatic parsimony is being used, the computed value according
514 |             to the population. Otherwise the initialized value is used.
515 | 
516 |         Returns
517 |         -------
518 |         fitness : float
519 |             The penalized fitness of the program.
520 | 
521 |         """
522 |         if parsimony_coefficient is None:
523 |             parsimony_coefficient = self.parsimony_coefficient
524 | 
525 |         penalty = parsimony_coefficient * len(self.program) * self.metric.sign
526 |         return self.raw_fitness_ - penalty
527 |     def get_subtree(self, random_state, program=None):
528 |         """Get a random subtree from the program.
529 | 
530 |         Parameters
531 |         ----------
532 |         random_state : RandomState instance
533 |             The random number generator.
534 | 
535 |         program : list, optional (default=None)
536 |             The flattened tree representation of the program. If None, the
537 |             embedded tree in the object will be used.
538 | 
539 |         Returns
540 |         -------
541 |         start, end : tuple of two ints
542 |             The indices of the start and end of the random subtree.
543 | 
544 |         """
545 |         if program is None:
546 |             program = self.program
547 |         # Choice of crossover points follows Koza's (1992) widely used approach
548 |         # of choosing functions 90% of the time and leaves 10% of the time.
549 |         probs = np.array([0.9 if isinstance(node, _Function) else 0.1
550 |                           for node in program])
551 |         probs = np.cumsum(probs / probs.sum())
552 |         start = np.searchsorted(probs, random_state.uniform())
553 | 
554 |         stack = 1
555 |         end = start
556 |         while stack > end - start:
557 |             node = program[end]
558 |             if isinstance(node, _Function):
559 |                 stack += node.arity
560 |             end += 1
561 | 
562 |         return start, end
563 | 
564 |     def reproduce(self):
565 |         """Return a copy of the embedded program."""
566 |         return copy(self.program)
567 | 
568 |     def crossover(self, donor, random_state):
569 |         """Perform the crossover genetic operation on the program.
570 | 
571 |         Crossover selects a random subtree from the embedded program to be
572 |         replaced. A donor also has a subtree selected at random and this is
573 |         inserted into the original parent to form an offspring.
574 | 
575 |         Parameters
576 |         ----------
577 |         donor : list
578 |             The flattened tree representation of the donor program.
579 | 
580 |         random_state : RandomState instance
581 |             The random number generator.
582 | 
583 |         Returns
584 |         -------
585 |         program : list
586 |             The flattened tree representation of the program.
587 | 
588 |         """
589 |         # Get a subtree to replace
590 |         start, end = self.get_subtree(random_state)
591 |         removed = range(start, end)
592 |         # Get a subtree to donate
593 |         donor_start, donor_end = self.get_subtree(random_state, donor)
594 |         donor_removed = list(set(range(len(donor))) -
595 |                              set(range(donor_start, donor_end)))
596 |         # Insert genetic material from donor
597 |         return (self.program[:start] +
598 |                 donor[donor_start:donor_end] +
599 |                 self.program[end:]), removed, donor_removed
600 | 
601 |     def subtree_mutation(self, random_state):
602 |         """Perform the subtree mutation operation on the program.
603 | 
604 |         Subtree mutation selects a random subtree from the embedded program to
605 |         be replaced. A donor subtree is generated at random and this is
606 |         inserted into the original parent to form an offspring. This
607 |         implementation uses the "headless chicken" method where the donor
608 |         subtree is grown using the initialization methods and a subtree of it
609 |         is selected to be donated to the parent.
610 | 
611 |         Parameters
612 |         ----------
613 |         random_state : RandomState instance
614 |             The random number generator.
615 | 
616 |         Returns
617 |         -------
618 |         program : list
619 |             The flattened tree representation of the program.
620 | 
621 |         """
622 |         # Build a new naive program
623 |         chicken = self.build_program(random_state)
624 |         # Do subtree mutation via the headless chicken method!
625 |         return self.crossover(chicken, random_state)
626 | 
627 |     def hoist_mutation(self, random_state):
628 |         """Perform the hoist mutation operation on the program.
629 | 
630 |         Hoist mutation selects a random subtree from the embedded program to
631 |         be replaced. A random subtree of that subtree is then selected and this
632 |         is 'hoisted' into the original subtrees location to form an offspring.
633 |         This method helps to control bloat.
634 | 
635 |         Parameters
636 |         ----------
637 |         random_state : RandomState instance
638 |             The random number generator.
639 | 
640 |         Returns
641 |         -------
642 |         program : list
643 |             The flattened tree representation of the program.
644 | 
645 |         """
646 |         # Get a subtree to replace
647 |         start, end = self.get_subtree(random_state)
648 |         subtree = self.program[start:end]
649 |         # Get a subtree of the subtree to hoist
650 |         sub_start, sub_end = self.get_subtree(random_state, subtree)
651 |         hoist = subtree[sub_start:sub_end]
652 |         # Determine which nodes were removed for plotting
653 |         removed = list(set(range(start, end)) -
654 |                        set(range(start + sub_start, start + sub_end)))
655 |         return self.program[:start] + hoist + self.program[end:], removed
656 | 
657 |     def point_mutation(self, random_state):
658 |         """Perform the point mutation operation on the program.
659 | 
660 |         Point mutation selects random nodes from the embedded program to be
661 |         replaced. Terminals are replaced by other terminals and functions are
662 |         replaced by other functions that require the same number of arguments
663 |         as the original node. The resulting tree forms an offspring.
664 | 
665 |         Parameters
666 |         ----------
667 |         random_state : RandomState instance
668 |             The random number generator.
669 | 
670 |         Returns
671 |         -------
672 |         program : list
673 |             The flattened tree representation of the program.
674 | 
675 |         """
676 |         program = copy(self.program)
677 | 
678 |         # Get the nodes to modify
679 |         mutate = np.where(random_state.uniform(size=len(program)) <
680 |                           self.p_point_replace)[0]
681 | 
682 |         for node in mutate:
683 |             if isinstance(program[node], _Function):
684 |                 arity = program[node].arity
685 |                 # Find a valid replacement with same arity
686 |                 replacement = len(self.arities[arity])
687 |                 replacement = random_state.randint(replacement)
688 |                 replacement = self.arities[arity][replacement]
689 |                 program[node] = replacement
690 |             else:
691 |                 # We've got a terminal, add a const or variable
692 |                 if self.const_range is not None:
693 |                     terminal = random_state.randint(self.n_features + 1)
694 |                 else:
695 |                     terminal = random_state.randint(self.n_features)
696 |                 if terminal == self.n_features:
697 |                     terminal = random_state.uniform(*self.const_range)
698 |                     if self.const_range is None:
699 |                         # We should never get here
700 |                         raise ValueError('A constant was produced with '
701 |                                          'const_range=None.')
702 |                 program[node] = terminal
703 | 
704 |         return program, list(mutate)
705 | 
706 |     depth_ = property(_depth)
707 |     length_ = property(_length)
708 |     indices_ = property(_indices)
709 | 


--------------------------------------------------------------------------------
/genetic.py:
--------------------------------------------------------------------------------
   1 | # encoding:utf-8
   2 | """Genetic Programming in Python, with a scikit-learn inspired API
   3 | 
   4 | The :mod:`gplearn.genetic` module implements Genetic Programming. These
   5 | are supervised learning methods based on applying evolutionary operations on
   6 | computer programs.
   7 | """
   8 | # print,input  make_function make_fitness
   9 | # Author: Trevor Stephens <trevorstephens.com>
  10 | # input print
  11 | # License: BSD 3 clause
  12 | 
  13 | import itertools
  14 | from abc import ABCMeta, abstractmethod
  15 | from time import time
  16 | from warnings import warn
  17 | import gc
  18 | 
  19 | import numpy as np
  20 | from joblib import Parallel, delayed
  21 | from scipy.stats import rankdata
  22 | from sklearn.base import BaseEstimator
  23 | from sklearn.base import RegressorMixin, TransformerMixin, ClassifierMixin
  24 | from sklearn.exceptions import NotFittedError
  25 | from sklearn.utils.validation import check_X_y, check_array
  26 | from sklearn.utils.multiclass import check_classification_targets
  27 | 
  28 | from ._program import _Program
  29 | from .fitness import _fitness_map, _Fitness
  30 | from .functions import _function_map, _Function, sig1 as sigmoid
  31 | from .utils import _partition_estimators
  32 | from .utils import check_random_state
  33 | 
  34 | __all__ = ['SymbolicRegressor', 'SymbolicClassifier', 'SymbolicTransformer']
  35 | 
  36 | MAX_INT = np.iinfo(np.int32).max
  37 | 
  38 | 
  39 | def _parallel_evolve(n_programs, parents, X, y, sample_weight, seeds, params):
  40 |     """Private function used to build a batch of programs within a job."""
  41 |     ''' 一个进化函数'''
  42 |     #n_samples, n_features = X.shape # 此为原版，如需要可调整回
  43 |     n_samples, n_features= len(X),18
  44 |     # Unpack parameters
  45 |     tournament_size = params['tournament_size']
  46 |     function_set = params['function_set']
  47 |     arities = params['arities']
  48 |     init_depth = params['init_depth']
  49 |     init_method = params['init_method']
  50 |     const_range = params['const_range']
  51 |     metric = params['_metric']
  52 |     transformer = params['_transformer']
  53 |     parsimony_coefficient = params['parsimony_coefficient']
  54 |     method_probs = params['method_probs']  #方法的概率
  55 |     p_point_replace = params['p_point_replace']
  56 |     max_samples = params['max_samples']
  57 |     feature_names = params['feature_names']
  58 | 
  59 |     max_samples = int(max_samples * n_samples)
  60 |     def _tournament():
  61 |         """Find the fittest individual from a sub-population."""
  62 |         contenders = random_state.randint(0, len(parents), tournament_size)
  63 |         fitness = [parents[p].fitness_ for p in contenders]
  64 |         if metric.greater_is_better:
  65 |             parent_index = contenders[np.argmax(fitness)]
  66 |         else:
  67 |             parent_index = contenders[np.argmin(fitness)]
  68 |         return parents[parent_index], parent_index
  69 | 
  70 |     # Build programs
  71 |     programs = []
  72 |     for i in range(n_programs):
  73 |         print (i)
  74 |         tt = time()
  75 |         # 随机选择一个数据，用于判断进化向那个方向进行
  76 |         random_state = check_random_state(seeds[i])
  77 |         # 如过父树不存在，则，若存在则选择下一步的方法
  78 |         if parents is None:
  79 |             program = None
  80 |             genome = None
  81 |         else:
  82 |             method = random_state.uniform()
  83 |             parent, parent_index = _tournament()
  84 | 
  85 |             if method < method_probs[0]:
  86 |                 # crossover # 交叉
  87 |                 # parent 为父树，donor 为捐赠者  
  88 |                 donor, donor_index = _tournament()
  89 |                 program, removed, remains = parent.crossover(donor.program,
  90 |                                                            random_state)
  91 |                 genome = {'method': 'Crossover',
  92 |                           'parent_idx': parent_index,
  93 |                           'parent_nodes': removed,
  94 |                           'donor_idx': donor_index,
  95 |                           'donor_nodes': remains}
  96 |             elif method < method_probs[1]:
  97 |                 # subtree_mutation,子树突变
  98 |                 program, removed, _ = parent.subtree_mutation(random_state)
  99 |                 genome = {'method': 'Subtree Mutation',
 100 |                           'parent_idx': parent_index,
 101 |                           'parent_nodes': removed}
 102 |             elif method < method_probs[2]:
 103 |                 # hoist_mutation   hoist 变异
 104 |                 program, removed = parent.hoist_mutation(random_state)
 105 |                 genome = {'method': 'Hoist Mutation',
 106 |                           'parent_idx': parent_index,
 107 |                           'parent_nodes': removed}
 108 |             elif method < method_probs[3]:
 109 |                 # point_mutation  点突变
 110 |                 program, mutated = parent.point_mutation(random_state)
 111 |                 genome = {'method': 'Point Mutation',
 112 |                           'parent_idx': parent_index,
 113 |                           'parent_nodes': mutated}
 114 |             else:
 115 |                 # reproduction  # 直接繁殖
 116 |                 program = parent.reproduce()
 117 |                 genome = {'method': 'Reproduction',
 118 |                           'parent_idx': parent_index,
 119 |                           'parent_nodes': []}
 120 |         #print (function_set)
 121 |         #input()
 122 |         program = _Program(function_set=function_set,
 123 |                            arities=arities,
 124 |                            init_depth=init_depth,
 125 |                            init_method=init_method,
 126 |                            n_features=n_features,
 127 |                            metric=metric,
 128 |                            transformer=transformer,
 129 |                            const_range=const_range,
 130 |                            p_point_replace=p_point_replace,
 131 |                            parsimony_coefficient=parsimony_coefficient,
 132 |                            feature_names=feature_names,
 133 |                            random_state=random_state,
 134 |                            program=program)
 135 | 
 136 |         program.parents = genome
 137 |         
 138 |         # Draw samples, using sample weights, and then fit
 139 |         if sample_weight is None:
 140 |             curr_sample_weight = np.ones((n_samples,))
 141 |         else:
 142 |             curr_sample_weight = sample_weight.copy()
 143 |         oob_sample_weight = curr_sample_weight.copy()
 144 | 
 145 |         indices, not_indices = program.get_all_indices(n_samples,
 146 |                                                        max_samples,
 147 |                                                        random_state)
 148 | 
 149 |         curr_sample_weight[not_indices] = 0
 150 |         oob_sample_weight[indices] = 0
 151 |         bb = time()
 152 |         #print (u'花费事假',time() - tt)
 153 |         program.raw_fitness_ = program.raw_fitness(X, y, curr_sample_weight)
 154 |         #print (u'这里时间',time()-bb)
 155 |         #a = time()
 156 |         if max_samples < n_samples:
 157 |             # Calculate OOB fitness
 158 |             program.oob_fitness_ = program.raw_fitness(X, y, oob_sample_weight)
 159 |            # print (time() - a,u'第二个raw_fitness')
 160 |             #input()
 161 |         programs.append(program)
 162 | 
 163 |     return programs
 164 | 
 165 | 
 166 | class BaseSymbolic(BaseEstimator, metaclass=ABCMeta):
 167 | 
 168 |     """Base class for symbolic regression / classification estimators.
 169 | 
 170 |     Warning: This class should not be used directly.
 171 |     Use derived classes instead.
 172 | 
 173 |     """
 174 | 
 175 |     @abstractmethod
 176 |     def __init__(self,
 177 |                  population_size=1000,
 178 |                  hall_of_fame=None,
 179 |                  n_components=None,
 180 |                  generations=20,
 181 |                  tournament_size=20,
 182 |                  stopping_criteria=0.0,
 183 |                  const_range=(-1., 1.),
 184 |                  init_depth=(2, 6),
 185 |                  init_method='half and half',
 186 |                  function_set=('add', 'sub', 'mul', 'div'),
 187 |                  transformer=None,
 188 |                  metric='mean absolute error',
 189 |                  parsimony_coefficient=0.001,
 190 |                  p_crossover=0.9,
 191 |                  p_subtree_mutation=0.01,
 192 |                  p_hoist_mutation=0.01,
 193 |                  p_point_mutation=0.01,
 194 |                  p_point_replace=0.05,
 195 |                  max_samples=1.0,
 196 |                  feature_names=None,
 197 |                  warm_start=False,
 198 |                  low_memory=False,
 199 |                  n_jobs=1,
 200 |                  verbose=0,
 201 |                  random_state=None):
 202 | 
 203 |         self.population_size = population_size
 204 |         self.hall_of_fame = hall_of_fame
 205 |         self.n_components = n_components
 206 |         self.generations = generations
 207 |         self.tournament_size = tournament_size
 208 |         self.stopping_criteria = stopping_criteria
 209 |         self.const_range = const_range
 210 |         self.init_depth = init_depth
 211 |         self.init_method = init_method
 212 |         self.function_set = function_set
 213 |         self.transformer = transformer
 214 |         self.metric = metric
 215 |         self.parsimony_coefficient = parsimony_coefficient
 216 |         self.p_crossover = p_crossover
 217 |         self.p_subtree_mutation = p_subtree_mutation
 218 |         self.p_hoist_mutation = p_hoist_mutation
 219 |         self.p_point_mutation = p_point_mutation
 220 |         self.p_point_replace = p_point_replace
 221 |         self.max_samples = max_samples
 222 |         self.feature_names = feature_names
 223 |         self.warm_start = warm_start
 224 |         self.low_memory = low_memory
 225 |         self.n_jobs = n_jobs
 226 |         self.verbose = verbose
 227 |         self.random_state = random_state
 228 | 
 229 |     def _verbose_reporter(self, run_details=None):
 230 |         """A report of the progress of the evolution process.
 231 | 
 232 |         Parameters
 233 |         ----------
 234 |         run_details : dict
 235 |             Information about the evolution.
 236 | 
 237 |         """
 238 |         if run_details is None:
 239 |             print('    |{:^25}|{:^42}|'.format('Population Average',
 240 |                                                'Best Individual'))
 241 |             print('-' * 4 + ' ' + '-' * 25 + ' ' + '-' * 42 + ' ' + '-' * 10)
 242 |             line_format = '{:>4} {:>8} {:>16} {:>8} {:>16} {:>16} {:>10}'
 243 |             print(line_format.format('Gen', 'Length', 'Fitness', 'Length',
 244 |                                      'Fitness', 'OOB Fitness', 'Time Left'))
 245 | 
 246 |         else:
 247 |             # Estimate remaining time for run
 248 |             gen = run_details['generation'][-1]
 249 |             generation_time = run_details['generation_time'][-1]
 250 |             remaining_time = (self.generations - gen - 1) * generation_time
 251 |             if remaining_time > 60:
 252 |                 remaining_time = '{0:.2f}m'.format(remaining_time / 60.0)
 253 |             else:
 254 |                 remaining_time = '{0:.2f}s'.format(remaining_time)
 255 | 
 256 |             oob_fitness = 'N/A'
 257 |             line_format = '{:4d} {:8.2f} {:16g} {:8d} {:16g} {:>16} {:>10}'
 258 |             if self.max_samples < 1.0:
 259 |                 oob_fitness = run_details['best_oob_fitness'][-1]
 260 |                 line_format = '{:4d} {:8.2f} {:16g} {:8d} {:16g} {:16g} {:>10}'
 261 | 
 262 |             print(line_format.format(run_details['generation'][-1],
 263 |                                      run_details['average_length'][-1],
 264 |                                      run_details['average_fitness'][-1],
 265 |                                      run_details['best_length'][-1],
 266 |                                      run_details['best_fitness'][-1],
 267 |                                      oob_fitness,
 268 |                                      remaining_time))
 269 | 
 270 |     def fit(self, X, y, sample_weight=None):
 271 |         """Fit the Genetic Program according to X, y.
 272 | 
 273 |         Parameters
 274 |         ----------
 275 |         X : array-like, shape = [n_samples, n_features]
 276 |             Training vectors, where n_samples is the number of samples and
 277 |             n_features is the number of features.
 278 | 
 279 |         y : array-like, shape = [n_samples]
 280 |             Target values.
 281 | 
 282 |         sample_weight : array-like, shape = [n_samples], optional
 283 |             Weights applied to individual samples.
 284 | 
 285 |         Returns
 286 |         -------
 287 |         self : object
 288 |             Returns self.
 289 | 
 290 |         """
 291 |         random_state = check_random_state(self.random_state)
 292 | 
 293 |         # Check arrays
 294 |         if isinstance(self, ClassifierMixin):
 295 |             X, y = check_X_y(X, y, y_numeric=False)
 296 |             check_classification_targets(y)
 297 |             self.classes_, y = np.unique(y, return_inverse=True)
 298 |             n_trim_classes = np.count_nonzero(np.bincount(y, sample_weight))
 299 |             if n_trim_classes != 2:
 300 |                 raise ValueError("y contains %d class after sample_weight "
 301 |                                  "trimmed classes with zero weights, while 2 "
 302 |                                  "classes are required."
 303 |                                  % n_trim_classes)
 304 |             self.n_classes_ = len(self.classes_)
 305 |         else:
 306 |             print(u'here')
 307 |             pass
 308 |             #X, y = check_X_y(X, y, y_numeric=True) # 此为原版，需要的话进行调整
 309 |         if sample_weight is not None:
 310 |             sample_weight = check_array(sample_weight, ensure_2d=False)
 311 |         # 为了进行循环测试，对x进行改造，
 312 |         #_, self.n_features_ = X.shape # 此为原版，如需要可调整回
 313 |         self.n_features_ = 18 # 此为改版，为了符合现有的情况进行改造
 314 | 
 315 |         hall_of_fame = self.hall_of_fame
 316 |         if hall_of_fame is None:
 317 |             hall_of_fame = self.population_size
 318 |         if hall_of_fame > self.population_size or hall_of_fame < 1:
 319 |             raise ValueError('hall_of_fame (%d) must be less than or equal to '
 320 |                              'population_size (%d).' % (self.hall_of_fame,
 321 |                                                         self.population_size))
 322 |         n_components = self.n_components
 323 |         if n_components is None:
 324 |             n_components = hall_of_fame
 325 |         if n_components > hall_of_fame or n_components < 1:
 326 |             raise ValueError('n_components (%d) must be less than or equal to '
 327 |                              'hall_of_fame (%d).' % (self.n_components,
 328 |                                                      self.hall_of_fame))
 329 | 
 330 |         self._function_set = []
 331 |         for function in self.function_set:
 332 |             #print (function)
 333 |             if isinstance(function, str):
 334 |                 if function not in _function_map:
 335 |                     raise ValueError('invalid function name %s found in '
 336 |                                      '`function_set`.' % function)
 337 |                 self._function_set.append(_function_map[function])
 338 |             elif isinstance(function, _Function):
 339 |                 self._function_set.append(function)
 340 |             else:
 341 |                 raise ValueError('invalid type %s found in `function_set`.'
 342 |                                  % type(function))
 343 |         if not self._function_set:
 344 |             raise ValueError('No valid functions found in `function_set`.')
 345 | 
 346 |         # For point-mutation to find a compatible replacement node
 347 |         self._arities = {}
 348 |         for function in self._function_set:
 349 |             arity = function.arity
 350 |             self._arities[arity] = self._arities.get(arity, [])
 351 |             self._arities[arity].append(function)
 352 | 
 353 |         if isinstance(self.metric, _Fitness):
 354 |             self._metric = self.metric
 355 |         elif isinstance(self, RegressorMixin):
 356 |             if self.metric not in ('mean absolute error', 'mse', 'rmse',
 357 |                                    'pearson', 'spearman','stock_dedicated'):
 358 |                 raise ValueError('Unsupported metric: %s' % self.metric)
 359 |             self._metric = _fitness_map[self.metric]
 360 |         elif isinstance(self, ClassifierMixin):
 361 |             if self.metric != 'log loss':
 362 |                 raise ValueError('Unsupported metric: %s' % self.metric)
 363 |             self._metric = _fitness_map[self.metric]
 364 |         elif isinstance(self, TransformerMixin):
 365 |             if self.metric not in ('pearson', 'spearman'):
 366 |                 raise ValueError('Unsupported metric: %s' % self.metric)
 367 |             self._metric = _fitness_map[self.metric]
 368 |         if self.metric in ('stock_dedicate'):
 369 |             self._metric.stock_is = True 
 370 |             
 371 |         
 372 |         self._method_probs = np.array([self.p_crossover,
 373 |                                        self.p_subtree_mutation,
 374 |                                        self.p_hoist_mutation,
 375 |                                        self.p_point_mutation])
 376 |         self._method_probs = np.cumsum(self._method_probs)
 377 | 
 378 |         if self._method_probs[-1] > 1:
 379 |             raise ValueError('The sum of p_crossover, p_subtree_mutation, '
 380 |                              'p_hoist_mutation and p_point_mutation should '
 381 |                              'total to 1.0 or less.')
 382 | 
 383 |         if self.init_method not in ('half and half', 'grow', 'full'):
 384 |             raise ValueError('Valid program initializations methods include '
 385 |                              '"grow", "full" and "half and half". Given %s.'
 386 |                              % self.init_method)
 387 | 
 388 |         if not((isinstance(self.const_range, tuple) and
 389 |                 len(self.const_range) == 2) or self.const_range is None):
 390 |             raise ValueError('const_range should be a tuple with length two, '
 391 |                              'or None.')
 392 | 
 393 |         if (not isinstance(self.init_depth, tuple) or
 394 |                 len(self.init_depth) != 2):
 395 |             raise ValueError('init_depth should be a tuple with length two.')
 396 |         if self.init_depth[0] > self.init_depth[1]:
 397 |             raise ValueError('init_depth should be in increasing numerical '
 398 |                              'order: (min_depth, max_depth).')
 399 | 
 400 |         if self.feature_names is not None:
 401 |             if self.n_features_ != len(self.feature_names):
 402 |                 raise ValueError('The supplied `feature_names` has different '
 403 |                                  'length to n_features. Expected %d, got %d.'
 404 |                                  % (self.n_features_, len(self.feature_names)))
 405 |             for feature_name in self.feature_names:
 406 |                 if not isinstance(feature_name, str):
 407 |                     raise ValueError('invalid type %s found in '
 408 |                                      '`feature_names`.' % type(feature_name))
 409 | 
 410 |         if self.transformer is not None:
 411 |             if isinstance(self.transformer, _Function):
 412 |                 self._transformer = self.transformer
 413 |             elif self.transformer == 'sigmoid':
 414 |                 self._transformer = sigmoid
 415 |             else:
 416 |                 raise ValueError('Invalid `transformer`. Expected either '
 417 |                                  '"sigmoid" or _Function object, got %s' %
 418 |                                  type(self.transformer))
 419 |             if self._transformer.arity != 1:
 420 |                 raise ValueError('Invalid arity for `transformer`. Expected 1, '
 421 |                                  'got %d.' % (self._transformer.arity))
 422 | 
 423 |         params = self.get_params()
 424 |         params['_metric'] = self._metric
 425 |         if hasattr(self, '_transformer'):
 426 |             params['_transformer'] = self._transformer
 427 |         else:
 428 |             params['_transformer'] = None
 429 |         params['function_set'] = self._function_set  # 加减乘除
 430 |         params['arities'] = self._arities
 431 |         params['method_probs'] = self._method_probs
 432 | 
 433 |         if not self.warm_start or not hasattr(self, '_programs'):
 434 |             # Free allocated memory, if any
 435 |             self._programs = []
 436 |             self.run_details_ = {'generation': [],
 437 |                                  'average_length': [],
 438 |                                  'average_fitness': [],
 439 |                                  'best_length': [],
 440 |                                  'best_fitness': [],
 441 |                                  'best_oob_fitness': [],
 442 |                                  'generation_time': []}
 443 | 
 444 |         prior_generations = len(self._programs)
 445 |         n_more_generations = self.generations - prior_generations
 446 | 
 447 |         if n_more_generations < 0:
 448 |             raise ValueError('generations=%d must be larger or equal to '
 449 |                              'len(_programs)=%d when warm_start==True'
 450 |                              % (self.generations, len(self._programs)))
 451 |         elif n_more_generations == 0:
 452 |             fitness = [program.raw_fitness_ for program in self._programs[-1]]
 453 |             warn('Warm-start fitting without increasing n_estimators does not '
 454 |                  'fit new programs.')
 455 | 
 456 |         if self.warm_start:
 457 |             # Generate and discard seeds that would have been produced on the
 458 |             # initial fit call.
 459 |             for i in range(len(self._programs)):
 460 |                 _ = random_state.randint(MAX_INT, size=self.population_size)
 461 | 
 462 |         if self.verbose:
 463 |             # Print header fields
 464 |             self._verbose_reporter()
 465 |         
 466 |         for gen in range(prior_generations, self.generations):
 467 |             print(gen,u'当前代数')
 468 | 
 469 |             start_time = time()
 470 | 
 471 |             if gen == 0:
 472 |                 parents = None
 473 |             else:
 474 |                 parents = self._programs[gen - 1]
 475 | 
 476 |             # Parallel loop
 477 |             n_jobs, n_programs, starts = _partition_estimators(
 478 |                 self.population_size, self.n_jobs)
 479 |             seeds = random_state.randint(MAX_INT, size=self.population_size)
 480 | 
 481 |             population = Parallel(n_jobs=n_jobs,
 482 |                                   verbose=int(self.verbose > 1))(
 483 |                 delayed(_parallel_evolve)(n_programs[i],
 484 |                                           parents,
 485 |                                           X,
 486 |                                           y,
 487 |                                           sample_weight,
 488 |                                           seeds[starts[i]:starts[i + 1]],
 489 |                                           params)
 490 |                 for i in range(n_jobs))
 491 | 
 492 |             # Reduce, maintaining order across different n_jobs
 493 |             population = list(itertools.chain.from_iterable(population))
 494 |             #print (population,'ninininini')
 495 |             #input()
 496 |             fitness = [program.raw_fitness_ for program in population]
 497 |             length = [program.length_ for program in population]
 498 | 
 499 |             parsimony_coefficient = None
 500 |             if self.parsimony_coefficient == 'auto':
 501 |                 parsimony_coefficient = (np.cov(length, fitness)[1, 0] /
 502 |                                          np.var(length))
 503 | 
 504 |             for program in population:
 505 |                 program.fitness_ = program.fitness(parsimony_coefficient)
 506 | 
 507 |             self._programs.append(population)
 508 |             # Remove old programs that didn't make it into the new population.
 509 |             if not self.low_memory:
 510 |                 for old_gen in np.arange(gen, 0, -1):
 511 |                     #print (old_gen)
 512 |                     indices = []
 513 |                     for program in self._programs[old_gen]:
 514 |                         if program is not None:
 515 |                             for idx in program.parents:
 516 |                                 if 'idx' in idx:
 517 |                                     indices.append(program.parents[idx])
 518 |                     indices = set(indices)
 519 |                     for idx in range(self.population_size):
 520 |                         if idx not in indices:
 521 |                             self._programs[old_gen - 1][idx] = None
 522 |             elif gen > 0:
 523 |                 # Remove old generations
 524 |                 self._programs[gen - 1] = None
 525 |                 
 526 |             
 527 |             if self._metric.greater_is_better:    
 528 |                 best_program = population[np.argmax(fitness)]
 529 |             else:
 530 |                 best_program = population[np.argmin(fitness)]
 531 | 
 532 |             self.run_details_['generation'].append(gen)
 533 |             self.run_details_['average_length'].append(np.mean(length))
 534 |             self.run_details_['average_fitness'].append(np.mean(fitness))
 535 |             self.run_details_['best_length'].append(best_program.length_)
 536 |             self.run_details_['best_fitness'].append(best_program.raw_fitness_)
 537 |             oob_fitness = np.nan
 538 |             if self.max_samples < 1.0:
 539 |                 oob_fitness = best_program.oob_fitness_
 540 |             self.run_details_['best_oob_fitness'].append(oob_fitness)
 541 |             generation_time = time() - start_time
 542 |             self.run_details_['generation_time'].append(generation_time)
 543 | 
 544 |             if self.verbose:
 545 |                 self._verbose_reporter(self.run_details_)
 546 | 
 547 |             # Check for early stopping
 548 |             if self._metric.greater_is_better:
 549 |                 best_fitness = fitness[np.argmax(fitness)]
 550 |                 if best_fitness >= self.stopping_criteria:
 551 |                     break
 552 |             else:
 553 |                 best_fitness = fitness[np.argmin(fitness)]
 554 |                 if best_fitness <= self.stopping_criteria:
 555 |                     break
 556 |             for tt in self._programs[-1]:
 557 |                 print (tt,'uuuuuu')
 558 |             input()
 559 |         if isinstance(self, TransformerMixin):
 560 |             # Find the best individuals in the final generation
 561 |             fitness = np.array(fitness)
 562 |             if self._metric.greater_is_better:
 563 |                 hall_of_fame = fitness.argsort()[::-1][:self.hall_of_fame]
 564 |             else:
 565 |                 hall_of_fame = fitness.argsort()[:self.hall_of_fame]
 566 |             evaluation = np.array([gp.execute(X) for gp in
 567 |                                    [self._programs[-1][i] for
 568 |                                     i in hall_of_fame]])
 569 |             if self.metric == 'spearman':
 570 |                 evaluation = np.apply_along_axis(rankdata, 1, evaluation)
 571 | 
 572 |             with np.errstate(divide='ignore', invalid='ignore'):
 573 |                 correlations = np.abs(np.corrcoef(evaluation))
 574 |             np.fill_diagonal(correlations, 0.)
 575 |             components = list(range(self.hall_of_fame))
 576 |             indices = list(range(self.hall_of_fame))
 577 |             # Iteratively remove least fit individual of most correlated pair
 578 |             while len(components) > self.n_components:
 579 |                 most_correlated = np.unravel_index(np.argmax(correlations),
 580 |                                                    correlations.shape)
 581 |                 # The correlation matrix is sorted by fitness, so identifying
 582 |                 # the least fit of the pair is simply getting the higher index
 583 |                 worst = max(most_correlated)
 584 |                 components.pop(worst)
 585 |                 indices.remove(worst)
 586 |                 correlations = correlations[:, indices][indices, :]
 587 |                 indices = list(range(len(components)))
 588 |             self._best_programs = [self._programs[-1][i] for i in
 589 |                                    hall_of_fame[components]]
 590 | 
 591 |         else:
 592 |             # Find the best individual in the final generation
 593 |             if self._metric.greater_is_better:
 594 |                 self._program = self._programs[-1][np.argmax(fitness)]
 595 |             else:
 596 |                 self._program = self._programs[-1][np.argmin(fitness)]
 597 |         print (self._program,u'program')
 598 |         c = sorted(fitness)[-100:]
 599 |         d = sorted(fitness)[0:20]
 600 |         qq = 0
 601 |         for i in c:
 602 |             print (i)
 603 |             if self._programs[-1][fitness.index(i)]==qq:
 604 |                 pass
 605 |             else:
 606 |                 qq = self._programs[-1][fitness.index(i)]
 607 |                 print (qq)
 608 |         return self
 609 | 
 610 | 
 611 | class SymbolicRegressor(BaseSymbolic, RegressorMixin):
 612 | 
 613 |     """A Genetic Programming symbolic regressor.
 614 | 
 615 |     A symbolic regressor is an estimator that begins by building a population
 616 |     of naive random formulas to represent a relationship. The formulas are
 617 |     represented as tree-like structures with mathematical functions being
 618 |     recursively applied to variables and constants. Each successive generation
 619 |     of programs is then evolved from the one that came before it by selecting
 620 |     the fittest individuals from the population to undergo genetic operations
 621 |     such as crossover, mutation or reproduction.
 622 | 
 623 |     Parameters
 624 |     ----------
 625 |     population_size : integer, optional (default=1000)
 626 |         The number of programs in each generation.
 627 |     # 每一代人口的数量
 628 | 
 629 |     generations : integer, optional (default=20)
 630 |         The number of generations to evolve.
 631 |     # 进化的世代数
 632 |     tournament_size : integer, optional (default=20)
 633 |         The number of programs that will compete to become part of the next
 634 |         generation.
 635 |     # 成为下一代的数量
 636 | 
 637 |     stopping_criteria : float, optional (default=0.0)
 638 |         The required metric value required in order to stop evolution early.
 639 |     #停止的度量值
 640 |     const_range : tuple of two floats, or None, optional (default=(-1., 1.))
 641 |         The range of constants to include in the formulas. If None then no
 642 |         constants will be included in the candidate programs.
 643 |     # const 范围
 644 |     init_depth : tuple of two ints, optional (default=(2, 6))
 645 |         The range of tree depths for the initial population of naive formulas.
 646 |         Individual trees will randomly choose a maximum depth from this range.
 647 |         When combined with `init_method='half and half'` this yields the well-
 648 |         known 'ramped half and half' initialization method.
 649 |     # 初始熟的深度
 650 |     init_method : str, optional (default='half and half')
 651 |         - 'grow' : Nodes are chosen at random from both functions and
 652 |           terminals, allowing for smaller trees than `init_depth` allows. Tends
 653 |           to grow asymmetrical trees.
 654 |         - 'full' : Functions are chosen until the `init_depth` is reached, and
 655 |           then terminals are selected. Tends to grow 'bushy' trees.
 656 |         - 'half and half' : Trees are grown through a 50/50 mix of 'full' and
 657 |           'grow', making for a mix of tree shapes in the initial population.
 658 | 
 659 |     function_set : iterable, optional (default=('add', 'sub', 'mul', 'div'))
 660 |         The functions to use when building and evolving programs. This iterable
 661 |         can include strings to indicate either individual functions as outlined
 662 |         below, or you can also include your own functions as built using the
 663 |         ``make_function`` factory from the ``functions`` module.
 664 | 
 665 |         Available individual functions are:
 666 | 
 667 |         - 'add' : addition, arity=2.
 668 |         - 'sub' : subtraction, arity=2.
 669 |         - 'mul' : multiplication, arity=2.
 670 |         - 'div' : protected division where a denominator near-zero returns 1.,
 671 |           arity=2.
 672 |         - 'sqrt' : protected square root where the absolute value of the
 673 |           argument is used, arity=1.
 674 |         - 'log' : protected log where the absolute value of the argument is
 675 |           used and a near-zero argument returns 0., arity=1.
 676 |         - 'abs' : absolute value, arity=1.
 677 |         - 'neg' : negative, arity=1.
 678 |         - 'inv' : protected inverse where a near-zero argument returns 0.,
 679 |           arity=1.
 680 |         - 'max' : maximum, arity=2.
 681 |         - 'min' : minimum, arity=2.
 682 |         - 'sin' : sine (radians), arity=1.
 683 |         - 'cos' : cosine (radians), arity=1.
 684 |         - 'tan' : tangent (radians), arity=1.
 685 | 
 686 |     metric : str, optional (default='mean absolute error')
 687 |         The name of the raw fitness metric. Available options include:
 688 | 
 689 |         - 'mean absolute error'.
 690 |         - 'mse' for mean squared error.
 691 |         - 'rmse' for root mean squared error.
 692 |         - 'pearson', for Pearson's product-moment correlation coefficient.
 693 |         - 'spearman' for Spearman's rank-order correlation coefficient.
 694 | 
 695 |         Note that 'pearson' and 'spearman' will not directly predict the target
 696 |         but could be useful as value-added features in a second-step estimator.
 697 |         This would allow the user to generate one engineered feature at a time,
 698 |         using the SymbolicTransformer would allow creation of multiple features
 699 |         at once.
 700 | 
 701 |     parsimony_coefficient : float or "auto", optional (default=0.001)
 702 |         This constant penalizes large programs by adjusting their fitness to
 703 |         be less favorable for selection. Larger values penalize the program
 704 |         more which can control the phenomenon known as 'bloat'. Bloat is when
 705 |         evolution is increasing the size of programs without a significant
 706 |         increase in fitness, which is costly for computation time and makes for
 707 |         a less understandable final result. This parameter may need to be tuned
 708 |         over successive runs.
 709 | 
 710 |         If "auto" the parsimony coefficient is recalculated for each generation
 711 |         using c = Cov(l,f)/Var( l), where Cov(l,f) is the covariance between
 712 |         program size l and program fitness f in the population, and Var(l) is
 713 |         the variance of program sizes.
 714 | 
 715 |     p_crossover : float, optional (default=0.9)
 716 |         The probability of performing crossover on a tournament winner.
 717 |         Crossover takes the winner of a tournament and selects a random subtree
 718 |         from it to be replaced. A second tournament is performed to find a
 719 |         donor. The donor also has a subtree selected at random and this is
 720 |         inserted into the original parent to form an offspring in the next
 721 |         generation.
 722 | 
 723 |     p_subtree_mutation : float, optional (default=0.01)
 724 |         The probability of performing subtree mutation on a tournament winner.
 725 |         Subtree mutation takes the winner of a tournament and selects a random
 726 |         subtree from it to be replaced. A donor subtree is generated at random
 727 |         and this is inserted into the original parent to form an offspring in
 728 |         the next generation.
 729 | 
 730 |     p_hoist_mutation : float, optional (default=0.01)
 731 |         The probability of performing hoist mutation on a tournament winner.
 732 |         Hoist mutation takes the winner of a tournament and selects a random
 733 |         subtree from it. A random subtree of that subtree is then selected
 734 |         and this is 'hoisted' into the original subtrees location to form an
 735 |         offspring in the next generation. This method helps to control bloat.
 736 | 
 737 |     p_point_mutation : float, optional (default=0.01)
 738 |         The probability of performing point mutation on a tournament winner.
 739 |         Point mutation takes the winner of a tournament and selects random
 740 |         nodes from it to be replaced. Terminals are replaced by other terminals
 741 |         and functions are replaced by other functions that require the same
 742 |         number of arguments as the original node. The resulting tree forms an
 743 |         offspring in the next generation.
 744 | 
 745 |         Note : The above genetic operation probabilities must sum to less than
 746 |         one. The balance of probability is assigned to 'reproduction', where a
 747 |         tournament winner is cloned and enters the next generation unmodified.
 748 | 
 749 |     p_point_replace : float, optional (default=0.05)
 750 |         For point mutation only, the probability that any given node will be
 751 |         mutated.
 752 | 
 753 |     max_samples : float, optional (default=1.0)
 754 |         The fraction of samples to draw from X to evaluate each program on.
 755 | 
 756 |     feature_names : list, optional (default=None)
 757 |         Optional list of feature names, used purely for representations in
 758 |         the `print` operation or `export_graphviz`. If None, then X0, X1, etc
 759 |         will be used for representations.
 760 | 
 761 |     warm_start : bool, optional (default=False)
 762 |         When set to ``True``, reuse the solution of the previous call to fit
 763 |         and add more generations to the evolution, otherwise, just fit a new
 764 |         evolution.
 765 | 
 766 |     low_memory : bool, optional (default=False)
 767 |         When set to ``True``, only the current generation is retained. Parent
 768 |         information is discarded. For very large populations or runs with many
 769 |         generations, this can result in substantial memory use reduction.
 770 | 
 771 |     n_jobs : integer, optional (default=1)
 772 |         The number of jobs to run in parallel for `fit`. If -1, then the number
 773 |         of jobs is set to the number of cores.
 774 | 
 775 |     verbose : int, optional (default=0)
 776 |         Controls the verbosity of the evolution building process.
 777 | 
 778 |     random_state : int, RandomState instance or None, optional (default=None)
 779 |         If int, random_state is the seed used by the random number generator;
 780 |         If RandomState instance, random_state is the random number generator;
 781 |         If None, the random number generator is the RandomState instance used
 782 |         by `np.random`.
 783 | 
 784 |     Attributes
 785 |     ----------
 786 |     run_details_ : dict
 787 |         Details of the evolution process. Includes the following elements:
 788 | 
 789 |         - 'generation' : The generation index.
 790 |         - 'average_length' : The average program length of the generation.
 791 |         - 'average_fitness' : The average program fitness of the generation.
 792 |         - 'best_length' : The length of the best program in the generation.
 793 |         - 'best_fitness' : The fitness of the best program in the generation.
 794 |         - 'best_oob_fitness' : The out of bag fitness of the best program in
 795 |           the generation (requires `max_samples` < 1.0).
 796 |         - 'generation_time' : The time it took for the generation to evolve.
 797 | 
 798 |     See Also
 799 |     --------
 800 |     SymbolicTransformer
 801 | 
 802 |     References
 803 |     ----------
 804 |     .. [1] J. Koza, "Genetic Programming", 1992.
 805 | 
 806 |     .. [2] R. Poli, et al. "A Field Guide to Genetic Programming", 2008.
 807 | 
 808 |     """
 809 | 
 810 |     def __init__(self,
 811 |                  population_size=1000,
 812 |                  generations=20,
 813 |                  tournament_size=20,
 814 |                  stopping_criteria=0.0,
 815 |                  const_range=(-1., 1.),
 816 |                  init_depth=(2, 6),
 817 |                  init_method='half and half',
 818 |                  function_set=('add', 'sub', 'mul', 'div'),
 819 |                  metric='mean absolute error',
 820 |                  parsimony_coefficient=0.001,
 821 |                  p_crossover=0.9,
 822 |                  p_subtree_mutation=0.01,
 823 |                  p_hoist_mutation=0.01,
 824 |                  p_point_mutation=0.01,
 825 |                  p_point_replace=0.05,
 826 |                  max_samples=1.0,
 827 |                  feature_names=None,
 828 |                  warm_start=False,
 829 |                  low_memory=False,
 830 |                  n_jobs=1,
 831 |                  verbose=0,
 832 |                  random_state=None):
 833 |         super(SymbolicRegressor, self).__init__(
 834 |             population_size=population_size,
 835 |             generations=generations,
 836 |             tournament_size=tournament_size,
 837 |             stopping_criteria=stopping_criteria,
 838 |             const_range=const_range,
 839 |             init_depth=init_depth,
 840 |             init_method=init_method,
 841 |             function_set=function_set,
 842 |             metric=metric,
 843 |             parsimony_coefficient=parsimony_coefficient,
 844 |             p_crossover=p_crossover,
 845 |             p_subtree_mutation=p_subtree_mutation,
 846 |             p_hoist_mutation=p_hoist_mutation,
 847 |             p_point_mutation=p_point_mutation,
 848 |             p_point_replace=p_point_replace,
 849 |             max_samples=max_samples,
 850 |             feature_names=feature_names,
 851 |             warm_start=warm_start,
 852 |             low_memory=low_memory,
 853 |             n_jobs=n_jobs,
 854 |             verbose=verbose,
 855 |             random_state=random_state)
 856 | 
 857 |     def __str__(self):
 858 |         """Overloads `print` output of the object to resemble a LISP tree."""
 859 |         if not hasattr(self, '_program'):
 860 |             return self.__repr__()
 861 |         return self._program.__str__()
 862 | 
 863 |     def predict(self, X):
 864 |         """Perform regression on test vectors X.
 865 | 
 866 |         Parameters
 867 |         ----------
 868 |         X : array-like, shape = [n_samples, n_features]
 869 |             Input vectors, where n_samples is the number of samples
 870 |             and n_features is the number of features.
 871 | 
 872 |         Returns
 873 |         -------
 874 |         y : array, shape = [n_samples]
 875 |             Predicted values for X.
 876 | 
 877 |         """
 878 |         if not hasattr(self, '_program'):
 879 |             raise NotFittedError('SymbolicRegressor not fitted.')
 880 | 
 881 |         X = check_array(X)
 882 |         _, n_features = X.shape
 883 |         if self.n_features_ != n_features:
 884 |             raise ValueError('Number of features of the model must match the '
 885 |                              'input. Model n_features is %s and input '
 886 |                              'n_features is %s.'
 887 |                              % (self.n_features_, n_features))
 888 | 
 889 |         y = self._program.execute(X)
 890 | 
 891 |         return y
 892 | 
 893 | 
 894 | class SymbolicClassifier(BaseSymbolic, ClassifierMixin):
 895 | 
 896 |     """A Genetic Programming symbolic classifier.
 897 | 
 898 |     A symbolic classifier is an estimator that begins by building a population
 899 |     of naive random formulas to represent a relationship. The formulas are
 900 |     represented as tree-like structures with mathematical functions being
 901 |     recursively applied to variables and constants. Each successive generation
 902 |     of programs is then evolved from the one that came before it by selecting
 903 |     the fittest individuals from the population to undergo genetic operations
 904 |     such as crossover, mutation or reproduction.
 905 | 
 906 |     Parameters
 907 |     ----------
 908 |     population_size : integer, optional (default=500)
 909 |         The number of programs in each generation.
 910 | 
 911 |     generations : integer, optional (default=10)
 912 |         The number of generations to evolve.
 913 | 
 914 |     tournament_size : integer, optional (default=20)
 915 |         The number of programs that will compete to become part of the next
 916 |         generation.
 917 | 
 918 |     stopping_criteria : float, optional (default=0.0)
 919 |         The required metric value required in order to stop evolution early.
 920 | 
 921 |     const_range : tuple of two floats, or None, optional (default=(-1., 1.))
 922 |         The range of constants to include in the formulas. If None then no
 923 |         constants will be included in the candidate programs.
 924 | 
 925 |     init_depth : tuple of two ints, optional (default=(2, 6))
 926 |         The range of tree depths for the initial population of naive formulas.
 927 |         Individual trees will randomly choose a maximum depth from this range.
 928 |         When combined with `init_method='half and half'` this yields the well-
 929 |         known 'ramped half and half' initialization method.
 930 | 
 931 |     init_method : str, optional (default='half and half')
 932 |         - 'grow' : Nodes are chosen at random from both functions and
 933 |           terminals, allowing for smaller trees than `init_depth` allows. Tends
 934 |           to grow asymmetrical trees.
 935 |         - 'full' : Functions are chosen until the `init_depth` is reached, and
 936 |           then terminals are selected. Tends to grow 'bushy' trees.
 937 |         - 'half and half' : Trees are grown through a 50/50 mix of 'full' and
 938 |           'grow', making for a mix of tree shapes in the initial population.
 939 | 
 940 |     function_set : iterable, optional (default=('add', 'sub', 'mul', 'div'))
 941 |         The functions to use when building and evolving programs. This iterable
 942 |         can include strings to indicate either individual functions as outlined
 943 |         below, or you can also include your own functions as built using the
 944 |         ``make_function`` factory from the ``functions`` module.
 945 | 
 946 |         Available individual functions are:
 947 | 
 948 |         - 'add' : addition, arity=2.
 949 |         - 'sub' : subtraction, arity=2.
 950 |         - 'mul' : multiplication, arity=2.
 951 |         - 'div' : protected division where a denominator near-zero returns 1.,
 952 |           arity=2.
 953 |         - 'sqrt' : protected square root where the absolute value of the
 954 |           argument is used, arity=1.
 955 |         - 'log' : protected log where the absolute value of the argument is
 956 |           used and a near-zero argument returns 0., arity=1.
 957 |         - 'abs' : absolute value, arity=1.
 958 |         - 'neg' : negative, arity=1.
 959 |         - 'inv' : protected inverse where a near-zero argument returns 0.,
 960 |           arity=1.
 961 |         - 'max' : maximum, arity=2.
 962 |         - 'min' : minimum, arity=2.
 963 |         - 'sin' : sine (radians), arity=1.
 964 |         - 'cos' : cosine (radians), arity=1.
 965 |         - 'tan' : tangent (radians), arity=1.
 966 | 
 967 |     transformer : str, optional (default='sigmoid')
 968 |         The name of the function through which the raw decision function is
 969 |         passed. This function will transform the raw decision function into
 970 |         probabilities of each class.
 971 | 
 972 |         This can also be replaced by your own functions as built using the
 973 |         ``make_function`` factory from the ``functions`` module.
 974 | 
 975 |     metric : str, optional (default='log loss')
 976 |         The name of the raw fitness metric. Available options include:
 977 | 
 978 |         - 'log loss' aka binary cross-entropy loss.
 979 | 
 980 |     parsimony_coefficient : float or "auto", optional (default=0.001)
 981 |         This constant penalizes large programs by adjusting their fitness to
 982 |         be less favorable for selection. Larger values penalize the program
 983 |         more which can control the phenomenon known as 'bloat'. Bloat is when
 984 |         evolution is increasing the size of programs without a significant
 985 |         increase in fitness, which is costly for computation time and makes for
 986 |         a less understandable final result. This parameter may need to be tuned
 987 |         over successive runs.
 988 | 
 989 |         If "auto" the parsimony coefficient is recalculated for each generation
 990 |         using c = Cov(l,f)/Var( l), where Cov(l,f) is the covariance between
 991 |         program size l and program fitness f in the population, and Var(l) is
 992 |         the variance of program sizes.
 993 | 
 994 |     p_crossover : float, optional (default=0.9)
 995 |         The probability of performing crossover on a tournament winner.
 996 |         Crossover takes the winner of a tournament and selects a random subtree
 997 |         from it to be replaced. A second tournament is performed to find a
 998 |         donor. The donor also has a subtree selected at random and this is
 999 |         inserted into the original parent to form an offspring in the next
1000 |         generation.
1001 | 
1002 |     p_subtree_mutation : float, optional (default=0.01)
1003 |         The probability of performing subtree mutation on a tournament winner.
1004 |         Subtree mutation takes the winner of a tournament and selects a random
1005 |         subtree from it to be replaced. A donor subtree is generated at random
1006 |         and this is inserted into the original parent to form an offspring in
1007 |         the next generation.
1008 | 
1009 |     p_hoist_mutation : float, optional (default=0.01)
1010 |         The probability of performing hoist mutation on a tournament winner.
1011 |         Hoist mutation takes the winner of a tournament and selects a random
1012 |         subtree from it. A random subtree of that subtree is then selected
1013 |         and this is 'hoisted' into the original subtrees location to form an
1014 |         offspring in the next generation. This method helps to control bloat.
1015 | 
1016 |     p_point_mutation : float, optional (default=0.01)
1017 |         The probability of performing point mutation on a tournament winner.
1018 |         Point mutation takes the winner of a tournament and selects random
1019 |         nodes from it to be replaced. Terminals are replaced by other terminals
1020 |         and functions are replaced by other functions that require the same
1021 |         number of arguments as the original node. The resulting tree forms an
1022 |         offspring in the next generation.
1023 | 
1024 |         Note : The above genetic operation probabilities must sum to less than
1025 |         one. The balance of probability is assigned to 'reproduction', where a
1026 |         tournament winner is cloned and enters the next generation unmodified.
1027 | 
1028 |     p_point_replace : float, optional (default=0.05)
1029 |         For point mutation only, the probability that any given node will be
1030 |         mutated.
1031 | 
1032 |     max_samples : float, optional (default=1.0)
1033 |         The fraction of samples to draw from X to evaluate each program on.
1034 | 
1035 |     feature_names : list, optional (default=None)
1036 |         Optional list of feature names, used purely for representations in
1037 |         the `print` operation or `export_graphviz`. If None, then X0, X1, etc
1038 |         will be used for representations.
1039 | 
1040 |     warm_start : bool, optional (default=False)
1041 |         When set to ``True``, reuse the solution of the previous call to fit
1042 |         and add more generations to the evolution, otherwise, just fit a new
1043 |         evolution.
1044 | 
1045 |     low_memory : bool, optional (default=False)
1046 |         When set to ``True``, only the current generation is retained. Parent
1047 |         information is discarded. For very large populations or runs with many
1048 |         generations, this can result in substantial memory use reduction.
1049 | 
1050 |     n_jobs : integer, optional (default=1)
1051 |         The number of jobs to run in parallel for `fit`. If -1, then the number
1052 |         of jobs is set to the number of cores.
1053 | 
1054 |     verbose : int, optional (default=0)
1055 |         Controls the verbosity of the evolution building process.
1056 | 
1057 |     random_state : int, RandomState instance or None, optional (default=None)
1058 |         If int, random_state is the seed used by the random number generator;
1059 |         If RandomState instance, random_state is the random number generator;
1060 |         If None, the random number generator is the RandomState instance used
1061 |         by `np.random`.
1062 | 
1063 |     Attributes
1064 |     ----------
1065 |     run_details_ : dict
1066 |         Details of the evolution process. Includes the following elements:
1067 | 
1068 |         - 'generation' : The generation index.
1069 |         - 'average_length' : The average program length of the generation.
1070 |         - 'average_fitness' : The average program fitness of the generation.
1071 |         - 'best_length' : The length of the best program in the generation.
1072 |         - 'best_fitness' : The fitness of the best program in the generation.
1073 |         - 'best_oob_fitness' : The out of bag fitness of the best program in
1074 |           the generation (requires `max_samples` < 1.0).
1075 |         - 'generation_time' : The time it took for the generation to evolve.
1076 | 
1077 |     See Also
1078 |     --------
1079 |     SymbolicTransformer
1080 | 
1081 |     References
1082 |     ----------
1083 |     .. [1] J. Koza, "Genetic Programming", 1992.
1084 | 
1085 |     .. [2] R. Poli, et al. "A Field Guide to Genetic Programming", 2008.
1086 | 
1087 |     """
1088 | 
1089 |     def __init__(self,
1090 |                  population_size=1000,
1091 |                  generations=20,
1092 |                  tournament_size=20,
1093 |                  stopping_criteria=0.0,
1094 |                  const_range=(-1., 1.),
1095 |                  init_depth=(2, 6),
1096 |                  init_method='half and half',
1097 |                  function_set=('add', 'sub', 'mul', 'div'),
1098 |                  transformer='sigmoid',
1099 |                  metric='log loss',
1100 |                  parsimony_coefficient=0.001,
1101 |                  p_crossover=0.9,
1102 |                  p_subtree_mutation=0.01,
1103 |                  p_hoist_mutation=0.01,
1104 |                  p_point_mutation=0.01,
1105 |                  p_point_replace=0.05,
1106 |                  max_samples=1.0,
1107 |                  feature_names=None,
1108 |                  warm_start=False,
1109 |                  low_memory=False,
1110 |                  n_jobs=1,
1111 |                  verbose=0,
1112 |                  random_state=None):
1113 |         super(SymbolicClassifier, self).__init__(
1114 |             population_size=population_size,
1115 |             generations=generations,
1116 |             tournament_size=tournament_size,
1117 |             stopping_criteria=stopping_criteria,
1118 |             const_range=const_range,
1119 |             init_depth=init_depth,
1120 |             init_method=init_method,
1121 |             function_set=function_set,
1122 |             transformer=transformer,
1123 |             metric=metric,
1124 |             parsimony_coefficient=parsimony_coefficient,
1125 |             p_crossover=p_crossover,
1126 |             p_subtree_mutation=p_subtree_mutation,
1127 |             p_hoist_mutation=p_hoist_mutation,
1128 |             p_point_mutation=p_point_mutation,
1129 |             p_point_replace=p_point_replace,
1130 |             max_samples=max_samples,
1131 |             feature_names=feature_names,
1132 |             warm_start=warm_start,
1133 |             low_memory=low_memory,
1134 |             n_jobs=n_jobs,
1135 |             verbose=verbose,
1136 |             random_state=random_state)
1137 | 
1138 |     def __str__(self):
1139 |         """Overloads `print` output of the object to resemble a LISP tree."""
1140 |         if not hasattr(self, '_program'):
1141 |             return self.__repr__()
1142 |         return self._program.__str__()
1143 | 
1144 |     def predict_proba(self, X):
1145 |         """Predict probabilities on test vectors X.
1146 | 
1147 |         Parameters
1148 |         ----------
1149 |         X : array-like, shape = [n_samples, n_features]
1150 |             Input vectors, where n_samples is the number of samples
1151 |             and n_features is the number of features.
1152 | 
1153 |         Returns
1154 |         -------
1155 |         proba : array, shape = [n_samples, n_classes]
1156 |             The class probabilities of the input samples. The order of the
1157 |             classes corresponds to that in the attribute `classes_`.
1158 | 
1159 |         """
1160 |         if not hasattr(self, '_program'):
1161 |             raise NotFittedError('SymbolicClassifier not fitted.')
1162 | 
1163 |         X = check_array(X)
1164 |         _, n_features = X.shape
1165 |         if self.n_features_ != n_features:
1166 |             raise ValueError('Number of features of the model must match the '
1167 |                              'input. Model n_features is %s and input '
1168 |                              'n_features is %s.'
1169 |                              % (self.n_features_, n_features))
1170 | 
1171 |         scores = self._program.execute(X)
1172 |         proba = self._transformer(scores)
1173 |         proba = np.vstack([1 - proba, proba]).T
1174 |         return proba
1175 | 
1176 |     def predict(self, X):
1177 |         """Predict classes on test vectors X.
1178 | 
1179 |         Parameters
1180 |         ----------
1181 |         X : array-like, shape = [n_samples, n_features]
1182 |             Input vectors, where n_samples is the number of samples
1183 |             and n_features is the number of features.
1184 | 
1185 |         Returns
1186 |         -------
1187 |         y : array, shape = [n_samples,]
1188 |             The predicted classes of the input samples.
1189 | 
1190 |         """
1191 |         proba = self.predict_proba(X)
1192 |         return self.classes_.take(np.argmax(proba, axis=1), axis=0)
1193 | 
1194 | 
1195 | class SymbolicTransformer(BaseSymbolic, TransformerMixin):
1196 | 
1197 |     """A Genetic Programming symbolic transformer.
1198 | 
1199 |     A symbolic transformer is a supervised transformer that begins by building
1200 |     a population of naive random formulas to represent a relationship. The
1201 |     formulas are represented as tree-like structures with mathematical
1202 |     functions being recursively applied to variables and constants. Each
1203 |     successive generation of programs is then evolved from the one that came
1204 |     before it by selecting the fittest individuals from the population to
1205 |     undergo genetic operations such as crossover, mutation or reproduction.
1206 |     The final population is searched for the fittest individuals with the least
1207 |     correlation to one another.
1208 | 
1209 |     Parameters
1210 |     ----------
1211 |     population_size : integer, optional (default=1000)
1212 |         The number of programs in each generation.
1213 | 
1214 |     hall_of_fame : integer, or None, optional (default=100)
1215 |         The number of fittest programs to compare from when finding the
1216 |         least-correlated individuals for the n_components. If `None`, the
1217 |         entire final generation will be used.
1218 | 
1219 |     n_components : integer, or None, optional (default=10)
1220 |         The number of best programs to return after searching the hall_of_fame
1221 |         for the least-correlated individuals. If `None`, the entire
1222 |         hall_of_fame will be used.
1223 | 
1224 |     generations : integer, optional (default=20)
1225 |         The number of generations to evolve.
1226 | 
1227 |     tournament_size : integer, optional (default=20)
1228 |         The number of programs that will compete to become part of the next
1229 |         generation.
1230 | 
1231 |     stopping_criteria : float, optional (default=1.0)
1232 |         The required metric value required in order to stop evolution early.
1233 | 
1234 |     const_range : tuple of two floats, or None, optional (default=(-1., 1.))
1235 |         The range of constants to include in the formulas. If None then no
1236 |         constants will be included in the candidate programs.
1237 | 
1238 |     init_depth : tuple of two ints, optional (default=(2, 6))
1239 |         The range of tree depths for the initial population of naive formulas.
1240 |         Individual trees will randomly choose a maximum depth from this range.
1241 |         When combined with `init_method='half and half'` this yields the well-
1242 |         known 'ramped half and half' initialization method.
1243 | 
1244 |     init_method : str, optional (default='half and half')
1245 |         - 'grow' : Nodes are chosen at random from both functions and
1246 |           terminals, allowing for smaller trees than `init_depth` allows. Tends
1247 |           to grow asymmetrical trees.
1248 |         - 'full' : Functions are chosen until the `init_depth` is reached, and
1249 |           then terminals are selected. Tends to grow 'bushy' trees.
1250 |         - 'half and half' : Trees are grown through a 50/50 mix of 'full' and
1251 |           'grow', making for a mix of tree shapes in the initial population.
1252 | 
1253 |     function_set : iterable, optional (default=('add', 'sub', 'mul', 'div'))
1254 |         The functions to use when building and evolving programs. This iterable
1255 |         can include strings to indicate either individual functions as outlined
1256 |         below, or you can also include your own functions as built using the
1257 |         ``make_function`` factory from the ``functions`` module.
1258 | 
1259 |         Available individual functions are:
1260 | 
1261 |         - 'add' : addition, arity=2.
1262 |         - 'sub' : subtraction, arity=2.
1263 |         - 'mul' : multiplication, arity=2.
1264 |         - 'div' : protected division where a denominator near-zero returns 1.,
1265 |           arity=2.
1266 |         - 'sqrt' : protected square root where the absolute value of the
1267 |           argument is used, arity=1.
1268 |         - 'log' : protected log where the absolute value of the argument is
1269 |           used and a near-zero argument returns 0., arity=1.
1270 |         - 'abs' : absolute value, arity=1.
1271 |         - 'neg' : negative, arity=1.
1272 |         - 'inv' : protected inverse where a near-zero argument returns 0.,
1273 |           arity=1.
1274 |         - 'max' : maximum, arity=2.
1275 |         - 'min' : minimum, arity=2.
1276 |         - 'sin' : sine (radians), arity=1.
1277 |         - 'cos' : cosine (radians), arity=1.
1278 |         - 'tan' : tangent (radians), arity=1.
1279 | 
1280 |     metric : str, optional (default='pearson')
1281 |         The name of the raw fitness metric. Available options include:
1282 | 
1283 |         - 'pearson', for Pearson's product-moment correlation coefficient.
1284 |         - 'spearman' for Spearman's rank-order correlation coefficient.
1285 | 
1286 |     parsimony_coefficient : float or "auto", optional (default=0.001)
1287 |         This constant penalizes large programs by adjusting their fitness to
1288 |         be less favorable for selection. Larger values penalize the program
1289 |         more which can control the phenomenon known as 'bloat'. Bloat is when
1290 |         evolution is increasing the size of programs without a significant
1291 |         increase in fitness, which is costly for computation time and makes for
1292 |         a less understandable final result. This parameter may need to be tuned
1293 |         over successive runs.
1294 | 
1295 |         If "auto" the parsimony coefficient is recalculated for each generation
1296 |         using c = Cov(l,f)/Var( l), where Cov(l,f) is the covariance between
1297 |         program size l and program fitness f in the population, and Var(l) is
1298 |         the variance of program sizes.
1299 | 
1300 |     p_crossover : float, optional (default=0.9)
1301 |         The probability of performing crossover on a tournament winner.
1302 |         Crossover takes the winner of a tournament and selects a random subtree
1303 |         from it to be replaced. A second tournament is performed to find a
1304 |         donor. The donor also has a subtree selected at random and this is
1305 |         inserted into the original parent to form an offspring in the next
1306 |         generation.
1307 | 
1308 |     p_subtree_mutation : float, optional (default=0.01)
1309 |         The probability of performing subtree mutation on a tournament winner.
1310 |         Subtree mutation takes the winner of a tournament and selects a random
1311 |         subtree from it to be replaced. A donor subtree is generated at random
1312 |         and this is inserted into the original parent to form an offspring in
1313 |         the next generation.
1314 | 
1315 |     p_hoist_mutation : float, optional (default=0.01)
1316 |         The probability of performing hoist mutation on a tournament winner.
1317 |         Hoist mutation takes the winner of a tournament and selects a random
1318 |         subtree from it. A random subtree of that subtree is then selected
1319 |         and this is 'hoisted' into the original subtrees location to form an
1320 |         offspring in the next generation. This method helps to control bloat.
1321 | 
1322 |     p_point_mutation : float, optional (default=0.01)
1323 |         The probability of performing point mutation on a tournament winner.
1324 |         Point mutation takes the winner of a tournament and selects random
1325 |         nodes from it to be replaced. Terminals are replaced by other terminals
1326 |         and functions are replaced by other functions that require the same
1327 |         number of arguments as the original node. The resulting tree forms an
1328 |         offspring in the next generation.
1329 | 
1330 |         Note : The above genetic operation probabilities must sum to less than
1331 |         one. The balance of probability is assigned to 'reproduction', where a
1332 |         tournament winner is cloned and enters the next generation unmodified.
1333 | 
1334 |     p_point_replace : float, optional (default=0.05)
1335 |         For point mutation only, the probability that any given node will be
1336 |         mutated.
1337 | 
1338 |     max_samples : float, optional (default=1.0)
1339 |         The fraction of samples to draw from X to evaluate each program on.
1340 | 
1341 |     feature_names : list, optional (default=None)
1342 |         Optional list of feature names, used purely for representations in
1343 |         the `print` operation or `export_graphviz`. If None, then X0, X1, etc
1344 |         will be used for representations.
1345 | 
1346 |     warm_start : bool, optional (default=False)
1347 |         When set to ``True``, reuse the solution of the previous call to fit
1348 |         and add more generations to the evolution, otherwise, just fit a new
1349 |         evolution.
1350 | 
1351 |     low_memory : bool, optional (default=False)
1352 |         When set to ``True``, only the current generation is retained. Parent
1353 |         information is discarded. For very large populations or runs with many
1354 |         generations, this can result in substantial memory use reduction.
1355 | 
1356 |     n_jobs : integer, optional (default=1)
1357 |         The number of jobs to run in parallel for `fit`. If -1, then the number
1358 |         of jobs is set to the number of cores.
1359 | 
1360 |     verbose : int, optional (default=0)
1361 |         Controls the verbosity of the evolution building process.
1362 | 
1363 |     random_state : int, RandomState instance or None, optional (default=None)
1364 |         If int, random_state is the seed used by the random number generator;
1365 |         If RandomState instance, random_state is the random number generator;
1366 |         If None, the random number generator is the RandomState instance used
1367 |         by `np.random`.
1368 | 
1369 |     Attributes
1370 |     ----------
1371 |     run_details_ : dict
1372 |         Details of the evolution process. Includes the following elements:
1373 | 
1374 |         - 'generation' : The generation index.
1375 |         - 'average_length' : The average program length of the generation.
1376 |         - 'average_fitness' : The average program fitness of the generation.
1377 |         - 'best_length' : The length of the best program in the generation.
1378 |         - 'best_fitness' : The fitness of the best program in the generation.
1379 |         - 'best_oob_fitness' : The out of bag fitness of the best program in
1380 |           the generation (requires `max_samples` < 1.0).
1381 |         - 'generation_time' : The time it took for the generation to evolve.
1382 | 
1383 |     See Also
1384 |     --------
1385 |     SymbolicRegressor
1386 | 
1387 |     References
1388 |     ----------
1389 |     .. [1] J. Koza, "Genetic Programming", 1992.
1390 | 
1391 |     .. [2] R. Poli, et al. "A Field Guide to Genetic Programming", 2008.
1392 | 
1393 |     """
1394 | 
1395 |     def __init__(self,
1396 |                  population_size=1000,
1397 |                  hall_of_fame=100,
1398 |                  n_components=10,
1399 |                  generations=20,
1400 |                  tournament_size=20,
1401 |                  stopping_criteria=1.0,
1402 |                  const_range=(-1., 1.),
1403 |                  init_depth=(2, 6),
1404 |                  init_method='half and half',
1405 |                  function_set=('add', 'sub', 'mul', 'div'),
1406 |                  metric='pearson',
1407 |                  parsimony_coefficient=0.001,
1408 |                  p_crossover=0.9,
1409 |                  p_subtree_mutation=0.01,
1410 |                  p_hoist_mutation=0.01,
1411 |                  p_point_mutation=0.01,
1412 |                  p_point_replace=0.05,
1413 |                  max_samples=1.0,
1414 |                  feature_names=None,
1415 |                  warm_start=False,
1416 |                  low_memory=False,
1417 |                  n_jobs=1,
1418 |                  verbose=0,
1419 |                  random_state=None):
1420 |         super(SymbolicTransformer, self).__init__(
1421 |             population_size=population_size,
1422 |             hall_of_fame=hall_of_fame,
1423 |             n_components=n_components,
1424 |             generations=generations,
1425 |             tournament_size=tournament_size,
1426 |             stopping_criteria=stopping_criteria,
1427 |             const_range=const_range,
1428 |             init_depth=init_depth,
1429 |             init_method=init_method,
1430 |             function_set=function_set,
1431 |             metric=metric,
1432 |             parsimony_coefficient=parsimony_coefficient,
1433 |             p_crossover=p_crossover,
1434 |             p_subtree_mutation=p_subtree_mutation,
1435 |             p_hoist_mutation=p_hoist_mutation,
1436 |             p_point_mutation=p_point_mutation,
1437 |             p_point_replace=p_point_replace,
1438 |             max_samples=max_samples,
1439 |             feature_names=feature_names,
1440 |             warm_start=warm_start,
1441 |             low_memory=low_memory,
1442 |             n_jobs=n_jobs,
1443 |             verbose=verbose,
1444 |             random_state=random_state)
1445 | 
1446 |     def __len__(self):
1447 |         """Overloads `len` output to be the number of fitted components."""
1448 |         if not hasattr(self, '_best_programs'):
1449 |             return 0
1450 |         return self.n_components
1451 | 
1452 |     def __getitem__(self, item):
1453 |         """Return the ith item of the fitted components."""
1454 |         if item >= len(self):
1455 |             raise IndexError
1456 |         return self._best_programs[item]
1457 | 
1458 |     def __str__(self):
1459 |         """Overloads `print` output of the object to resemble LISP trees."""
1460 |         if not hasattr(self, '_best_programs'):
1461 |             return self.__repr__()
1462 |         output = str([gp.__str__() for gp in self])
1463 |         return output.replace("',", ",\n").replace("'", "")
1464 | 
1465 |     def transform(self, X):
1466 |         """Transform X according to the fitted transformer.
1467 | 
1468 |         Parameters
1469 |         ----------
1470 |         X : array-like, shape = [n_samples, n_features]
1471 |             Input vectors, where n_samples is the number of samples
1472 |             and n_features is the number of features.
1473 | 
1474 |         Returns
1475 |         -------
1476 |         X_new : array-like, shape = [n_samples, n_components]
1477 |             Transformed array.
1478 | 
1479 |         """
1480 |         if not hasattr(self, '_best_programs'):
1481 |             raise NotFittedError('SymbolicTransformer not fitted.')
1482 | 
1483 |         X = check_array(X)
1484 |         _, n_features = X.shape
1485 |         if self.n_features_ != n_features:
1486 |             raise ValueError('Number of features of the model must match the '
1487 |                              'input. Model n_features is %s and input '
1488 |                              'n_features is %s.'
1489 |                              % (self.n_features_, n_features))
1490 | 
1491 |         X_new = np.array([gp.execute(X) for gp in self._best_programs]).T
1492 | 
1493 |         return X_new
1494 | 
1495 |     def fit_transform(self, X, y, sample_weight=None):
1496 |         """Fit to data, then transform it.
1497 | 
1498 |         Parameters
1499 |         ----------
1500 |         X : array-like, shape = [n_samples, n_features]
1501 |             Training vectors, where n_samples is the number of samples and
1502 |             n_features is the number of features.
1503 | 
1504 |         y : array-like, shape = [n_samples]
1505 |             Target values.
1506 | 
1507 |         sample_weight : array-like, shape = [n_samples], optional
1508 |             Weights applied to individual samples.
1509 | 
1510 |         Returns
1511 |         -------
1512 |         X_new : array-like, shape = [n_samples, n_components]
1513 |             Transformed array.
1514 | 
1515 |         """
1516 |         return self.fit(X, y, sample_weight).transform(X)
1517 | 


--------------------------------------------------------------------------------