├── .idea
├── Modeling_Preparation.iml
├── markdown-exported-files.xml
├── markdown-navigator.xml
├── markdown-navigator
│ └── profiles_settings.xml
├── misc.xml
├── modules.xml
├── vcs.xml
└── workspace.xml
├── .vscode
└── tags
├── README.md
├── dataset
├── SZIndex.csv
├── SZIndex.desc
├── abalone.txt
├── ablone.names
├── auto.csv
├── auto.mat
├── auto_1.csv
└── international-airline-passengers.csv
├── plot
├── cluster_plot.R
├── datafile
│ ├── beijing.png
│ ├── beijingDots.png
│ ├── c_dijishi.dta
│ ├── c_seven.dta
│ ├── c_sheng1.dta
│ ├── d_dijishi.dta
│ ├── d_seven.dta
│ ├── d_sheng1.dta
│ └── word_vector.txt
├── google_map_api.py
├── plot.R
└── spatial.do
├── 优化模型
├── PSO.py
├── genetic_algorithm.py
├── sa_tsp_example.py
├── simulated_annealing.py
└── simulated_annealing.pyc
├── 小工具
├── Association_rules.py
├── data_clean.py
├── due_date_calculate.py
├── lasso_regression.m
├── ridgeRegression_func1.m
├── ridge_regression.m
├── trade_account.py
└── 二分法期权计算器.cs
├── 评价模型
├── EntropyWeight.m
├── PPE.asv
├── PPE.m
├── SOM.py
├── cluster.py
├── constraint.m
├── get_Q.m
├── optimal_tools.png
├── pso_optimal.asv
├── pso_optimal.m
└── som_data.txt
├── 赛题整理
└── 赛题整理.md
└── 预测模型
├── GM1_1.m
├── HMM.py
├── LSTM_predict.py
├── ML_classify_model.py
├── PLSR.m
├── SVR.py
├── decision_tree.py
├── evaluate.py
├── neural_network.m
└── neural_network.py
/.idea/Modeling_Preparation.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/.idea/markdown-exported-files.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/markdown-navigator.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
--------------------------------------------------------------------------------
/.idea/markdown-navigator/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.vscode/tags:
--------------------------------------------------------------------------------
1 | !_TAG_FILE_FORMAT 2 /extended format; --format=1 will not append ;" to lines/
2 | !_TAG_FILE_SORTED 1 /0=unsorted, 1=sorted, 2=foldcase/
3 | !_TAG_PROGRAM_AUTHOR Darren Hiebert /dhiebert@users.sourceforge.net/
4 | !_TAG_PROGRAM_NAME Exuberant Ctags //
5 | !_TAG_PROGRAM_URL http://ctags.sourceforge.net /official site/
6 | !_TAG_PROGRAM_VERSION 5.9~svn20110310 //
7 | Association_rules.py ../小工具/Association_rules.py 1;" kind:file line:1
8 | City ../优化模型/sa_tsp_example.py /^class City():$/;" kind:class line:9
9 | Cluster ../评价模型/cluster.py /^class Cluster:$/;" kind:class line:8
10 | DIRECTION_BUY ../小工具/trade_account.py /^DIRECTION_BUY = 0$/;" kind:variable line:11
11 | DIRECTION_SELL ../小工具/trade_account.py /^DIRECTION_SELL = 1$/;" kind:variable line:12
12 | Evaluate ../预测模型/evaluate.py /^class Evaluate:$/;" kind:class line:20
13 | GA ../优化模型/genetic_algorithm.py /^ GA = GeneticAlgorithm()$/;" kind:variable line:129
14 | Gene ../优化模型/genetic_algorithm.py /^class Gene():$/;" kind:class line:16
15 | GeneticAlgorithm ../优化模型/genetic_algorithm.py /^class GeneticAlgorithm:$/;" kind:class line:62
16 | Gmap ../plot/google_map_api.py /^def Gmap(centerLat,centerLon,zoomS,pixelS,size,dark,saveAddress):$/;" kind:function line:10
17 | Graph ../优化模型/sa_tsp_example.py /^class Graph:$/;" kind:class line:16
18 | HMM.py ../预测模型/HMM.py 1;" kind:file line:1
19 | INSTRUMENT_FUTURE ../小工具/trade_account.py /^INSTRUMENT_FUTURE = 1$/;" kind:variable line:9
20 | INSTRUMENT_OPTION ../小工具/trade_account.py /^INSTRUMENT_OPTION = 0$/;" kind:variable line:8
21 | K_means ../评价模型/cluster.py /^ def K_means(self, K, axis=0):$/;" kind:member line:16
22 | LSTM_predict.py ../预测模型/LSTM_predict.py 1;" kind:file line:1
23 | M ../plot/google_map_api.py /^ M = {}$/;" kind:variable line:57
24 | ML_classify_model.py ../预测模型/ML_classify_model.py 1;" kind:file line:1
25 | MyGaussianHMM ../预测模型/HMM.py /^def MyGaussianHMM():$/;" kind:function line:106
26 | MyMultinomialHMM ../预测模型/HMM.py /^def MyMultinomialHMM():$/;" kind:function line:46
27 | MySOM ../评价模型/SOM.py /^class MySOM:$/;" kind:class line:25
28 | NeuralNetwork ../预测模型/neural_network.py /^class NeuralNetwork:$/;" kind:class line:130
29 | Node ../预测模型/decision_tree.py /^class Node:$/;" kind:class line:6
30 | OPENTYPE_CLOSE ../小工具/trade_account.py /^OPENTYPE_CLOSE = 1 $/;" kind:variable line:15
31 | OPENTYPE_OPEN ../小工具/trade_account.py /^OPENTYPE_OPEN = 0$/;" kind:variable line:14
32 | PSO ../优化模型/PSO.py /^class PSO():$/;" kind:class line:39
33 | PSO.py ../优化模型/PSO.py 1;" kind:file line:1
34 | PositionQueue ../小工具/trade_account.py /^class PositionQueue:$/;" kind:class line:27
35 | SOM.py ../评价模型/SOM.py 1;" kind:file line:1
36 | SVR.py ../预测模型/SVR.py 1;" kind:file line:1
37 | SimulatedAnnealing ../优化模型/simulated_annealing.py /^class SimulatedAnnealing:$/;" kind:class line:49
38 | TYPE_CONTINUE ../预测模型/evaluate.py /^TYPE_CONTINUE = 2 # 实际值与预测值均为连续$/;" kind:variable line:17
39 | TYPE_DISCRETE ../预测模型/evaluate.py /^TYPE_DISCRETE = 0 # 实际值与预测值均为离散$/;" kind:variable line:15
40 | TYPE_DISCRETE_2 ../预测模型/evaluate.py /^TYPE_DISCRETE_2 =1 # 实际值为离散,预测值为连续 logistic$/;" kind:variable line:16
41 | TradeAccount ../小工具/trade_account.py /^class TradeAccount:$/;" kind:class line:65
42 | Tree ../预测模型/decision_tree.py /^class Tree:$/;" kind:class line:17
43 | ValueCalculate ../小工具/trade_account.py /^class ValueCalculate():$/;" kind:class line:168
44 | Wind2Df ../小工具/due_date_calculate.py /^def Wind2Df(wind_data):$/;" kind:function line:8
45 | Wind2Df ../预测模型/neural_network.py /^def Wind2Df(wind_data):$/;" kind:function line:230
46 | X ../预测模型/SVR.py /^X = boston.data$/;" kind:variable line:9
47 | X_test ../预测模型/SVR.py /^X_test = ss_X.transform(X_test)$/;" kind:variable line:24
48 | X_train ../预测模型/SVR.py /^X_train = ss_X.fit_transform(X_train)$/;" kind:variable line:23
49 | __doc__ ../优化模型/genetic_algorithm.py /^ __doc__ = "个体基因类,存储单个基因"$/;" kind:variable line:18
50 | __init__ ../优化模型/PSO.py /^ def __init__(self, particle_number=10, variable_number=1):$/;" kind:member line:41
51 | __init__ ../优化模型/genetic_algorithm.py /^ def __init__(self):$/;" kind:member line:64
52 | __init__ ../优化模型/genetic_algorithm.py /^ def __init__(self, gene_length=10, float_length=4):$/;" kind:member line:20
53 | __init__ ../优化模型/sa_tsp_example.py /^ def __init__(self):$/;" kind:member line:18
54 | __init__ ../优化模型/sa_tsp_example.py /^ def __init__(self, x, y):$/;" kind:member line:11
55 | __init__ ../优化模型/simulated_annealing.py /^ def __init__(self, func):$/;" kind:member line:51
56 | __init__ ../小工具/trade_account.py /^ def __init__(self):$/;" kind:member line:29
57 | __init__ ../小工具/trade_account.py /^ def __init__(self, capital_list, init_capital):$/;" kind:member line:170
58 | __init__ ../小工具/trade_account.py /^ def __init__(self, init_capital):$/;" kind:member line:67
59 | __init__ ../评价模型/SOM.py /^ def __init__(self, df, mapsize, initialization = 'random'):$/;" kind:member line:26
60 | __init__ ../评价模型/cluster.py /^ def __init__(self, df):$/;" kind:member line:10
61 | __init__ ../预测模型/ML_classify_model.py /^ def __init__(self, train_x, train_y):$/;" kind:member line:7
62 | __init__ ../预测模型/decision_tree.py /^ def __init__(self, df):$/;" kind:member line:23
63 | __init__ ../预测模型/decision_tree.py /^ def __init__(self, feature, df):$/;" kind:member line:8
64 | __init__ ../预测模型/evaluate.py /^ def __init__(self, true_array, predict_array, pred_type = TYPE_DISCRETE):$/;" kind:member line:22
65 | __init__ ../预测模型/neural_network.py /^ def __init__(self, input_layer, hide_layer, output_layer, df):$/;" kind:member line:132
66 | __slots__ ../优化模型/sa_tsp_example.py /^ __slots__ = ("X", "Y")$/;" kind:variable line:10
67 | _generate_data ../预测模型/neural_network.py /^def _generate_data():$/;" kind:function line:19
68 | _pca ../评价模型/cluster.py /^ def _pca(self):$/;" kind:member line:65
69 | accuracy ../预测模型/evaluate.py /^ def accuracy(self):$/;" kind:member line:28
70 | add ../小工具/trade_account.py /^ def add(self, instrument, direction, price):$/;" kind:member line:34
71 | add_city ../优化模型/sa_tsp_example.py /^ def add_city(self, city):$/;" kind:member line:26
72 | auto_cluster ../评价模型/cluster.py /^ def auto_cluster(self):$/;" kind:member line:71
73 | ax ../plot/google_map_api.py /^ ax = plt.subplot(111)$/;" kind:variable line:65
74 | ax ../小工具/data_clean.py /^ ax = plt.subplot(111)$/;" kind:variable line:179
75 | ax1 ../小工具/due_date_calculate.py /^ax1 = fig.add_subplot(211)$/;" kind:variable line:61
76 | ax2 ../小工具/due_date_calculate.py /^ax2 = fig.add_subplot(212)$/;" kind:variable line:62
77 | begin ../优化模型/genetic_algorithm.py /^ def begin(self):$/;" kind:member line:103
78 | begin ../优化模型/simulated_annealing.py /^ def begin(self):$/;" kind:member line:66
79 | bin2dec ../优化模型/genetic_algorithm.py /^ def bin2dec(self):$/;" kind:member line:34
80 | boston ../预测模型/SVR.py /^boston = load_boston()$/;" kind:variable line:3
81 | chinese_province_list ../优化模型/sa_tsp_example.py /^ chinese_province_list = [$/;" kind:variable line:146
82 | choose_gene ../优化模型/genetic_algorithm.py /^ def choose_gene(self, rand):$/;" kind:member line:81
83 | city_a ../优化模型/sa_tsp_example.py /^ city_a = City(0, 0)$/;" kind:variable line:101
84 | city_b ../优化模型/sa_tsp_example.py /^ city_b = City(0, 1)$/;" kind:variable line:102
85 | city_c ../优化模型/sa_tsp_example.py /^ city_c = City(1, 0)$/;" kind:variable line:103
86 | city_d ../优化模型/sa_tsp_example.py /^ city_d = City(1, 1)$/;" kind:variable line:104
87 | city_list ../优化模型/sa_tsp_example.py /^ city_list = [$/;" kind:variable line:106
88 | classify_report ../预测模型/evaluate.py /^ def classify_report(self):$/;" kind:member line:67
89 | clu ../评价模型/cluster.py /^ clu = Cluster(df)$/;" kind:variable line:82
90 | cluster ../评价模型/SOM.py /^ def cluster(self, n):$/;" kind:member line:65
91 | cluster.py ../评价模型/cluster.py 1;" kind:file line:1
92 | cluster_plot ../评价模型/cluster.py /^ def cluster_plot(self, label):$/;" kind:member line:55
93 | confusion_matrix ../预测模型/evaluate.py /^ def confusion_matrix(self):$/;" kind:member line:43
94 | confusion_matrix_plot ../预测模型/evaluate.py /^ def confusion_matrix_plot(self, cmap=plt.cm.Blues):$/;" kind:member line:46
95 | create_dataset ../预测模型/LSTM_predict.py /^def create_dataset(dataset, look_back=1):$/;" kind:function line:23
96 | cross ../优化模型/genetic_algorithm.py /^ def cross(gene1, gene2):$/;" kind:member line:44
97 | data ../plot/google_map_api.py /^ data = {'23.157105_113.256031': 5,$/;" kind:variable line:53
98 | data ../评价模型/SOM.py /^ data = fetch_california_housing()$/;" kind:variable line:96
99 | data ../评价模型/SOM.py /^ data = np.column_stack([data.data, data.target])$/;" kind:variable line:99
100 | data_clean.py ../小工具/data_clean.py 1;" kind:file line:1
101 | data_pre_handle ../预测模型/neural_network.py /^ def data_pre_handle(self, df):$/;" kind:member line:137
102 | data_set ../预测模型/decision_tree.py /^ data_set = [$/;" kind:variable line:72
103 | dataframe ../预测模型/LSTM_predict.py /^dataframe = read_csv('..\/dataset\/international-airline-passengers.csv', usecols=[1], engine='python', skipfooter=3)$/;" kind:variable line:40
104 | dataset ../预测模型/LSTM_predict.py /^dataset = dataframe.values$/;" kind:variable line:41
105 | dataset ../预测模型/LSTM_predict.py /^dataset = dataset.astype('float32')$/;" kind:variable line:42
106 | dataset ../预测模型/LSTM_predict.py /^dataset = scaler.fit_transform(dataset)$/;" kind:variable line:45
107 | decision_tree ../预测模型/ML_classify_model.py /^ def decision_tree(self):$/;" kind:member line:26
108 | decision_tree.py ../预测模型/decision_tree.py 1;" kind:file line:1
109 | descr ../评价模型/SOM.py /^ descr = data.DESCR$/;" kind:variable line:97
110 | df ../小工具/data_clean.py /^ df = fill_na(df)$/;" kind:variable line:161
111 | df ../小工具/data_clean.py /^ df = label_encode(df, ['make', 'foreign'])$/;" kind:variable line:155
112 | df ../小工具/data_clean.py /^ df = pd.read_csv("\/home\/ray\/Documents\/suibe\/2017\/建模\/Modeling_Preparation\/dataset\/auto.csv")$/;" kind:variable line:151
113 | df ../小工具/data_clean.py /^ df = standardize(df, ['make','foreign']) # 这两列是分类变量,不需要标准化$/;" kind:variable line:164
114 | df ../评价模型/SOM.py /^ df = pd.DataFrame(data)$/;" kind:variable line:100
115 | df ../评价模型/cluster.py /^ df = df.dropna(axis=0)$/;" kind:variable line:81
116 | df ../评价模型/cluster.py /^ df = pd.read_csv("\/home\/ray\/Documents\/suibe\/2017\/建模\/Modeling_Preparation\/dataset\/auto_1.csv")$/;" kind:variable line:80
117 | df ../预测模型/ML_classify_model.py /^ df = df.dropna(axis=0)$/;" kind:variable line:64
118 | df ../预测模型/ML_classify_model.py /^ df = pd.read_csv("..\/dataset\/auto_1.csv")$/;" kind:variable line:63
119 | df ../预测模型/decision_tree.py /^ df = pd.DataFrame(data_set)$/;" kind:variable line:84
120 | df ../预测模型/neural_network.py /^ df = Wind2Df(w.wst("IC1709.CFE",$/;" kind:variable line:249
121 | df2 ../小工具/data_clean.py /^ df2 = winsorize(df,1,99)$/;" kind:variable line:177
122 | df_all ../小工具/due_date_calculate.py /^df_all = pd.merge(df_if00, df_if01, left_index=True, right_index=True)$/;" kind:variable line:46
123 | df_all1 ../小工具/due_date_calculate.py /^df_all1 = df_all.copy()$/;" kind:variable line:54
124 | df_columns ../小工具/data_clean.py /^ df_columns = df.columns$/;" kind:variable line:152
125 | df_if00 ../小工具/due_date_calculate.py /^df_if00 = Wind2Df(w.wsi("IF00.CFE", "close, volume", "2016-02-01 09:30:00", "2017-08-16 13:48:43", "periodstart=09:30:00;periodend=15:00:00"))$/;" kind:variable line:41
126 | df_if01 ../小工具/due_date_calculate.py /^df_if01 = Wind2Df(w.wsi("IF01.CFE", "close, volume", "2016-02-01 09:30:00", "2017-08-16 13:48:43", "periodstart=09:30:00;periodend=15:00:00"))$/;" kind:variable line:42
127 | display ../小工具/trade_account.py /^ def display(self):$/;" kind:member line:266
128 | display ../小工具/trade_account.py /^ def display(self):$/;" kind:member line:61
129 | display ../预测模型/evaluate.py /^ def display(self):$/;" kind:member line:123
130 | display_node ../预测模型/decision_tree.py /^ def display_node(self, node, depth):$/;" kind:member line:61
131 | draw_cluster_map ../评价模型/SOM.py /^ def draw_cluster_map(self):$/;" kind:member line:59
132 | draw_hit_map ../评价模型/SOM.py /^ def draw_hit_map(self):$/;" kind:member line:53
133 | draw_input_weights ../评价模型/SOM.py /^ def draw_input_weights(self):$/;" kind:member line:47
134 | drop_duplicate ../小工具/data_clean.py /^def drop_duplicate(df, columns=[]):$/;" kind:function line:129
135 | due_date_calculate.py ../小工具/due_date_calculate.py 1;" kind:file line:1
136 | end_trade ../小工具/trade_account.py /^ def end_trade(self):$/;" kind:member line:163
137 | eva_0 ../预测模型/evaluate.py /^ eva_0 = Evaluate(true_y_0, pred_y_0, TYPE_DISCRETE)$/;" kind:variable line:157
138 | eva_1 ../预测模型/evaluate.py /^ eva_1 = Evaluate(true_y_1, pred_y_1, TYPE_DISCRETE_2)$/;" kind:variable line:158
139 | eva_2 ../预测模型/evaluate.py /^ eva_2 = Evaluate(true_y_2, pred_y_2, TYPE_CONTINUE)$/;" kind:variable line:159
140 | evaluate.py ../预测模型/evaluate.py 1;" kind:file line:1
141 | exeTime ../优化模型/simulated_annealing.py /^def exeTime(func):$/;" kind:function line:35
142 | explained_variance ../预测模型/evaluate.py /^ def explained_variance(self):$/;" kind:member line:104
143 | f1 ../预测模型/evaluate.py /^ def f1(self):$/;" kind:member line:38
144 | fig ../小工具/due_date_calculate.py /^fig = plt.figure()$/;" kind:variable line:60
145 | filename ../plot/google_map_api.py /^ filename = ".\/datafile\/beijing.png"$/;" kind:variable line:48
146 | fill_na ../小工具/data_clean.py /^def fill_na(df, excep_columns=[], how='mean'):$/;" kind:function line:27
147 | fnn_begin ../预测模型/neural_network.py /^def fnn_begin():$/;" kind:function line:125
148 | func ../优化模型/PSO.py /^ def func(self, array):$/;" kind:member line:56
149 | future_parameter ../小工具/trade_account.py /^future_parameter = {$/;" kind:variable line:17
150 | gen_due_date ../小工具/due_date_calculate.py /^def gen_due_date(year, month):$/;" kind:function line:20
151 | gen_new_sequence ../优化模型/sa_tsp_example.py /^def gen_new_sequence(sequence):$/;" kind:function line:60
152 | gen_new_x ../优化模型/simulated_annealing.py /^ def gen_new_x(self, x_before, T):$/;" kind:member line:59
153 | gen_tree ../预测模型/decision_tree.py /^ def gen_tree(self, node):$/;" kind:member line:42
154 | gene_pop ../优化模型/genetic_algorithm.py /^ def gene_pop(self):$/;" kind:member line:94
155 | genetic_algorithm.py ../优化模型/genetic_algorithm.py 1;" kind:file line:1
156 | get_annual_return ../小工具/trade_account.py /^ def get_annual_return(self):$/;" kind:member line:190
157 | get_average_return ../小工具/trade_account.py /^ def get_average_return(self):$/;" kind:member line:193
158 | get_best_gene ../优化模型/genetic_algorithm.py /^ def get_best_gene(self):$/;" kind:member line:77
159 | get_cluster_label ../评价模型/SOM.py /^ def get_cluster_label(self):$/;" kind:member line:68
160 | get_continue_lose_times ../小工具/trade_account.py /^ def get_continue_lose_times(self):$/;" kind:member line:240
161 | get_continue_win_times ../小工具/trade_account.py /^ def get_continue_win_times(self):$/;" kind:member line:229
162 | get_distance ../优化模型/sa_tsp_example.py /^ def get_distance(city1, city2):$/;" kind:member line:23
163 | get_due_date ../小工具/due_date_calculate.py /^def get_due_date(date):$/;" kind:function line:28
164 | get_feature ../预测模型/decision_tree.py /^ def get_feature(self, df):$/;" kind:member line:28
165 | get_fit_value ../优化模型/genetic_algorithm.py /^ def get_fit_value(self):$/;" kind:member line:74
166 | get_fit_value ../优化模型/genetic_algorithm.py /^ def get_fit_value(self, func):$/;" kind:member line:58
167 | get_fnn ../预测模型/neural_network.py /^ def get_fnn(self, i, h, o):$/;" kind:member line:153
168 | get_fnn ../预测模型/neural_network.py /^def get_fnn():$/;" kind:function line:42
169 | get_label ../评价模型/SOM.py /^ def get_label(self):$/;" kind:member line:79
170 | get_lose_times ../小工具/trade_account.py /^ def get_lose_times(self):$/;" kind:member line:211
171 | get_max_drawdown ../小工具/trade_account.py /^ def get_max_drawdown(self):$/;" kind:member line:251
172 | get_max_lose ../小工具/trade_account.py /^ def get_max_lose(self):$/;" kind:member line:226
173 | get_max_win ../小工具/trade_account.py /^ def get_max_win(self):$/;" kind:member line:223
174 | get_neurons ../评价模型/SOM.py /^ def get_neurons(self):$/;" kind:member line:72
175 | get_return_list ../小工具/trade_account.py /^ def get_return_list(self):$/;" kind:member line:177
176 | get_return_volatility ../小工具/trade_account.py /^ def get_return_volatility(self):$/;" kind:member line:199
177 | get_sharp_ratio ../小工具/trade_account.py /^ def get_sharp_ratio(self):$/;" kind:member line:261
178 | get_shortest_distance ../优化模型/sa_tsp_example.py /^def get_shortest_distance(graph):$/;" kind:function line:67
179 | get_total_distance ../优化模型/sa_tsp_example.py /^ def get_total_distance(self, sequence = None):$/;" kind:member line:37
180 | get_total_return ../小工具/trade_account.py /^ def get_total_return(self):$/;" kind:member line:187
181 | get_total_trade_times ../小工具/trade_account.py /^ def get_total_trade_times(self):$/;" kind:member line:196
182 | get_train_data ../预测模型/neural_network.py /^ def get_train_data(self, input_layer, output_layer):$/;" kind:member line:179
183 | get_train_data ../预测模型/neural_network.py /^def get_train_data():$/;" kind:function line:70
184 | get_win_lose_ratio ../小工具/trade_account.py /^ def get_win_lose_ratio(self):$/;" kind:member line:218
185 | get_win_ratio ../小工具/trade_account.py /^ def get_win_ratio(self):$/;" kind:member line:215
186 | get_win_times ../小工具/trade_account.py /^ def get_win_times(self):$/;" kind:member line:207
187 | google_map_api.py ../plot/google_map_api.py 1;" kind:file line:1
188 | graph ../优化模型/sa_tsp_example.py /^ graph = Graph()$/;" kind:variable line:180
189 | hamming_distance ../预测模型/evaluate.py /^ def hamming_distance(self):$/;" kind:member line:96
190 | have_position ../小工具/trade_account.py /^ def have_position(self, instrument, direction):$/;" kind:member line:49
191 | hierarchial_plot ../评价模型/cluster.py /^ def hierarchial_plot(self, Z):$/;" kind:member line:41
192 | hierarchical ../评价模型/cluster.py /^ def hierarchical(self):$/;" kind:member line:30
193 | im ../plot/google_map_api.py /^ im = Image.open(filename)#np.flipud(plt.imread(filename))$/;" kind:variable line:64
194 | initial_gene ../优化模型/genetic_algorithm.py /^ def initial_gene(self):$/;" kind:member line:25
195 | initial_particle ../优化模型/PSO.py /^ def initial_particle(self):$/;" kind:member line:65
196 | interpolate_na ../小工具/data_clean.py /^def interpolate_na(df, excep_columns=[], how='lagrange'):$/;" kind:function line:46
197 | is_due_date ../小工具/due_date_calculate.py /^def is_due_date(date):$/;" kind:function line:14
198 | jaccard_distance ../预测模型/evaluate.py /^ def jaccard_distance(self):$/;" kind:member line:100
199 | kappa_score ../预测模型/evaluate.py /^ def kappa_score(self):$/;" kind:member line:71
200 | knn ../预测模型/ML_classify_model.py /^ def knn(self, k=3):$/;" kind:member line:12
201 | label ../评价模型/cluster.py /^ label = clu.K_means(4)$/;" kind:variable line:84
202 | label2 ../评价模型/cluster.py /^ label2 = clu.hierarchical()$/;" kind:variable line:87
203 | label_encode ../小工具/data_clean.py /^def label_encode(df, encode_column=[]):$/;" kind:function line:114
204 | latLonToPixelXY ../plot/google_map_api.py /^def latLonToPixelXY(lat,lon,zoomS):$/;" kind:function line:28
205 | linear_svr ../预测模型/SVR.py /^linear_svr = SVR(kernel = 'linear')$/;" kind:variable line:30
206 | linear_svr_y_predict ../预测模型/SVR.py /^linear_svr_y_predict = linear_svr.predict(X_test)$/;" kind:variable line:34
207 | logistic ../预测模型/ML_classify_model.py /^ def logistic(self):$/;" kind:member line:19
208 | look_back ../预测模型/LSTM_predict.py /^look_back = 2$/;" kind:variable line:51
209 | mc ../预测模型/ML_classify_model.py /^ mc = myclassify(df.iloc[:, 0:10], df.iloc[:,-1])$/;" kind:variable line:65
210 | mean_absolute_error ../预测模型/evaluate.py /^ def mean_absolute_error(self):$/;" kind:member line:112
211 | mean_squared_error ../预测模型/evaluate.py /^ def mean_squared_error(self):$/;" kind:member line:108
212 | median_absolute_error ../预测模型/evaluate.py /^ def median_absolute_error(self):$/;" kind:member line:116
213 | model ../预测模型/LSTM_predict.py /^model = Sequential()$/;" kind:variable line:59
214 | mutation ../优化模型/genetic_algorithm.py /^ def mutation(self):$/;" kind:member line:51
215 | my_som ../评价模型/SOM.py /^ my_som = MySOM(df, (20,20))$/;" kind:variable line:103
216 | myclassify ../预测模型/ML_classify_model.py /^class myclassify():$/;" kind:class line:6
217 | naive_bayes ../预测模型/ML_classify_model.py /^ def naive_bayes(self):$/;" kind:member line:33
218 | names ../评价模型/SOM.py /^ names = data.feature_names+["HouseValue"]$/;" kind:variable line:98
219 | neural_network.py ../预测模型/neural_network.py 1;" kind:file line:1
220 | newFunc ../优化模型/simulated_annealing.py /^ def newFunc(*args, **args2):$/;" kind:function line:36
221 | nn ../预测模型/neural_network.py /^ nn = NeuralNetwork(15, 15, 1, df)$/;" kind:variable line:252
222 | normalize ../小工具/data_clean.py /^def normalize(df, excep_columns=[]):$/;" kind:function line:100
223 | option_parameter ../小工具/trade_account.py /^option_parameter = {$/;" kind:variable line:23
224 | order_future ../小工具/trade_account.py /^ def order_future(self, price, instrument, direction, open_type, amount=1):$/;" kind:member line:74
225 | order_option ../小工具/trade_account.py /^ def order_option(self, price, instrument, direction, open_type, amount):$/;" kind:member line:118
226 | ployinterp_column ../小工具/data_clean.py /^ def ployinterp_column(s, n, k=5):$/;" kind:function line:59
227 | poly_svr ../预测模型/SVR.py /^poly_svr = SVR(kernel = 'poly')$/;" kind:variable line:36
228 | poly_svr_y_predict ../预测模型/SVR.py /^poly_svr_y_predict = poly_svr.predict(X_test)$/;" kind:variable line:38
229 | pop ../小工具/trade_account.py /^ def pop(self, instrument, direction):$/;" kind:member line:42
230 | pred_y_0 ../预测模型/evaluate.py /^ pred_y_0 = [1,0,1,1,0,1,0,1]$/;" kind:variable line:149
231 | pred_y_1 ../预测模型/evaluate.py /^ pred_y_1 = [1, 0.8, 0.2, 1.2, 0, 1.0, 0, 1.7, 2.1, 3.1]$/;" kind:variable line:152
232 | pred_y_2 ../预测模型/evaluate.py /^ pred_y_2 = [1, 0, 1, 1.2, 0, 1, 0, 1]$/;" kind:variable line:155
233 | predict ../评价模型/SOM.py /^ def predict(self, x):$/;" kind:member line:87
234 | predict ../预测模型/neural_network.py /^ def predict(self):$/;" kind:member line:212
235 | predict_num ../预测模型/ML_classify_model.py /^ predict_num = -3$/;" kind:variable line:69
236 | print_error ../评价模型/SOM.py /^ def print_error(self):$/;" kind:member line:42
237 | pso ../优化模型/PSO.py /^ pso = PSO()$/;" kind:variable line:103
238 | pso_begin ../优化模型/PSO.py /^ def pso_begin(self):$/;" kind:member line:96
239 | r_square ../预测模型/evaluate.py /^ def r_square(self):$/;" kind:member line:120
240 | rbf_svr ../预测模型/SVR.py /^rbf_svr = SVR(kernel = 'rbf')$/;" kind:variable line:40
241 | rbf_svr_y_predict ../预测模型/SVR.py /^rbf_svr_y_predict = rbf_svr.predict(X_test)$/;" kind:variable line:42
242 | recall ../预测模型/evaluate.py /^ def recall(self):$/;" kind:member line:33
243 | replace_outlier ../小工具/data_clean.py /^def replace_outlier(df):$/;" kind:function line:133
244 | reset_distance ../优化模型/sa_tsp_example.py /^ def reset_distance(self):$/;" kind:member line:34
245 | result ../优化模型/sa_tsp_example.py /^ result = get_shortest_distance(graph)$/;" kind:variable line:183
246 | result_list ../小工具/Association_rules.py /^result_list = []$/;" kind:variable line:32
247 | roc_plot ../预测模型/evaluate.py /^ def roc_plot(self, title='Receiver operating characteristic plot'):$/;" kind:member line:79
248 | roc_score ../预测模型/evaluate.py /^ def roc_score(self):$/;" kind:member line:76
249 | sa ../优化模型/simulated_annealing.py /^ sa = SimulatedAnnealing('')$/;" kind:variable line:93
250 | sa_tsp_example.py ../优化模型/sa_tsp_example.py 1;" kind:file line:1
251 | sample ../plot/google_map_api.py /^def sample(lis,amount):$/;" kind:function line:37
252 | scaler ../预测模型/LSTM_predict.py /^scaler = MinMaxScaler(feature_range=(0, 1))$/;" kind:variable line:44
253 | simulated_annealing.py ../优化模型/simulated_annealing.py 1;" kind:file line:1
254 | single_predict ../预测模型/neural_network.py /^ def single_predict(self, x_array):$/;" kind:member line:227
255 | split_by_part ../预测模型/neural_network.py /^ def split_by_part(DS, proportion=0.9):$/;" kind:function line:191
256 | ss_X ../预测模型/SVR.py /^ss_X = StandardScaler()$/;" kind:variable line:20
257 | ss_y ../预测模型/SVR.py /^ss_y = StandardScaler()$/;" kind:variable line:21
258 | standardize ../小工具/data_clean.py /^def standardize(df, excep_columns=[]):$/;" kind:function line:88
259 | svm ../预测模型/ML_classify_model.py /^ def svm(self):$/;" kind:member line:40
260 | svm_cv ../预测模型/ML_classify_model.py /^ def svm_cv(self):$/;" kind:member line:47
261 | temp ../小工具/Association_rules.py /^ temp = '-'.join([k for k in i[0]]) + ',' + '-'.join([k for k in i[1]]) + ','$/;" kind:variable line:34
262 | temp ../小工具/data_clean.py /^ temp = df['price'].values$/;" kind:variable line:172
263 | testPredict ../预测模型/LSTM_predict.py /^testPredict = model.predict(testX)$/;" kind:variable line:66
264 | testPredict ../预测模型/LSTM_predict.py /^testPredict = scaler.inverse_transform(testPredict)$/;" kind:variable line:70
265 | testPredictPlot ../预测模型/LSTM_predict.py /^testPredictPlot = numpy.empty_like(dataset)$/;" kind:variable line:82
266 | testScore ../预测模型/LSTM_predict.py /^testScore = math.sqrt(mean_squared_error(testY[0], testPredict[:,0]))$/;" kind:variable line:75
267 | testX ../预测模型/LSTM_predict.py /^testX = numpy.reshape(testX, (testX.shape[0], 1, testX.shape[1]))$/;" kind:variable line:56
268 | testY ../预测模型/LSTM_predict.py /^testY = scaler.inverse_transform([testY])$/;" kind:variable line:71
269 | test_case ../小工具/Association_rules.py /^test_case = [$/;" kind:variable line:13
270 | test_size ../预测模型/LSTM_predict.py /^test_size = len(dataset) - train_size$/;" kind:variable line:48
271 | trade_account.py ../小工具/trade_account.py 1;" kind:file line:1
272 | train ../评价模型/SOM.py /^ def train(self):$/;" kind:member line:39
273 | train ../预测模型/neural_network.py /^ def train(self, times = 1000):$/;" kind:member line:208
274 | trainPredict ../预测模型/LSTM_predict.py /^trainPredict = model.predict(trainX)$/;" kind:variable line:65
275 | trainPredict ../预测模型/LSTM_predict.py /^trainPredict = scaler.inverse_transform(trainPredict)$/;" kind:variable line:68
276 | trainPredictPlot ../预测模型/LSTM_predict.py /^trainPredictPlot = numpy.empty_like(dataset)$/;" kind:variable line:78
277 | trainScore ../预测模型/LSTM_predict.py /^trainScore = math.sqrt(mean_squared_error(trainY[0], trainPredict[:,0]))$/;" kind:variable line:73
278 | trainX ../预测模型/LSTM_predict.py /^trainX = numpy.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))$/;" kind:variable line:55
279 | trainY ../预测模型/LSTM_predict.py /^trainY = scaler.inverse_transform([trainY])$/;" kind:variable line:69
280 | train_and_predict ../预测模型/neural_network.py /^def train_and_predict(fnn, dataTrain, dataTest):$/;" kind:function line:88
281 | train_size ../预测模型/LSTM_predict.py /^train_size = int(len(dataset) * 0.67)$/;" kind:variable line:47
282 | tree ../预测模型/decision_tree.py /^ tree = Tree(df)$/;" kind:variable line:86
283 | true_y_0 ../预测模型/evaluate.py /^ true_y_0 = [1,1,0,1,0,1,1,1]$/;" kind:variable line:148
284 | true_y_1 ../预测模型/evaluate.py /^ true_y_1 = [1, 1, 0, 1, 0, 1, 1, 0, 1, 1]$/;" kind:variable line:151
285 | true_y_2 ../预测模型/evaluate.py /^ true_y_2 = [1, 1, 0.9, 1.1, 0.1, 1, 1, 0]$/;" kind:variable line:154
286 | update_particle ../优化模型/PSO.py /^ def update_particle(self):$/;" kind:member line:79
287 | vote ../预测模型/decision_tree.py /^ def vote(df, columns_name, value):$/;" kind:member line:38
288 | winsorize ../小工具/data_clean.py /^def winsorize(df, low_q=1, up_q=99):$/;" kind:function line:138
289 | x ../优化模型/simulated_annealing.py /^ x = sa.begin()$/;" kind:variable line:94
290 | y ../预测模型/SVR.py /^y = boston.target$/;" kind:variable line:10
291 | y_test ../预测模型/SVR.py /^y_test = ss_y.transform(y_test)$/;" kind:variable line:26
292 | y_train ../预测模型/SVR.py /^y_train = ss_y.fit_transform(y_train)$/;" kind:variable line:25
293 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Modeling_Preparation
2 | 数学建模准备工作——每日一算法,包括各种各样可复用的小函数
3 |
4 | > /dataset
5 |
6 | - **abalone.txt** 鲍鱼数据集 数据
7 | - **abalone.names** 鲍鱼数据集 变量名
8 | - **auto.csv** 大众数据集原始版
9 | - **auto_1.csv** 大众数据集 中文替换
10 | - **auto.mat** 大众数据集 matlab格式
11 | - **SZIndex.csv** 上证指数数据集
12 | - **SZIndex.desc** 上证指数数据集说明
13 | - **international-airline-passengers.csv** International airline passengers: monthly totals in thousands. Jan 49 – Dec 60
14 |
15 | > /优化模型
16 |
17 | - **genetic_algorithm.py** 遗传算法
18 | - **PSO.py** 粒子群算法
19 | - **simulated_annealing.py** 模拟退火算法
20 | - **sa_tsp_example.py** 模拟退火算法解决TSP问题
21 |
22 | > /小工具
23 |
24 | - **due_date_calculate.py** wind接口调用 + 计算期权/期货到期日
25 | - **lasso_regression.m** lasso回归
26 | - **ridge_regression.m** 岭回归主程序
27 | - **ridgeRegression_func1.m** 岭回归函数1
28 | - **trade_account.py**
29 | - 交易仓位类
30 | - 交易模拟账户类,支持期权和期货模拟交易
31 | - 净值指标计算类,输入净值序列,输出夏普、年化收益等
32 | - **二分法期权计算器.cs** 根据BS公式与二分法,计算特定期权指标(权费/行权价/无风险收益/到期日/隐含波动率/标的价格)
33 | - **Association_rules.py** 关联规则的Apriori算法,包括使用fp-growth方法寻找频繁项集
34 | - **data_clean.py** 数据预处理
35 |
36 | > /评价模型
37 |
38 | - **PPE.m** 投影寻踪法主程序,确定权重
39 | - **get_Q.m** 投影寻踪法获取目标函数值
40 | - **constraint.m** 投影寻踪法约束条件,用以输入优化工具箱
41 | - **pso_optimal.m** 粒子群算法求解投影寻踪法结果
42 | - **optimal_tools.png** 优化工具箱使用方法:参数输入
43 | - **EntropyWeight.m** 商权法确定去那种
44 | - **SOM.py** 神经网络聚类方法
45 | - **cluster.py** K均值和层次聚类法,包括PCA降维
46 |
47 | > /预测模型
48 |
49 | - **decision_tree.py** 决策树手写
50 | - **ML_classify_mode.py** 机器学习分类模型汇总 调用sklearn包
51 | - **neural_network.py** 神经网络BP算法,用于连续值预测
52 | - **neural_network.m** 神经网络BP算法,调用matlab神经网络工具箱
53 | - **SVR.py** 支持向量回归,用于连续值预测
54 | - **HMM.py** 隐马尔科夫模型
55 | - **evaluate.py** 预测效果评估
56 | - **GM1_1.py** 灰色预测
57 | - **LSTM_predict.py** 长短记忆神经网络预测模型
58 | - **PLSR.m** 典型相关分析,偏最小二乘,研究变量间影响,尤其是多对多,并进行预测
59 |
60 | > Some Notes
61 | - 分层优化
62 | - 例1:分级排班优化建模,主要包含两层优化:一是利用飞机使用最小化模型得到每一天覆盖所有航班的最小航班串;二是利用飞机维修机会最大化模型得到覆盖所有航班串的一周飞机路线,并进行仿真
63 | - 例2:全国5A景点旅游路线规划,两层:一是省内区域(或者是聚类得到的景点簇)内旅游路线进行优化(TSP),二是省际优化方法
64 | - 典型相关
65 | - 《基于格兰杰因果检验和典型相关的农民收入影响因素研究》
66 | - 《典型相关分析综述》
67 | - 《基于核典型相关分析和支持向量机的语音情感识别模型》
68 |
69 |
70 | > TODO LIST:
71 |
72 | - 格兰杰因果检验,时间序列算法 (stata搞定)
73 | - 双种群遗传算法 (了解)
74 | - 多车辆路径问题 (http://blog.csdn.net/wangqiuyun/article/details/7664995)
75 | - 最小生成树 (http://blog.csdn.net/heisediwei/article/details/50326847)
76 | - MTSP 多旅行商tsp问题 (文献很多)
77 | - 系统动力学 (了解)
78 | - 隶属度函数 (了解)
79 | - 连续区间有序加权平均算子 COWA (尚未了解)
80 | - 插值与拟合 matlab 实现 (拟合工具箱和interp1函数)
81 | - lingo (放弃)
82 | - 灰色关联度 筛选变量 (灰色关联度在变量选择之中的误区)
83 | - latex 编译 内容填充
84 | - 空间作图 复习
85 | - R ggplot2 各种作图 复习
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
--------------------------------------------------------------------------------
/dataset/SZIndex.desc:
--------------------------------------------------------------------------------
1 | 上证指数2010-04-09到2017-08-31数据
2 |
3 | columns:
4 | 一日对数收益差
5 | 五日对数收益差
6 | 当日对数高低价差
7 | 当日成交量
8 | 对数融资余额差
9 | 交易日期
10 | 收盘价
11 |
12 | 前5列 利用 from sklearn import preprocessing.scale 进行归一化
13 |
14 |
15 |
--------------------------------------------------------------------------------
/dataset/ablone.names:
--------------------------------------------------------------------------------
1 | 1. Title of Database: Abalone data
2 |
3 | 2. Sources:
4 |
5 | (a) Original owners of database:
6 | Marine Resources Division
7 | Marine Research Laboratories - Taroona
8 | Department of Primary Industry and Fisheries, Tasmania
9 | GPO Box 619F, Hobart, Tasmania 7001, Australia
10 | (contact: Warwick Nash +61 02 277277, wnash@dpi.tas.gov.au)
11 |
12 | (b) Donor of database:
13 | Sam Waugh (Sam.Waugh@cs.utas.edu.au)
14 | Department of Computer Science, University of Tasmania
15 | GPO Box 252C, Hobart, Tasmania 7001, Australia
16 |
17 | (c) Date received: December 1995
18 |
19 |
20 | 3. Past Usage:
21 |
22 | Sam Waugh (1995) "Extending and benchmarking Cascade-Correlation", PhD
23 | thesis, Computer Science Department, University of Tasmania.
24 |
25 | -- Test set performance (final 1044 examples, first 3133 used for training):
26 | 24.86% Cascade-Correlation (no hidden nodes)
27 | 26.25% Cascade-Correlation (5 hidden nodes)
28 | 21.5% C4.5
29 | 0.0% Linear Discriminate Analysis
30 | 3.57% k=5 Nearest Neighbour
31 | (Problem encoded as a classification task)
32 |
33 | -- Data set samples are highly overlapped. Further information is required
34 | to separate completely using affine combinations. Other restrictions
35 | to data set examined.
36 |
37 | David Clark, Zoltan Schreter, Anthony Adams "A Quantitative Comparison of
38 | Dystal and Backpropagation", submitted to the Australian Conference on
39 | Neural Networks (ACNN'96). Data set treated as a 3-category classification
40 | problem (grouping ring classes 1-8, 9 and 10, and 11 on).
41 |
42 | -- Test set performance (3133 training, 1044 testing as above):
43 | 64% Backprop
44 | 55% Dystal
45 | -- Previous work (Waugh, 1995) on same data set:
46 | 61.40% Cascade-Correlation (no hidden nodes)
47 | 65.61% Cascade-Correlation (5 hidden nodes)
48 | 59.2% C4.5
49 | 32.57% Linear Discriminate Analysis
50 | 62.46% k=5 Nearest Neighbour
51 |
52 |
53 | 4. Relevant Information Paragraph:
54 |
55 | Predicting the age of abalone from physical measurements. The age of
56 | abalone is determined by cutting the shell through the cone, staining it,
57 | and counting the number of rings through a microscope -- a boring and
58 | time-consuming task. Other measurements, which are easier to obtain, are
59 | used to predict the age. Further information, such as weather patterns
60 | and location (hence food availability) may be required to solve the problem.
61 |
62 | From the original data examples with missing values were removed (the
63 | majority having the predicted value missing), and the ranges of the
64 | continuous values have been scaled for use with an ANN (by dividing by 200).
65 |
66 | Data comes from an original (non-machine-learning) study:
67 |
68 | Warwick J Nash, Tracy L Sellers, Simon R Talbot, Andrew J Cawthorn and
69 | Wes B Ford (1994) "The Population Biology of Abalone (_Haliotis_
70 | species) in Tasmania. I. Blacklip Abalone (_H. rubra_) from the North
71 | Coast and Islands of Bass Strait", Sea Fisheries Division, Technical
72 | Report No. 48 (ISSN 1034-3288)
73 |
74 |
75 | 5. Number of Instances: 4177
76 |
77 |
78 | 6. Number of Attributes: 8
79 |
80 |
81 | 7. Attribute information:
82 |
83 | Given is the attribute name, attribute type, the measurement unit and a
84 | brief description. The number of rings is the value to predict: either
85 | as a continuous value or as a classification problem.
86 |
87 | Name Data Type Meas. Description
88 | ---- --------- ----- -----------
89 | Sex nominal M, F, and I (infant)
90 | Length continuous mm Longest shell measurement
91 | Diameter continuous mm perpendicular to length
92 | Height continuous mm with meat in shell
93 | Whole weight continuous grams whole abalone
94 | Shucked weight continuous grams weight of meat
95 | Viscera weight continuous grams gut weight (after bleeding)
96 | Shell weight continuous grams after being dried
97 | Rings integer +1.5 gives the age in years
98 |
99 | Statistics for numeric domains:
100 |
101 | Length Diam Height Whole Shucked Viscera Shell Rings
102 | Min 0.075 0.055 0.000 0.002 0.001 0.001 0.002 1
103 | Max 0.815 0.650 1.130 2.826 1.488 0.760 1.005 29
104 | Mean 0.524 0.408 0.140 0.829 0.359 0.181 0.239 9.934
105 | SD 0.120 0.099 0.042 0.490 0.222 0.110 0.139 3.224
106 | Correl 0.557 0.575 0.557 0.540 0.421 0.504 0.628 1.0
107 |
108 |
109 | 8. Missing Attribute Values: None
110 |
111 |
112 | 9. Class Distribution:
113 |
114 | Class Examples
115 | ----- --------
116 | 1 1
117 | 2 1
118 | 3 15
119 | 4 57
120 | 5 115
121 | 6 259
122 | 7 391
123 | 8 568
124 | 9 689
125 | 10 634
126 | 11 487
127 | 12 267
128 | 13 203
129 | 14 126
130 | 15 103
131 | 16 67
132 | 17 58
133 | 18 42
134 | 19 32
135 | 20 26
136 | 21 14
137 | 22 6
138 | 23 9
139 | 24 2
140 | 25 1
141 | 26 1
142 | 27 2
143 | 29 1
144 | ----- ----
145 | Total 4177
146 |
--------------------------------------------------------------------------------
/dataset/auto.csv:
--------------------------------------------------------------------------------
1 | make,price,mpg,rep78,headroom,trunk,weight,length,turn,displacement,gear_ratio,foreign
2 | "AMC Concord",4099,22,3,2.5,11,2930,186,40,121,3.58,"Domestic"
3 | "AMC Pacer",4749,17,3,3.0,11,3350,173,40,258,2.53,"Domestic"
4 | "AMC Spirit",3799,22,,3.0,12,2640,168,35,121,3.08,"Domestic"
5 | "Buick Century",4816,20,3,4.5,16,3250,196,40,196,2.93,"Domestic"
6 | "Buick Electra",7827,15,4,4.0,20,4080,222,43,350,2.41,"Domestic"
7 | "Buick LeSabre",5788,18,3,4.0,21,3670,218,43,231,2.73,"Domestic"
8 | "Buick Opel",4453,26,,3.0,10,2230,170,34,304,2.87,"Domestic"
9 | "Buick Regal",5189,20,3,2.0,16,3280,200,42,196,2.93,"Domestic"
10 | "Buick Riviera",10372,16,3,3.5,17,3880,207,43,231,2.93,"Domestic"
11 | "Buick Skylark",4082,19,3,3.5,13,3400,200,42,231,3.08,"Domestic"
12 | "Cad. Deville",11385,14,3,4.0,20,4330,221,44,425,2.28,"Domestic"
13 | "Cad. Eldorado",14500,14,2,3.5,16,3900,204,43,350,2.19,"Domestic"
14 | "Cad. Seville",15906,21,3,3.0,13,4290,204,45,350,2.24,"Domestic"
15 | "Chev. Chevette",3299,29,3,2.5,9,2110,163,34,231,2.93,"Domestic"
16 | "Chev. Impala",5705,16,4,4.0,20,3690,212,43,250,2.56,"Domestic"
17 | "Chev. Malibu",4504,22,3,3.5,17,3180,193,31,200,2.73,"Domestic"
18 | "Chev. Monte Carlo",5104,22,2,2.0,16,3220,200,41,200,2.73,"Domestic"
19 | "Chev. Monza",3667,24,2,2.0,7,2750,179,40,151,2.73,"Domestic"
20 | "Chev. Nova",3955,19,3,3.5,13,3430,197,43,250,2.56,"Domestic"
21 | "Dodge Colt",3984,30,5,2.0,8,2120,163,35,98,3.54,"Domestic"
22 | "Dodge Diplomat",4010,18,2,4.0,17,3600,206,46,318,2.47,"Domestic"
23 | "Dodge Magnum",5886,16,2,4.0,17,3600,206,46,318,2.47,"Domestic"
24 | "Dodge St. Regis",6342,17,2,4.5,21,3740,220,46,225,2.94,"Domestic"
25 | "Ford Fiesta",4389,28,4,1.5,9,1800,147,33,98,3.15,"Domestic"
26 | "Ford Mustang",4187,21,3,2.0,10,2650,179,43,140,3.08,"Domestic"
27 | "Linc. Continental",11497,12,3,3.5,22,4840,233,51,400,2.47,"Domestic"
28 | "Linc. Mark V",13594,12,3,2.5,18,4720,230,48,400,2.47,"Domestic"
29 | "Linc. Versailles",13466,14,3,3.5,15,3830,201,41,302,2.47,"Domestic"
30 | "Merc. Bobcat",3829,22,4,3.0,9,2580,169,39,140,2.73,"Domestic"
31 | "Merc. Cougar",5379,14,4,3.5,16,4060,221,48,302,2.75,"Domestic"
32 | "Merc. Marquis",6165,15,3,3.5,23,3720,212,44,302,2.26,"Domestic"
33 | "Merc. Monarch",4516,18,3,3.0,15,3370,198,41,250,2.43,"Domestic"
34 | "Merc. XR-7",6303,14,4,3.0,16,4130,217,45,302,2.75,"Domestic"
35 | "Merc. Zephyr",3291,20,3,3.5,17,2830,195,43,140,3.08,"Domestic"
36 | "Olds 98",8814,21,4,4.0,20,4060,220,43,350,2.41,"Domestic"
37 | "Olds Cutl Supr",5172,19,3,2.0,16,3310,198,42,231,2.93,"Domestic"
38 | "Olds Cutlass",4733,19,3,4.5,16,3300,198,42,231,2.93,"Domestic"
39 | "Olds Delta 88",4890,18,4,4.0,20,3690,218,42,231,2.73,"Domestic"
40 | "Olds Omega",4181,19,3,4.5,14,3370,200,43,231,3.08,"Domestic"
41 | "Olds Starfire",4195,24,1,2.0,10,2730,180,40,151,2.73,"Domestic"
42 | "Olds Toronado",10371,16,3,3.5,17,4030,206,43,350,2.41,"Domestic"
43 | "Plym. Arrow",4647,28,3,2.0,11,3260,170,37,156,3.05,"Domestic"
44 | "Plym. Champ",4425,34,5,2.5,11,1800,157,37,86,2.97,"Domestic"
45 | "Plym. Horizon",4482,25,3,4.0,17,2200,165,36,105,3.37,"Domestic"
46 | "Plym. Sapporo",6486,26,,1.5,8,2520,182,38,119,3.54,"Domestic"
47 | "Plym. Volare",4060,18,2,5.0,16,3330,201,44,225,3.23,"Domestic"
48 | "Pont. Catalina",5798,18,4,4.0,20,3700,214,42,231,2.73,"Domestic"
49 | "Pont. Firebird",4934,18,1,1.5,7,3470,198,42,231,3.08,"Domestic"
50 | "Pont. Grand Prix",5222,19,3,2.0,16,3210,201,45,231,2.93,"Domestic"
51 | "Pont. Le Mans",4723,19,3,3.5,17,3200,199,40,231,2.93,"Domestic"
52 | "Pont. Phoenix",4424,19,,3.5,13,3420,203,43,231,3.08,"Domestic"
53 | "Pont. Sunbird",4172,24,2,2.0,7,2690,179,41,151,2.73,"Domestic"
54 | "Audi 5000",9690,17,5,3.0,15,2830,189,37,131,3.20,"Foreign"
55 | "Audi Fox",6295,23,3,2.5,11,2070,174,36,97,3.70,"Foreign"
56 | "BMW 320i",9735,25,4,2.5,12,2650,177,34,121,3.64,"Foreign"
57 | "Datsun 200",6229,23,4,1.5,6,2370,170,35,119,3.89,"Foreign"
58 | "Datsun 210",4589,35,5,2.0,8,2020,165,32,85,3.70,"Foreign"
59 | "Datsun 510",5079,24,4,2.5,8,2280,170,34,119,3.54,"Foreign"
60 | "Datsun 810",8129,21,4,2.5,8,2750,184,38,146,3.55,"Foreign"
61 | "Fiat Strada",4296,21,3,2.5,16,2130,161,36,105,3.37,"Foreign"
62 | "Honda Accord",5799,25,5,3.0,10,2240,172,36,107,3.05,"Foreign"
63 | "Honda Civic",4499,28,4,2.5,5,1760,149,34,91,3.30,"Foreign"
64 | "Mazda GLC",3995,30,4,3.5,11,1980,154,33,86,3.73,"Foreign"
65 | "Peugeot 604",12990,14,,3.5,14,3420,192,38,163,3.58,"Foreign"
66 | "Renault Le Car",3895,26,3,3.0,10,1830,142,34,79,3.72,"Foreign"
67 | "Subaru",3798,35,5,2.5,11,2050,164,36,97,3.81,"Foreign"
68 | "Toyota Celica",5899,18,5,2.5,14,2410,174,36,134,3.06,"Foreign"
69 | "Toyota Corolla",3748,31,5,3.0,9,2200,165,35,97,3.21,"Foreign"
70 | "Toyota Corona",5719,18,5,2.0,11,2670,175,36,134,3.05,"Foreign"
71 | "VW Dasher",7140,23,4,2.5,12,2160,172,36,97,3.74,"Foreign"
72 | "VW Diesel",5397,41,5,3.0,15,2040,155,35,90,3.78,"Foreign"
73 | "VW Rabbit",4697,25,4,3.0,15,1930,155,35,89,3.78,"Foreign"
74 | "VW Scirocco",6850,25,4,2.0,16,1990,156,36,97,3.78,"Foreign"
75 | "Volvo 260",11995,17,5,2.5,14,3170,193,37,163,2.98,"Foreign"
76 |
--------------------------------------------------------------------------------
/dataset/auto.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yangruipis/ModelingPreparation/1663c8153fb97049ae716e2f0dab27d457bcbc9c/dataset/auto.mat
--------------------------------------------------------------------------------
/dataset/auto_1.csv:
--------------------------------------------------------------------------------
1 | price,mpg,rep78,headroom,trunk,weight,length,turn,displacement,gear_ratio,foreign
2 | 4099,22,3,2.5,11,2930,186,40,121,3.58,1
3 | 4749,17,3,3,11,3350,173,40,258,2.53,1
4 | 3799,22,,3,12,2640,168,35,121,3.08,1
5 | 4816,20,3,4.5,16,3250,196,40,196,2.93,1
6 | 7827,15,4,4,20,4080,222,43,350,2.41,1
7 | 5788,18,3,4,21,3670,218,43,231,2.73,1
8 | 4453,26,,3,10,2230,170,34,304,2.87,1
9 | 5189,20,3,2,16,3280,200,42,196,2.93,1
10 | 10372,16,3,3.5,17,3880,207,43,231,2.93,1
11 | 4082,19,3,3.5,13,3400,200,42,231,3.08,1
12 | 11385,14,3,4,20,4330,221,44,425,2.28,1
13 | 14500,14,2,3.5,16,3900,204,43,350,2.19,1
14 | 15906,21,3,3,13,4290,204,45,350,2.24,1
15 | 3299,29,3,2.5,9,2110,163,34,231,2.93,1
16 | 5705,16,4,4,20,3690,212,43,250,2.56,1
17 | 4504,22,3,3.5,17,3180,193,31,200,2.73,1
18 | 5104,22,2,2,16,3220,200,41,200,2.73,1
19 | 3667,24,2,2,7,2750,179,40,151,2.73,1
20 | 3955,19,3,3.5,13,3430,197,43,250,2.56,1
21 | 3984,30,5,2,8,2120,163,35,98,3.54,1
22 | 4010,18,2,4,17,3600,206,46,318,2.47,1
23 | 5886,16,2,4,17,3600,206,46,318,2.47,1
24 | 6342,17,2,4.5,21,3740,220,46,225,2.94,1
25 | 4389,28,4,1.5,9,1800,147,33,98,3.15,1
26 | 4187,21,3,2,10,2650,179,43,140,3.08,1
27 | 11497,12,3,3.5,22,4840,233,51,400,2.47,1
28 | 13594,12,3,2.5,18,4720,230,48,400,2.47,1
29 | 13466,14,3,3.5,15,3830,201,41,302,2.47,1
30 | 3829,22,4,3,9,2580,169,39,140,2.73,1
31 | 5379,14,4,3.5,16,4060,221,48,302,2.75,1
32 | 6165,15,3,3.5,23,3720,212,44,302,2.26,1
33 | 4516,18,3,3,15,3370,198,41,250,2.43,1
34 | 6303,14,4,3,16,4130,217,45,302,2.75,1
35 | 3291,20,3,3.5,17,2830,195,43,140,3.08,1
36 | 8814,21,4,4,20,4060,220,43,350,2.41,1
37 | 5172,19,3,2,16,3310,198,42,231,2.93,1
38 | 4733,19,3,4.5,16,3300,198,42,231,2.93,1
39 | 4890,18,4,4,20,3690,218,42,231,2.73,1
40 | 4181,19,3,4.5,14,3370,200,43,231,3.08,1
41 | 4195,24,1,2,10,2730,180,40,151,2.73,1
42 | 10371,16,3,3.5,17,4030,206,43,350,2.41,1
43 | 4647,28,3,2,11,3260,170,37,156,3.05,1
44 | 4425,34,5,2.5,11,1800,157,37,86,2.97,1
45 | 4482,25,3,4,17,2200,165,36,105,3.37,1
46 | 6486,26,,1.5,8,2520,182,38,119,3.54,1
47 | 4060,18,2,5,16,3330,201,44,225,3.23,1
48 | 5798,18,4,4,20,3700,214,42,231,2.73,1
49 | 4934,18,1,1.5,7,3470,198,42,231,3.08,1
50 | 5222,19,3,2,16,3210,201,45,231,2.93,1
51 | 4723,19,3,3.5,17,3200,199,40,231,2.93,1
52 | 4424,19,,3.5,13,3420,203,43,231,3.08,1
53 | 4172,24,2,2,7,2690,179,41,151,2.73,1
54 | 9690,17,5,3,15,2830,189,37,131,3.2,0
55 | 6295,23,3,2.5,11,2070,174,36,97,3.7,0
56 | 9735,25,4,2.5,12,2650,177,34,121,3.64,0
57 | 6229,23,4,1.5,6,2370,170,35,119,3.89,0
58 | 4589,35,5,2,8,2020,165,32,85,3.7,0
59 | 5079,24,4,2.5,8,2280,170,34,119,3.54,0
60 | 8129,21,4,2.5,8,2750,184,38,146,3.55,0
61 | 4296,21,3,2.5,16,2130,161,36,105,3.37,0
62 | 5799,25,5,3,10,2240,172,36,107,3.05,0
63 | 4499,28,4,2.5,5,1760,149,34,91,3.3,0
64 | 3995,30,4,3.5,11,1980,154,33,86,3.73,0
65 | 12990,14,,3.5,14,3420,192,38,163,3.58,0
66 | 3895,26,3,3,10,1830,142,34,79,3.72,0
67 | 3798,35,5,2.5,11,2050,164,36,97,3.81,0
68 | 5899,18,5,2.5,14,2410,174,36,134,3.06,0
69 | 3748,31,5,3,9,2200,165,35,97,3.21,0
70 | 5719,18,5,2,11,2670,175,36,134,3.05,0
71 | 7140,23,4,2.5,12,2160,172,36,97,3.74,0
72 | 5397,41,5,3,15,2040,155,35,90,3.78,0
73 | 4697,25,4,3,15,1930,155,35,89,3.78,0
74 | 6850,25,4,2,16,1990,156,36,97,3.78,0
75 | 11995,17,5,2.5,14,3170,193,37,163,2.98,0
76 |
--------------------------------------------------------------------------------
/dataset/international-airline-passengers.csv:
--------------------------------------------------------------------------------
1 | "Month","International airline passengers: monthly totals in thousands. Jan 49 ? Dec 60"
2 | "1949-01",112
3 | "1949-02",118
4 | "1949-03",132
5 | "1949-04",129
6 | "1949-05",121
7 | "1949-06",135
8 | "1949-07",148
9 | "1949-08",148
10 | "1949-09",136
11 | "1949-10",119
12 | "1949-11",104
13 | "1949-12",118
14 | "1950-01",115
15 | "1950-02",126
16 | "1950-03",141
17 | "1950-04",135
18 | "1950-05",125
19 | "1950-06",149
20 | "1950-07",170
21 | "1950-08",170
22 | "1950-09",158
23 | "1950-10",133
24 | "1950-11",114
25 | "1950-12",140
26 | "1951-01",145
27 | "1951-02",150
28 | "1951-03",178
29 | "1951-04",163
30 | "1951-05",172
31 | "1951-06",178
32 | "1951-07",199
33 | "1951-08",199
34 | "1951-09",184
35 | "1951-10",162
36 | "1951-11",146
37 | "1951-12",166
38 | "1952-01",171
39 | "1952-02",180
40 | "1952-03",193
41 | "1952-04",181
42 | "1952-05",183
43 | "1952-06",218
44 | "1952-07",230
45 | "1952-08",242
46 | "1952-09",209
47 | "1952-10",191
48 | "1952-11",172
49 | "1952-12",194
50 | "1953-01",196
51 | "1953-02",196
52 | "1953-03",236
53 | "1953-04",235
54 | "1953-05",229
55 | "1953-06",243
56 | "1953-07",264
57 | "1953-08",272
58 | "1953-09",237
59 | "1953-10",211
60 | "1953-11",180
61 | "1953-12",201
62 | "1954-01",204
63 | "1954-02",188
64 | "1954-03",235
65 | "1954-04",227
66 | "1954-05",234
67 | "1954-06",264
68 | "1954-07",302
69 | "1954-08",293
70 | "1954-09",259
71 | "1954-10",229
72 | "1954-11",203
73 | "1954-12",229
74 | "1955-01",242
75 | "1955-02",233
76 | "1955-03",267
77 | "1955-04",269
78 | "1955-05",270
79 | "1955-06",315
80 | "1955-07",364
81 | "1955-08",347
82 | "1955-09",312
83 | "1955-10",274
84 | "1955-11",237
85 | "1955-12",278
86 | "1956-01",284
87 | "1956-02",277
88 | "1956-03",317
89 | "1956-04",313
90 | "1956-05",318
91 | "1956-06",374
92 | "1956-07",413
93 | "1956-08",405
94 | "1956-09",355
95 | "1956-10",306
96 | "1956-11",271
97 | "1956-12",306
98 | "1957-01",315
99 | "1957-02",301
100 | "1957-03",356
101 | "1957-04",348
102 | "1957-05",355
103 | "1957-06",422
104 | "1957-07",465
105 | "1957-08",467
106 | "1957-09",404
107 | "1957-10",347
108 | "1957-11",305
109 | "1957-12",336
110 | "1958-01",340
111 | "1958-02",318
112 | "1958-03",362
113 | "1958-04",348
114 | "1958-05",363
115 | "1958-06",435
116 | "1958-07",491
117 | "1958-08",505
118 | "1958-09",404
119 | "1958-10",359
120 | "1958-11",310
121 | "1958-12",337
122 | "1959-01",360
123 | "1959-02",342
124 | "1959-03",406
125 | "1959-04",396
126 | "1959-05",420
127 | "1959-06",472
128 | "1959-07",548
129 | "1959-08",559
130 | "1959-09",463
131 | "1959-10",407
132 | "1959-11",362
133 | "1959-12",405
134 | "1960-01",417
135 | "1960-02",391
136 | "1960-03",419
137 | "1960-04",461
138 | "1960-05",472
139 | "1960-06",535
140 | "1960-07",622
141 | "1960-08",606
142 | "1960-09",508
143 | "1960-10",461
144 | "1960-11",390
145 | "1960-12",432
146 |
147 | International airline passengers: monthly totals in thousands. Jan 49 ? Dec 60
148 |
149 |
--------------------------------------------------------------------------------
/plot/cluster_plot.R:
--------------------------------------------------------------------------------
1 | library(devtools)
2 | library(wordVectors)
3 | library(showtext)
4 | library(ggplot2)
5 |
6 |
7 | df <- read.csv("./datafile/word_vector.txt",header = TRUE)
8 |
9 | a <- cosineDist(df[1],df[2])
10 | a <- cov(df,df)
11 | r<- matrix(1-a,nrow=40,dimnames=list(colnames(df),colnames(df)))
12 | hc <- hclust(as.dist(r),method = "complete")
13 |
14 | pdf(file = 'fujia_7.pdf', width = 12, height = 8)
15 | plot(hc)
16 | dev.off()
17 | ## dist
18 | #euclidean 欧几里德距离,就是平方再开方。
19 | #maximum 切比雪夫距离
20 | #manhattan 绝对值距离
21 | #canberra Lance 距离
22 | #minkowski 明科夫斯基距离,使用时要指定p值
23 | #binary 定性变量距离.
24 |
25 | ##method
26 | #single 最短距离法
27 | #complete 最长距离法
28 | #median 中间距离法
29 | #mcquitty 相似法
30 | #average 类平均法
31 | #centroid 重心法
32 | #ward 离差平方和法
33 |
34 | install.packages("ape",repos = 'http://mirrors.ustc.edu.cn/CRAN/')
35 | library(ape)
36 | plot(as.phylo(hc), type = "fan")
37 | plot(as.phylo(hc), type = "fan", tip.color = hsv(runif(15, 0.65,
38 | 0.95), 1, 1, 0.7), edge.color = hsv(runif(10, 0.65, 0.75), 1, 1, 0.7), edge.width = runif(20,
39 | 0.5, 3), use.edge.length = TRUE, col = "gray80")
40 |
--------------------------------------------------------------------------------
/plot/datafile/beijing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yangruipis/ModelingPreparation/1663c8153fb97049ae716e2f0dab27d457bcbc9c/plot/datafile/beijing.png
--------------------------------------------------------------------------------
/plot/datafile/beijingDots.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yangruipis/ModelingPreparation/1663c8153fb97049ae716e2f0dab27d457bcbc9c/plot/datafile/beijingDots.png
--------------------------------------------------------------------------------
/plot/datafile/c_dijishi.dta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yangruipis/ModelingPreparation/1663c8153fb97049ae716e2f0dab27d457bcbc9c/plot/datafile/c_dijishi.dta
--------------------------------------------------------------------------------
/plot/datafile/c_seven.dta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yangruipis/ModelingPreparation/1663c8153fb97049ae716e2f0dab27d457bcbc9c/plot/datafile/c_seven.dta
--------------------------------------------------------------------------------
/plot/datafile/c_sheng1.dta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yangruipis/ModelingPreparation/1663c8153fb97049ae716e2f0dab27d457bcbc9c/plot/datafile/c_sheng1.dta
--------------------------------------------------------------------------------
/plot/datafile/d_dijishi.dta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yangruipis/ModelingPreparation/1663c8153fb97049ae716e2f0dab27d457bcbc9c/plot/datafile/d_dijishi.dta
--------------------------------------------------------------------------------
/plot/datafile/d_seven.dta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yangruipis/ModelingPreparation/1663c8153fb97049ae716e2f0dab27d457bcbc9c/plot/datafile/d_seven.dta
--------------------------------------------------------------------------------
/plot/datafile/d_sheng1.dta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yangruipis/ModelingPreparation/1663c8153fb97049ae716e2f0dab27d457bcbc9c/plot/datafile/d_sheng1.dta
--------------------------------------------------------------------------------
/plot/google_map_api.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | import Image
4 | import urllib
5 | import numpy as np
6 | from cStringIO import StringIO
7 | import matplotlib.pyplot as plt
8 | from PIL import Image
9 |
10 | def Gmap(centerLat,centerLon,zoomS,pixelS,size,dark,saveAddress):
11 | # get the map .png of your interesting area
12 | url = 'http://maps.googleapis.com/maps/api/staticmap?sensor=false'\
13 | +'&size='+str(size)+'x'+str(size)+'¢er='+str(centerLat)+','\
14 | +str(centerLon)+'&zoom='+str(zoomS)+'&scale='+str(pixelS)\
15 | +'&maptype=terrain' # satellite 卫星图
16 | if dark==True:
17 | url = url+'&style=feature:all|element:all|saturation:-10|lightness:20'
18 | print url
19 | # 由于缺少api key,直接手动保存到本地即可
20 |
21 | # buffer = StringIO(urllib.urlopen(url).read())
22 | # image = Image.open(buffer)
23 | # if saveAddress:
24 | # image.save(saveAddress)
25 | # else:
26 | # image.show()
27 |
28 | def latLonToPixelXY(lat,lon,zoomS):
29 | mapW = 256*2**zoomS+0.0
30 | mapH = 256*2**zoomS+0.0
31 | x = (lon+180)*(mapW/360)# get x value
32 | latRad = lat*np.pi/180# convert from degrees to radians
33 | mercN = np.log(np.tan((np.pi/4)+(latRad/2)))# get y value
34 | y = (mapH/2)-(mapW*mercN/(2*np.pi))
35 | return x,y
36 |
37 | def sample(lis,amount):
38 | # 作图样本太多时用于抽样
39 | import random
40 | num_set = set()
41 | while(len(num_set) rand:
92 | return i
93 |
94 | def gene_pop(self):
95 | """
96 | 基因淘汰
97 | :return:
98 | """
99 | min_index = self.fit_value.index(min(self.fit_value))
100 | self.genes.pop(min_index)
101 | self.fit_value.pop(min_index)
102 |
103 | def begin(self):
104 | for i in range(1000):
105 | index1 = self.choose_gene(random.random())
106 | index2 = self.choose_gene(random.random())
107 | while index1 == index2:
108 | index2 = self.choose_gene(random.random())
109 |
110 | if random.random() < self.mutation_prob:
111 | self.genes[index1].mutation()
112 | self.genes[index2].mutation()
113 |
114 | if random.random() < self.cross_prob:
115 | Gene.cross(self.genes[index1], self.genes[index2])
116 |
117 | self.get_fit_value()
118 | # self.gene_pop()
119 |
120 | result = self.get_best_gene()
121 | print len(self.genes), result[0].bin2dec(), result[1]
122 |
123 |
124 |
125 | if __name__ == '__main__':
126 | # gene1 = Gene()
127 | # print gene1.bin_value
128 | # print gene1.bin2dec()
129 | GA = GeneticAlgorithm()
130 | GA.begin()
--------------------------------------------------------------------------------
/优化模型/sa_tsp_example.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | import numpy as np
3 | import random
4 | import copy
5 | from simulated_annealing import exeTime
6 | import matplotlib.pyplot as plt
7 |
8 |
9 | class City():
10 | __slots__ = ("X", "Y")
11 | def __init__(self, x, y):
12 | self.X = x
13 | self.Y = y
14 |
15 |
16 | class Graph:
17 |
18 | def __init__(self):
19 | self.city_list = []
20 | self.total_distance = 0
21 |
22 | @staticmethod
23 | def get_distance(city1, city2):
24 | return np.sqrt((city1.X - city2.X) ** 2 + (city1.Y - city2.Y) ** 2)
25 |
26 | def add_city(self, city):
27 | if isinstance(city, City):
28 | self.city_list.append(city)
29 | elif isinstance(city, list):
30 | self.city_list += city
31 | else:
32 | print 'Add City Wrong'
33 |
34 | def reset_distance(self):
35 | self.total_distance = 0
36 |
37 | def get_total_distance(self, sequence = None):
38 | if self.city_list == [] or len(self.city_list) < 2:
39 | print "请添加城市!"
40 | else:
41 | distance = 0
42 | if sequence == None:
43 | for i,city in enumerate(self.city_list[:-1]):
44 | distance += self.get_distance(city, self.city_list[i+1])
45 |
46 | distance += self.get_distance(self.city_list[0], self.city_list[-1])
47 |
48 | elif sorted(sequence) == range(len(self.city_list)):
49 | self.reset_distance()
50 | for i,j in enumerate(sequence[:-1]):
51 | distance += self.get_distance(self.city_list[j], self.city_list[sequence[i+1]])
52 |
53 | distance += self.get_distance(self.city_list[sequence[0]], self.city_list[sequence[-1]])
54 |
55 | else:
56 | print 'Wrong Sequence'
57 | return distance
58 |
59 |
60 | def gen_new_sequence(sequence):
61 | sequence1 = copy.copy(sequence)
62 | swap_number1, swap_number2 = random.sample(sequence1, 2)
63 | sequence1[swap_number1], sequence1[swap_number2] = sequence1[swap_number2], sequence1[swap_number1]
64 | return copy.copy(sequence1)
65 |
66 | #@exeTime
67 | def get_shortest_distance(graph):
68 | T0 = 1000
69 | T_min = 1e-5
70 | delta = 0.9
71 | K = 10
72 | sequence = range(len(graph.city_list))
73 | distance = graph.get_total_distance(sequence)
74 | distance_list = []
75 | T = T0
76 | while T > T_min:
77 | for i in range(K):
78 | distance_list.append(distance)
79 | new_sequence = gen_new_sequence(sequence)
80 |
81 | new_distance = graph.get_total_distance(new_sequence)
82 |
83 | delta_E = new_distance - distance
84 | if delta_E < 0:
85 | distance = new_distance
86 | sequence = new_sequence
87 | break
88 | else:
89 | p_k = np.exp(- delta_E / T)
90 | if random.random() < p_k:
91 | distance = new_distance
92 | sequence = new_sequence
93 | break
94 | T *= delta
95 | return sequence, distance, distance_list
96 |
97 |
98 |
99 | if __name__ == '__main__':
100 |
101 | city_a = City(0, 0)
102 | city_b = City(0, 1)
103 | city_c = City(1, 0)
104 | city_d = City(1, 1)
105 |
106 | city_list = [
107 | City(0, 0),
108 | City(1, 0),
109 | City(2, 0),
110 | City(3, 0),
111 | City(4, 0),
112 | City(5, 2),
113 | City(0, 3),
114 | City(0, 4),
115 | City(0, 5),
116 | City(0, 6),
117 | City(1, 2),
118 | City(4, 3),
119 | City(50, 6),
120 | City(2, 3),
121 | City(1, 4),
122 | City(3, 16),
123 | City(3, 12),
124 | City(1, 12),
125 | City(12, 21),
126 | City(7, 8),
127 | City(5, 0),
128 | City(1, 9),
129 | City(2, 7),
130 | City(3, 7),
131 | City(10, 11),
132 | City(11, 1),
133 | City(17, 3),
134 | City(15, 3),
135 | City(22, 16),
136 | City(15, 1),
137 | City(8, 5),
138 | City(3, 1),
139 | City(2, 9),
140 | City(1, 9),
141 | City(9, 3),
142 | City(14, 1),
143 | City(12, 12),
144 | ]
145 | # 中国31省数据,最优值为15500以下
146 | chinese_province_list = [
147 | City(1304,2312),
148 | City(3639,1315),
149 | City(4177,2244),
150 | City(3712,1399),
151 | City(3488,1535),
152 | City(3326,1556),
153 | City(3238,1229),
154 | City(4196,1004),
155 | City(4312,790),
156 | City(4386,570),
157 | City(3007,1970),
158 | City(2562,1756),
159 | City(2788,1491),
160 | City(2381,1676),
161 | City(1332,695),
162 | City(3715,1678),
163 | City(3918,2179),
164 | City(4061,2370),
165 | City(3780,2212),
166 | City(3676,2578),
167 | City(4029,2838),
168 | City(4263,2931),
169 | City(3429,1908),
170 | City(3507,2367),
171 | City(3394,2643),
172 | City(3439,3201),
173 | City(2935,3240),
174 | City(3140,3550),
175 | City(2545,2357),
176 | City(2778,2826),
177 | City(2370,2975)
178 | ]
179 |
180 | graph = Graph()
181 | #graph.add_city([city_a, city_b, city_c, city_d])
182 | graph.add_city(chinese_province_list)
183 | result = get_shortest_distance(graph)
184 | plt.plot(result[2])
185 | plt.show()
186 |
187 |
188 |
189 |
190 | # result_list = []
191 | # for i in range(100):
192 | # result = get_shortest_distance(graph)
193 | # result_list.append(result[1])
194 | # print result[1]
195 | # result_list.sort()
196 | # print result_list
197 |
198 |
--------------------------------------------------------------------------------
/优化模型/simulated_annealing.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf8 -*-
2 | import numpy as np
3 | import time
4 | import random
5 |
6 | """
7 | 伪代码:
8 | =================================================
9 | 随机生成初始解 x,对应目标函数为f(x)
10 | 开始温度 T0 1e10
11 | 停止搜索温度 T_min 1e-8
12 | 温度下降速度 delta 0.9
13 | 每次迭代次数 K 100
14 |
15 | T = T0
16 | while T > T_threshold:
17 | for i in range(K):
18 | x' = gen(x)
19 | if f(x') < f(x):
20 | x = x'
21 | else:
22 | delta_E = f(x') - f(x)
23 | P_k = \frac{1}{1 + e^{-delta_E / T}}
24 | rand = random.random()
25 | if rand < P_k:
26 | x = x'
27 | else:
28 | pass
29 | T *= delta
30 |
31 | =================================================
32 | """
33 |
34 |
35 | def exeTime(func):
36 | def newFunc(*args, **args2):
37 | t0 = time.time()
38 | print "%s, {%s} start" % (time.strftime("%X", time.localtime()), func.__name__)
39 | print '------------------- begin ------------------------'
40 | back = func(*args, **args2)
41 | print '-------------------- end -------------------------'
42 | print "%s, {%s} end" % (time.strftime("%X", time.localtime()), func.__name__)
43 | print "%.8fs taken for {%s}" % (time.time() - t0, func.__name__)
44 | return back
45 |
46 | return newFunc
47 |
48 |
49 | class SimulatedAnnealing:
50 |
51 | def __init__(self, func):
52 | self.T0 = 1000
53 | self.T_min = 1e-8
54 | self.delta = 0.99
55 | self.K = 10000
56 | self.x_range = (0, 100)
57 | self.func = lambda x:(x-20)**2 if x <= 50 else (x-80)**2 +30 #-(1.0 * x**4 - x**3 + x**2 - x)
58 |
59 | def gen_new_x(self, x_before, T):
60 | while 1:
61 | x_after = x_before + (random.random() * 2 - 1) * T
62 | if self.x_range[0] <= x_after <= self.x_range[1]:
63 | return x_after
64 |
65 | @exeTime
66 | def begin(self):
67 | x = random.randint(self.x_range[0], self.x_range[1])
68 | f = self.func(x)
69 | T = self.T0
70 | while T > self.T_min:
71 | for i in range(self.K):
72 | new_x = self.gen_new_x(x, T)
73 | f_x = self.func(new_x)
74 | delta_E = f_x - f
75 | #
76 | if delta_E < 0:
77 | f = f_x
78 | x = new_x
79 | break
80 | else:
81 | #p_k = 1.0 / (1 + np.exp(- delta_E / self.func(T)))
82 | p_k = np.exp(- delta_E / T)
83 | if random.random() < p_k:
84 | f = f_x
85 | x = new_x
86 | break
87 | T *= self.delta
88 |
89 | return x
90 |
91 |
92 | if __name__ == '__main__':
93 | sa = SimulatedAnnealing('')
94 | x = sa.begin()
95 | print x, sa.func(x)
96 |
--------------------------------------------------------------------------------
/优化模型/simulated_annealing.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yangruipis/ModelingPreparation/1663c8153fb97049ae716e2f0dab27d457bcbc9c/优化模型/simulated_annealing.pyc
--------------------------------------------------------------------------------
/小工具/Association_rules.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | """
4 | 通过树状结构,更高效的挖掘频繁项集
5 |
6 | git:
7 | http://github.com/enaeseth/python-fp-growth/
8 | """
9 |
10 | from fp_growth import find_frequent_itemsets
11 | from apriori import *
12 |
13 | test_case = [
14 | ['a','b'],
15 | ['b','c','d'],
16 | ['a','b','d','e'],
17 | ['a','d','e'],
18 | ['a','b','c'],
19 | ['a','b','c','d'],
20 | ['a'],
21 | ['a','b','c'],
22 | ['a','b','d'],
23 | ['b','c','e'],
24 | ]
25 |
26 | # ================ Approach 1: how to get a faster frequent items ====================
27 | for item, support in find_frequent_itemsets(test_case, 2, True):
28 | print item, support
29 |
30 |
31 | # ================ Approach 2: gen a min support rate =================
32 | result_list = []
33 | for i in my_apriori(test_case):
34 | temp = '-'.join([k for k in i[0]]) + ',' + '-'.join([k for k in i[1]]) + ','
35 | result_list.append((temp, i[2]))
36 |
37 | result_list.sort(key = lambda x: x[1], reverse= False)
38 |
--------------------------------------------------------------------------------
/小工具/data_clean.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | """
4 | 数据清洗,包括
5 | - 缺失值补全
6 | - 均值、中位数补全 Done
7 | - 插值补全 Done
8 | - 异常值处理: winsor处理
9 | - 归一化 Done
10 | - 标准化 Done
11 | - 二值化 No Need
12 | - 分类变量编码
13 | - 有序 No Need
14 | - 无序 Done
15 | - 正则化检验(针对纯文本)
16 | - 去重
17 | - 去无效值 No Need
18 | - 关联性验证 No Need
19 |
20 | """
21 |
22 | import sklearn.preprocessing as sp
23 | import pandas as pd
24 | import numpy as np
25 | import matplotlib.pyplot as plt
26 |
27 | def fill_na(df, excep_columns=[], how='mean'):
28 | """
29 | 补全缺失值
30 | :param how:
31 | = 'mean'
32 | = 'median'
33 | = 'most_frequent'
34 | """
35 | select_columns = [i for i in df.columns if i not in excep_columns]
36 | df_temp = df.loc[:, select_columns]
37 |
38 | imp = sp.Imputer(missing_values='NaN', strategy=how, axis=0)
39 | imp.fit(df_temp)
40 | result = imp.transform(df_temp)
41 | for i in range(result.shape[1]):
42 | df[select_columns[i]] = result[:, i]
43 |
44 | return df
45 |
46 | def interpolate_na(df, excep_columns=[], how='lagrange'):
47 | """
48 |
49 | :param df:
50 | :param how:
51 | lagrange 拉格朗日插值
52 | spline 样条插值
53 | :return:
54 | """
55 | select_columns = [i for i in df.columns if i not in excep_columns]
56 |
57 | if how == 'lagrange':
58 | from scipy.interpolate import lagrange
59 | def ployinterp_column(s, n, k=5):
60 | set1 = set(range(len(s)))
61 | set2 = set(list(range(n - k, n)) + list(range(n + 1, n + 1 + k)))
62 | x = list(set1 & set2)
63 | y = s[x] # 取数
64 | x = np.array(x)[pd.notnull(y)]
65 | y = y[pd.notnull(y)] # 剔除空值
66 | lagrange_result =lagrange(x, y)
67 | return lagrange_result(n) # 插值并返回插值结果
68 | for column in select_columns:
69 | ds = df.loc[:,column].values
70 | if isinstance(ds[0], int) or isinstance(ds[0], float):
71 | for j in range(len(ds)):
72 | if pd.isnull(ds[j]):
73 | ds[j] = ployinterp_column(ds,j)
74 | df[column] = ds
75 | return df
76 | elif how == 'spline':
77 | from scipy.interpolate import spline
78 | for column in select_columns:
79 | ds = df.loc[:,column].values
80 | if isinstance(ds[0], int) or isinstance(ds[0], float):
81 | target_index= np.arange(len(ds))
82 | index = target_index[pd.notnull(ds)]
83 | ds_notnull = ds[pd.notnull(ds)]
84 | new_ds = spline(index, ds_notnull, target_index)
85 | df[column] = new_ds
86 | return df
87 |
88 | def standardize(df, excep_columns=[]):
89 | """
90 | 标准化,假设服从正态分布
91 | """
92 | select_columns = [i for i in df.columns if i not in excep_columns]
93 | df_temp = df.loc[:, select_columns]
94 | scaler = sp.StandardScaler().fit(df_temp)
95 | result = scaler.transform(df_temp)
96 | for i in range(result.shape[1]):
97 | df[select_columns[i]] = result[:, i]
98 | return df
99 |
100 | def normalize(df, excep_columns=[]):
101 | """
102 | 极值归一化,根据最大最小值使其在[0,1]之间
103 | """
104 | select_columns = [i for i in df.columns if i not in excep_columns]
105 | df_temp = df.loc[:, select_columns]
106 | min_max_scaler = sp.MinMaxScaler()
107 | min_max_scaler.fit_transform(df_temp)
108 | result = min_max_scaler.transform(df_temp)
109 | for i in range(result.shape[1]):
110 | df[select_columns[i]] = result[:, i]
111 | return df
112 |
113 |
114 | def label_encode(df, encode_column=[]):
115 | """
116 | 将分类标签进行编码,注意:只针对无序标签
117 | :param df: 数据框
118 | :param encode_column: 列名列表
119 | :return: 数据框
120 | """
121 | le = sp.LabelEncoder()
122 | for column in encode_column:
123 | # 非数值型转化为数值型
124 | ds = df.loc[:, column].values
125 | le.fit(ds)
126 | df[column] = le.transform(ds) # array([2, 2, 1])
127 | return df
128 |
129 | def drop_duplicate(df, columns=[]):
130 | return df.drop_duplicates(subset=columns)
131 |
132 |
133 | def replace_outlier(df):
134 | # 有问题,未调试,用winsor
135 | result = sp.robust_scale(df, with_scaling=False, with_centering=False)
136 | return pd.DataFrame(result)
137 |
138 | def winsorize(df, low_q=1, up_q=99):
139 | temp_df = df.copy()
140 | for column in temp_df.columns:
141 | ds = temp_df[column].values
142 | if isinstance(ds[0], int) or isinstance(ds[0], float):
143 | lower_bound = np.percentile(ds, low_q)
144 | upper_bound = np.percentile(ds, up_q)
145 | ds = map(lambda x: lower_bound if x < lower_bound else upper_bound if x > upper_bound else x, ds)
146 | temp_df[column] = ds
147 | return temp_df
148 |
149 |
150 | if __name__ == '__main__':
151 | df = pd.read_csv("/home/ray/Documents/suibe/2017/建模/Modeling_Preparation/dataset/auto.csv")
152 | df_columns = df.columns
153 |
154 | # 分类变量编码
155 | df = label_encode(df, ['make', 'foreign'])
156 |
157 | # 由于该数据非时序数据,因此无法线性插值,我们用样本均值填补
158 | # 样条插值补全缺失值
159 | # df = interpolate_na(df, ['rep78'], how='spline')
160 | # 均值补全缺失值
161 | df = fill_na(df)
162 |
163 | # 标准化
164 | df = standardize(df, ['make','foreign']) # 这两列是分类变量,不需要标准化
165 |
166 | # 归一化
167 | # df = normalize(df, ['make','foreign']) # 这两列是分类变量,不需要归一化
168 |
169 | # 去重
170 | # df = drop_duplicate(df, ['foreign', 'rep78'])
171 |
172 | temp = df['price'].values
173 | temp[0] = 5
174 | df['price'] =temp
175 |
176 | # 异常值
177 | df2 = winsorize(df,1,99)
178 |
179 | ax = plt.subplot(111)
180 | ax.scatter(df.index, df.price.values, color='r', label='1')
181 | ax.plot(df2.index, df2.price, color='b', label='2')
182 | ax.legend(['1','2'])
183 | plt.show()
184 |
185 |
--------------------------------------------------------------------------------
/小工具/due_date_calculate.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from WindPy import *
3 | import matplotlib.pyplot as plt
4 | import datetime
5 | import numpy as np
6 |
7 |
8 | def Wind2Df(wind_data):
9 | df = pd.DataFrame(wind_data.Data).T
10 | df.columns = wind_data.Fields
11 | df.index = wind_data.Times
12 | return df
13 |
14 | def is_due_date(date):
15 | if 15 <= date.day <= 21:
16 | if date.weekday() == 4:
17 | return True
18 | return False
19 |
20 | def gen_due_date(year, month):
21 | date0 = datetime.date(year,month, 1)
22 | for i in range(31):
23 | date0 = date0 + datetime.timedelta(1)
24 | if is_due_date(date0):
25 | return date0
26 | return None
27 |
28 | def get_due_date(date):
29 | due_date_this_month = gen_due_date(date.year, date.month)
30 | if date.month != 12:
31 | due_date_next_month = gen_due_date(date.year, date.month + 1)
32 | else:
33 | due_date_next_month = gen_due_date(date.year - 1, 1)
34 | if date > due_date_this_month:
35 | return due_date_next_month
36 | else:
37 | return due_date_this_month
38 |
39 | w.start()
40 |
41 | df_if00 = Wind2Df(w.wsi("IF00.CFE", "close, volume", "2016-02-01 09:30:00", "2017-08-16 13:48:43", "periodstart=09:30:00;periodend=15:00:00"))
42 | df_if01 = Wind2Df(w.wsi("IF01.CFE", "close, volume", "2016-02-01 09:30:00", "2017-08-16 13:48:43", "periodstart=09:30:00;periodend=15:00:00"))
43 | df_if00.columns = ['close0', 'volume0']
44 | df_if01.columns = ['close1', 'volume1']
45 |
46 | df_all = pd.merge(df_if00, df_if01, left_index=True, right_index=True)
47 |
48 | df_all['diff'] = df_all.close0 - df_all.close1
49 | df_all['date'] = map(lambda x: x.date(), df_all.index)
50 | df_all['due_time'] = map(lambda x: get_due_date(x.date()), df_all.index)
51 | df_all['t'] = map(lambda x,y: (y - x.date() ).days, df_all.index, df_all.due_time)
52 |
53 |
54 | df_all1 = df_all.copy()
55 | df_all1.index = range(df_all1.shape[0])
56 | df_all1['0day_diff'] = map(lambda y,x: y if x == 0 else np.nan, df_all1['diff'], df_all1['t'])
57 | df_all1['1day_diff'] = map(lambda y,x: y if x == 1 else np.nan, df_all1['diff'], df_all1['t'])
58 | df_all1['2day_diff'] = map(lambda y,x: y if x == 2 else np.nan, df_all1['diff'], df_all1['t'])
59 |
60 | fig = plt.figure()
61 | ax1 = fig.add_subplot(211)
62 | ax2 = fig.add_subplot(212)
63 | df_all1[['volume0', 'volume1']].plot(ax = ax1)
64 | df_all1[['diff','0day_diff','1day_diff','2day_diff' ]].plot(ax = ax2)
65 | plt.show()
66 |
67 |
68 |
69 |
70 |
--------------------------------------------------------------------------------
/小工具/lasso_regression.m:
--------------------------------------------------------------------------------
1 | % Lasso 回归
2 | %
3 | % # lasso (L1正则项最小二乘)
4 | % lasso用于变量筛选
5 | %
6 | % http://blog.csdn.net/sinat_26917383/article/details/52092040 # 正则项,lasso和ridge区别
7 | % matlab实现: http://cn.mathworks.com/help/stats/lasso.html
8 | %
9 |
10 | % lasso
11 | data = csvread('auto_1.csv',1,0);
12 | X = data(:,1:10);
13 | Y = data(:, 11);
14 | weight = lasso(X,Y);
15 |
16 | % plot
17 | hold on
18 | axis([0 100 -0.6 0.1]);
19 | xlabel log(lam);
20 | ylabel weights;
21 | y = zeros(1,100);
22 | for i = 1:10
23 | x = 0:99;
24 | y(1,:) = weight(i,:);
25 | plot(x,y);
26 | %legend(int2str( i));
27 | end
28 | legend('1','2','3','4','5','6','7','8','9','10');
29 | hold off
30 |
31 | % 结果分析:
32 | % 默认输出lambda=0:99时对应的参数,返回一个100行n列(n为自变量数)的矩阵
33 | % lasso结果用以筛选变量,无关变量的系数会趋近与0,相关变量则不会
34 |
35 | % 含参数的lasso
36 | % 1. lasso & ridge
37 | % 采用ridge回归, lasso占比1%, 输出lambda = 0:19
38 | lasso(data(:,1:10), data(:,11), 'Alpha', 0.01, 'NumLambda', 20)
39 |
40 |
--------------------------------------------------------------------------------
/小工具/ridgeRegression_func1.m:
--------------------------------------------------------------------------------
1 | function [ w ] = ridgeRegression_func1( x, y, lam )
2 | xTx = x'*x;
3 | [m,n] = size(xTx);
4 | temp = xTx + eye(m,n)*lam;
5 | if det(temp) == 0
6 | disp('This matrix is singular, cannot do inverse');
7 | end
8 | w = temp^(-1)*x'*y;
9 | end
--------------------------------------------------------------------------------
/小工具/ridge_regression.m:
--------------------------------------------------------------------------------
1 | %% 岭回归(Ridge Regression)
2 |
3 | % # 岭回归 (L2正则项最小二乘)
4 | % - 有偏估计,但是在保证RSS足够小的情况下,使得参数更稳定
5 | % - 在原先的最小二乘估计中加入扰动项(二阶正则项),使问题稳定有解
6 | % - 岭回归针对样本没有办法提供给你足够的有效的信息的情况,此时OLS唯一存在的条件不满足,
7 | % 以损失部分信息、降低精度为代价获得回归系数更为符合实际、更可靠的回归方法,对病态数据的拟合要强于OLS
8 | %
9 | % http://blog.csdn.net/google19890102/article/details/27228279
10 | % http://f.dataguru.cn/thread-598486-1-1.html
11 |
12 | %导入数据
13 | data = csvread('auto_1.csv', 1,0);
14 | [m,n] = size(data);
15 |
16 | dataX = data(:,1:10);%特征
17 | dataY = data(:,11);%标签
18 |
19 | %标准化
20 | yMeans = mean(dataY);
21 | for i = 1:m
22 | yMat(i,:) = dataY(i,:)-yMeans;
23 | end
24 |
25 | xMeans = mean(dataX);
26 | xVars = var(dataX);
27 | for i = 1:m
28 | xMat(i,:) = (dataX(i,:) - xMeans)./xVars;
29 | end
30 |
31 | % 运算30次
32 | testNum = 30;
33 | weights = zeros(testNum, n-1);
34 | for i = 1:testNum
35 | w = ridgeRegression_func1(xMat, yMat, exp(i-10));
36 | weights(i,:) = w';
37 | end
38 |
39 | % 画出随着参数lam 岭迹图
40 | % λ的选择:一般通过观察,选取喇叭口附近的值,此时各β值已趋于稳定,但总的RSS又不是很大。
41 | % 选择变量:删除那些β取值一直趋于0的变量。
42 | hold on
43 | axis([-9 20 -1.0 2.5]);
44 | xlabel log(lam);
45 | ylabel weights;
46 | for i = 1:n-1
47 | x = -9:20;
48 | y(1,:) = weights(:,i)';
49 | plot(x,y);
50 | end
51 |
52 | % 怎么看结果:
53 | % 每一行对应一个lambda值,以及该lambda值下每个自变量的参数beta_i(lambda)
--------------------------------------------------------------------------------
/小工具/trade_account.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | from WindPy import *
3 | import pandas as pd
4 | import numpy as np
5 | from collections import defaultdict
6 | import pickle
7 |
8 | INSTRUMENT_OPTION = 0
9 | INSTRUMENT_FUTURE = 1
10 |
11 | DIRECTION_BUY = 0
12 | DIRECTION_SELL = 1
13 |
14 | OPENTYPE_OPEN = 0
15 | OPENTYPE_CLOSE = 1
16 |
17 | future_parameter = {
18 | 'interest_rate' : 0.0000231,
19 | 'deposit_rate' : 0.2,
20 | 'point_value' : 300
21 | }
22 |
23 | option_parameter = {
24 | 'interest' : 2.5
25 | }
26 |
27 | class PositionQueue:
28 |
29 | def __init__(self):
30 | self.buy_queue_dict = defaultdict(list)
31 | self.sell_queue_dict = defaultdict(list)
32 | self.positions = defaultdict(int)
33 |
34 | def add(self, instrument, direction, price):
35 | self.positions[(instrument, direction)] += 1
36 |
37 | if direction == DIRECTION_BUY:
38 | self.buy_queue_dict[instrument].append(price)
39 | else:
40 | self.sell_queue_dict[instrument].append(price)
41 |
42 | def pop(self, instrument, direction):
43 | self.positions[(instrument, direction)] -= 1
44 | if direction == DIRECTION_SELL:
45 | self.buy_queue_dict[instrument].pop(0)
46 | else:
47 | self.sell_queue_dict[instrument].pop(0)
48 |
49 | def have_position(self, instrument, direction):
50 | if direction == DIRECTION_SELL:
51 | if instrument in self.buy_queue_dict:
52 | return True
53 | else:
54 | return False
55 | else:
56 | if instrument in self.sell_queue_dict:
57 | return True
58 | else:
59 | return False
60 |
61 | def display(self):
62 | pass
63 |
64 |
65 | class TradeAccount:
66 |
67 | def __init__(self, init_capital):
68 | self.capital = init_capital
69 | self.init_capital = init_capital
70 | self.deposit = 0
71 | self.cash = init_capital
72 | self.position = PositionQueue()
73 |
74 | def order_future(self, price, instrument, direction, open_type, amount=1):
75 | if open_type == OPENTYPE_OPEN:
76 | deposit = amount * price * future_parameter['point_value'] * future_parameter['deposit_rate']
77 | interest = amount * price * future_parameter['point_value'] * future_parameter['interest_rate']
78 | self.deposit += deposit
79 | self.cash -= deposit + interest
80 | self.capital -= interest
81 | for i in range(amount):
82 | self.position.add(instrument, direction, price)
83 |
84 | elif open_type == OPENTYPE_CLOSE:
85 | if direction == DIRECTION_BUY:
86 | interest = price * future_parameter['point_value'] * future_parameter['interest_rate']
87 | for i in range(amount):
88 | to_be_offset_list = self.position.sell_queue_dict[instrument]
89 | if to_be_offset_list == []:
90 | print '期货空头合约仓位不足, 剩余 %s 个请求无法平仓' % (amount-i)
91 | break
92 | else:
93 | deposit = to_be_offset_list[0] * future_parameter['point_value'] * future_parameter['deposit_rate']
94 | earn = (to_be_offset_list[0] - price) * future_parameter['point_value']
95 | self.deposit -= deposit
96 | self.cash += deposit + earn - interest
97 | self.capital += earn - interest
98 | self.position.pop(instrument, direction)
99 |
100 | elif direction == DIRECTION_SELL:
101 | interest = price * future_parameter['point_value'] * future_parameter['interest_rate']
102 | for i in range(amount):
103 | to_be_offset_list = self.position.buy_queue_dict[instrument]
104 | if to_be_offset_list == []:
105 | print '期货多头合约仓位不足, 剩余 %s 个请求无法平仓' % (amount - i)
106 | break
107 | else:
108 | deposit = to_be_offset_list[0] * future_parameter['point_value'] * future_parameter[
109 | 'deposit_rate']
110 | earn = (price - to_be_offset_list[0]) * future_parameter['point_value']
111 | self.deposit -= deposit
112 | self.cash += deposit + earn - interest
113 | self.capital += earn - interest
114 | self.position.pop(instrument, direction)
115 | else:
116 | raise Exception('No Such Open Type')
117 |
118 | def order_option(self, price, instrument, direction, open_type, amount):
119 | # 期权卖开无手续费
120 | if open_type == OPENTYPE_OPEN:
121 | if direction == DIRECTION_BUY:
122 | interest = option_parameter['interest'] * amount
123 | cost = price * 10000 * amount
124 | self.cash += - cost - interest
125 | self.capital += - interest
126 | for i in range(amount):
127 | self.position.buy_queue_dict[instrument].append(price)
128 |
129 | elif direction == DIRECTION_SELL:
130 | interest = 0
131 | get = price * 10000 * amount
132 | self.cash += get - interest
133 | self.capital += - interest
134 | for i in range(amount):
135 | self.position.sell_queue_dict[instrument].append(price)
136 |
137 | elif open_type == OPENTYPE_CLOSE:
138 | interest = option_parameter['interest']
139 | if direction == DIRECTION_BUY:
140 | for i in range(amount):
141 | to_be_offset_list = self.position.sell_queue_dict[instrument]
142 | if to_be_offset_list == []:
143 | print '期权空头合约仓位不足, 剩余 %s 个请求无法平仓' % (amount - i)
144 | break
145 | else:
146 | earn = (to_be_offset_list[0] - price) * 10000
147 | self.cash += - price * 10000 - interest
148 | self.capital += earn - interest
149 | self.position.pop(instrument, direction)
150 |
151 | elif direction == DIRECTION_SELL:
152 | for i in range(amount):
153 | to_be_offset_list = self.position.buy_queue_dict[instrument]
154 | if to_be_offset_list == []:
155 | print '期权多头合约仓位不足, 剩余 %s 个请求无法平仓' % (amount - i)
156 | break
157 | else:
158 | earn = (price - to_be_offset_list[0]) * 10000
159 | self.cash += price * 10000 - interest
160 | self.capital += earn - interest
161 | self.position.pop(instrument, direction)
162 |
163 | def end_trade(self):
164 | with open('account_record', 'wb') as f:
165 | pickle.dump(self, f)
166 |
167 |
168 | class ValueCalculate():
169 |
170 | def __init__(self, capital_list, init_capital):
171 | self._init_capital = init_capital
172 | self.capital_list = capital_list
173 | self.return_list = []
174 | self.profit_list = []
175 | self.get_return_list()
176 |
177 | def get_return_list(self):
178 | for i, capital in enumerate(self.capital_list):
179 | if i == 0:
180 | self.return_list.append((capital - self._init_capital) / self._init_capital)
181 | self.profit_list.append(capital - self._init_capital)
182 | else:
183 | self.return_list.append((capital - self.capital_list[i-1]) / self.capital_list[i-1])
184 | self.profit_list.append(capital - self.capital_list[i-1])
185 |
186 |
187 | def get_total_return(self):
188 | return (self.capital_list[-1] - self._init_capital) / self._init_capital
189 |
190 | def get_annual_return(self):
191 | return self.get_total_return() / len(self.capital_list) * 250.0
192 |
193 | def get_average_return(self):
194 | return np.mean(self.return_list)
195 |
196 | def get_total_trade_times(self):
197 | return "%s / %s" % ((self.get_win_times() + self.get_lose_times()), len(self.capital_list))
198 |
199 | def get_return_volatility(self):
200 | rit_bar = self.get_average_return()
201 | sum_temp = 0
202 | for i in self.return_list:
203 | sum_temp += np.square(i - rit_bar)
204 | volatility = np.sqrt((250.0 / (len(self.capital_list) - 1)) * sum_temp)
205 | return volatility
206 |
207 | def get_win_times(self):
208 | win_list = [i for i in self.return_list if i > 0]
209 | return len(win_list)
210 |
211 | def get_lose_times(self):
212 | lose_list = [i for i in self.return_list if i < 0]
213 | return len(lose_list)
214 |
215 | def get_win_ratio(self):
216 | return self.get_win_times() * 1.0 / (self.get_win_times() + self.get_lose_times())
217 |
218 | def get_win_lose_ratio(self):
219 | win_sum = np.sum([i for i in self.profit_list if i > 0])
220 | lose_sum = np.sum([i for i in self.profit_list if i < 0])
221 | return - win_sum * 1.0 / lose_sum
222 |
223 | def get_max_win(self):
224 | return max([i for i in self.profit_list if i > 0]) / self._init_capital
225 |
226 | def get_max_lose(self):
227 | return -min([i for i in self.profit_list if i < 0]) / self._init_capital
228 |
229 | def get_continue_win_times(self):
230 | time_count_list = []
231 | temp = 0
232 | for i, returns in enumerate(self.return_list):
233 | if returns > 0:
234 | temp += 1
235 | else:
236 | time_count_list.append(temp)
237 | temp = 0
238 | return max(time_count_list)
239 |
240 | def get_continue_lose_times(self):
241 | time_count_list = []
242 | temp = 0
243 | for i, returns in enumerate(self.return_list):
244 | if returns < 0:
245 | temp += 1
246 | else:
247 | time_count_list.append(temp)
248 | temp = 0
249 | return max(time_count_list)
250 |
251 | def get_max_drawdown(self):
252 | drawdown_list = []
253 | for i, capital in enumerate(self.capital_list):
254 | new_capital_list = self.capital_list[:i]
255 | if len(new_capital_list) > 0:
256 | max_capital_past = max(new_capital_list)
257 | drawdown = (1 - capital / max_capital_past)
258 | drawdown_list.append(drawdown)
259 | return max(drawdown_list)
260 |
261 | def get_sharp_ratio(self):
262 | volatility = self.get_return_volatility()
263 | sharp = ((self.capital_list[-1] - self._init_capital) / self._init_capital - 0.03) / volatility
264 | return sharp
265 |
266 | def display(self):
267 | print "总收益率: ", self.get_total_return()
268 | print "年化收益率: ", self.get_annual_return()
269 | print "日均收益率: ", self.get_average_return()
270 | print "总交易次数: ", self.get_total_trade_times()
271 | print "收益波动率: ", self.get_return_volatility()
272 | print "获胜次数: ", self.get_win_times()
273 | print "失败次数: ", self.get_lose_times()
274 | print "胜率: ", self.get_win_ratio()
275 | print "盈亏比: ", self.get_win_lose_ratio()
276 | print "单次最大盈利: ", self.get_max_win()
277 | print "单次最大亏损: ", self.get_max_lose()
278 | print "最大连胜次数: ", self.get_continue_win_times()
279 | print "最大连负次数: ", self.get_continue_lose_times()
280 | print "最大回撤: ", self.get_max_drawdown()
281 | print "夏普比率: ", self.get_sharp_ratio()
282 |
283 | if __name__ == "__main__":
284 | pass
285 |
--------------------------------------------------------------------------------
/小工具/二分法期权计算器.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.ComponentModel;
4 | using System.Data;
5 | using System.Drawing;
6 | using System.Linq;
7 | using System.Text;
8 | using System.Threading.Tasks;
9 | using System.Windows.Forms;
10 |
11 | namespace 期权计算器
12 | {
13 | public partial class Form1 : Form
14 | {
15 | public Form1()
16 | {
17 | InitializeComponent();
18 | }
19 |
20 | private static double CumDensity(double z)
21 | {
22 | double p = 0.3275911;
23 | double a1 = 0.254829592;
24 | double a2 = -0.284496736;
25 | double a3 = 1.421413741;
26 | double a4 = -1.453152027;
27 | double a5 = 1.061405429;
28 |
29 | int sign;
30 | if (z < 0.0)
31 | sign = -1;
32 | else
33 | sign = 1;
34 |
35 | double x = Math.Abs(z) / Math.Sqrt(2.0);
36 | double t = 1.0 / (1.0 + p * x);
37 | double erf = 1.0 - (((((a5 * t + a4) * t) + a3)
38 | * t + a2) * t + a1) * t * Math.Exp(-x * x);
39 | return 0.5 * (1.0 + sign * erf);
40 | }
41 |
42 | private double get_value(double[] double_array )
43 | {
44 | double underlying_price = double_array[0];
45 | double strike_price = double_array[1];
46 | double due_time = double_array[2];
47 | double rate = double_array[3];
48 | double vol = double_array[4];
49 |
50 | double d1 = (Math.Log(underlying_price / strike_price) + (rate + Math.Pow(vol, 2) / 2) * due_time) / (vol * Math.Sqrt(due_time));
51 | double d2 = d1 - vol * Math.Sqrt(due_time);
52 | return underlying_price * CumDensity(d1) - strike_price * Math.Exp(-rate * due_time) * CumDensity(d2);
53 | }
54 |
55 | private List has_one_null(string[] str)
56 | {
57 | List null_index = new List();
58 | for (int i = 0; i < str.Length; i++)
59 | {
60 | if (str[i].Trim() == string.Empty)
61 | {
62 | null_index.Add(i);
63 | }
64 | }
65 | return null_index;
66 | }
67 |
68 | private double dichotomy_cal(int index, double upper, double lower, double[] param, bool positive, double price)
69 | {
70 | param[index] = (upper + lower) / 2.0;
71 | double pre;
72 | int count = 0;
73 | while (true) {
74 | count++;
75 | double price1 = get_value(param);
76 | pre = param[index];
77 | if (positive)
78 | {
79 | if (price1 < price)
80 | {
81 | lower = param[index];
82 | param[index] = (lower + upper) / 2.0;
83 | }
84 | else if (price1 > price)
85 | {
86 | upper = param[index];
87 | param[index] = (lower + upper) / 2.0;
88 | }
89 | else
90 | return param[index];
91 | if (Math.Abs(param[index] - pre) < 1e-5)
92 | return param[index];
93 | if(count > 10000)
94 | return 9999;
95 | }
96 | else
97 | {
98 | if (price1 > price)
99 | {
100 | lower = param[index];
101 | param[index] = (lower + upper) / 2.0;
102 | }
103 | else if (price1 < price)
104 | {
105 | upper = param[index];
106 | param[index] = (lower + upper) / 2.0;
107 | }
108 | else
109 | return param[index];
110 | if (Math.Abs(param[index] - pre) < 1e-5)
111 | return param[index];
112 | if (count > 10000)
113 | return 9999;
114 | }
115 |
116 | }
117 | }
118 |
119 |
120 | private double up, sp, t, r, vol, p;
121 |
122 |
123 |
124 | private void button1_Click(object sender, EventArgs e)
125 | {
126 | textBox_result.Text = "";
127 |
128 | string underlying_price = textBox_up.Text;
129 | string strike_price = textBox_sp.Text;
130 | string due_time = textBox_t.Text;
131 | string rate = textBox_r.Text;
132 | string volatity = textBox_v.Text;
133 | string price = textBox_price.Text;
134 |
135 | up = sp = t = r = vol = p = 9999;
136 |
137 | if (underlying_price != String.Empty)
138 | up = Convert.ToDouble(underlying_price);
139 | if (strike_price != String.Empty)
140 | sp = Convert.ToDouble(strike_price);
141 | if (due_time != String.Empty)
142 | t = Convert.ToDouble(due_time) / 365;
143 | if (rate != String.Empty)
144 | r = Convert.ToDouble(rate);
145 | if (volatity != String.Empty)
146 | vol = Convert.ToDouble(volatity);
147 | if (price != String.Empty)
148 | p = Convert.ToDouble(price);
149 |
150 |
151 | string[] string_array = new string[6] { underlying_price, strike_price, due_time, rate, volatity, price };
152 | var null_index = has_one_null(string_array);
153 |
154 | if (null_index.Count != 1)
155 | {
156 | MessageBox.Show("只能空一个待求参数!");
157 | }
158 | else
159 | {
160 | switch (null_index[0])
161 | {
162 | case 5:
163 | var result5 = get_value(new double[] { up, sp, t, r, vol });
164 | textBox_result.Text = Math.Round(result5, 4).ToString();
165 | break;
166 | case 4:
167 | // vol
168 | var result4 = dichotomy_cal(4, 100, 0, new double[] { up, sp, t, r, vol }, true, p);
169 | if (result4 != 9999)
170 | textBox_result.Text = Math.Round(result4, 4).ToString();
171 | else
172 | MessageBox.Show("深度实值认购期权隐波无解!");
173 | break;
174 | case 3:
175 | // rate
176 | var result3 = dichotomy_cal(3, 1.0, 0, new double[] { up, sp, t, r, vol }, true, p);
177 | textBox_result.Text = Math.Round(result3, 4).ToString();
178 | break;
179 | case 2:
180 | // due_time
181 | var result2 = dichotomy_cal(2, 100, 0, new double[] { up, sp, t, r, vol }, true, p);
182 | textBox_result.Text = Math.Round(result2 * 365).ToString();
183 | break;
184 | case 1:
185 | //strike price
186 | var result1 = dichotomy_cal(1, 10, 0, new double[] { up, sp, t, r, vol }, false, p);
187 | textBox_result.Text = Math.Round(result1, 4).ToString();
188 | break;
189 | case 0:
190 | //underlying price
191 | var result0 = dichotomy_cal(0, 10, 0, new double[] { up, sp, t, r, vol }, true, p);
192 | textBox_result.Text = Math.Round(result0, 4).ToString();
193 | break;
194 | }
195 | }
196 |
197 |
198 |
199 | }
200 | }
201 | }
202 |
--------------------------------------------------------------------------------
/评价模型/EntropyWeight.m:
--------------------------------------------------------------------------------
1 | function weights = EntropyWeight(R)
2 | %% 熵权法求指标权重,R为输入矩阵,返回权重向量weights
3 |
4 | [rows,cols]=size(R); % 输入矩阵的大小,rows为对象个数,cols为指标个数
5 | k=1/log(rows); % 求k
6 |
7 | f=zeros(rows,cols); % 初始化fij
8 | sumBycols=sum(R,1); % 输入矩阵的每一列之和(结果为一个1*cols的行向量)
9 | % 计算fij
10 | for i=1:rows
11 | for j=1:cols
12 | f(i,j)=R(i,j)./sumBycols(1,j);
13 | end
14 | end
15 |
16 | lnfij=zeros(rows,cols); % 初始化lnfij
17 | % 计算lnfij
18 | for i=1:rows
19 | for j=1:cols
20 | if f(i,j)==0
21 | lnfij(i,j)=0;
22 | else
23 | lnfij(i,j)=log(f(i,j));
24 | end
25 | end
26 | end
27 |
28 | Hj=-k*(sum(f.*lnfij,1)); % 计算熵值Hj
29 | weights=(1-Hj)/(cols-sum(Hj));
30 | end
31 | %------------------------------------------------------分割线---------------------------------------------------------------
32 |
--------------------------------------------------------------------------------
/评价模型/PPE.asv:
--------------------------------------------------------------------------------
1 | % this program use PPE method to judge the quality of a second-hand car
2 |
3 | load('auto.mat')
4 | this_size = size(auto_dataset);
5 | n = this_size(1);
6 | p = this_size(2);
7 |
8 |
9 | standard_data = zeros(74,11);
10 | for j = 1:p
11 | for i = 1:n
12 | if(ismember(j, [1,2,4,5,6,7]) == 1)
13 | standard_data(i,j) = (auto_dataset(i,j) - min(auto_dataset(:,j))) / ...
14 | (max(auto_dataset(:,j)) - min(auto_dataset(:,j)));
15 | else
16 | standard_data(i,j) = (max(auto_dataset(:,j)) - auto_dataset(i,j)) / ...
17 | (max(auto_dataset(:,j)) - min(auto_dataset(:,j)));
18 | end
19 | end
20 | end
21 |
22 | alpha = zeros(1,p);
23 |
24 |
25 | function = get_z(alpha, n, p, data, R)
26 | z = zeros(n,1);
27 | for i = 1:n
28 | sum = 0;
29 | for j = 1:p
30 | sum = sum + alpha(j) * data(i,j);
31 | end
32 | z(i) = sum;
33 | end
34 | S_alpha = std(z);
35 | sum_d = 0;
36 | for i = 1:n
37 | for j = 1:p
38 | u = 0;
39 | temp = R - abs(z(i) - z(j));
40 | if(temp>=0)
41 | u = 1
42 | end
43 | sum_d = sum_d + temp * u;
44 | end
45 | end
46 | result = S_alpha * sum_d;
47 | end
--------------------------------------------------------------------------------
/评价模型/PPE.m:
--------------------------------------------------------------------------------
1 | % this program use PPE method to judge the quality of a second-hand car
2 |
3 | data= csvread('question4.csv',1,3);
4 | this_size = size(data);
5 | global n p standard_data;
6 | p = this_size(2);
7 | n = this_size(1);
8 | standard_data = zeros(23,3);
9 | for j = 1:p
10 | for i = 1:n
11 | if(ismember(j, [2,3]) == 1)
12 | standard_data(i,j) = (data(i,j) - min(data(:,j))) / ...
13 | (max(data(:,j)) - min(data(:,j)));
14 | else
15 | standard_data(i,j) = (max(data(:,j)) - data(i,j)) / ...
16 | (max(data(:,j)) - min(data(:,j)));
17 | end
18 | end
19 | end
20 |
21 | alpha = zeros(1,p);
22 | for j = 1:p
23 | alpha(j) = 1/p;
24 | end
25 |
26 | %[a] = get_Q(alpha);
27 | [value_list,best_a,b] = pso_optimal(100,3);
28 |
29 | % load('auto.mat')
30 | % this_size = size(auto_dataset);
31 | % global n p standard_data;
32 | % n = this_size(1);
33 | % p = this_size(2);
34 | % standard_data = zeros(74,11);
35 | %
36 | % % premnmx() 归一化
37 | % for j = 1:p
38 | % for i = 1:n
39 | % if(ismember(j, [1,2,4,5,6,7]) == 1)
40 | % standard_data(i,j) = (auto_dataset(i,j) - min(auto_dataset(:,j))) / ...
41 | % (max(auto_dataset(:,j)) - min(auto_dataset(:,j)));
42 | % else
43 | % standard_data(i,j) = (max(auto_dataset(:,j)) - auto_dataset(i,j)) / ...
44 | % (max(auto_dataset(:,j)) - min(auto_dataset(:,j)));
45 | % end
46 | % end
47 | % end
48 | %
49 | % % 初始化起点
50 | % alpha = zeros(1,p);
51 | % for j = 1:p
52 | % alpha(j) = 1/p;
53 | % end
54 | %
55 | % %[a] = get_Q(alpha);
56 | % [best_a,b] = pso_optimal(100);
57 | %
58 | % Z=zeros(n,1);
59 | % for i=1:n
60 | % Z(i)=abs(sum(best_a.*standard_data(i,:)));
61 | % end
62 | % Z=abs(Z);
63 | %
64 | % figure%投影散布图
65 | % plot(abs(Z),'bd','LineWidth',1,'MarkerEdgeColor','k','MarkerFaceColor','b','MarkerSize',5);
66 | % %axis([1,12,0,2.5]);%图形边界根据需要显示
67 | % grid on
68 | % xlabel(' ','FontName','TimesNewRoman','FontSize',12);
69 | % ylabel('Projective Value','FontName','Times New Roman','Fontsize',12);
70 | % figure
71 | % [newZ,I]=sort(Z);
72 | % plot(abs(newZ),'bd','LineWidth',1,'MarkerEdgeColor','k','MarkerFaceColor','b','MarkerSize',5);
73 | % %axis([1,12,0,2.5]);%图形边界根据需要显示
74 | % grid on
75 | % xlabel(' ','FontName','TimesNewRoman','FontSize',12);
76 | % ylabel('Projective Value','FontName','Times New Roman','Fontsize',12);
77 | %
78 | % disp('最佳投影向量为')
79 | % disp(best_a);
--------------------------------------------------------------------------------
/评价模型/SOM.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Self Organizing Maps for clustering
3 | """
4 | 相关文档
5 | theory:
6 | - http://www.cnblogs.com/sylvanas2012/p/5117056.html
7 | - http://www.68dl.com/research/2014/0922/9129.html
8 | matlab:
9 | - http://blog.sina.com.cn/s/blog_906d892d0102vxfv.html
10 | - http://blog.csdn.net/bwangk/article/details/53300622
11 | - https://cn.mathworks.com/help/nnet/ug/cluster-with-self-organizing-map-neural-network.html
12 | python:
13 | - http://blog.csdn.net/chenge_j/article/details/72537568 个人实现
14 | - https://github.com/sevamoo/SOMPY 官方包
15 | """
16 |
17 | import numpy as np
18 | from matplotlib import pyplot as plt
19 | from sompy.sompy import SOMFactory
20 | from sklearn.datasets import fetch_california_housing
21 | import pandas as pd
22 | from collections import Counter
23 |
24 |
25 | class MySOM:
26 | def __init__(self, df, mapsize, initialization = 'random'):
27 | """
28 |
29 | :param df: 数据框
30 | :param mapsize: 输出层维度,一般为二维,输入(20,20)的形式
31 | :param initialization: "PCA" 或 "random",初始化权重的方法
32 | - PCA是以变量的主成分值作为权重,见sompy.codebool.pca_linear_initialization
33 | - random是以随机数进行初始化
34 | """
35 | self.data = np.array(df)
36 | self.sm = SOMFactory().build(self.data, mapsize=mapsize, initialization=initialization, component_names=df.columns)
37 | self.train()
38 |
39 | def train(self):
40 | self.sm.train(n_job=1,verbose=False, train_rough_len=2, train_finetune_len=5)
41 |
42 | def print_error(self):
43 | topographic_error = self.sm.calculate_topographic_error()
44 | quantization_error = np.mean(self.sm._bmu[1])
45 | print ("Topographic error = %s; Quantization error = %s" % (topographic_error, quantization_error))
46 |
47 | def draw_input_weights(self):
48 | from sompy.visualization.mapview import View2D
49 | view2D = View2D(10, 10, "rand data", text_size=10)
50 | view2D.show(self.sm, col_sz=4, which_dim="all", desnormalize=True)
51 | plt.show()
52 |
53 | def draw_hit_map(self):
54 | from sompy.visualization.bmuhits import BmuHitsView
55 | vhts = BmuHitsView(4, 4, "Hits Map", text_size=12)
56 | vhts.show(self.sm, anotate=True, onlyzeros=False, labelsize=12, cmap="Greys", logaritmic=False)
57 | plt.show()
58 |
59 | def draw_cluster_map(self):
60 | from sompy.visualization.hitmap import HitMapView
61 | hits = HitMapView(20, 20, "Clustering", text_size=12)
62 | hits.show(self.sm)
63 | plt.show()
64 |
65 | def cluster(self, n):
66 | self.sm.cluster(n)
67 |
68 | def get_cluster_label(self):
69 | # 长度等于mapsize[0] * mapsize[1]
70 | return self.sm.cluster_labels
71 |
72 | def get_neurons(self):
73 | """
74 | 获取原数据的每个样本对应的神经元,原包并未提供此方法,所以自己动手
75 | :return: array, length = self.df.shape[0]
76 | """
77 | return self.sm._bmu[0]
78 |
79 | def get_label(self):
80 | """
81 | 获取原数据的每个样本对应的分类标签,原包并未提供此方法,所以自己动手
82 | :return: array, length = self.df.shape[0]
83 | """
84 | neurons_label_dict = {i:j for i,j in enumerate(self.sm.cluster_labels)}
85 | return np.array([neurons_label_dict[i] for i in self.sm._bmu[0]])
86 |
87 | def predict(self, x):
88 | """
89 | 以label作为y,采取各种机器学习算法
90 | :param x:
91 | :return:
92 | """
93 | pass
94 |
95 | if __name__ == '__main__':
96 | data = fetch_california_housing()
97 | descr = data.DESCR
98 | names = data.feature_names+["HouseValue"]
99 | data = np.column_stack([data.data, data.target])
100 | df = pd.DataFrame(data)
101 | df.columns = names
102 |
103 | my_som = MySOM(df, (20,20))
104 | my_som.draw_input_weights()
105 | my_som.draw_hit_map()
106 |
107 | my_som.cluster(5)
108 | my_som.draw_cluster_map()
109 | print my_som.get_label()[:10]
110 | print Counter(my_som.get_label())
111 |
112 | my_som.predict(np.array(df.iloc[0]))
--------------------------------------------------------------------------------
/评价模型/cluster.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | import numpy as np
4 | import matplotlib.pyplot as plt
5 | import pandas as pd
6 | from sklearn.decomposition import PCA
7 |
8 | class Cluster:
9 |
10 | def __init__(self, df):
11 | from scipy.cluster.vq import whiten
12 | self.df = df
13 | self.data = whiten(df)
14 | self.sample_names = np.array(df.index)
15 |
16 | def K_means(self, K, axis=0):
17 | from scipy.cluster.vq import kmeans, vq
18 | # k-means最后输出的结果其实是两维的,第一维是聚类中心,第二维是损失distortion
19 | if axis == 0:
20 | # 此时对样本聚类
21 | centroid, distortion = kmeans(self.data, K)
22 | # 使用vq函数根据聚类中心对所有数据进行分类,vq的输出也是两维的,[0]表示的是所有数据的label
23 | label, distortion = vq(self.data, centroid)
24 | else:
25 | # 此时对变量聚类
26 | centroid, distortion = kmeans(self.data.T, K)
27 | label, distortion = vq(self.data.T, centroid)
28 | return label
29 |
30 | def hierarchical(self):
31 | import scipy.cluster.hierarchy as sch
32 | # 生成点与点之间的距离矩阵,这里用的欧氏距离:
33 | disMat = sch.distance.pdist(self.data, 'euclidean')
34 | # 进行层次聚类:
35 | Z = sch.linkage(disMat, method='average')
36 | self.hierarchial_plot(Z)
37 | # 根据linkage matrix Z得到聚类结果:
38 | cluster = sch.fcluster(Z, 1, 'inconsistent')
39 | return cluster
40 |
41 | def hierarchial_plot(self, Z):
42 | import scipy.cluster.hierarchy as sch
43 | # 将层级聚类结果以树状图表示出来,其中labels为每个样本的名称数组,应该为self.sample_names
44 | sch.dendrogram(Z, labels=self.sample_names, orientation='right')
45 | plt.tick_params(
46 | axis='x', # 使用 x 坐标轴
47 | which='both', # 同时使用主刻度标签(major ticks)和次刻度标签(minor ticks)
48 | bottom='off', # 取消底部边缘(bottom edge)标签
49 | top='off', # 取消顶部边缘(top edge)标签
50 | labelbottom='off')
51 | plt.tight_layout() # 展示紧凑的绘图布局
52 | plt.show()
53 | # plt.savefig('plot_dendrogram.png')
54 |
55 | def cluster_plot(self, label):
56 | # 聚类结果适合在二维数据中进行可视化,而面对多维情况,采取主成分分析进行降唯
57 | pca_result = self._pca()
58 | color = ['r', 'y', 'k', 'g', 'm'] * 10
59 | for i in range(max(label)+1):
60 | idx = np.where(label==i)
61 | plt.scatter(pca_result[idx, 0], pca_result[idx, 1], marker='o',label = str(i), color=color[i])
62 | plt.legend([u"Class: "+ str(i) for i in range(max(label) + 1)])
63 | plt.show()
64 |
65 | def _pca(self):
66 | pca = PCA(n_components=2) # ='mle' 时自动判断需要保留几个主成分,在这里因为需要做图,所以保留前两个
67 | pca.fit(self.data)
68 | print "variance_ratio:", pca.explained_variance_ratio_
69 | return pca.transform(self.data)
70 |
71 | def auto_cluster(self):
72 | # 先层次聚类,获取分类数,再根据类别进行K均值聚类
73 | hierarchical_cluster = self.hierarchical()
74 | K = max(hierarchical_cluster)
75 | labels = self.K_means(K)
76 | self.cluster_plot(labels)
77 |
78 |
79 | if __name__ == '__main__':
80 | df = pd.read_csv("/home/ray/Documents/suibe/2017/建模/Modeling_Preparation/dataset/auto_1.csv")
81 | df = df.dropna(axis=0)
82 | clu = Cluster(df)
83 |
84 | label = clu.K_means(4)
85 | clu.cluster_plot(label)
86 |
87 | label2 = clu.hierarchical()
88 | clu.cluster_plot(label2)
89 |
--------------------------------------------------------------------------------
/评价模型/constraint.m:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yangruipis/ModelingPreparation/1663c8153fb97049ae716e2f0dab27d457bcbc9c/评价模型/constraint.m
--------------------------------------------------------------------------------
/评价模型/get_Q.m:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yangruipis/ModelingPreparation/1663c8153fb97049ae716e2f0dab27d457bcbc9c/评价模型/get_Q.m
--------------------------------------------------------------------------------
/评价模型/optimal_tools.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yangruipis/ModelingPreparation/1663c8153fb97049ae716e2f0dab27d457bcbc9c/评价模型/optimal_tools.png
--------------------------------------------------------------------------------
/评价模型/pso_optimal.asv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yangruipis/ModelingPreparation/1663c8153fb97049ae716e2f0dab27d457bcbc9c/评价模型/pso_optimal.asv
--------------------------------------------------------------------------------
/评价模型/pso_optimal.m:
--------------------------------------------------------------------------------
1 | function [value_list, optimal_position, optimal_value] = pso_optimal(itertimes, variable_number)
2 | w = 0.8;
3 | c1 = 2; c2 = 2;
4 | r1 = 0.25;
5 | r2 = 0.75;
6 | particle_number = 1000; % 100������
7 | %variable_number = variable_number; % 11������
8 | X = zeros(particle_number, variable_number);
9 | V = zeros(particle_number, variable_number);
10 | particle_optimal_position = zeros(particle_number, variable_number);
11 | optimal_position = zeros(1, variable_number);
12 | opitmal_value = 1e10;
13 | x_range = [0,1];
14 | value_list = [];
15 | % initial the infomation of each particle
16 |
17 | for i = 1:particle_number
18 | for j = 1:variable_number
19 | X(i,j) = x_range(1) + (x_range(2) - x_range(1)) * rand();
20 | V(i,j) = 0;
21 | end
22 | X(i,:) = X(i,:) / norm(X(i,:));
23 | particle_optimal_position(i,:) = X(i,:);
24 | temp_value = get_Q(X(i,:));
25 | if(temp_value < opitmal_value)
26 | optimal_position = X(i,:);
27 | optimal_value = temp_value;
28 | end
29 | end
30 | count = 0;
31 | % update the particle
32 | for iter = 1:itertimes
33 | for i = 1:particle_number
34 | V(i,:) = w * V(i,:) + c1 * r1 * (particle_optimal_position(i,:) - ...
35 | X(i,:)) + c2 * r2 * (optimal_position - X(i,:));
36 | X(i,:) = X(i,:) + V(i,:);
37 | X(i,:) = X(i,:) / norm(X(i,:));
38 | if ( x_range(1) <= min(X(i,:)) && x_range(2) >= max(X(i,:)))
39 | value_before = get_Q(particle_optimal_position(i,:));
40 | value_now = get_Q(X(i,:));
41 | if(value_now < value_before)
42 | particle_optimal_position(i,:) = X(i,:);
43 | end
44 |
45 | if (value_now < optimal_value)
46 | optimal_position =X(i,:);
47 | optimal_value = value_now;
48 | count = count + 1;
49 | value_list(count) = optimal_value;
50 | end
51 | end
52 | end
53 | end
54 |
55 |
56 | end
--------------------------------------------------------------------------------
/评价模型/som_data.txt:
--------------------------------------------------------------------------------
1 | 0.697,0.46
2 | 0.774,0.376
3 | 0.634,0.264
4 | 0.608,0.318
5 | 0.556,0.215
6 | 0.403,0.237
7 | 0.481,0.149
8 | 0.437,0.211
9 | 0.666,0.091
10 | 0.243,0.267
11 | 0.245,0.057
12 | 0.343,0.099
13 | 0.639,0.161
14 | 0.657,0.198
15 | 0.36,0.37
16 | 0.593,0.042
17 | 0.719,0.103
18 | 0.359,0.188
19 | 0.339,0.241
20 | 0.282,0.257
21 | 0.748,0.232
22 | 0.714,0.346
23 | 0.483,0.312
24 | 0.478,0.437
25 | 0.525,0.369
26 | 0.751,0.489
27 | 0.532,0.472
28 | 0.473,0.376
29 | 0.725,0.445
30 | 0.446,0.459
--------------------------------------------------------------------------------
/赛题整理/赛题整理.md:
--------------------------------------------------------------------------------
1 | # 2016 E题
2 |
3 | 1. 影响粮食种植面积的因素比较多,它们之间的关系错综复杂而且可能存在着粮食品种和区域差异。请你们建立影响粮食种植面积的**指标体系**和关于粮食种植面积的数学模型,讨论、评价指标体系的合理性,研究他们之间的关系,并对得出的相应结果的可信度和可靠性给出检验和分析。
4 | - Spearman 相关检验, 主成分回归模型
5 | - 非参数 Spearman 相关检验法,偏最小二乘回归
6 | - 结构方程模型,证性因子分析和路径分析
7 | - Kolmogorov-Smirnov,相关性检验和主成分因子分析,Granger 因果关系检验,Granger 因果关系检验,似然比检验法,单位根检验和协整检验法
8 | - 多元线性回归模型,相关系数、残差和显著性水平
9 | 2. 对粮食最低收购价政策的作用,学者们褒贬不一。请你们建立粮食最低收购价**政策执行效果的评价模型**。并运用你们所建立的评价模型,结合粮食品种和区域差异,选择几个省份比较研究粮食主产区粮食最低收购价执行的效果。
10 | - /
11 | - 主成分分析,混合线性模型
12 | - 三角模糊数
13 | - 最低收购价政策执行效果综合评价指数模型,基于粒子群和投影寻踪算法的权重确定模型
14 | - 主成分分析法
15 | 3. 粮食市场收购价是粮食企业收购粮食的市场价格,是由粮食供需双方通过市场调节来决定。它与粮食最低收购价一起构成粮食价格体系,是宏观价格调控系统中有一定相对独立性的重要措施。请你们运用数据分析或建立数学模型探讨我国粮食价格所具有的**特殊规律性**。
16 | - 市场收购价理论和局部调整模型,包括 供应量模型、企业收购量模型和市场收购价格模型
17 | - 基于供需理论构建粮食供需及价格联动模型, ARIMA 模型
18 | - “蛛网”模型, ARCH 类模型
19 | - 局部均衡模型和正反馈系统
20 | - /
21 | 4. 结合前面的研究和国家制定粮食最低收购价政策的初衷,请你们建立粮食最低收购价的合理**定价模型**,进而对“十二五”期间国家发展与改革委员会公布的粮食最低收购价价格的合理性做出评价,并运用你们所建立的模型对2017年的粮食最低收购价的合理范围进行**预测**
22 | - 以粮食产量为目标模型,以价格波动、财政支出、库存和种植面积为约束条件建立了粮食最低收购价合理定价的线性规划模型
23 | - 优化模型
24 | - GARCH 模型、单变量二阶差分方程模型(DDE)、支持向量机预测模型( SVM 模型)以及马尔科夫链的时变权组合预测模型( HM-TWA)
25 | - 基于正态分布随机数遗传,多目标合理定价模型,基于有序加权平均( OWA) 算子
26 | - /
27 |
28 | # 2014年B题
29 |
30 | 1. 根据附录中1000个样本在某条有可能致病的染色体片段上的9445个位点的编码信息(见genotype.dat)和样本患有遗传疾病A的信息(见phenotype.txt文件)。设计或采用一个方法,找出某种疾病最有可能的一个或几个致病位点,并给出相关的理论依据。
31 |
32 | 2. 同上题中的样本患有遗传疾病A的信息(phenotype.txt文件)。现有300个基因,每个基因所包含的位点名称见文件夹gene_info中的300个dat文件,每个dat文件列出了对应基因所包含的位点(位点信息见文件genotype.dat)。由于可以把基因理解为若干个位点组成的集合,遗传疾病与基因的关联性可以由基因中 包含的位点的全集或其子集合表现出来请找出与疾病最有可能相关的一个或几个基因,并说明理由。
33 |
34 | 3. 已知9445个位点,其编码信息见genotype.dat文件。在实际的研究中,科研人员往往把相关的性状或疾病看成一个整体,然后来探寻与它们相关的位点或基因。试根据multi_phenos.txt文件给出的1000个样本的10个相关联性状的信息及其9445个位点的编码信息(见genotype.dat),找出与multi_phenos.txt中10个性状有关联的位点。
35 |
36 | # 2014年D题
37 | 1. 筛选出主要的水果和蔬菜品种,并尝试用多种方法建立数学模型对其消费量进行估计,研究其发展趋势
38 |
39 | 2. 评价中国居民目前矿物质、维生素、膳食纤维等营养的年摄入水平是否合理。按照水果和蔬菜近期的消费趋势,至2020年,中国居民的人体营养健康状况是趋于好转还是恶化?请给出支持你们结论的充分依据。
40 |
41 | 3. 为当今中国居民(可以分区域分季节)提供主要的水果和蔬菜产品的按年度合理人均消费量,使人们能够以较低的购买成本(假定各品种价格按照原有趋势合理变动)满足自身的营养健康需要
42 |
43 | 4. 建立数学模型重新计算中国居民主要的水果和蔬菜产品的按年度合理人均消费量,并给出到2020年我国水果和蔬菜产品生产的调整战略。
44 |
45 | # 2013年F题
46 |
47 | 1. 中国城乡居民(含新农保)养老金收入、支出的宏观数学模型
48 |
49 | 2. 对养老金缺口的理解和对未来有关情况的合理估计,城乡居民养老保险收支矛盾最尖锐的情况发生在什么时间,严重程度如何?考虑到党的十八大提出的收入倍增计划,你们的数学模型哪些部分需要调整?
50 |
51 | 3. 利用仿真手段寻找替代率和缴费率的合理区间以保证我国养老保险体系的可持续性;在步入良性循环之前,在矛盾最尖锐到来前的过渡期内应该采取哪些政策措施实现平稳过渡并仿真预测相关政策的效果
--------------------------------------------------------------------------------
/预测模型/GM1_1.m:
--------------------------------------------------------------------------------
1 | %%%% GM(1,1)预测模型
2 |
3 | input = [132,92,118,130,187,207,213,284,301,333];
4 | predict_times = 10;
5 |
6 | sum_input = cumsum(input);
7 |
8 | B = ones(length(input)-1,2);
9 | for i = 1:length(input)-1
10 | B(i,1) = -(sum_input(i) + sum_input(i+1)) / 2.0;
11 | end
12 | Y = input(2:end);
13 |
14 | a_hat = inv(B' * B) * B' * Y';
15 | a = a_hat(1);
16 | u = a_hat(2);
17 |
18 | result_length = length(input) + predict_times;
19 | sum_result = zeros(result_length, 1);
20 | result = zeros(result_length, 1);
21 | for i = 1:result_length
22 | sum_result(i) = (input(1) - u / a) * exp(-a * (i - 1)) + u / a;
23 | if(i == 1)
24 | result(i) = sum_result(i);
25 | else
26 | result(i) = sum_result(i) - sum_result(i-1);
27 | end
28 | end
29 | x_1 = 1:length(input);
30 | x_2 = 1:result_length;
31 |
32 | plot(x_1, input, '.b', x_2, result, 'ro--')
33 | legend('Acture Value', 'Predict Value')
34 | title('GM(1,1) Predict Result')
35 |
36 |
37 |
38 |
--------------------------------------------------------------------------------
/预测模型/HMM.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | """
4 | Theory:
5 |
6 | 通过识别隐含状态,以及计算隐含状态到观测状态的概率,实现对未来隐含状态的预测(
7 | 方法1. 已知当期的隐状态,推断下一期隐状态概率,以及各自的观测分布情况,进行预测
8 | 方法2. 根据当期t_1的观测,寻找与当期最相似的时期t_2,类比t_2下一期的观测值金预测
9 | )
10 |
11 | 模型参数:隐含状态转移概率矩阵、隐状态->观测转移概率(emission matrix, 混淆矩阵)、初始隐状态概率
12 |
13 | 根据观测分类:
14 | - MultinomialHMM 观测值离散的HMM
15 | - GaussianHMM 观测值连续的HMM,当观测为一维时,假定为正态分布;当观测为n维时,为n维联合正态分布
16 | - GMMHMM 同样为连续观测,运用混合正态分布
17 |
18 | 根据问题分类:
19 | 1. 已知整个模型(包括转移概率矩阵、混淆矩阵),根据观测值序列,计算该序列产生的概率如何
20 | 2. 已知整个模型(包括转移概率矩阵、混淆矩阵),根据观测值序列,推断这段时间的隐含状态
21 | 3. 模型未知,只知道观测值序列,求解整个模型,计算两个概率矩阵(或者是概率分布,连续情况),以及初始隐含状态概率(分布)
22 |
23 | 对应求解方法:
24 | 1. 前向、后向算法
25 | 2. Viterbi Algo,维特比算法
26 | 3. Baum-Welch Algo,鲍姆-韦尔奇算法
27 | ref:
28 | https://www.zhihu.com/question/20962240
29 |
30 | python:
31 | ref:
32 | http://www.cnblogs.com/pinard/p/7001397.html
33 | https://uqer.io/community/share/56ec30bf228e5b887be50b35 # 量化
34 | http://blog.csdn.net/baskbeast/article/details/51218777 # 量化
35 |
36 | """
37 |
38 | import hmmlearn
39 | import pandas as pd
40 | import numpy as np
41 | import matplotlib.pyplot as plt
42 | import warnings
43 | warnings.filterwarnings("ignore")
44 |
45 |
46 | def MyMultinomialHMM():
47 | from hmmlearn import hmm
48 |
49 | # 离散观测情况
50 | states = ["box 1", "box 2", "box3"]
51 | n_states = len(states)
52 |
53 | observations = ["red", "white"]
54 | n_observations = len(observations)
55 |
56 | start_probability = np.array([0.2, 0.4, 0.4])
57 |
58 | transition_probability = np.array([
59 | [0.5, 0.2, 0.3],
60 | [0.3, 0.5, 0.2],
61 | [0.2, 0.3, 0.5]
62 | ])
63 |
64 | emission_probability = np.array([
65 | [0.5, 0.5],
66 | [0.4, 0.6],
67 | [0.7, 0.3]
68 | ])
69 |
70 | model = hmm.MultinomialHMM(n_components=n_states)
71 | model.startprob_ = start_probability
72 | model.transmat_ = transition_probability
73 | model.emissionprob_ = emission_probability
74 |
75 | # question 2
76 | seen = np.array([[0, 1, 0, 1, 0, 0, 1]]).T # 观测序列
77 | logprob, box = model.decode(seen, algorithm="viterbi")
78 | print "The ball picked:", ", ".join(map(lambda x: observations[x], seen.T.reshape(7)))
79 | print "The hidden box", ", ".join(map(lambda x: states[x], box))
80 |
81 | box2 = model.predict(seen)
82 | print "The ball picked:", ", ".join(map(lambda x: observations[x], seen.T.reshape(7)))
83 | print "The hidden box", ", ".join(map(lambda x: states[x], box2))
84 |
85 | # question 1
86 | print np.exp(model.score(seen))
87 |
88 | # question 3
89 |
90 | # states = ["box 1", "box 2", "box3"]
91 | n_states = 3 # 参数 1
92 | X2 = np.array([[0, 1, 0, 1], [0, 0, 0, 1], [1, 0, 1, 1]]) # 参数 2
93 |
94 | model2 = hmm.MultinomialHMM(n_components=n_states, n_iter=20, tol=0.01)
95 | model2.fit(X2)
96 | for i in range(10):
97 | # 由于鲍姆-韦尔奇算法是基于EM算法的近似算法,所以我们需要多跑几次,选择X2概率最大的作为模型估计结果
98 | model2.fit(X2)
99 | print model2.startprob_
100 | print model2.transmat_
101 | print model2.emissionprob_
102 | print np.exp(model2.score(X2))
103 | print model2.sample(10)
104 | print model2.predict(X2.reshape([3, 4, 1])[1])
105 |
106 | def MyGaussianHMM():
107 | from hmmlearn.hmm import GaussianHMM
108 | df = pd.read_csv("/home/ray/Documents/suibe/2017/建模/Modeling_Preparation/dataset/SZIndex.csv", header=-1)
109 | df.head()
110 | X = np.array(df.iloc[:, 0:5])
111 |
112 | # 一、未知模型情况下,解决问题3
113 | model = GaussianHMM(n_components=6, covariance_type="diag", n_iter=1000) # 方差矩阵为对角阵
114 | """
115 | 参数解释:
116 | covariance_type:
117 | "spherical" :主对角元素均为1,其余元素为0,独立同分布 (数据不足时,难以进行参数估计)
118 | "diag" :主对角元素不为0,其余为0 (一般情况,折中)
119 | "full" :所有元素均不为0 (数据足够进行参数估计时)
120 | """
121 | model.fit(X)
122 | print "隐含状态为: ", model.predict(X) # 列出每一天的隐含状态
123 | print "特征数目 %s" % model.n_features
124 | print "隐状态数目 %s" % model.n_components
125 | print "起始概率 :", model.startprob_
126 | print "隐状态转移矩阵", model.transmat_
127 | ## 每个隐含层对应的特征概率空间假设为正态分布,则可以得到一个model.n_components行model.n_features列的均值矩阵
128 | print "混淆矩阵:均值部分", model.means_
129 | print "混淆矩阵:方差部分", model.covars_
130 |
131 | ## 绘图
132 | hidden_states = model.predict(X)
133 | tradeDate = df.iloc[:, 5].values
134 | closeIndex = df.iloc[:, 6].values
135 | plt.figure(figsize=(15, 8))
136 | for i in range(model.n_components):
137 | idx = (hidden_states == i)
138 | plt.plot_date(pd.to_datetime(tradeDate[idx]), closeIndex[idx], '.', label='%dth hidden state' % i, lw=1)
139 | plt.legend()
140 | plt.grid(1)
141 | plt.show()
142 |
143 | # 二、已知模型情况下,解决问题1,2
144 |
145 | ## 沿用上述模型
146 | ### 问题1
147 | print "某天出现该观测的概率为: %s" % np.exp(model.score(X[0]))
148 | ### 问题2
149 | log_prob, state = model.decode(X[:10], algorithm="viterbi")
150 | print "只根据前十天,推断出最有可能的隐含状态序列为:", state
151 |
152 | ## 自己输入模型参数
153 | ### 一个2特征,4隐状态情况
154 | startprob = np.array([0.6, 0.3, 0.1, 0.0])
155 | # The transition matrix, note that there are no transitions possible
156 | # between component 1 and 3
157 | transmat = np.array([[0.7, 0.2, 0.0, 0.1],
158 | [0.3, 0.5, 0.2, 0.0],
159 | [0.0, 0.3, 0.5, 0.2],
160 | [0.2, 0.0, 0.2, 0.6]])
161 | # The means of each component
162 | means = np.array([[0.0, 0.0],
163 | [0.0, 11.0],
164 | [9.0, 10.0],
165 | [11.0, -1.0]])
166 | # The covariance of each component
167 | covars = .5 * np.tile(np.identity(2), (4, 1, 1))
168 | model2 = GaussianHMM(n_components=4, covariance_type="full", n_iter=1000)
169 | model2.startprob_ = startprob
170 | model2.transmat_ = transmat
171 | model2.means_ = means
172 | model2.covars_ = covars
173 |
174 | if __name__ == '__main__':
175 | MyGaussianHMM()
176 | pass
177 |
--------------------------------------------------------------------------------
/预测模型/LSTM_predict.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | """
3 | ref:
4 | # 通过上一期序列值预测下一期
5 | https://machinelearningmastery.com/time-series-prediction-lstm-recurrent-neural-networks-python-keras/
6 | # theory
7 | https://deeplearning4j.org/lstm.html#long
8 |
9 | """
10 | # LSTM for international airline passengers problem with regression framing
11 | import numpy
12 | import matplotlib.pyplot as plt
13 | from pandas import read_csv
14 | import math
15 | from keras.models import Sequential
16 | from keras.layers import Dense
17 | from keras.layers import LSTM
18 | from sklearn.preprocessing import MinMaxScaler
19 | from sklearn.metrics import mean_squared_error
20 | # convert an array of values into a dataset matrix
21 |
22 |
23 | def create_dataset(dataset, look_back=1):
24 | dataX, dataY = [], []
25 | for i in range(len(dataset)-look_back-1):
26 | a = dataset[i:(i+look_back), 0]
27 | dataX.append(a)
28 | dataY.append(dataset[i + look_back, 0])
29 | return numpy.array(dataX), numpy.array(dataY)
30 |
31 |
32 | # fix random seed for reproducibility
33 | numpy.random.seed(7)
34 | # load the dataset
35 |
36 | #df = read_csv('../dataset/SZIndex.csv',header=-1)
37 | #dataset = df[6].values
38 | #dataset = dataset.reshape(dataset.shape[0], 1)
39 |
40 | dataframe = read_csv('../dataset/international-airline-passengers.csv', usecols=[1], engine='python', skipfooter=3)
41 | dataset = dataframe.values
42 | dataset = dataset.astype('float32')
43 | # normalize the dataset
44 | scaler = MinMaxScaler(feature_range=(0, 1))
45 | dataset = scaler.fit_transform(dataset)
46 | # split into train and test sets
47 | train_size = int(len(dataset) * 0.67)
48 | test_size = len(dataset) - train_size
49 | train, test = dataset[0:train_size,:], dataset[train_size:len(dataset),:]
50 | # reshape into X=t and Y=t+1
51 | look_back = 2
52 | trainX, trainY = create_dataset(train, look_back)
53 | testX, testY = create_dataset(test, look_back)
54 | # reshape input to be [samples, time steps, features]
55 | trainX = numpy.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
56 | testX = numpy.reshape(testX, (testX.shape[0], 1, testX.shape[1]))
57 |
58 | # create and fit the LSTM network
59 | model = Sequential()
60 | model.add(LSTM(4, input_shape=(1, look_back)))
61 | model.add(Dense(1))
62 | model.compile(loss='mean_squared_error', optimizer='adam')
63 | model.fit(trainX, trainY, epochs=100, batch_size=1, verbose=2)
64 | # make predictions
65 | trainPredict = model.predict(trainX)
66 | testPredict = model.predict(testX)
67 | # invert predictions
68 | trainPredict = scaler.inverse_transform(trainPredict)
69 | trainY = scaler.inverse_transform([trainY])
70 | testPredict = scaler.inverse_transform(testPredict)
71 | testY = scaler.inverse_transform([testY])
72 | # calculate root mean squared error
73 | trainScore = math.sqrt(mean_squared_error(trainY[0], trainPredict[:,0]))
74 | print('Train Score: %.2f RMSE' % (trainScore))
75 | testScore = math.sqrt(mean_squared_error(testY[0], testPredict[:,0]))
76 | print('Test Score: %.2f RMSE' % (testScore))
77 | # shift train predictions for plotting
78 | trainPredictPlot = numpy.empty_like(dataset)
79 | trainPredictPlot[:, :] = numpy.nan
80 | trainPredictPlot[look_back:len(trainPredict)+look_back, :] = trainPredict
81 | # shift test predictions for plotting
82 | testPredictPlot = numpy.empty_like(dataset)
83 | testPredictPlot[:, :] = numpy.nan
84 | testPredictPlot[len(trainPredict)+(look_back*2)+1:len(dataset)-1, :] = testPredict
85 | # plot baseline and predictions
86 | plt.plot(scaler.inverse_transform(dataset))
87 | plt.plot(trainPredictPlot)
88 | plt.plot(testPredictPlot)
89 | plt.show()
--------------------------------------------------------------------------------
/预测模型/ML_classify_model.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | from sklearn import cross_validation
3 | import numpy as np
4 | import pandas as pd
5 |
6 | class myclassify():
7 | def __init__(self, train_x, train_y):
8 | self.x = train_x
9 | self.y = train_y
10 | self.cv_time = 10
11 |
12 | def knn(self, k=3):
13 | from sklearn import neighbors
14 | knn_model = neighbors.KNeighborsClassifier(n_neighbors=k)
15 | scores = cross_validation.cross_val_score(knn_model, self.x, self.y, cv=self.cv_time)
16 | knn_model.fit(self.x, self.y)
17 | return np.mean(scores), knn_model
18 |
19 | def logistic(self):
20 | from sklearn.linear_model import LogisticRegression
21 | logit_model = LogisticRegression()
22 | scores = cross_validation.cross_val_score(logit_model, self.x, self.y, cv=self.cv_time)
23 | logit_model.fit(self.x, self.y)
24 | return np.mean(scores), logit_model
25 |
26 | def decision_tree(self):
27 | from sklearn import tree
28 | dt_model = tree.DecisionTreeClassifier(criterion='entropy')
29 | scores = cross_validation.cross_val_score(dt_model, self.x, self.y, cv=self.cv_time)
30 | dt_model.fit(self.x, self.y)
31 | return np.mean(scores), dt_model
32 |
33 | def naive_bayes(self):
34 | from sklearn.naive_bayes import MultinomialNB
35 | nb_model = MultinomialNB()
36 | scores = cross_validation.cross_val_score(nb_model, self.x, self.y, cv=self.cv_time)
37 | nb_model.fit(self.x, self.y)
38 | return np.mean(scores), nb_model
39 |
40 | def svm(self):
41 | from sklearn.svm import SVC
42 | model = SVC(kernel='rbf', probability=True)
43 | scores = cross_validation.cross_val_score(model, self.x, self.y, cv=self.cv_time)
44 | model.fit(self.x, self.y)
45 | return np.mean(scores), model
46 |
47 | def svm_cv(self):
48 | from sklearn.grid_search import GridSearchCV
49 | from sklearn.svm import SVC
50 | model = SVC(kernel='rbf', probability=True)
51 | param_grid = {'C': [1e-3, 1e-2, 1e-1, 1, 10, 100, 1000], 'gamma': [0.001, 0.0001]}
52 | grid_search = GridSearchCV(model, param_grid, n_jobs=1, verbose=1)
53 | grid_search.fit(self.x, self.y)
54 | best_parameters = grid_search.best_estimator_.get_params()
55 | for para, val in list(best_parameters.items()):
56 | print(para, val)
57 | model = SVC(kernel='rbf', C=best_parameters['C'], gamma=best_parameters['gamma'], probability=True)
58 | scores = cross_validation.cross_val_score(model, self.x, self.y, cv=self.cv_time)
59 | model.fit(self.x, self.y)
60 | return scores, model
61 |
62 | if __name__ == '__main__':
63 | df = pd.read_csv("../dataset/auto_1.csv")
64 | df = df.dropna(axis=0)
65 | mc = myclassify(df.iloc[:, 0:10], df.iloc[:,-1])
66 | #scores, model = mc.knn(3)
67 | #scores, model = mc.svm()
68 | scores, model = mc.svm_cv()
69 | predict_num = -3
70 | print scores,model.predict(df.iloc[predict_num,0:10].values.T)[0],df.iloc[predict_num,-1]
71 |
72 |
73 |
--------------------------------------------------------------------------------
/预测模型/PLSR.m:
--------------------------------------------------------------------------------
1 | clc;clear
2 | Y=[ 0.1 0.5 0.7
3 | 0.2 0.6 0.4
4 | 0.3 0.7 0.5
5 | 0.4 0.6 0.3
6 | 0.5 0.8 0.2
7 | 0.6 0.3 0.5
8 | 0.4 0.7 0.6
9 | 0.3 0.5 0.7];
10 | X=[0.2876 0.6173 0.9647 1.1936 1.0636 0.7332 0.5441 0.6247 0.7421 0.7052
11 | 0.2653 0.5167 0.8403 1.0435 1.008 0.7396 0.5344 0.5675 0.6312 0.5368
12 | 0.3833 0.7089 1.0544 1.2805 1.2524 0.8886 0.6596 0.6815 0.75 0.6671
13 | 0.3957 0.6853 0.9204 1.0648 1.0486 0.7999 0.5579 0.5381 0.5698 0.469
14 | 0.472 0.7413 1.0124 1.2202 1.2297 0.9699 0.6646 0.635 0.6254 0.4978
15 | 0.6268 0.9851 1.1633 1.1629 1.0128 0.7123 0.5161 0.482 0.5194 0.4909
16 | 0.4921 0.8723 1.2407 1.4583 1.3631 1.0073 0.7341 0.7032 0.8171 0.7228
17 | 0.4308 0.8232 1.146 1.309 1.1767 0.8207 0.5852 0.6604 0.7677 0.7237];
18 | %X0=[0.4089 0.6996 0.8712 1.0159 0.9638 0.7115 0.5112 0.4722 0.5059 0.4343];
19 |
20 | [A,B,r,U,V,stats] = canoncorr(X,Y);
21 | % A X变量数×典型变量个数,第i列表示自变量中的第i个典型变量里,X各个变量的系数,系数越大,表示影响越大
22 | % B Y变量数×典型变量个数,第i列表示因变量中的第i个典型变量里,Y各个变量的系数
23 | % r 典型变量个数,向量;表示自变量中第i个典型变量和因变量中第i个典型变量的相关系数,此时均为最大相关系数
24 | % var( X * A(:,1)) = var( X * A(:,2)) = var( Y * B(:,1)) = 1
25 | % U = (X - repmat(mean(X), size(X,1), 1)) * A
26 | % V = (Y - repmat(mean(Y), size(Y,1), 1)) * B
27 | % stats 统计参数,见https://cn.mathworks.com/help/stats/canoncorr.html
28 |
29 |
30 |
31 | [XL,YL,XS,YS,BETA,PCTVAR,MSE] = plsregress(X,Y,7);
32 | % X为8×10, Y为8×3
33 | % 此时,获得7个主成分(默认获取样本数-1个主成分,根据解释方差累计图确认所需主成分),XL为10*7维矩阵
34 | % 第i列表示第i个主成分,每一个变量对应的系数,共7个主成分
35 | % XS为8*7维矩阵,第i列表示第i个主成分,每一个样本所对应的值
36 | % 同理,YL,YS 如上
37 | % BETA表示每个自变量对因变量的系数,注意包括了截距项,获取yhat时需要对X加上全为1的列,作为第一列
38 | % PCTVAR表示每个成分的解释方差,第一行为X的,第二行为Y的
39 | % MSE表示均方误差,第一行为X的,第二行为Y的
40 | % https://cn.mathworks.com/help/stats/plsregress.html
41 |
42 | plot(1:size(PCTVAR, 2),cumsum(100*PCTVAR(1,:)),'-bo');
43 | xlabel('Number of PLS components');
44 | ylabel('Percent Variance Explained in x');
45 |
46 | plot(1:size(PCTVAR, 2),cumsum(100*PCTVAR(2,:)),'-bo');
47 | xlabel('Number of PLS components');
48 | ylabel('Percent Variance Explained in y');
49 |
50 | Ytest = [ones(size(X,1),1) X]*BETA;
51 | residuals = Y-Ytest;
52 | stem(residuals)
53 | xlabel('Observation');
54 | ylabel('Residual');
55 |
--------------------------------------------------------------------------------
/预测模型/SVR.py:
--------------------------------------------------------------------------------
1 | from sklearn.datasets import load_boston
2 |
3 | boston = load_boston()
4 |
5 | from sklearn.cross_validation import train_test_split
6 |
7 | import numpy as np;
8 |
9 | X = boston.data
10 | y = boston.target
11 |
12 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 33, test_size = 0.25)
13 |
14 | print 'The max target value is: ', np.max(boston.target)
15 | print 'The min target value is: ', np.min(boston.target)
16 | print 'The average terget value is: ', np.mean(boston.target)
17 |
18 | from sklearn.preprocessing import StandardScaler
19 |
20 | ss_X = StandardScaler()
21 | ss_y = StandardScaler()
22 |
23 | X_train = ss_X.fit_transform(X_train)
24 | X_test = ss_X.transform(X_test)
25 | y_train = ss_y.fit_transform(y_train)
26 | y_test = ss_y.transform(y_test)
27 |
28 | from sklearn.svm import SVR
29 |
30 | linear_svr = SVR(kernel = 'linear')
31 |
32 | linear_svr.fit(X_train, y_train)
33 |
34 | linear_svr_y_predict = linear_svr.predict(X_test)
35 |
36 | poly_svr = SVR(kernel = 'poly')
37 | poly_svr.fit(X_train, y_train)
38 | poly_svr_y_predict = poly_svr.predict(X_test)
39 |
40 | rbf_svr = SVR(kernel = 'rbf')
41 | rbf_svr.fit(X_train, y_train)
42 | rbf_svr_y_predict = rbf_svr.predict(X_test)
43 |
44 | from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
45 |
46 | print 'R-squared value of linear SVR is: ', linear_svr.score(X_test, y_test)
47 | print 'The mean squared error of linear SVR is: ', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(linear_svr_y_predict))
48 | print 'The mean absolute error of lin SVR is: ', mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(linear_svr_y_predict))
49 |
50 | print 'R-squared of ploy SVR is: ', poly_svr.score(X_test, y_test)
51 | print 'the value of mean squared error of poly SVR is: ', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(poly_svr_y_predict))
52 | print 'the value of mean ssbsolute error of poly SVR is: ', mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(poly_svr_y_predict))
53 |
54 | print 'R-squared of rbf SVR is: ', rbf_svr.score(X_test, y_test)
55 | print 'the value of mean squared error of rbf SVR is: ', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(rbf_svr_y_predict))
56 | print 'the value of mean ssbsolute error of rbf SVR is: ', mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(rbf_svr_y_predict))
57 |
--------------------------------------------------------------------------------
/预测模型/decision_tree.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf8 -*-
2 | import pandas as pd
3 | from collections import Counter
4 |
5 |
6 | class Node:
7 |
8 | def __init__(self, feature, df):
9 | self.feature = feature
10 | self.df = df
11 | self.left = None
12 | self.right = None
13 | self.feature_value = None
14 | self.label_value = None
15 |
16 |
17 | class Tree:
18 |
19 | """
20 | 确保数据框标签列列名为'label'
21 | """
22 |
23 | def __init__(self, df):
24 | self.df = df
25 | feature_name = self.get_feature(self.df)
26 | self.init_node = Node(feature_name, self.df)
27 |
28 | def get_feature(self, df):
29 | gini = {}
30 | for i in df.columns:
31 | if i != 'label':
32 | value_count_dict = df[i].value_counts()
33 | sums = value_count_dict.values.sum()
34 | gini[i] = 1 - sum([(j * 1.0 / sums)**2 for j in value_count_dict.values])
35 | return max(gini, key=gini.get)
36 |
37 | @staticmethod
38 | def vote(df, columns_name, value):
39 | label_data = df.loc[df[columns_name] == value, 'label'].values
40 | return Counter(label_data).most_common()[0][0]
41 |
42 | def gen_tree(self, node):
43 | df = node.df
44 | feature_name = self.get_feature(df)
45 | feature_value_set = list(set(df[feature_name].values))
46 | if len(feature_value_set) > 2:
47 | raise ValueError
48 | elif len(feature_value_set) == 1:
49 | node.label_value = self.vote(df, feature_name, feature_value_set[0])
50 | return
51 | elif len(feature_value_set) == 2:
52 | left_node = Node(feature_name, df.loc[df[feature_name] == feature_value_set[0]])
53 | left_node.feature_value = feature_value_set[0]
54 | right_node = Node(feature_name, df.loc[df[feature_name] == feature_value_set[1]])
55 | right_node.feature_value = feature_value_set[1]
56 | node.left = left_node
57 | node.right = right_node
58 | self.gen_tree(left_node)
59 | self.gen_tree(right_node)
60 |
61 | def display_node(self, node, depth):
62 | if node.left == None:
63 | print "%slabel:%s" % ((depth - 1) *'\t|---' + '', node.label_value)
64 | else:
65 | print "%sfeature: %s, value: %s" % (depth * '\t' + '|---', node.left.feature, node.left.feature_value)
66 | self.display_node(node.left,depth+1)
67 | print "%sfeature: %s, value: %s" % (depth * '\t' +'|---', node.right.feature, node.right.feature_value)
68 | self.display_node(node.right,depth+1)
69 |
70 |
71 | if __name__ == '__main__':
72 | data_set = [
73 | [1, 0, 1, 1],
74 | [0, 1, 1, 1],
75 | [0, 0, 0, 0],
76 | [1, 1, 1, 1],
77 | [0, 0, 0, 0],
78 | [0, 1, 0, 1],
79 | [1, 0, 1, 1],
80 | [0, 0, 0, 0],
81 | [0, 1, 0, 0],
82 | [0, 0, 0, 0]
83 | ]
84 | df = pd.DataFrame(data_set)
85 | df.columns = ['house', 'marriage', 'wage', 'label']
86 | tree = Tree(df)
87 | tree.gen_tree(tree.init_node)
88 | tree.display_node(tree.init_node, 0)
89 |
--------------------------------------------------------------------------------
/预测模型/evaluate.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | """
4 | 对所有预测模型的预测效果进行评估
5 | ref:
6 | http://blog.csdn.net/sinat_26917383/article/details/75199996?locationNum=3&fps=1
7 | http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
8 | https://www.zhihu.com/question/30643044
9 | """
10 |
11 | from sklearn import metrics
12 | import matplotlib.pyplot as plt
13 | import numpy as np
14 |
15 | TYPE_DISCRETE = 0 # 实际值与预测值均为离散
16 | TYPE_DISCRETE_2 =1 # 实际值为离散,预测值为连续 logistic
17 | TYPE_CONTINUE = 2 # 实际值与预测值均为连续
18 |
19 |
20 | class Evaluate:
21 |
22 | def __init__(self, true_array, predict_array, pred_type = TYPE_DISCRETE):
23 | self.type = pred_type
24 | self.true_array = np.array(true_array)
25 | self.pred_array = np.array(predict_array)
26 |
27 | @property
28 | def accuracy(self):
29 | # 获取精确度
30 | # 采取宏平均 macro, 也可采用(None, ‘micro’, ‘macro’, ‘weighted’, ‘samples’)
31 | return metrics.precision_score(self.true_array, self.pred_array, average='macro')
32 | @property
33 | def recall(self):
34 | # 获取召回率
35 | return metrics.recall_score(self.true_array, self.pred_array, average='macro')
36 |
37 | @property
38 | def f1(self):
39 | # 获取F1值,即精确值和召回率的调和均值
40 | return metrics.f1_score(self.true_array, self.pred_array, average='weighted')
41 |
42 | @property
43 | def confusion_matrix(self):
44 | return metrics.confusion_matrix(self.true_array, self.pred_array)
45 |
46 | def confusion_matrix_plot(self, cmap=plt.cm.Blues):
47 | """Matplotlib绘制混淆矩阵图
48 | parameters
49 | ----------
50 | y_truth: 真实的y的值, 1d array
51 | y_predict: 预测的y的值, 1d array
52 | cmap: 画混淆矩阵图的配色风格, 使用cm.Blues,更多风格请参考官网
53 | """
54 | cm = metrics.confusion_matrix(self.true_array, self.pred_array)
55 | plt.matshow(cm, cmap=cmap) # 混淆矩阵图
56 | plt.colorbar() # 颜色标签
57 |
58 | for x in range(len(cm)): # 数据标签
59 | for y in range(len(cm)):
60 | plt.annotate(cm[x, y], xy=(x, y), horizontalalignment='center', verticalalignment='center')
61 |
62 | plt.ylabel('True label') # 坐标轴标签
63 | plt.xlabel('Predicted label') # 坐标轴标签
64 | plt.show() # 显示作图结果
65 |
66 | @property
67 | def classify_report(self):
68 | return metrics.classification_report(self.true_array, self.pred_array)
69 |
70 | @property
71 | def kappa_score(self):
72 | # kappa score是一个介于(-1, 1)之间的数. score>0.8意味着好的分类;0或更低意味着不好
73 | return metrics.cohen_kappa_score(self.true_array, self.pred_array)
74 |
75 | @property
76 | def roc_score(self):
77 | return metrics.roc_auc_score(self.true_array, self.pred_array)
78 |
79 | def roc_plot(self, title='Receiver operating characteristic plot'):
80 | # 只针对二分了问题,如果是多分类,分别转换为二分类作图,即是第一类和不是第一类,是第二类和不是第二类等等
81 | fpr, tpr, _ = metrics.roc_curve(self.true_array, self.pred_array)
82 | plt.figure()
83 | # lw : line width
84 | plt.plot(fpr, tpr, color='darkorange',
85 | lw=2, label='ROC curve (area = %0.2f)' % self.roc_score)
86 | plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
87 | plt.xlim([0.0, 1.0])
88 | plt.ylim([0.0, 1.05])
89 | plt.xlabel('False Positive Rate')
90 | plt.ylabel('True Positive Rate')
91 | plt.title(title)
92 | plt.legend(loc="lower right")
93 | plt.show()
94 |
95 | @property
96 | def hamming_distance(self):
97 | return metrics.hamming_loss(self.true_array, self.pred_array)
98 |
99 | @property
100 | def jaccard_distance(self):
101 | return metrics.jaccard_similarity_score(self.true_array, self.pred_array)
102 |
103 | @property
104 | def explained_variance(self):
105 | return metrics.explained_variance_score(self.true_array, self.pred_array)
106 |
107 | @property
108 | def mean_squared_error(self):
109 | return metrics.mean_squared_error(self.true_array, self.pred_array)
110 |
111 | @property
112 | def mean_absolute_error(self):
113 | return metrics.mean_absolute_error(self.true_array, self.pred_array)
114 |
115 | @property
116 | def median_absolute_error(self):
117 | return metrics.median_absolute_error(self.true_array, self.pred_array)
118 |
119 | @property
120 | def r_square(self):
121 | return metrics.r2_score(self.true_array, self.pred_array)
122 |
123 | def display(self):
124 | if self.type == TYPE_DISCRETE:
125 | print "accuracy : %s" % self.accuracy
126 | print "recall : %s" % self.recall
127 | print "F1 : %s" % self.f1
128 | print "confusion_matrix : \n %s" % self.confusion_matrix
129 | print "kappa : %s" % self.kappa_score
130 | print "ROC score : %s" % self.roc_score
131 | print "report : \n %s" % self.classify_report
132 | print "hamming loss : %s" % self.hamming_distance
133 | print "jaccard distance : %s" % self.jaccard_distance
134 | self.confusion_matrix_plot()
135 | self.roc_plot()
136 | elif self.type == TYPE_DISCRETE_2:
137 | print "ROC score : %s" % self.roc_score
138 | self.roc_plot()
139 |
140 | print "mean_squared_error : %s" % self.mean_squared_error
141 | print "mean_absolute_error : %s" % self.mean_absolute_error
142 | print "median_absolute_error : %s" % self.median_absolute_error
143 | print "explained_variance : %s" % self.explained_variance
144 | print "r_square : %s" % self.r_square
145 |
146 |
147 | if __name__ == '__main__':
148 | true_y_0 = [1,1,0,1,0,1,1,1]
149 | pred_y_0 = [1,0,1,1,0,1,0,1]
150 |
151 | true_y_1 = [1, 1, 0, 1, 0, 1, 1, 0, 1, 1]
152 | pred_y_1 = [1, 0.8, 0.2, 1.2, 0, 1.0, 0, 1.7, 2.1, 3.1]
153 |
154 | true_y_2 = [1, 1, 0.9, 1.1, 0.1, 1, 1, 0]
155 | pred_y_2 = [1, 0, 1, 1.2, 0, 1, 0, 1]
156 |
157 | eva_0 = Evaluate(true_y_0, pred_y_0, TYPE_DISCRETE)
158 | eva_1 = Evaluate(true_y_1, pred_y_1, TYPE_DISCRETE_2)
159 | eva_2 = Evaluate(true_y_2, pred_y_2, TYPE_CONTINUE)
160 |
161 | eva_0.display()
162 | eva_1.display()
163 | eva_2.display()
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
--------------------------------------------------------------------------------
/预测模型/neural_network.m:
--------------------------------------------------------------------------------
1 |
2 | % ------------------------- EXAMPLE 1 -----------------------
3 | P=[3.2 3.2 3 3.2 3.2 3.4 3.2 3 3.2 3.2 3.2 3.9 3.1 3.2;
4 | 9.6 10.3 9 10.3 10.1 10 9.6 9 9.6 9.2 9.5 9 9.5 9.7;
5 | 3.45 3.75 3.5 3.65 3.5 3.4 3.55 3.5 3.55 3.5 3.4 3.1 3.6 3.45;
6 | 2.15 2.2 2.2 2.2 2 2.15 2.14 2.1 2.1 2.1 2.15 2 2.1 2.15;
7 | 140 120 140 150 80 130 130 100 130 140 115 80 90 130;
8 | 2.8 3.4 3.5 2.8 1.5 3.2 3.5 1.8 3.5 2.5 2.8 2.2 2.7 4.6;
9 | 11 10.9 11.4 10.8 11.3 11.5 11.8 11.3 11.8 11 11.9 13 11.1 10.85;
10 | 50 70 50 80 50 60 65 40 65 50 50 50 70 70];
11 | T=[2.24 2.33 2.24 2.32 2.2 2.27 2.2 2.26 2.2 2.24 2.24 2.2 2.2 2.35];
12 | [p1,minp,maxp,t1,mint,maxt]=premnmx(P,T); % 归一化
13 |
14 |
15 | net=newff(minmax(P),[8,6,1],{'tansig','tansig','purelin'},'trainlm');
16 | net.trainParam.epochs = 5000;
17 | net.trainParam.goal=0.0000001;
18 | [net,tr]=train(net,p1,t1);
19 |
20 |
21 | a=[3.0;9.3;3.3;2.05;100;2.8;11.2;50];
22 | a=premnmx(a);
23 | %放入到网络输出数据
24 | b=sim(net,a);
25 | c=postmnmx(b,mint,maxt);
26 | disp(c)
27 |
28 | % ------------------------- EXAMPLE 2 -----------------------
29 | clear;
30 | clc;
31 |
32 | P=[110 0.807 240 0.2 15 1 18 2 1.5;
33 | 110 2.865 240 0.1 15 2 12 1 2;
34 | 110 2.59 240 0.1 12 4 24 1 1.5;
35 | 220 0.6 240 0.3 12 3 18 2 1;
36 | 220 3 240 0.3 25 3 21 1 1.5;
37 | 110 1.562 240 0.3 15 3 18 1 1.5;
38 | 110 0.547 240 0.3 15 1 9 2 1.5;
39 | 0 1.318 300 0.1 15 2 18 1 2];
40 |
41 | T=[54248 162787 168380 314797;
42 | 28614 63958 69637 82898;
43 | 86002 402710 644415 328084;
44 | 230802 445102 362823 335913;
45 | 60257 127892 76753 73541;
46 | 34615 93532 80762 110049;
47 | 56783 172907 164548 144040;
48 | 907 117437 120368 130179];
49 | m=max(max(P));
50 | n=max(max(T));
51 | P=P'/m;
52 | T=T'/n;
53 | %-------------------------------------------------------------------------%
54 | pr(1:9,1)=0; %输入矢量的取值范围矩阵
55 | pr(1:9,2)=1;
56 | bpnet=newff(pr,[12 4],{'logsig', 'logsig'}, 'traingdx', 'learngdm');
57 | %建立BP神经网络, 12个隐层神经元,4个输出神经元
58 | %tranferFcn属性 'logsig' 隐层采用Sigmoid传输函数
59 | %tranferFcn属性 'logsig' 输出层采用Sigmoid传输函数
60 | %trainFcn属性 'traingdx' 自适应调整学习速率附加动量因子梯度下降反向传播算法训练函数
61 | %learn属性 'learngdm' 附加动量因子的梯度下降学习函数
62 | net.trainParam.epochs=1000;%允许最大训练步数2000步
63 | net.trainParam.goal=0.001; %训练目标最小误差0.001
64 | net.trainParam.show=10; %每间隔100步显示一次训练结果
65 | net.trainParam.lr=0.05; %学习速率0.05
66 | bpnet=train(bpnet,P,T);
67 | %-------------------------------------------------------------------------
68 | p=[110 1.318 300 0.1 15 2 18 1 2];
69 | p=p'/m;
70 |
71 | r=sim(bpnet,p);
72 | R=r'*n;
73 | display(R);
--------------------------------------------------------------------------------
/预测模型/neural_network.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 | import pandas as pd
5 | from sklearn import preprocessing # 数据标准化
6 |
7 | from pybrain.structure import *
8 | from pybrain.datasets import SupervisedDataSet
9 | from pybrain.supervised.trainers import BackpropTrainer
10 |
11 | """
12 | 神经网络 对连续值进行预测
13 | 预安装pybrain: > pip install pybrain
14 | ref:
15 | http://blog.csdn.net/u010900574/article/details/51290855
16 | """
17 |
18 |
19 | def _generate_data():
20 | """
21 | 生成数据集
22 | 输入层为u(k-1) 和 y(k-1),输出层为y(k)
23 | """
24 | # u = np.random.uniform(-1,1,200)
25 | # y=[]
26 | # former_y_value = 0
27 | # for i in np.arange(0,200):
28 | # y.append(former_y_value)
29 | # next_y_value = (29.0 / 40) * np.sin(
30 | # (16.0 * u[i] + 8 * former_y_value) / (3.0 + 4.0 * (u[i] ** 2) + 4 * (former_y_value ** 2))) \
31 | # + (2.0 / 10) * u[i] + (2.0 / 10) * former_y_value
32 | # former_y_value = next_y_value
33 | # return u,y
34 | u1 = np.random.uniform(-np.pi,np.pi,200)
35 | u2 = np.random.uniform(-1,1,200)
36 | y = np.zeros(200)
37 | for i in range(200):
38 | value = np.sin(u1[i]) + u2[i]
39 | y[i] = value
40 | return u1, u2, y
41 |
42 | def get_fnn():
43 | """
44 | 创建层
45 | 输入层: 2 units
46 | 隐含层: 10 units
47 | 输出层: 1 units
48 | """
49 | # createa neural network
50 | fnn = FeedForwardNetwork()
51 | # claim the layer
52 | inLayer = LinearLayer(2, name='inLayer')
53 | hiddenLayer0 = SigmoidLayer(10, name='hiddenLayer0')
54 | outLayer = LinearLayer(1, name='outLayer')
55 | # add three layers to the neural network
56 | fnn.addInputModule(inLayer)
57 | fnn.addModule(hiddenLayer0)
58 | fnn.addOutputModule(outLayer)
59 | # link three layers
60 | in_to_hidden0 = FullConnection(inLayer, hiddenLayer0)
61 | hidden0_to_out = FullConnection(hiddenLayer0, outLayer)
62 | # add the links to neural network
63 | fnn.addConnection(in_to_hidden0)
64 | fnn.addConnection(hidden0_to_out)
65 | # make neural network come into effect
66 | fnn.sortModules()
67 |
68 | return fnn
69 |
70 | def get_train_data():
71 | # definite the dataset as two input , one output
72 | DS = SupervisedDataSet(2, 1)
73 |
74 | u1, u2, y = _generate_data()
75 | # add data element to the dataset
76 | for i in np.arange(199):
77 | DS.addSample([u1[i], u2[i]], [y[i + 1]])
78 |
79 | # you can get your input/output this way
80 | # X = DS['input']
81 | # Y = DS['target']
82 |
83 | # split the dataset into train dataset and test dataset
84 | dataTrain, dataTest = DS.splitWithProportion(0.8)
85 |
86 | return dataTrain, dataTest
87 |
88 | def train_and_predict(fnn, dataTrain, dataTest):
89 | # train the NN
90 | # we use BP Algorithm
91 | # verbose = True means print th total error
92 | trainer = BackpropTrainer(fnn, dataTrain, verbose=True, learningrate=0.01)
93 | # set the epoch times to make the NN fit
94 | trainer.trainUntilConvergence(maxEpochs=1000)
95 |
96 | xTest, yTest = dataTest['input'], dataTest['target']
97 | predict_resutl = []
98 | for i in np.arange(len(xTest)):
99 | predict_resutl.append(fnn.activate(xTest[i])[0])
100 | print(predict_resutl)
101 |
102 | plt.figure()
103 | plt.plot(np.arange(0, len(xTest)), predict_resutl, 'ro--', label='predict number')
104 | plt.plot(np.arange(0, len(xTest)), yTest, 'ko-', label='true number')
105 | plt.legend()
106 | plt.xlabel("x")
107 | plt.ylabel("y")
108 | plt.show()
109 |
110 | # for mod in fnn.modules:
111 | # print ("Module:", mod.name)
112 | # if mod.paramdim > 0:
113 | # print ("--parameters:", mod.params)
114 | # for conn in fnn.connections[mod]:
115 | # print ("-connection to", conn.outmod.name)
116 | # if conn.paramdim > 0:
117 | # print ("- parameters", conn.params)
118 | # if hasattr(fnn, "recurrentConns"):
119 | # print ("Recurrent connections")
120 | # for conn in fnn.recurrentConns:
121 | # print ("-", conn.inmod.name, " to", conn.outmod.name)
122 | # if conn.paramdim > 0:
123 | # print ("- parameters", conn.params)
124 |
125 | def fnn_begin():
126 | fnn = get_fnn()
127 | dataTrain, dataTest = get_train_data()
128 | train_and_predict(fnn, dataTrain, dataTest)
129 |
130 | class NeuralNetwork:
131 |
132 | def __init__(self, input_layer, hide_layer, output_layer, df):
133 | self.fnn = self.get_fnn(input_layer, hide_layer, output_layer)
134 | self.df = self.data_pre_handle(df)
135 | self.get_train_data(input_layer, output_layer)
136 |
137 | def data_pre_handle(self, df):
138 | """
139 | 1. 剔除无分析价值列
140 | 2. 缺失值补全
141 | 3. 无效值剔除
142 | 4. 分类变量编码
143 | 5. 所有变量归一化
144 |
145 | """
146 | #df['类别'] = df['类别'].astype('category') # 节省内存开支
147 | df = df.dropna(axis=0)
148 | for column in df.columns:
149 | # 归一化
150 | df[column] = preprocessing.scale(df[column])
151 | return df
152 |
153 | def get_fnn(self, i, h, o):
154 | """
155 | 创建层
156 | 输入层: i units
157 | 隐含层: h units
158 | 输出层: o units
159 | """
160 | fnn = FeedForwardNetwork()
161 |
162 | inLayer = LinearLayer(i, name='inLayer')
163 | hiddenLayer0 = SigmoidLayer(h, name='hiddenLayer0')
164 | outLayer = LinearLayer(o, name='outLayer')
165 |
166 | fnn.addInputModule(inLayer)
167 | fnn.addModule(hiddenLayer0)
168 | fnn.addOutputModule(outLayer)
169 |
170 | in_to_hidden0 = FullConnection(inLayer, hiddenLayer0)
171 | hidden0_to_out = FullConnection(hiddenLayer0, outLayer)
172 |
173 | fnn.addConnection(in_to_hidden0)
174 | fnn.addConnection(hidden0_to_out)
175 |
176 | fnn.sortModules()
177 | return fnn
178 |
179 | def get_train_data(self, input_layer, output_layer):
180 | """
181 | 输入数据为数据框,前input_layer列为输入数据,后output_layer列为输出数据
182 | """
183 | DS = SupervisedDataSet(input_layer, output_layer)
184 |
185 | for i in range(self.df.shape[0] - 1):
186 | DS.addSample(self.df.iloc[i, :input_layer].values, self.df.iloc[i+1, input_layer:].values)
187 |
188 | # 打乱顺序,取80%训练,20%测试
189 | # self.dataTrain, self.dataTest = DS.splitWithProportion(0.8)
190 |
191 | def split_by_part(DS, proportion=0.9):
192 | # 不随机抽取,而是取前80%的样本训练,后20%测试
193 | leftIndices = range(int(len(DS) * proportion))
194 | leftDs = DS.copy()
195 | leftDs.clear()
196 | rightDs = leftDs.copy()
197 | index = 0
198 | for sp in DS:
199 | if index in leftIndices:
200 | leftDs.addSample(*sp)
201 | else:
202 | rightDs.addSample(*sp)
203 | index += 1
204 | return leftDs, rightDs
205 |
206 | self.dataTrain, self.dataTest = split_by_part(DS, 0.99)
207 |
208 | def train(self, times = 1000):
209 | trainer = BackpropTrainer(self.fnn, self.dataTrain, verbose=True, learningrate=0.01)
210 | trainer.trainUntilConvergence(maxEpochs=times)
211 |
212 | def predict(self):
213 | xTest, yTest = self.dataTest['input'], self.dataTest['target']
214 | predict_resut = []
215 | for i in np.arange(len(xTest)):
216 | predict_resut.append(self.fnn.activate(xTest[i]))
217 | print(predict_resut)
218 |
219 | plt.figure()
220 | plt.plot(np.arange(0, len(xTest)), predict_resut, 'ro--', label='predict number')
221 | plt.plot(np.arange(0, len(xTest)), yTest, 'ko-', label='true number')
222 | plt.legend()
223 | plt.xlabel("x")
224 | plt.ylabel("y")
225 | plt.show()
226 |
227 | def single_predict(self, x_array):
228 | return self.fnn.activate(x_array)
229 |
230 | def Wind2Df(wind_data):
231 | df = pd.DataFrame(wind_data.Data).T
232 | df.columns = wind_data.Fields
233 | df.index = wind_data.Times
234 | return df
235 |
236 |
237 | if __name__ == '__main__':
238 | # # fnn_begin()
239 | # df = pd.read_csv('dataset/auto.csv')
240 | # df = df.loc[:,[u'mpg', u'rep78', u'headroom', u'trunk', u'weight', u'length', u'turn', u'displacement', u'gear_ratio', u'price']]
241 | # nn = NeuralNetwork(9, 10, 1, df)
242 | # nn.train()
243 | # nn.predict()
244 | # print nn.single_predict(nn.df.ix[0].values[:9])
245 | # print nn.df.ix[0].values[-1]
246 | from WindPy import *
247 | import datetime
248 | w.start()
249 | df = Wind2Df(w.wst("IC1709.CFE",
250 | "volume,amt,oi,bsize1,asize1,ask2,bid2,bsize2,asize2,bid3,ask3,bsize3,asize3,ask1,bid1,last",
251 | "2017-08-22 09:00:00", "2017-08-22 14:45:05", ""))
252 | nn = NeuralNetwork(15, 15, 1, df)
253 | nn.train(100)
254 | nn.predict()
255 |
256 |
257 |
258 |
259 |
260 |
--------------------------------------------------------------------------------