├── .idea ├── Modeling_Preparation.iml ├── markdown-exported-files.xml ├── markdown-navigator.xml ├── markdown-navigator │ └── profiles_settings.xml ├── misc.xml ├── modules.xml ├── vcs.xml └── workspace.xml ├── .vscode └── tags ├── README.md ├── dataset ├── SZIndex.csv ├── SZIndex.desc ├── abalone.txt ├── ablone.names ├── auto.csv ├── auto.mat ├── auto_1.csv └── international-airline-passengers.csv ├── plot ├── cluster_plot.R ├── datafile │ ├── beijing.png │ ├── beijingDots.png │ ├── c_dijishi.dta │ ├── c_seven.dta │ ├── c_sheng1.dta │ ├── d_dijishi.dta │ ├── d_seven.dta │ ├── d_sheng1.dta │ └── word_vector.txt ├── google_map_api.py ├── plot.R └── spatial.do ├── 优化模型 ├── PSO.py ├── genetic_algorithm.py ├── sa_tsp_example.py ├── simulated_annealing.py └── simulated_annealing.pyc ├── 小工具 ├── Association_rules.py ├── data_clean.py ├── due_date_calculate.py ├── lasso_regression.m ├── ridgeRegression_func1.m ├── ridge_regression.m ├── trade_account.py └── 二分法期权计算器.cs ├── 评价模型 ├── EntropyWeight.m ├── PPE.asv ├── PPE.m ├── SOM.py ├── cluster.py ├── constraint.m ├── get_Q.m ├── optimal_tools.png ├── pso_optimal.asv ├── pso_optimal.m └── som_data.txt ├── 赛题整理 └── 赛题整理.md └── 预测模型 ├── GM1_1.m ├── HMM.py ├── LSTM_predict.py ├── ML_classify_model.py ├── PLSR.m ├── SVR.py ├── decision_tree.py ├── evaluate.py ├── neural_network.m └── neural_network.py /.idea/Modeling_Preparation.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | -------------------------------------------------------------------------------- /.idea/markdown-exported-files.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/markdown-navigator.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 36 | 37 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | -------------------------------------------------------------------------------- /.idea/markdown-navigator/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.vscode/tags: -------------------------------------------------------------------------------- 1 | !_TAG_FILE_FORMAT 2 /extended format; --format=1 will not append ;" to lines/ 2 | !_TAG_FILE_SORTED 1 /0=unsorted, 1=sorted, 2=foldcase/ 3 | !_TAG_PROGRAM_AUTHOR Darren Hiebert /dhiebert@users.sourceforge.net/ 4 | !_TAG_PROGRAM_NAME Exuberant Ctags // 5 | !_TAG_PROGRAM_URL http://ctags.sourceforge.net /official site/ 6 | !_TAG_PROGRAM_VERSION 5.9~svn20110310 // 7 | Association_rules.py ../小工具/Association_rules.py 1;" kind:file line:1 8 | City ../优化模型/sa_tsp_example.py /^class City():$/;" kind:class line:9 9 | Cluster ../评价模型/cluster.py /^class Cluster:$/;" kind:class line:8 10 | DIRECTION_BUY ../小工具/trade_account.py /^DIRECTION_BUY = 0$/;" kind:variable line:11 11 | DIRECTION_SELL ../小工具/trade_account.py /^DIRECTION_SELL = 1$/;" kind:variable line:12 12 | Evaluate ../预测模型/evaluate.py /^class Evaluate:$/;" kind:class line:20 13 | GA ../优化模型/genetic_algorithm.py /^ GA = GeneticAlgorithm()$/;" kind:variable line:129 14 | Gene ../优化模型/genetic_algorithm.py /^class Gene():$/;" kind:class line:16 15 | GeneticAlgorithm ../优化模型/genetic_algorithm.py /^class GeneticAlgorithm:$/;" kind:class line:62 16 | Gmap ../plot/google_map_api.py /^def Gmap(centerLat,centerLon,zoomS,pixelS,size,dark,saveAddress):$/;" kind:function line:10 17 | Graph ../优化模型/sa_tsp_example.py /^class Graph:$/;" kind:class line:16 18 | HMM.py ../预测模型/HMM.py 1;" kind:file line:1 19 | INSTRUMENT_FUTURE ../小工具/trade_account.py /^INSTRUMENT_FUTURE = 1$/;" kind:variable line:9 20 | INSTRUMENT_OPTION ../小工具/trade_account.py /^INSTRUMENT_OPTION = 0$/;" kind:variable line:8 21 | K_means ../评价模型/cluster.py /^ def K_means(self, K, axis=0):$/;" kind:member line:16 22 | LSTM_predict.py ../预测模型/LSTM_predict.py 1;" kind:file line:1 23 | M ../plot/google_map_api.py /^ M = {}$/;" kind:variable line:57 24 | ML_classify_model.py ../预测模型/ML_classify_model.py 1;" kind:file line:1 25 | MyGaussianHMM ../预测模型/HMM.py /^def MyGaussianHMM():$/;" kind:function line:106 26 | MyMultinomialHMM ../预测模型/HMM.py /^def MyMultinomialHMM():$/;" kind:function line:46 27 | MySOM ../评价模型/SOM.py /^class MySOM:$/;" kind:class line:25 28 | NeuralNetwork ../预测模型/neural_network.py /^class NeuralNetwork:$/;" kind:class line:130 29 | Node ../预测模型/decision_tree.py /^class Node:$/;" kind:class line:6 30 | OPENTYPE_CLOSE ../小工具/trade_account.py /^OPENTYPE_CLOSE = 1 $/;" kind:variable line:15 31 | OPENTYPE_OPEN ../小工具/trade_account.py /^OPENTYPE_OPEN = 0$/;" kind:variable line:14 32 | PSO ../优化模型/PSO.py /^class PSO():$/;" kind:class line:39 33 | PSO.py ../优化模型/PSO.py 1;" kind:file line:1 34 | PositionQueue ../小工具/trade_account.py /^class PositionQueue:$/;" kind:class line:27 35 | SOM.py ../评价模型/SOM.py 1;" kind:file line:1 36 | SVR.py ../预测模型/SVR.py 1;" kind:file line:1 37 | SimulatedAnnealing ../优化模型/simulated_annealing.py /^class SimulatedAnnealing:$/;" kind:class line:49 38 | TYPE_CONTINUE ../预测模型/evaluate.py /^TYPE_CONTINUE = 2 # 实际值与预测值均为连续$/;" kind:variable line:17 39 | TYPE_DISCRETE ../预测模型/evaluate.py /^TYPE_DISCRETE = 0 # 实际值与预测值均为离散$/;" kind:variable line:15 40 | TYPE_DISCRETE_2 ../预测模型/evaluate.py /^TYPE_DISCRETE_2 =1 # 实际值为离散，预测值为连续 logistic$/;" kind:variable line:16 41 | TradeAccount ../小工具/trade_account.py /^class TradeAccount:$/;" kind:class line:65 42 | Tree ../预测模型/decision_tree.py /^class Tree:$/;" kind:class line:17 43 | ValueCalculate ../小工具/trade_account.py /^class ValueCalculate():$/;" kind:class line:168 44 | Wind2Df ../小工具/due_date_calculate.py /^def Wind2Df(wind_data):$/;" kind:function line:8 45 | Wind2Df ../预测模型/neural_network.py /^def Wind2Df(wind_data):$/;" kind:function line:230 46 | X ../预测模型/SVR.py /^X = boston.data$/;" kind:variable line:9 47 | X_test ../预测模型/SVR.py /^X_test = ss_X.transform(X_test)$/;" kind:variable line:24 48 | X_train ../预测模型/SVR.py /^X_train = ss_X.fit_transform(X_train)$/;" kind:variable line:23 49 | __doc__ ../优化模型/genetic_algorithm.py /^ __doc__ = "个体基因类，存储单个基因"$/;" kind:variable line:18 50 | __init__ ../优化模型/PSO.py /^ def __init__(self, particle_number=10, variable_number=1):$/;" kind:member line:41 51 | __init__ ../优化模型/genetic_algorithm.py /^ def __init__(self):$/;" kind:member line:64 52 | __init__ ../优化模型/genetic_algorithm.py /^ def __init__(self, gene_length=10, float_length=4):$/;" kind:member line:20 53 | __init__ ../优化模型/sa_tsp_example.py /^ def __init__(self):$/;" kind:member line:18 54 | __init__ ../优化模型/sa_tsp_example.py /^ def __init__(self, x, y):$/;" kind:member line:11 55 | __init__ ../优化模型/simulated_annealing.py /^ def __init__(self, func):$/;" kind:member line:51 56 | __init__ ../小工具/trade_account.py /^ def __init__(self):$/;" kind:member line:29 57 | __init__ ../小工具/trade_account.py /^ def __init__(self, capital_list, init_capital):$/;" kind:member line:170 58 | __init__ ../小工具/trade_account.py /^ def __init__(self, init_capital):$/;" kind:member line:67 59 | __init__ ../评价模型/SOM.py /^ def __init__(self, df, mapsize, initialization = 'random'):$/;" kind:member line:26 60 | __init__ ../评价模型/cluster.py /^ def __init__(self, df):$/;" kind:member line:10 61 | __init__ ../预测模型/ML_classify_model.py /^ def __init__(self, train_x, train_y):$/;" kind:member line:7 62 | __init__ ../预测模型/decision_tree.py /^ def __init__(self, df):$/;" kind:member line:23 63 | __init__ ../预测模型/decision_tree.py /^ def __init__(self, feature, df):$/;" kind:member line:8 64 | __init__ ../预测模型/evaluate.py /^ def __init__(self, true_array, predict_array, pred_type = TYPE_DISCRETE):$/;" kind:member line:22 65 | __init__ ../预测模型/neural_network.py /^ def __init__(self, input_layer, hide_layer, output_layer, df):$/;" kind:member line:132 66 | __slots__ ../优化模型/sa_tsp_example.py /^ __slots__ = ("X", "Y")$/;" kind:variable line:10 67 | _generate_data ../预测模型/neural_network.py /^def _generate_data():$/;" kind:function line:19 68 | _pca ../评价模型/cluster.py /^ def _pca(self):$/;" kind:member line:65 69 | accuracy ../预测模型/evaluate.py /^ def accuracy(self):$/;" kind:member line:28 70 | add ../小工具/trade_account.py /^ def add(self, instrument, direction, price):$/;" kind:member line:34 71 | add_city ../优化模型/sa_tsp_example.py /^ def add_city(self, city):$/;" kind:member line:26 72 | auto_cluster ../评价模型/cluster.py /^ def auto_cluster(self):$/;" kind:member line:71 73 | ax ../plot/google_map_api.py /^ ax = plt.subplot(111)$/;" kind:variable line:65 74 | ax ../小工具/data_clean.py /^ ax = plt.subplot(111)$/;" kind:variable line:179 75 | ax1 ../小工具/due_date_calculate.py /^ax1 = fig.add_subplot(211)$/;" kind:variable line:61 76 | ax2 ../小工具/due_date_calculate.py /^ax2 = fig.add_subplot(212)$/;" kind:variable line:62 77 | begin ../优化模型/genetic_algorithm.py /^ def begin(self):$/;" kind:member line:103 78 | begin ../优化模型/simulated_annealing.py /^ def begin(self):$/;" kind:member line:66 79 | bin2dec ../优化模型/genetic_algorithm.py /^ def bin2dec(self):$/;" kind:member line:34 80 | boston ../预测模型/SVR.py /^boston = load_boston()$/;" kind:variable line:3 81 | chinese_province_list ../优化模型/sa_tsp_example.py /^ chinese_province_list = [$/;" kind:variable line:146 82 | choose_gene ../优化模型/genetic_algorithm.py /^ def choose_gene(self, rand):$/;" kind:member line:81 83 | city_a ../优化模型/sa_tsp_example.py /^ city_a = City(0, 0)$/;" kind:variable line:101 84 | city_b ../优化模型/sa_tsp_example.py /^ city_b = City(0, 1)$/;" kind:variable line:102 85 | city_c ../优化模型/sa_tsp_example.py /^ city_c = City(1, 0)$/;" kind:variable line:103 86 | city_d ../优化模型/sa_tsp_example.py /^ city_d = City(1, 1)$/;" kind:variable line:104 87 | city_list ../优化模型/sa_tsp_example.py /^ city_list = [$/;" kind:variable line:106 88 | classify_report ../预测模型/evaluate.py /^ def classify_report(self):$/;" kind:member line:67 89 | clu ../评价模型/cluster.py /^ clu = Cluster(df)$/;" kind:variable line:82 90 | cluster ../评价模型/SOM.py /^ def cluster(self, n):$/;" kind:member line:65 91 | cluster.py ../评价模型/cluster.py 1;" kind:file line:1 92 | cluster_plot ../评价模型/cluster.py /^ def cluster_plot(self, label):$/;" kind:member line:55 93 | confusion_matrix ../预测模型/evaluate.py /^ def confusion_matrix(self):$/;" kind:member line:43 94 | confusion_matrix_plot ../预测模型/evaluate.py /^ def confusion_matrix_plot(self, cmap=plt.cm.Blues):$/;" kind:member line:46 95 | create_dataset ../预测模型/LSTM_predict.py /^def create_dataset(dataset, look_back=1):$/;" kind:function line:23 96 | cross ../优化模型/genetic_algorithm.py /^ def cross(gene1, gene2):$/;" kind:member line:44 97 | data ../plot/google_map_api.py /^ data = {'23.157105_113.256031': 5,$/;" kind:variable line:53 98 | data ../评价模型/SOM.py /^ data = fetch_california_housing()$/;" kind:variable line:96 99 | data ../评价模型/SOM.py /^ data = np.column_stack([data.data, data.target])$/;" kind:variable line:99 100 | data_clean.py ../小工具/data_clean.py 1;" kind:file line:1 101 | data_pre_handle ../预测模型/neural_network.py /^ def data_pre_handle(self, df):$/;" kind:member line:137 102 | data_set ../预测模型/decision_tree.py /^ data_set = [$/;" kind:variable line:72 103 | dataframe ../预测模型/LSTM_predict.py /^dataframe = read_csv('..\/dataset\/international-airline-passengers.csv', usecols=[1], engine='python', skipfooter=3)$/;" kind:variable line:40 104 | dataset ../预测模型/LSTM_predict.py /^dataset = dataframe.values$/;" kind:variable line:41 105 | dataset ../预测模型/LSTM_predict.py /^dataset = dataset.astype('float32')$/;" kind:variable line:42 106 | dataset ../预测模型/LSTM_predict.py /^dataset = scaler.fit_transform(dataset)$/;" kind:variable line:45 107 | decision_tree ../预测模型/ML_classify_model.py /^ def decision_tree(self):$/;" kind:member line:26 108 | decision_tree.py ../预测模型/decision_tree.py 1;" kind:file line:1 109 | descr ../评价模型/SOM.py /^ descr = data.DESCR$/;" kind:variable line:97 110 | df ../小工具/data_clean.py /^ df = fill_na(df)$/;" kind:variable line:161 111 | df ../小工具/data_clean.py /^ df = label_encode(df, ['make', 'foreign'])$/;" kind:variable line:155 112 | df ../小工具/data_clean.py /^ df = pd.read_csv("\/home\/ray\/Documents\/suibe\/2017\/建模\/Modeling_Preparation\/dataset\/auto.csv")$/;" kind:variable line:151 113 | df ../小工具/data_clean.py /^ df = standardize(df, ['make','foreign']) # 这两列是分类变量，不需要标准化$/;" kind:variable line:164 114 | df ../评价模型/SOM.py /^ df = pd.DataFrame(data)$/;" kind:variable line:100 115 | df ../评价模型/cluster.py /^ df = df.dropna(axis=0)$/;" kind:variable line:81 116 | df ../评价模型/cluster.py /^ df = pd.read_csv("\/home\/ray\/Documents\/suibe\/2017\/建模\/Modeling_Preparation\/dataset\/auto_1.csv")$/;" kind:variable line:80 117 | df ../预测模型/ML_classify_model.py /^ df = df.dropna(axis=0)$/;" kind:variable line:64 118 | df ../预测模型/ML_classify_model.py /^ df = pd.read_csv("..\/dataset\/auto_1.csv")$/;" kind:variable line:63 119 | df ../预测模型/decision_tree.py /^ df = pd.DataFrame(data_set)$/;" kind:variable line:84 120 | df ../预测模型/neural_network.py /^ df = Wind2Df(w.wst("IC1709.CFE",$/;" kind:variable line:249 121 | df2 ../小工具/data_clean.py /^ df2 = winsorize(df,1,99)$/;" kind:variable line:177 122 | df_all ../小工具/due_date_calculate.py /^df_all = pd.merge(df_if00, df_if01, left_index=True, right_index=True)$/;" kind:variable line:46 123 | df_all1 ../小工具/due_date_calculate.py /^df_all1 = df_all.copy()$/;" kind:variable line:54 124 | df_columns ../小工具/data_clean.py /^ df_columns = df.columns$/;" kind:variable line:152 125 | df_if00 ../小工具/due_date_calculate.py /^df_if00 = Wind2Df(w.wsi("IF00.CFE", "close, volume", "2016-02-01 09:30:00", "2017-08-16 13:48:43", "periodstart=09:30:00;periodend=15:00:00"))$/;" kind:variable line:41 126 | df_if01 ../小工具/due_date_calculate.py /^df_if01 = Wind2Df(w.wsi("IF01.CFE", "close, volume", "2016-02-01 09:30:00", "2017-08-16 13:48:43", "periodstart=09:30:00;periodend=15:00:00"))$/;" kind:variable line:42 127 | display ../小工具/trade_account.py /^ def display(self):$/;" kind:member line:266 128 | display ../小工具/trade_account.py /^ def display(self):$/;" kind:member line:61 129 | display ../预测模型/evaluate.py /^ def display(self):$/;" kind:member line:123 130 | display_node ../预测模型/decision_tree.py /^ def display_node(self, node, depth):$/;" kind:member line:61 131 | draw_cluster_map ../评价模型/SOM.py /^ def draw_cluster_map(self):$/;" kind:member line:59 132 | draw_hit_map ../评价模型/SOM.py /^ def draw_hit_map(self):$/;" kind:member line:53 133 | draw_input_weights ../评价模型/SOM.py /^ def draw_input_weights(self):$/;" kind:member line:47 134 | drop_duplicate ../小工具/data_clean.py /^def drop_duplicate(df, columns=[]):$/;" kind:function line:129 135 | due_date_calculate.py ../小工具/due_date_calculate.py 1;" kind:file line:1 136 | end_trade ../小工具/trade_account.py /^ def end_trade(self):$/;" kind:member line:163 137 | eva_0 ../预测模型/evaluate.py /^ eva_0 = Evaluate(true_y_0, pred_y_0, TYPE_DISCRETE)$/;" kind:variable line:157 138 | eva_1 ../预测模型/evaluate.py /^ eva_1 = Evaluate(true_y_1, pred_y_1, TYPE_DISCRETE_2)$/;" kind:variable line:158 139 | eva_2 ../预测模型/evaluate.py /^ eva_2 = Evaluate(true_y_2, pred_y_2, TYPE_CONTINUE)$/;" kind:variable line:159 140 | evaluate.py ../预测模型/evaluate.py 1;" kind:file line:1 141 | exeTime ../优化模型/simulated_annealing.py /^def exeTime(func):$/;" kind:function line:35 142 | explained_variance ../预测模型/evaluate.py /^ def explained_variance(self):$/;" kind:member line:104 143 | f1 ../预测模型/evaluate.py /^ def f1(self):$/;" kind:member line:38 144 | fig ../小工具/due_date_calculate.py /^fig = plt.figure()$/;" kind:variable line:60 145 | filename ../plot/google_map_api.py /^ filename = ".\/datafile\/beijing.png"$/;" kind:variable line:48 146 | fill_na ../小工具/data_clean.py /^def fill_na(df, excep_columns=[], how='mean'):$/;" kind:function line:27 147 | fnn_begin ../预测模型/neural_network.py /^def fnn_begin():$/;" kind:function line:125 148 | func ../优化模型/PSO.py /^ def func(self, array):$/;" kind:member line:56 149 | future_parameter ../小工具/trade_account.py /^future_parameter = {$/;" kind:variable line:17 150 | gen_due_date ../小工具/due_date_calculate.py /^def gen_due_date(year, month):$/;" kind:function line:20 151 | gen_new_sequence ../优化模型/sa_tsp_example.py /^def gen_new_sequence(sequence):$/;" kind:function line:60 152 | gen_new_x ../优化模型/simulated_annealing.py /^ def gen_new_x(self, x_before, T):$/;" kind:member line:59 153 | gen_tree ../预测模型/decision_tree.py /^ def gen_tree(self, node):$/;" kind:member line:42 154 | gene_pop ../优化模型/genetic_algorithm.py /^ def gene_pop(self):$/;" kind:member line:94 155 | genetic_algorithm.py ../优化模型/genetic_algorithm.py 1;" kind:file line:1 156 | get_annual_return ../小工具/trade_account.py /^ def get_annual_return(self):$/;" kind:member line:190 157 | get_average_return ../小工具/trade_account.py /^ def get_average_return(self):$/;" kind:member line:193 158 | get_best_gene ../优化模型/genetic_algorithm.py /^ def get_best_gene(self):$/;" kind:member line:77 159 | get_cluster_label ../评价模型/SOM.py /^ def get_cluster_label(self):$/;" kind:member line:68 160 | get_continue_lose_times ../小工具/trade_account.py /^ def get_continue_lose_times(self):$/;" kind:member line:240 161 | get_continue_win_times ../小工具/trade_account.py /^ def get_continue_win_times(self):$/;" kind:member line:229 162 | get_distance ../优化模型/sa_tsp_example.py /^ def get_distance(city1, city2):$/;" kind:member line:23 163 | get_due_date ../小工具/due_date_calculate.py /^def get_due_date(date):$/;" kind:function line:28 164 | get_feature ../预测模型/decision_tree.py /^ def get_feature(self, df):$/;" kind:member line:28 165 | get_fit_value ../优化模型/genetic_algorithm.py /^ def get_fit_value(self):$/;" kind:member line:74 166 | get_fit_value ../优化模型/genetic_algorithm.py /^ def get_fit_value(self, func):$/;" kind:member line:58 167 | get_fnn ../预测模型/neural_network.py /^ def get_fnn(self, i, h, o):$/;" kind:member line:153 168 | get_fnn ../预测模型/neural_network.py /^def get_fnn():$/;" kind:function line:42 169 | get_label ../评价模型/SOM.py /^ def get_label(self):$/;" kind:member line:79 170 | get_lose_times ../小工具/trade_account.py /^ def get_lose_times(self):$/;" kind:member line:211 171 | get_max_drawdown ../小工具/trade_account.py /^ def get_max_drawdown(self):$/;" kind:member line:251 172 | get_max_lose ../小工具/trade_account.py /^ def get_max_lose(self):$/;" kind:member line:226 173 | get_max_win ../小工具/trade_account.py /^ def get_max_win(self):$/;" kind:member line:223 174 | get_neurons ../评价模型/SOM.py /^ def get_neurons(self):$/;" kind:member line:72 175 | get_return_list ../小工具/trade_account.py /^ def get_return_list(self):$/;" kind:member line:177 176 | get_return_volatility ../小工具/trade_account.py /^ def get_return_volatility(self):$/;" kind:member line:199 177 | get_sharp_ratio ../小工具/trade_account.py /^ def get_sharp_ratio(self):$/;" kind:member line:261 178 | get_shortest_distance ../优化模型/sa_tsp_example.py /^def get_shortest_distance(graph):$/;" kind:function line:67 179 | get_total_distance ../优化模型/sa_tsp_example.py /^ def get_total_distance(self, sequence = None):$/;" kind:member line:37 180 | get_total_return ../小工具/trade_account.py /^ def get_total_return(self):$/;" kind:member line:187 181 | get_total_trade_times ../小工具/trade_account.py /^ def get_total_trade_times(self):$/;" kind:member line:196 182 | get_train_data ../预测模型/neural_network.py /^ def get_train_data(self, input_layer, output_layer):$/;" kind:member line:179 183 | get_train_data ../预测模型/neural_network.py /^def get_train_data():$/;" kind:function line:70 184 | get_win_lose_ratio ../小工具/trade_account.py /^ def get_win_lose_ratio(self):$/;" kind:member line:218 185 | get_win_ratio ../小工具/trade_account.py /^ def get_win_ratio(self):$/;" kind:member line:215 186 | get_win_times ../小工具/trade_account.py /^ def get_win_times(self):$/;" kind:member line:207 187 | google_map_api.py ../plot/google_map_api.py 1;" kind:file line:1 188 | graph ../优化模型/sa_tsp_example.py /^ graph = Graph()$/;" kind:variable line:180 189 | hamming_distance ../预测模型/evaluate.py /^ def hamming_distance(self):$/;" kind:member line:96 190 | have_position ../小工具/trade_account.py /^ def have_position(self, instrument, direction):$/;" kind:member line:49 191 | hierarchial_plot ../评价模型/cluster.py /^ def hierarchial_plot(self, Z):$/;" kind:member line:41 192 | hierarchical ../评价模型/cluster.py /^ def hierarchical(self):$/;" kind:member line:30 193 | im ../plot/google_map_api.py /^ im = Image.open(filename)#np.flipud(plt.imread(filename))$/;" kind:variable line:64 194 | initial_gene ../优化模型/genetic_algorithm.py /^ def initial_gene(self):$/;" kind:member line:25 195 | initial_particle ../优化模型/PSO.py /^ def initial_particle(self):$/;" kind:member line:65 196 | interpolate_na ../小工具/data_clean.py /^def interpolate_na(df, excep_columns=[], how='lagrange'):$/;" kind:function line:46 197 | is_due_date ../小工具/due_date_calculate.py /^def is_due_date(date):$/;" kind:function line:14 198 | jaccard_distance ../预测模型/evaluate.py /^ def jaccard_distance(self):$/;" kind:member line:100 199 | kappa_score ../预测模型/evaluate.py /^ def kappa_score(self):$/;" kind:member line:71 200 | knn ../预测模型/ML_classify_model.py /^ def knn(self, k=3):$/;" kind:member line:12 201 | label ../评价模型/cluster.py /^ label = clu.K_means(4)$/;" kind:variable line:84 202 | label2 ../评价模型/cluster.py /^ label2 = clu.hierarchical()$/;" kind:variable line:87 203 | label_encode ../小工具/data_clean.py /^def label_encode(df, encode_column=[]):$/;" kind:function line:114 204 | latLonToPixelXY ../plot/google_map_api.py /^def latLonToPixelXY(lat,lon,zoomS):$/;" kind:function line:28 205 | linear_svr ../预测模型/SVR.py /^linear_svr = SVR(kernel = 'linear')$/;" kind:variable line:30 206 | linear_svr_y_predict ../预测模型/SVR.py /^linear_svr_y_predict = linear_svr.predict(X_test)$/;" kind:variable line:34 207 | logistic ../预测模型/ML_classify_model.py /^ def logistic(self):$/;" kind:member line:19 208 | look_back ../预测模型/LSTM_predict.py /^look_back = 2$/;" kind:variable line:51 209 | mc ../预测模型/ML_classify_model.py /^ mc = myclassify(df.iloc[:, 0:10], df.iloc[:,-1])$/;" kind:variable line:65 210 | mean_absolute_error ../预测模型/evaluate.py /^ def mean_absolute_error(self):$/;" kind:member line:112 211 | mean_squared_error ../预测模型/evaluate.py /^ def mean_squared_error(self):$/;" kind:member line:108 212 | median_absolute_error ../预测模型/evaluate.py /^ def median_absolute_error(self):$/;" kind:member line:116 213 | model ../预测模型/LSTM_predict.py /^model = Sequential()$/;" kind:variable line:59 214 | mutation ../优化模型/genetic_algorithm.py /^ def mutation(self):$/;" kind:member line:51 215 | my_som ../评价模型/SOM.py /^ my_som = MySOM(df, (20,20))$/;" kind:variable line:103 216 | myclassify ../预测模型/ML_classify_model.py /^class myclassify():$/;" kind:class line:6 217 | naive_bayes ../预测模型/ML_classify_model.py /^ def naive_bayes(self):$/;" kind:member line:33 218 | names ../评价模型/SOM.py /^ names = data.feature_names+["HouseValue"]$/;" kind:variable line:98 219 | neural_network.py ../预测模型/neural_network.py 1;" kind:file line:1 220 | newFunc ../优化模型/simulated_annealing.py /^ def newFunc(*args, **args2):$/;" kind:function line:36 221 | nn ../预测模型/neural_network.py /^ nn = NeuralNetwork(15, 15, 1, df)$/;" kind:variable line:252 222 | normalize ../小工具/data_clean.py /^def normalize(df, excep_columns=[]):$/;" kind:function line:100 223 | option_parameter ../小工具/trade_account.py /^option_parameter = {$/;" kind:variable line:23 224 | order_future ../小工具/trade_account.py /^ def order_future(self, price, instrument, direction, open_type, amount=1):$/;" kind:member line:74 225 | order_option ../小工具/trade_account.py /^ def order_option(self, price, instrument, direction, open_type, amount):$/;" kind:member line:118 226 | ployinterp_column ../小工具/data_clean.py /^ def ployinterp_column(s, n, k=5):$/;" kind:function line:59 227 | poly_svr ../预测模型/SVR.py /^poly_svr = SVR(kernel = 'poly')$/;" kind:variable line:36 228 | poly_svr_y_predict ../预测模型/SVR.py /^poly_svr_y_predict = poly_svr.predict(X_test)$/;" kind:variable line:38 229 | pop ../小工具/trade_account.py /^ def pop(self, instrument, direction):$/;" kind:member line:42 230 | pred_y_0 ../预测模型/evaluate.py /^ pred_y_0 = [1,0,1,1,0,1,0,1]$/;" kind:variable line:149 231 | pred_y_1 ../预测模型/evaluate.py /^ pred_y_1 = [1, 0.8, 0.2, 1.2, 0, 1.0, 0, 1.7, 2.1, 3.1]$/;" kind:variable line:152 232 | pred_y_2 ../预测模型/evaluate.py /^ pred_y_2 = [1, 0, 1, 1.2, 0, 1, 0, 1]$/;" kind:variable line:155 233 | predict ../评价模型/SOM.py /^ def predict(self, x):$/;" kind:member line:87 234 | predict ../预测模型/neural_network.py /^ def predict(self):$/;" kind:member line:212 235 | predict_num ../预测模型/ML_classify_model.py /^ predict_num = -3$/;" kind:variable line:69 236 | print_error ../评价模型/SOM.py /^ def print_error(self):$/;" kind:member line:42 237 | pso ../优化模型/PSO.py /^ pso = PSO()$/;" kind:variable line:103 238 | pso_begin ../优化模型/PSO.py /^ def pso_begin(self):$/;" kind:member line:96 239 | r_square ../预测模型/evaluate.py /^ def r_square(self):$/;" kind:member line:120 240 | rbf_svr ../预测模型/SVR.py /^rbf_svr = SVR(kernel = 'rbf')$/;" kind:variable line:40 241 | rbf_svr_y_predict ../预测模型/SVR.py /^rbf_svr_y_predict = rbf_svr.predict(X_test)$/;" kind:variable line:42 242 | recall ../预测模型/evaluate.py /^ def recall(self):$/;" kind:member line:33 243 | replace_outlier ../小工具/data_clean.py /^def replace_outlier(df):$/;" kind:function line:133 244 | reset_distance ../优化模型/sa_tsp_example.py /^ def reset_distance(self):$/;" kind:member line:34 245 | result ../优化模型/sa_tsp_example.py /^ result = get_shortest_distance(graph)$/;" kind:variable line:183 246 | result_list ../小工具/Association_rules.py /^result_list = []$/;" kind:variable line:32 247 | roc_plot ../预测模型/evaluate.py /^ def roc_plot(self, title='Receiver operating characteristic plot'):$/;" kind:member line:79 248 | roc_score ../预测模型/evaluate.py /^ def roc_score(self):$/;" kind:member line:76 249 | sa ../优化模型/simulated_annealing.py /^ sa = SimulatedAnnealing('')$/;" kind:variable line:93 250 | sa_tsp_example.py ../优化模型/sa_tsp_example.py 1;" kind:file line:1 251 | sample ../plot/google_map_api.py /^def sample(lis,amount):$/;" kind:function line:37 252 | scaler ../预测模型/LSTM_predict.py /^scaler = MinMaxScaler(feature_range=(0, 1))$/;" kind:variable line:44 253 | simulated_annealing.py ../优化模型/simulated_annealing.py 1;" kind:file line:1 254 | single_predict ../预测模型/neural_network.py /^ def single_predict(self, x_array):$/;" kind:member line:227 255 | split_by_part ../预测模型/neural_network.py /^ def split_by_part(DS, proportion=0.9):$/;" kind:function line:191 256 | ss_X ../预测模型/SVR.py /^ss_X = StandardScaler()$/;" kind:variable line:20 257 | ss_y ../预测模型/SVR.py /^ss_y = StandardScaler()$/;" kind:variable line:21 258 | standardize ../小工具/data_clean.py /^def standardize(df, excep_columns=[]):$/;" kind:function line:88 259 | svm ../预测模型/ML_classify_model.py /^ def svm(self):$/;" kind:member line:40 260 | svm_cv ../预测模型/ML_classify_model.py /^ def svm_cv(self):$/;" kind:member line:47 261 | temp ../小工具/Association_rules.py /^ temp = '-'.join([k for k in i[0]]) + ',' + '-'.join([k for k in i[1]]) + ','$/;" kind:variable line:34 262 | temp ../小工具/data_clean.py /^ temp = df['price'].values$/;" kind:variable line:172 263 | testPredict ../预测模型/LSTM_predict.py /^testPredict = model.predict(testX)$/;" kind:variable line:66 264 | testPredict ../预测模型/LSTM_predict.py /^testPredict = scaler.inverse_transform(testPredict)$/;" kind:variable line:70 265 | testPredictPlot ../预测模型/LSTM_predict.py /^testPredictPlot = numpy.empty_like(dataset)$/;" kind:variable line:82 266 | testScore ../预测模型/LSTM_predict.py /^testScore = math.sqrt(mean_squared_error(testY[0], testPredict[:,0]))$/;" kind:variable line:75 267 | testX ../预测模型/LSTM_predict.py /^testX = numpy.reshape(testX, (testX.shape[0], 1, testX.shape[1]))$/;" kind:variable line:56 268 | testY ../预测模型/LSTM_predict.py /^testY = scaler.inverse_transform([testY])$/;" kind:variable line:71 269 | test_case ../小工具/Association_rules.py /^test_case = [$/;" kind:variable line:13 270 | test_size ../预测模型/LSTM_predict.py /^test_size = len(dataset) - train_size$/;" kind:variable line:48 271 | trade_account.py ../小工具/trade_account.py 1;" kind:file line:1 272 | train ../评价模型/SOM.py /^ def train(self):$/;" kind:member line:39 273 | train ../预测模型/neural_network.py /^ def train(self, times = 1000):$/;" kind:member line:208 274 | trainPredict ../预测模型/LSTM_predict.py /^trainPredict = model.predict(trainX)$/;" kind:variable line:65 275 | trainPredict ../预测模型/LSTM_predict.py /^trainPredict = scaler.inverse_transform(trainPredict)$/;" kind:variable line:68 276 | trainPredictPlot ../预测模型/LSTM_predict.py /^trainPredictPlot = numpy.empty_like(dataset)$/;" kind:variable line:78 277 | trainScore ../预测模型/LSTM_predict.py /^trainScore = math.sqrt(mean_squared_error(trainY[0], trainPredict[:,0]))$/;" kind:variable line:73 278 | trainX ../预测模型/LSTM_predict.py /^trainX = numpy.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))$/;" kind:variable line:55 279 | trainY ../预测模型/LSTM_predict.py /^trainY = scaler.inverse_transform([trainY])$/;" kind:variable line:69 280 | train_and_predict ../预测模型/neural_network.py /^def train_and_predict(fnn, dataTrain, dataTest):$/;" kind:function line:88 281 | train_size ../预测模型/LSTM_predict.py /^train_size = int(len(dataset) * 0.67)$/;" kind:variable line:47 282 | tree ../预测模型/decision_tree.py /^ tree = Tree(df)$/;" kind:variable line:86 283 | true_y_0 ../预测模型/evaluate.py /^ true_y_0 = [1,1,0,1,0,1,1,1]$/;" kind:variable line:148 284 | true_y_1 ../预测模型/evaluate.py /^ true_y_1 = [1, 1, 0, 1, 0, 1, 1, 0, 1, 1]$/;" kind:variable line:151 285 | true_y_2 ../预测模型/evaluate.py /^ true_y_2 = [1, 1, 0.9, 1.1, 0.1, 1, 1, 0]$/;" kind:variable line:154 286 | update_particle ../优化模型/PSO.py /^ def update_particle(self):$/;" kind:member line:79 287 | vote ../预测模型/decision_tree.py /^ def vote(df, columns_name, value):$/;" kind:member line:38 288 | winsorize ../小工具/data_clean.py /^def winsorize(df, low_q=1, up_q=99):$/;" kind:function line:138 289 | x ../优化模型/simulated_annealing.py /^ x = sa.begin()$/;" kind:variable line:94 290 | y ../预测模型/SVR.py /^y = boston.target$/;" kind:variable line:10 291 | y_test ../预测模型/SVR.py /^y_test = ss_y.transform(y_test)$/;" kind:variable line:26 292 | y_train ../预测模型/SVR.py /^y_train = ss_y.fit_transform(y_train)$/;" kind:variable line:25 293 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Modeling_Preparation 2 | 数学建模准备工作——每日一算法，包括各种各样可复用的小函数 3 | 4 | > /dataset 5 | 6 | - **abalone.txt** 鲍鱼数据集数据 7 | - **abalone.names** 鲍鱼数据集变量名 8 | - **auto.csv** 大众数据集原始版 9 | - **auto_1.csv** 大众数据集中文替换 10 | - **auto.mat** 大众数据集 matlab格式 11 | - **SZIndex.csv** 上证指数数据集 12 | - **SZIndex.desc** 上证指数数据集说明 13 | - **international-airline-passengers.csv** International airline passengers: monthly totals in thousands. Jan 49 – Dec 60 14 | 15 | > /优化模型 16 | 17 | - **genetic_algorithm.py** 遗传算法 18 | - **PSO.py** 粒子群算法 19 | - **simulated_annealing.py** 模拟退火算法 20 | - **sa_tsp_example.py** 模拟退火算法解决TSP问题 21 | 22 | > /小工具 23 | 24 | - **due_date_calculate.py** wind接口调用 + 计算期权/期货到期日 25 | - **lasso_regression.m** lasso回归 26 | - **ridge_regression.m** 岭回归主程序 27 | - **ridgeRegression_func1.m** 岭回归函数1 28 | - **trade_account.py** 29 | - 交易仓位类 30 | - 交易模拟账户类，支持期权和期货模拟交易 31 | - 净值指标计算类，输入净值序列，输出夏普、年化收益等 32 | - **二分法期权计算器.cs** 根据BS公式与二分法，计算特定期权指标（权费/行权价/无风险收益/到期日/隐含波动率/标的价格） 33 | - **Association_rules.py** 关联规则的Apriori算法，包括使用fp-growth方法寻找频繁项集 34 | - **data_clean.py** 数据预处理 35 | 36 | > /评价模型 37 | 38 | - **PPE.m** 投影寻踪法主程序，确定权重 39 | - **get_Q.m** 投影寻踪法获取目标函数值 40 | - **constraint.m** 投影寻踪法约束条件，用以输入优化工具箱 41 | - **pso_optimal.m** 粒子群算法求解投影寻踪法结果 42 | - **optimal_tools.png** 优化工具箱使用方法：参数输入 43 | - **EntropyWeight.m** 商权法确定去那种 44 | - **SOM.py** 神经网络聚类方法 45 | - **cluster.py** K均值和层次聚类法，包括PCA降维 46 | 47 | > /预测模型 48 | 49 | - **decision_tree.py** 决策树手写 50 | - **ML_classify_mode.py** 机器学习分类模型汇总调用sklearn包 51 | - **neural_network.py** 神经网络BP算法，用于连续值预测 52 | - **neural_network.m** 神经网络BP算法，调用matlab神经网络工具箱 53 | - **SVR.py** 支持向量回归，用于连续值预测 54 | - **HMM.py** 隐马尔科夫模型 55 | - **evaluate.py** 预测效果评估 56 | - **GM1_1.py** 灰色预测 57 | - **LSTM_predict.py** 长短记忆神经网络预测模型 58 | - **PLSR.m** 典型相关分析，偏最小二乘，研究变量间影响，尤其是多对多，并进行预测 59 | 60 | > Some Notes 61 | - 分层优化 62 | - 例1：分级排班优化建模，主要包含两层优化:一是利用飞机使用最小化模型得到每一天覆盖所有航班的最小航班串;二是利用飞机维修机会最大化模型得到覆盖所有航班串的一周飞机路线,并进行仿真 63 | - 例2：全国5A景点旅游路线规划，两层:一是省内区域（或者是聚类得到的景点簇）内旅游路线进行优化（TSP），二是省际优化方法 64 | - 典型相关 65 | - 《基于格兰杰因果检验和典型相关的农民收入影响因素研究》 66 | - 《典型相关分析综述》 67 | - 《基于核典型相关分析和支持向量机的语音情感识别模型》 68 | 69 | 70 | > TODO LIST: 71 | 72 | - 格兰杰因果检验，时间序列算法（stata搞定） 73 | - 双种群遗传算法 (了解) 74 | - 多车辆路径问题 (http://blog.csdn.net/wangqiuyun/article/details/7664995) 75 | - 最小生成树 (http://blog.csdn.net/heisediwei/article/details/50326847) 76 | - MTSP 多旅行商tsp问题 (文献很多) 77 | - 系统动力学 (了解) 78 | - 隶属度函数 (了解) 79 | - 连续区间有序加权平均算子 COWA (尚未了解) 80 | - 插值与拟合 matlab 实现 (拟合工具箱和interp1函数) 81 | - lingo (放弃) 82 | - 灰色关联度筛选变量 (灰色关联度在变量选择之中的误区) 83 | - latex 编译内容填充 84 | - 空间作图复习 85 | - R ggplot2 各种作图复习 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | -------------------------------------------------------------------------------- /dataset/SZIndex.desc: -------------------------------------------------------------------------------- 1 | 上证指数2010-04-09到2017-08-31数据 2 | 3 | columns: 4 | 一日对数收益差 5 | 五日对数收益差 6 | 当日对数高低价差 7 | 当日成交量 8 | 对数融资余额差 9 | 交易日期 10 | 收盘价 11 | 12 | 前5列利用 from sklearn import preprocessing.scale 进行归一化 13 | 14 | 15 | -------------------------------------------------------------------------------- /dataset/ablone.names: -------------------------------------------------------------------------------- 1 | 1. Title of Database: Abalone data 2 | 3 | 2. Sources: 4 | 5 | (a) Original owners of database: 6 | Marine Resources Division 7 | Marine Research Laboratories - Taroona 8 | Department of Primary Industry and Fisheries, Tasmania 9 | GPO Box 619F, Hobart, Tasmania 7001, Australia 10 | (contact: Warwick Nash +61 02 277277, wnash@dpi.tas.gov.au) 11 | 12 | (b) Donor of database: 13 | Sam Waugh (Sam.Waugh@cs.utas.edu.au) 14 | Department of Computer Science, University of Tasmania 15 | GPO Box 252C, Hobart, Tasmania 7001, Australia 16 | 17 | (c) Date received: December 1995 18 | 19 | 20 | 3. Past Usage: 21 | 22 | Sam Waugh (1995) "Extending and benchmarking Cascade-Correlation", PhD 23 | thesis, Computer Science Department, University of Tasmania. 24 | 25 | -- Test set performance (final 1044 examples, first 3133 used for training): 26 | 24.86% Cascade-Correlation (no hidden nodes) 27 | 26.25% Cascade-Correlation (5 hidden nodes) 28 | 21.5% C4.5 29 | 0.0% Linear Discriminate Analysis 30 | 3.57% k=5 Nearest Neighbour 31 | (Problem encoded as a classification task) 32 | 33 | -- Data set samples are highly overlapped. Further information is required 34 | to separate completely using affine combinations. Other restrictions 35 | to data set examined. 36 | 37 | David Clark, Zoltan Schreter, Anthony Adams "A Quantitative Comparison of 38 | Dystal and Backpropagation", submitted to the Australian Conference on 39 | Neural Networks (ACNN'96). Data set treated as a 3-category classification 40 | problem (grouping ring classes 1-8, 9 and 10, and 11 on). 41 | 42 | -- Test set performance (3133 training, 1044 testing as above): 43 | 64% Backprop 44 | 55% Dystal 45 | -- Previous work (Waugh, 1995) on same data set: 46 | 61.40% Cascade-Correlation (no hidden nodes) 47 | 65.61% Cascade-Correlation (5 hidden nodes) 48 | 59.2% C4.5 49 | 32.57% Linear Discriminate Analysis 50 | 62.46% k=5 Nearest Neighbour 51 | 52 | 53 | 4. Relevant Information Paragraph: 54 | 55 | Predicting the age of abalone from physical measurements. The age of 56 | abalone is determined by cutting the shell through the cone, staining it, 57 | and counting the number of rings through a microscope -- a boring and 58 | time-consuming task. Other measurements, which are easier to obtain, are 59 | used to predict the age. Further information, such as weather patterns 60 | and location (hence food availability) may be required to solve the problem. 61 | 62 | From the original data examples with missing values were removed (the 63 | majority having the predicted value missing), and the ranges of the 64 | continuous values have been scaled for use with an ANN (by dividing by 200). 65 | 66 | Data comes from an original (non-machine-learning) study: 67 | 68 | Warwick J Nash, Tracy L Sellers, Simon R Talbot, Andrew J Cawthorn and 69 | Wes B Ford (1994) "The Population Biology of Abalone (_Haliotis_ 70 | species) in Tasmania. I. Blacklip Abalone (_H. rubra_) from the North 71 | Coast and Islands of Bass Strait", Sea Fisheries Division, Technical 72 | Report No. 48 (ISSN 1034-3288) 73 | 74 | 75 | 5. Number of Instances: 4177 76 | 77 | 78 | 6. Number of Attributes: 8 79 | 80 | 81 | 7. Attribute information: 82 | 83 | Given is the attribute name, attribute type, the measurement unit and a 84 | brief description. The number of rings is the value to predict: either 85 | as a continuous value or as a classification problem. 86 | 87 | Name Data Type Meas. Description 88 | ---- --------- ----- ----------- 89 | Sex nominal M, F, and I (infant) 90 | Length continuous mm Longest shell measurement 91 | Diameter continuous mm perpendicular to length 92 | Height continuous mm with meat in shell 93 | Whole weight continuous grams whole abalone 94 | Shucked weight continuous grams weight of meat 95 | Viscera weight continuous grams gut weight (after bleeding) 96 | Shell weight continuous grams after being dried 97 | Rings integer +1.5 gives the age in years 98 | 99 | Statistics for numeric domains: 100 | 101 | Length Diam Height Whole Shucked Viscera Shell Rings 102 | Min 0.075 0.055 0.000 0.002 0.001 0.001 0.002 1 103 | Max 0.815 0.650 1.130 2.826 1.488 0.760 1.005 29 104 | Mean 0.524 0.408 0.140 0.829 0.359 0.181 0.239 9.934 105 | SD 0.120 0.099 0.042 0.490 0.222 0.110 0.139 3.224 106 | Correl 0.557 0.575 0.557 0.540 0.421 0.504 0.628 1.0 107 | 108 | 109 | 8. Missing Attribute Values: None 110 | 111 | 112 | 9. Class Distribution: 113 | 114 | Class Examples 115 | ----- -------- 116 | 1 1 117 | 2 1 118 | 3 15 119 | 4 57 120 | 5 115 121 | 6 259 122 | 7 391 123 | 8 568 124 | 9 689 125 | 10 634 126 | 11 487 127 | 12 267 128 | 13 203 129 | 14 126 130 | 15 103 131 | 16 67 132 | 17 58 133 | 18 42 134 | 19 32 135 | 20 26 136 | 21 14 137 | 22 6 138 | 23 9 139 | 24 2 140 | 25 1 141 | 26 1 142 | 27 2 143 | 29 1 144 | ----- ---- 145 | Total 4177 146 | -------------------------------------------------------------------------------- /dataset/auto.csv: -------------------------------------------------------------------------------- 1 | make,price,mpg,rep78,headroom,trunk,weight,length,turn,displacement,gear_ratio,foreign 2 | "AMC Concord",4099,22,3,2.5,11,2930,186,40,121,3.58,"Domestic" 3 | "AMC Pacer",4749,17,3,3.0,11,3350,173,40,258,2.53,"Domestic" 4 | "AMC Spirit",3799,22,,3.0,12,2640,168,35,121,3.08,"Domestic" 5 | "Buick Century",4816,20,3,4.5,16,3250,196,40,196,2.93,"Domestic" 6 | "Buick Electra",7827,15,4,4.0,20,4080,222,43,350,2.41,"Domestic" 7 | "Buick LeSabre",5788,18,3,4.0,21,3670,218,43,231,2.73,"Domestic" 8 | "Buick Opel",4453,26,,3.0,10,2230,170,34,304,2.87,"Domestic" 9 | "Buick Regal",5189,20,3,2.0,16,3280,200,42,196,2.93,"Domestic" 10 | "Buick Riviera",10372,16,3,3.5,17,3880,207,43,231,2.93,"Domestic" 11 | "Buick Skylark",4082,19,3,3.5,13,3400,200,42,231,3.08,"Domestic" 12 | "Cad. Deville",11385,14,3,4.0,20,4330,221,44,425,2.28,"Domestic" 13 | "Cad. Eldorado",14500,14,2,3.5,16,3900,204,43,350,2.19,"Domestic" 14 | "Cad. Seville",15906,21,3,3.0,13,4290,204,45,350,2.24,"Domestic" 15 | "Chev. Chevette",3299,29,3,2.5,9,2110,163,34,231,2.93,"Domestic" 16 | "Chev. Impala",5705,16,4,4.0,20,3690,212,43,250,2.56,"Domestic" 17 | "Chev. Malibu",4504,22,3,3.5,17,3180,193,31,200,2.73,"Domestic" 18 | "Chev. Monte Carlo",5104,22,2,2.0,16,3220,200,41,200,2.73,"Domestic" 19 | "Chev. Monza",3667,24,2,2.0,7,2750,179,40,151,2.73,"Domestic" 20 | "Chev. Nova",3955,19,3,3.5,13,3430,197,43,250,2.56,"Domestic" 21 | "Dodge Colt",3984,30,5,2.0,8,2120,163,35,98,3.54,"Domestic" 22 | "Dodge Diplomat",4010,18,2,4.0,17,3600,206,46,318,2.47,"Domestic" 23 | "Dodge Magnum",5886,16,2,4.0,17,3600,206,46,318,2.47,"Domestic" 24 | "Dodge St. Regis",6342,17,2,4.5,21,3740,220,46,225,2.94,"Domestic" 25 | "Ford Fiesta",4389,28,4,1.5,9,1800,147,33,98,3.15,"Domestic" 26 | "Ford Mustang",4187,21,3,2.0,10,2650,179,43,140,3.08,"Domestic" 27 | "Linc. Continental",11497,12,3,3.5,22,4840,233,51,400,2.47,"Domestic" 28 | "Linc. Mark V",13594,12,3,2.5,18,4720,230,48,400,2.47,"Domestic" 29 | "Linc. Versailles",13466,14,3,3.5,15,3830,201,41,302,2.47,"Domestic" 30 | "Merc. Bobcat",3829,22,4,3.0,9,2580,169,39,140,2.73,"Domestic" 31 | "Merc. Cougar",5379,14,4,3.5,16,4060,221,48,302,2.75,"Domestic" 32 | "Merc. Marquis",6165,15,3,3.5,23,3720,212,44,302,2.26,"Domestic" 33 | "Merc. Monarch",4516,18,3,3.0,15,3370,198,41,250,2.43,"Domestic" 34 | "Merc. XR-7",6303,14,4,3.0,16,4130,217,45,302,2.75,"Domestic" 35 | "Merc. Zephyr",3291,20,3,3.5,17,2830,195,43,140,3.08,"Domestic" 36 | "Olds 98",8814,21,4,4.0,20,4060,220,43,350,2.41,"Domestic" 37 | "Olds Cutl Supr",5172,19,3,2.0,16,3310,198,42,231,2.93,"Domestic" 38 | "Olds Cutlass",4733,19,3,4.5,16,3300,198,42,231,2.93,"Domestic" 39 | "Olds Delta 88",4890,18,4,4.0,20,3690,218,42,231,2.73,"Domestic" 40 | "Olds Omega",4181,19,3,4.5,14,3370,200,43,231,3.08,"Domestic" 41 | "Olds Starfire",4195,24,1,2.0,10,2730,180,40,151,2.73,"Domestic" 42 | "Olds Toronado",10371,16,3,3.5,17,4030,206,43,350,2.41,"Domestic" 43 | "Plym. Arrow",4647,28,3,2.0,11,3260,170,37,156,3.05,"Domestic" 44 | "Plym. Champ",4425,34,5,2.5,11,1800,157,37,86,2.97,"Domestic" 45 | "Plym. Horizon",4482,25,3,4.0,17,2200,165,36,105,3.37,"Domestic" 46 | "Plym. Sapporo",6486,26,,1.5,8,2520,182,38,119,3.54,"Domestic" 47 | "Plym. Volare",4060,18,2,5.0,16,3330,201,44,225,3.23,"Domestic" 48 | "Pont. Catalina",5798,18,4,4.0,20,3700,214,42,231,2.73,"Domestic" 49 | "Pont. Firebird",4934,18,1,1.5,7,3470,198,42,231,3.08,"Domestic" 50 | "Pont. Grand Prix",5222,19,3,2.0,16,3210,201,45,231,2.93,"Domestic" 51 | "Pont. Le Mans",4723,19,3,3.5,17,3200,199,40,231,2.93,"Domestic" 52 | "Pont. Phoenix",4424,19,,3.5,13,3420,203,43,231,3.08,"Domestic" 53 | "Pont. Sunbird",4172,24,2,2.0,7,2690,179,41,151,2.73,"Domestic" 54 | "Audi 5000",9690,17,5,3.0,15,2830,189,37,131,3.20,"Foreign" 55 | "Audi Fox",6295,23,3,2.5,11,2070,174,36,97,3.70,"Foreign" 56 | "BMW 320i",9735,25,4,2.5,12,2650,177,34,121,3.64,"Foreign" 57 | "Datsun 200",6229,23,4,1.5,6,2370,170,35,119,3.89,"Foreign" 58 | "Datsun 210",4589,35,5,2.0,8,2020,165,32,85,3.70,"Foreign" 59 | "Datsun 510",5079,24,4,2.5,8,2280,170,34,119,3.54,"Foreign" 60 | "Datsun 810",8129,21,4,2.5,8,2750,184,38,146,3.55,"Foreign" 61 | "Fiat Strada",4296,21,3,2.5,16,2130,161,36,105,3.37,"Foreign" 62 | "Honda Accord",5799,25,5,3.0,10,2240,172,36,107,3.05,"Foreign" 63 | "Honda Civic",4499,28,4,2.5,5,1760,149,34,91,3.30,"Foreign" 64 | "Mazda GLC",3995,30,4,3.5,11,1980,154,33,86,3.73,"Foreign" 65 | "Peugeot 604",12990,14,,3.5,14,3420,192,38,163,3.58,"Foreign" 66 | "Renault Le Car",3895,26,3,3.0,10,1830,142,34,79,3.72,"Foreign" 67 | "Subaru",3798,35,5,2.5,11,2050,164,36,97,3.81,"Foreign" 68 | "Toyota Celica",5899,18,5,2.5,14,2410,174,36,134,3.06,"Foreign" 69 | "Toyota Corolla",3748,31,5,3.0,9,2200,165,35,97,3.21,"Foreign" 70 | "Toyota Corona",5719,18,5,2.0,11,2670,175,36,134,3.05,"Foreign" 71 | "VW Dasher",7140,23,4,2.5,12,2160,172,36,97,3.74,"Foreign" 72 | "VW Diesel",5397,41,5,3.0,15,2040,155,35,90,3.78,"Foreign" 73 | "VW Rabbit",4697,25,4,3.0,15,1930,155,35,89,3.78,"Foreign" 74 | "VW Scirocco",6850,25,4,2.0,16,1990,156,36,97,3.78,"Foreign" 75 | "Volvo 260",11995,17,5,2.5,14,3170,193,37,163,2.98,"Foreign" 76 | -------------------------------------------------------------------------------- /dataset/auto.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yangruipis/ModelingPreparation/1663c8153fb97049ae716e2f0dab27d457bcbc9c/dataset/auto.mat -------------------------------------------------------------------------------- /dataset/auto_1.csv: -------------------------------------------------------------------------------- 1 | price,mpg,rep78,headroom,trunk,weight,length,turn,displacement,gear_ratio,foreign 2 | 4099,22,3,2.5,11,2930,186,40,121,3.58,1 3 | 4749,17,3,3,11,3350,173,40,258,2.53,1 4 | 3799,22,,3,12,2640,168,35,121,3.08,1 5 | 4816,20,3,4.5,16,3250,196,40,196,2.93,1 6 | 7827,15,4,4,20,4080,222,43,350,2.41,1 7 | 5788,18,3,4,21,3670,218,43,231,2.73,1 8 | 4453,26,,3,10,2230,170,34,304,2.87,1 9 | 5189,20,3,2,16,3280,200,42,196,2.93,1 10 | 10372,16,3,3.5,17,3880,207,43,231,2.93,1 11 | 4082,19,3,3.5,13,3400,200,42,231,3.08,1 12 | 11385,14,3,4,20,4330,221,44,425,2.28,1 13 | 14500,14,2,3.5,16,3900,204,43,350,2.19,1 14 | 15906,21,3,3,13,4290,204,45,350,2.24,1 15 | 3299,29,3,2.5,9,2110,163,34,231,2.93,1 16 | 5705,16,4,4,20,3690,212,43,250,2.56,1 17 | 4504,22,3,3.5,17,3180,193,31,200,2.73,1 18 | 5104,22,2,2,16,3220,200,41,200,2.73,1 19 | 3667,24,2,2,7,2750,179,40,151,2.73,1 20 | 3955,19,3,3.5,13,3430,197,43,250,2.56,1 21 | 3984,30,5,2,8,2120,163,35,98,3.54,1 22 | 4010,18,2,4,17,3600,206,46,318,2.47,1 23 | 5886,16,2,4,17,3600,206,46,318,2.47,1 24 | 6342,17,2,4.5,21,3740,220,46,225,2.94,1 25 | 4389,28,4,1.5,9,1800,147,33,98,3.15,1 26 | 4187,21,3,2,10,2650,179,43,140,3.08,1 27 | 11497,12,3,3.5,22,4840,233,51,400,2.47,1 28 | 13594,12,3,2.5,18,4720,230,48,400,2.47,1 29 | 13466,14,3,3.5,15,3830,201,41,302,2.47,1 30 | 3829,22,4,3,9,2580,169,39,140,2.73,1 31 | 5379,14,4,3.5,16,4060,221,48,302,2.75,1 32 | 6165,15,3,3.5,23,3720,212,44,302,2.26,1 33 | 4516,18,3,3,15,3370,198,41,250,2.43,1 34 | 6303,14,4,3,16,4130,217,45,302,2.75,1 35 | 3291,20,3,3.5,17,2830,195,43,140,3.08,1 36 | 8814,21,4,4,20,4060,220,43,350,2.41,1 37 | 5172,19,3,2,16,3310,198,42,231,2.93,1 38 | 4733,19,3,4.5,16,3300,198,42,231,2.93,1 39 | 4890,18,4,4,20,3690,218,42,231,2.73,1 40 | 4181,19,3,4.5,14,3370,200,43,231,3.08,1 41 | 4195,24,1,2,10,2730,180,40,151,2.73,1 42 | 10371,16,3,3.5,17,4030,206,43,350,2.41,1 43 | 4647,28,3,2,11,3260,170,37,156,3.05,1 44 | 4425,34,5,2.5,11,1800,157,37,86,2.97,1 45 | 4482,25,3,4,17,2200,165,36,105,3.37,1 46 | 6486,26,,1.5,8,2520,182,38,119,3.54,1 47 | 4060,18,2,5,16,3330,201,44,225,3.23,1 48 | 5798,18,4,4,20,3700,214,42,231,2.73,1 49 | 4934,18,1,1.5,7,3470,198,42,231,3.08,1 50 | 5222,19,3,2,16,3210,201,45,231,2.93,1 51 | 4723,19,3,3.5,17,3200,199,40,231,2.93,1 52 | 4424,19,,3.5,13,3420,203,43,231,3.08,1 53 | 4172,24,2,2,7,2690,179,41,151,2.73,1 54 | 9690,17,5,3,15,2830,189,37,131,3.2,0 55 | 6295,23,3,2.5,11,2070,174,36,97,3.7,0 56 | 9735,25,4,2.5,12,2650,177,34,121,3.64,0 57 | 6229,23,4,1.5,6,2370,170,35,119,3.89,0 58 | 4589,35,5,2,8,2020,165,32,85,3.7,0 59 | 5079,24,4,2.5,8,2280,170,34,119,3.54,0 60 | 8129,21,4,2.5,8,2750,184,38,146,3.55,0 61 | 4296,21,3,2.5,16,2130,161,36,105,3.37,0 62 | 5799,25,5,3,10,2240,172,36,107,3.05,0 63 | 4499,28,4,2.5,5,1760,149,34,91,3.3,0 64 | 3995,30,4,3.5,11,1980,154,33,86,3.73,0 65 | 12990,14,,3.5,14,3420,192,38,163,3.58,0 66 | 3895,26,3,3,10,1830,142,34,79,3.72,0 67 | 3798,35,5,2.5,11,2050,164,36,97,3.81,0 68 | 5899,18,5,2.5,14,2410,174,36,134,3.06,0 69 | 3748,31,5,3,9,2200,165,35,97,3.21,0 70 | 5719,18,5,2,11,2670,175,36,134,3.05,0 71 | 7140,23,4,2.5,12,2160,172,36,97,3.74,0 72 | 5397,41,5,3,15,2040,155,35,90,3.78,0 73 | 4697,25,4,3,15,1930,155,35,89,3.78,0 74 | 6850,25,4,2,16,1990,156,36,97,3.78,0 75 | 11995,17,5,2.5,14,3170,193,37,163,2.98,0 76 | -------------------------------------------------------------------------------- /dataset/international-airline-passengers.csv: -------------------------------------------------------------------------------- 1 | "Month","International airline passengers: monthly totals in thousands. Jan 49 ? Dec 60" 2 | "1949-01",112 3 | "1949-02",118 4 | "1949-03",132 5 | "1949-04",129 6 | "1949-05",121 7 | "1949-06",135 8 | "1949-07",148 9 | "1949-08",148 10 | "1949-09",136 11 | "1949-10",119 12 | "1949-11",104 13 | "1949-12",118 14 | "1950-01",115 15 | "1950-02",126 16 | "1950-03",141 17 | "1950-04",135 18 | "1950-05",125 19 | "1950-06",149 20 | "1950-07",170 21 | "1950-08",170 22 | "1950-09",158 23 | "1950-10",133 24 | "1950-11",114 25 | "1950-12",140 26 | "1951-01",145 27 | "1951-02",150 28 | "1951-03",178 29 | "1951-04",163 30 | "1951-05",172 31 | "1951-06",178 32 | "1951-07",199 33 | "1951-08",199 34 | "1951-09",184 35 | "1951-10",162 36 | "1951-11",146 37 | "1951-12",166 38 | "1952-01",171 39 | "1952-02",180 40 | "1952-03",193 41 | "1952-04",181 42 | "1952-05",183 43 | "1952-06",218 44 | "1952-07",230 45 | "1952-08",242 46 | "1952-09",209 47 | "1952-10",191 48 | "1952-11",172 49 | "1952-12",194 50 | "1953-01",196 51 | "1953-02",196 52 | "1953-03",236 53 | "1953-04",235 54 | "1953-05",229 55 | "1953-06",243 56 | "1953-07",264 57 | "1953-08",272 58 | "1953-09",237 59 | "1953-10",211 60 | "1953-11",180 61 | "1953-12",201 62 | "1954-01",204 63 | "1954-02",188 64 | "1954-03",235 65 | "1954-04",227 66 | "1954-05",234 67 | "1954-06",264 68 | "1954-07",302 69 | "1954-08",293 70 | "1954-09",259 71 | "1954-10",229 72 | "1954-11",203 73 | "1954-12",229 74 | "1955-01",242 75 | "1955-02",233 76 | "1955-03",267 77 | "1955-04",269 78 | "1955-05",270 79 | "1955-06",315 80 | "1955-07",364 81 | "1955-08",347 82 | "1955-09",312 83 | "1955-10",274 84 | "1955-11",237 85 | "1955-12",278 86 | "1956-01",284 87 | "1956-02",277 88 | "1956-03",317 89 | "1956-04",313 90 | "1956-05",318 91 | "1956-06",374 92 | "1956-07",413 93 | "1956-08",405 94 | "1956-09",355 95 | "1956-10",306 96 | "1956-11",271 97 | "1956-12",306 98 | "1957-01",315 99 | "1957-02",301 100 | "1957-03",356 101 | "1957-04",348 102 | "1957-05",355 103 | "1957-06",422 104 | "1957-07",465 105 | "1957-08",467 106 | "1957-09",404 107 | "1957-10",347 108 | "1957-11",305 109 | "1957-12",336 110 | "1958-01",340 111 | "1958-02",318 112 | "1958-03",362 113 | "1958-04",348 114 | "1958-05",363 115 | "1958-06",435 116 | "1958-07",491 117 | "1958-08",505 118 | "1958-09",404 119 | "1958-10",359 120 | "1958-11",310 121 | "1958-12",337 122 | "1959-01",360 123 | "1959-02",342 124 | "1959-03",406 125 | "1959-04",396 126 | "1959-05",420 127 | "1959-06",472 128 | "1959-07",548 129 | "1959-08",559 130 | "1959-09",463 131 | "1959-10",407 132 | "1959-11",362 133 | "1959-12",405 134 | "1960-01",417 135 | "1960-02",391 136 | "1960-03",419 137 | "1960-04",461 138 | "1960-05",472 139 | "1960-06",535 140 | "1960-07",622 141 | "1960-08",606 142 | "1960-09",508 143 | "1960-10",461 144 | "1960-11",390 145 | "1960-12",432 146 | 147 | International airline passengers: monthly totals in thousands. Jan 49 ? Dec 60 148 | 149 | -------------------------------------------------------------------------------- /plot/cluster_plot.R: -------------------------------------------------------------------------------- 1 | library(devtools) 2 | library(wordVectors) 3 | library(showtext) 4 | library(ggplot2) 5 | 6 | 7 | df <- read.csv("./datafile/word_vector.txt",header = TRUE) 8 | 9 | a <- cosineDist(df[1],df[2]) 10 | a <- cov(df,df) 11 | r<- matrix(1-a,nrow=40,dimnames=list(colnames(df),colnames(df))) 12 | hc <- hclust(as.dist(r),method = "complete") 13 | 14 | pdf(file = 'fujia_7.pdf', width = 12, height = 8) 15 | plot(hc) 16 | dev.off() 17 | ## dist 18 | #euclidean 欧几里德距离，就是平方再开方。 19 | #maximum 切比雪夫距离 20 | #manhattan 绝对值距离 21 | #canberra Lance 距离 22 | #minkowski 明科夫斯基距离，使用时要指定p值 23 | #binary 定性变量距离. 24 | 25 | ##method 26 | #single 最短距离法 27 | #complete 最长距离法 28 | #median 中间距离法 29 | #mcquitty 相似法 30 | #average 类平均法 31 | #centroid 重心法 32 | #ward 离差平方和法 33 | 34 | install.packages("ape",repos = 'http://mirrors.ustc.edu.cn/CRAN/') 35 | library(ape) 36 | plot(as.phylo(hc), type = "fan") 37 | plot(as.phylo(hc), type = "fan", tip.color = hsv(runif(15, 0.65, 38 | 0.95), 1, 1, 0.7), edge.color = hsv(runif(10, 0.65, 0.75), 1, 1, 0.7), edge.width = runif(20, 39 | 0.5, 3), use.edge.length = TRUE, col = "gray80") 40 | -------------------------------------------------------------------------------- /plot/datafile/beijing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yangruipis/ModelingPreparation/1663c8153fb97049ae716e2f0dab27d457bcbc9c/plot/datafile/beijing.png -------------------------------------------------------------------------------- /plot/datafile/beijingDots.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yangruipis/ModelingPreparation/1663c8153fb97049ae716e2f0dab27d457bcbc9c/plot/datafile/beijingDots.png -------------------------------------------------------------------------------- /plot/datafile/c_dijishi.dta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yangruipis/ModelingPreparation/1663c8153fb97049ae716e2f0dab27d457bcbc9c/plot/datafile/c_dijishi.dta -------------------------------------------------------------------------------- /plot/datafile/c_seven.dta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yangruipis/ModelingPreparation/1663c8153fb97049ae716e2f0dab27d457bcbc9c/plot/datafile/c_seven.dta -------------------------------------------------------------------------------- /plot/datafile/c_sheng1.dta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yangruipis/ModelingPreparation/1663c8153fb97049ae716e2f0dab27d457bcbc9c/plot/datafile/c_sheng1.dta -------------------------------------------------------------------------------- /plot/datafile/d_dijishi.dta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yangruipis/ModelingPreparation/1663c8153fb97049ae716e2f0dab27d457bcbc9c/plot/datafile/d_dijishi.dta -------------------------------------------------------------------------------- /plot/datafile/d_seven.dta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yangruipis/ModelingPreparation/1663c8153fb97049ae716e2f0dab27d457bcbc9c/plot/datafile/d_seven.dta -------------------------------------------------------------------------------- /plot/datafile/d_sheng1.dta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yangruipis/ModelingPreparation/1663c8153fb97049ae716e2f0dab27d457bcbc9c/plot/datafile/d_sheng1.dta -------------------------------------------------------------------------------- /plot/google_map_api.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | import Image 4 | import urllib 5 | import numpy as np 6 | from cStringIO import StringIO 7 | import matplotlib.pyplot as plt 8 | from PIL import Image 9 | 10 | def Gmap(centerLat,centerLon,zoomS,pixelS,size,dark,saveAddress): 11 | # get the map .png of your interesting area 12 | url = 'http://maps.googleapis.com/maps/api/staticmap?sensor=false'\ 13 | +'&size='+str(size)+'x'+str(size)+'¢er='+str(centerLat)+','\ 14 | +str(centerLon)+'&zoom='+str(zoomS)+'&scale='+str(pixelS)\ 15 | +'&maptype=terrain' # satellite 卫星图 16 | if dark==True: 17 | url = url+'&style=feature:all|element:all|saturation:-10|lightness:20' 18 | print url 19 | # 由于缺少api key，直接手动保存到本地即可 20 | 21 | # buffer = StringIO(urllib.urlopen(url).read()) 22 | # image = Image.open(buffer) 23 | # if saveAddress: 24 | # image.save(saveAddress) 25 | # else: 26 | # image.show() 27 | 28 | def latLonToPixelXY(lat,lon,zoomS): 29 | mapW = 256*2**zoomS+0.0 30 | mapH = 256*2**zoomS+0.0 31 | x = (lon+180)*(mapW/360)# get x value 32 | latRad = lat*np.pi/180# convert from degrees to radians 33 | mercN = np.log(np.tan((np.pi/4)+(latRad/2)))# get y value 34 | y = (mapH/2)-(mapW*mercN/(2*np.pi)) 35 | return x,y 36 | 37 | def sample(lis,amount): 38 | # 作图样本太多时用于抽样 39 | import random 40 | num_set = set() 41 | while(len(num_set) rand: 92 | return i 93 | 94 | def gene_pop(self): 95 | """ 96 | 基因淘汰 97 | :return: 98 | """ 99 | min_index = self.fit_value.index(min(self.fit_value)) 100 | self.genes.pop(min_index) 101 | self.fit_value.pop(min_index) 102 | 103 | def begin(self): 104 | for i in range(1000): 105 | index1 = self.choose_gene(random.random()) 106 | index2 = self.choose_gene(random.random()) 107 | while index1 == index2: 108 | index2 = self.choose_gene(random.random()) 109 | 110 | if random.random() < self.mutation_prob: 111 | self.genes[index1].mutation() 112 | self.genes[index2].mutation() 113 | 114 | if random.random() < self.cross_prob: 115 | Gene.cross(self.genes[index1], self.genes[index2]) 116 | 117 | self.get_fit_value() 118 | # self.gene_pop() 119 | 120 | result = self.get_best_gene() 121 | print len(self.genes), result[0].bin2dec(), result[1] 122 | 123 | 124 | 125 | if __name__ == '__main__': 126 | # gene1 = Gene() 127 | # print gene1.bin_value 128 | # print gene1.bin2dec() 129 | GA = GeneticAlgorithm() 130 | GA.begin() -------------------------------------------------------------------------------- /优化模型/sa_tsp_example.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import numpy as np 3 | import random 4 | import copy 5 | from simulated_annealing import exeTime 6 | import matplotlib.pyplot as plt 7 | 8 | 9 | class City(): 10 | __slots__ = ("X", "Y") 11 | def __init__(self, x, y): 12 | self.X = x 13 | self.Y = y 14 | 15 | 16 | class Graph: 17 | 18 | def __init__(self): 19 | self.city_list = [] 20 | self.total_distance = 0 21 | 22 | @staticmethod 23 | def get_distance(city1, city2): 24 | return np.sqrt((city1.X - city2.X) ** 2 + (city1.Y - city2.Y) ** 2) 25 | 26 | def add_city(self, city): 27 | if isinstance(city, City): 28 | self.city_list.append(city) 29 | elif isinstance(city, list): 30 | self.city_list += city 31 | else: 32 | print 'Add City Wrong' 33 | 34 | def reset_distance(self): 35 | self.total_distance = 0 36 | 37 | def get_total_distance(self, sequence = None): 38 | if self.city_list == [] or len(self.city_list) < 2: 39 | print "请添加城市！" 40 | else: 41 | distance = 0 42 | if sequence == None: 43 | for i,city in enumerate(self.city_list[:-1]): 44 | distance += self.get_distance(city, self.city_list[i+1]) 45 | 46 | distance += self.get_distance(self.city_list[0], self.city_list[-1]) 47 | 48 | elif sorted(sequence) == range(len(self.city_list)): 49 | self.reset_distance() 50 | for i,j in enumerate(sequence[:-1]): 51 | distance += self.get_distance(self.city_list[j], self.city_list[sequence[i+1]]) 52 | 53 | distance += self.get_distance(self.city_list[sequence[0]], self.city_list[sequence[-1]]) 54 | 55 | else: 56 | print 'Wrong Sequence' 57 | return distance 58 | 59 | 60 | def gen_new_sequence(sequence): 61 | sequence1 = copy.copy(sequence) 62 | swap_number1, swap_number2 = random.sample(sequence1, 2) 63 | sequence1[swap_number1], sequence1[swap_number2] = sequence1[swap_number2], sequence1[swap_number1] 64 | return copy.copy(sequence1) 65 | 66 | #@exeTime 67 | def get_shortest_distance(graph): 68 | T0 = 1000 69 | T_min = 1e-5 70 | delta = 0.9 71 | K = 10 72 | sequence = range(len(graph.city_list)) 73 | distance = graph.get_total_distance(sequence) 74 | distance_list = [] 75 | T = T0 76 | while T > T_min: 77 | for i in range(K): 78 | distance_list.append(distance) 79 | new_sequence = gen_new_sequence(sequence) 80 | 81 | new_distance = graph.get_total_distance(new_sequence) 82 | 83 | delta_E = new_distance - distance 84 | if delta_E < 0: 85 | distance = new_distance 86 | sequence = new_sequence 87 | break 88 | else: 89 | p_k = np.exp(- delta_E / T) 90 | if random.random() < p_k: 91 | distance = new_distance 92 | sequence = new_sequence 93 | break 94 | T *= delta 95 | return sequence, distance, distance_list 96 | 97 | 98 | 99 | if __name__ == '__main__': 100 | 101 | city_a = City(0, 0) 102 | city_b = City(0, 1) 103 | city_c = City(1, 0) 104 | city_d = City(1, 1) 105 | 106 | city_list = [ 107 | City(0, 0), 108 | City(1, 0), 109 | City(2, 0), 110 | City(3, 0), 111 | City(4, 0), 112 | City(5, 2), 113 | City(0, 3), 114 | City(0, 4), 115 | City(0, 5), 116 | City(0, 6), 117 | City(1, 2), 118 | City(4, 3), 119 | City(50, 6), 120 | City(2, 3), 121 | City(1, 4), 122 | City(3, 16), 123 | City(3, 12), 124 | City(1, 12), 125 | City(12, 21), 126 | City(7, 8), 127 | City(5, 0), 128 | City(1, 9), 129 | City(2, 7), 130 | City(3, 7), 131 | City(10, 11), 132 | City(11, 1), 133 | City(17, 3), 134 | City(15, 3), 135 | City(22, 16), 136 | City(15, 1), 137 | City(8, 5), 138 | City(3, 1), 139 | City(2, 9), 140 | City(1, 9), 141 | City(9, 3), 142 | City(14, 1), 143 | City(12, 12), 144 | ] 145 | # 中国31省数据，最优值为15500以下 146 | chinese_province_list = [ 147 | City(1304,2312), 148 | City(3639,1315), 149 | City(4177,2244), 150 | City(3712,1399), 151 | City(3488,1535), 152 | City(3326,1556), 153 | City(3238,1229), 154 | City(4196,1004), 155 | City(4312,790), 156 | City(4386,570), 157 | City(3007,1970), 158 | City(2562,1756), 159 | City(2788,1491), 160 | City(2381,1676), 161 | City(1332,695), 162 | City(3715,1678), 163 | City(3918,2179), 164 | City(4061,2370), 165 | City(3780,2212), 166 | City(3676,2578), 167 | City(4029,2838), 168 | City(4263,2931), 169 | City(3429,1908), 170 | City(3507,2367), 171 | City(3394,2643), 172 | City(3439,3201), 173 | City(2935,3240), 174 | City(3140,3550), 175 | City(2545,2357), 176 | City(2778,2826), 177 | City(2370,2975) 178 | ] 179 | 180 | graph = Graph() 181 | #graph.add_city([city_a, city_b, city_c, city_d]) 182 | graph.add_city(chinese_province_list) 183 | result = get_shortest_distance(graph) 184 | plt.plot(result[2]) 185 | plt.show() 186 | 187 | 188 | 189 | 190 | # result_list = [] 191 | # for i in range(100): 192 | # result = get_shortest_distance(graph) 193 | # result_list.append(result[1]) 194 | # print result[1] 195 | # result_list.sort() 196 | # print result_list 197 | 198 | -------------------------------------------------------------------------------- /优化模型/simulated_annealing.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf8 -*- 2 | import numpy as np 3 | import time 4 | import random 5 | 6 | """ 7 | 伪代码： 8 | ================================================= 9 | 随机生成初始解 x，对应目标函数为f(x) 10 | 开始温度 T0 1e10 11 | 停止搜索温度 T_min 1e-8 12 | 温度下降速度 delta 0.9 13 | 每次迭代次数 K 100 14 | 15 | T = T0 16 | while T > T_threshold: 17 | for i in range(K): 18 | x' = gen(x) 19 | if f(x') < f(x): 20 | x = x' 21 | else: 22 | delta_E = f(x') - f(x) 23 | P_k = \frac{1}{1 + e^{-delta_E / T}} 24 | rand = random.random() 25 | if rand < P_k: 26 | x = x' 27 | else: 28 | pass 29 | T *= delta 30 | 31 | ================================================= 32 | """ 33 | 34 | 35 | def exeTime(func): 36 | def newFunc(*args, **args2): 37 | t0 = time.time() 38 | print "%s, {%s} start" % (time.strftime("%X", time.localtime()), func.__name__) 39 | print '------------------- begin ------------------------' 40 | back = func(*args, **args2) 41 | print '-------------------- end -------------------------' 42 | print "%s, {%s} end" % (time.strftime("%X", time.localtime()), func.__name__) 43 | print "%.8fs taken for {%s}" % (time.time() - t0, func.__name__) 44 | return back 45 | 46 | return newFunc 47 | 48 | 49 | class SimulatedAnnealing: 50 | 51 | def __init__(self, func): 52 | self.T0 = 1000 53 | self.T_min = 1e-8 54 | self.delta = 0.99 55 | self.K = 10000 56 | self.x_range = (0, 100) 57 | self.func = lambda x:(x-20)**2 if x <= 50 else (x-80)**2 +30 #-(1.0 * x**4 - x**3 + x**2 - x) 58 | 59 | def gen_new_x(self, x_before, T): 60 | while 1: 61 | x_after = x_before + (random.random() * 2 - 1) * T 62 | if self.x_range[0] <= x_after <= self.x_range[1]: 63 | return x_after 64 | 65 | @exeTime 66 | def begin(self): 67 | x = random.randint(self.x_range[0], self.x_range[1]) 68 | f = self.func(x) 69 | T = self.T0 70 | while T > self.T_min: 71 | for i in range(self.K): 72 | new_x = self.gen_new_x(x, T) 73 | f_x = self.func(new_x) 74 | delta_E = f_x - f 75 | # 76 | if delta_E < 0: 77 | f = f_x 78 | x = new_x 79 | break 80 | else: 81 | #p_k = 1.0 / (1 + np.exp(- delta_E / self.func(T))) 82 | p_k = np.exp(- delta_E / T) 83 | if random.random() < p_k: 84 | f = f_x 85 | x = new_x 86 | break 87 | T *= self.delta 88 | 89 | return x 90 | 91 | 92 | if __name__ == '__main__': 93 | sa = SimulatedAnnealing('') 94 | x = sa.begin() 95 | print x, sa.func(x) 96 | -------------------------------------------------------------------------------- /优化模型/simulated_annealing.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yangruipis/ModelingPreparation/1663c8153fb97049ae716e2f0dab27d457bcbc9c/优化模型/simulated_annealing.pyc -------------------------------------------------------------------------------- /小工具/Association_rules.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | """ 4 | 通过树状结构，更高效的挖掘频繁项集 5 | 6 | git: 7 | http://github.com/enaeseth/python-fp-growth/ 8 | """ 9 | 10 | from fp_growth import find_frequent_itemsets 11 | from apriori import * 12 | 13 | test_case = [ 14 | ['a','b'], 15 | ['b','c','d'], 16 | ['a','b','d','e'], 17 | ['a','d','e'], 18 | ['a','b','c'], 19 | ['a','b','c','d'], 20 | ['a'], 21 | ['a','b','c'], 22 | ['a','b','d'], 23 | ['b','c','e'], 24 | ] 25 | 26 | # ================ Approach 1: how to get a faster frequent items ==================== 27 | for item, support in find_frequent_itemsets(test_case, 2, True): 28 | print item, support 29 | 30 | 31 | # ================ Approach 2: gen a min support rate ================= 32 | result_list = [] 33 | for i in my_apriori(test_case): 34 | temp = '-'.join([k for k in i[0]]) + ',' + '-'.join([k for k in i[1]]) + ',' 35 | result_list.append((temp, i[2])) 36 | 37 | result_list.sort(key = lambda x: x[1], reverse= False) 38 | -------------------------------------------------------------------------------- /小工具/data_clean.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | """ 4 | 数据清洗，包括 5 | - 缺失值补全 6 | - 均值、中位数补全 Done 7 | - 插值补全 Done 8 | - 异常值处理： winsor处理 9 | - 归一化 Done 10 | - 标准化 Done 11 | - 二值化 No Need 12 | - 分类变量编码 13 | - 有序 No Need 14 | - 无序 Done 15 | - 正则化检验（针对纯文本） 16 | - 去重 17 | - 去无效值 No Need 18 | - 关联性验证 No Need 19 | 20 | """ 21 | 22 | import sklearn.preprocessing as sp 23 | import pandas as pd 24 | import numpy as np 25 | import matplotlib.pyplot as plt 26 | 27 | def fill_na(df, excep_columns=[], how='mean'): 28 | """ 29 | 补全缺失值 30 | :param how: 31 | = 'mean' 32 | = 'median' 33 | = 'most_frequent' 34 | """ 35 | select_columns = [i for i in df.columns if i not in excep_columns] 36 | df_temp = df.loc[:, select_columns] 37 | 38 | imp = sp.Imputer(missing_values='NaN', strategy=how, axis=0) 39 | imp.fit(df_temp) 40 | result = imp.transform(df_temp) 41 | for i in range(result.shape[1]): 42 | df[select_columns[i]] = result[:, i] 43 | 44 | return df 45 | 46 | def interpolate_na(df, excep_columns=[], how='lagrange'): 47 | """ 48 | 49 | :param df: 50 | :param how: 51 | lagrange 拉格朗日插值 52 | spline 样条插值 53 | :return: 54 | """ 55 | select_columns = [i for i in df.columns if i not in excep_columns] 56 | 57 | if how == 'lagrange': 58 | from scipy.interpolate import lagrange 59 | def ployinterp_column(s, n, k=5): 60 | set1 = set(range(len(s))) 61 | set2 = set(list(range(n - k, n)) + list(range(n + 1, n + 1 + k))) 62 | x = list(set1 & set2) 63 | y = s[x] # 取数 64 | x = np.array(x)[pd.notnull(y)] 65 | y = y[pd.notnull(y)] # 剔除空值 66 | lagrange_result =lagrange(x, y) 67 | return lagrange_result(n) # 插值并返回插值结果 68 | for column in select_columns: 69 | ds = df.loc[:,column].values 70 | if isinstance(ds[0], int) or isinstance(ds[0], float): 71 | for j in range(len(ds)): 72 | if pd.isnull(ds[j]): 73 | ds[j] = ployinterp_column(ds,j) 74 | df[column] = ds 75 | return df 76 | elif how == 'spline': 77 | from scipy.interpolate import spline 78 | for column in select_columns: 79 | ds = df.loc[:,column].values 80 | if isinstance(ds[0], int) or isinstance(ds[0], float): 81 | target_index= np.arange(len(ds)) 82 | index = target_index[pd.notnull(ds)] 83 | ds_notnull = ds[pd.notnull(ds)] 84 | new_ds = spline(index, ds_notnull, target_index) 85 | df[column] = new_ds 86 | return df 87 | 88 | def standardize(df, excep_columns=[]): 89 | """ 90 | 标准化，假设服从正态分布 91 | """ 92 | select_columns = [i for i in df.columns if i not in excep_columns] 93 | df_temp = df.loc[:, select_columns] 94 | scaler = sp.StandardScaler().fit(df_temp) 95 | result = scaler.transform(df_temp) 96 | for i in range(result.shape[1]): 97 | df[select_columns[i]] = result[:, i] 98 | return df 99 | 100 | def normalize(df, excep_columns=[]): 101 | """ 102 | 极值归一化，根据最大最小值使其在[0,1]之间 103 | """ 104 | select_columns = [i for i in df.columns if i not in excep_columns] 105 | df_temp = df.loc[:, select_columns] 106 | min_max_scaler = sp.MinMaxScaler() 107 | min_max_scaler.fit_transform(df_temp) 108 | result = min_max_scaler.transform(df_temp) 109 | for i in range(result.shape[1]): 110 | df[select_columns[i]] = result[:, i] 111 | return df 112 | 113 | 114 | def label_encode(df, encode_column=[]): 115 | """ 116 | 将分类标签进行编码，注意：只针对无序标签 117 | :param df: 数据框 118 | :param encode_column: 列名列表 119 | :return: 数据框 120 | """ 121 | le = sp.LabelEncoder() 122 | for column in encode_column: 123 | # 非数值型转化为数值型 124 | ds = df.loc[:, column].values 125 | le.fit(ds) 126 | df[column] = le.transform(ds) # array([2, 2, 1]) 127 | return df 128 | 129 | def drop_duplicate(df, columns=[]): 130 | return df.drop_duplicates(subset=columns) 131 | 132 | 133 | def replace_outlier(df): 134 | # 有问题，未调试，用winsor 135 | result = sp.robust_scale(df, with_scaling=False, with_centering=False) 136 | return pd.DataFrame(result) 137 | 138 | def winsorize(df, low_q=1, up_q=99): 139 | temp_df = df.copy() 140 | for column in temp_df.columns: 141 | ds = temp_df[column].values 142 | if isinstance(ds[0], int) or isinstance(ds[0], float): 143 | lower_bound = np.percentile(ds, low_q) 144 | upper_bound = np.percentile(ds, up_q) 145 | ds = map(lambda x: lower_bound if x < lower_bound else upper_bound if x > upper_bound else x, ds) 146 | temp_df[column] = ds 147 | return temp_df 148 | 149 | 150 | if __name__ == '__main__': 151 | df = pd.read_csv("/home/ray/Documents/suibe/2017/建模/Modeling_Preparation/dataset/auto.csv") 152 | df_columns = df.columns 153 | 154 | # 分类变量编码 155 | df = label_encode(df, ['make', 'foreign']) 156 | 157 | # 由于该数据非时序数据，因此无法线性插值，我们用样本均值填补 158 | # 样条插值补全缺失值 159 | # df = interpolate_na(df, ['rep78'], how='spline') 160 | # 均值补全缺失值 161 | df = fill_na(df) 162 | 163 | # 标准化 164 | df = standardize(df, ['make','foreign']) # 这两列是分类变量，不需要标准化 165 | 166 | # 归一化 167 | # df = normalize(df, ['make','foreign']) # 这两列是分类变量，不需要归一化 168 | 169 | # 去重 170 | # df = drop_duplicate(df, ['foreign', 'rep78']) 171 | 172 | temp = df['price'].values 173 | temp[0] = 5 174 | df['price'] =temp 175 | 176 | # 异常值 177 | df2 = winsorize(df,1,99) 178 | 179 | ax = plt.subplot(111) 180 | ax.scatter(df.index, df.price.values, color='r', label='1') 181 | ax.plot(df2.index, df2.price, color='b', label='2') 182 | ax.legend(['1','2']) 183 | plt.show() 184 | 185 | -------------------------------------------------------------------------------- /小工具/due_date_calculate.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from WindPy import * 3 | import matplotlib.pyplot as plt 4 | import datetime 5 | import numpy as np 6 | 7 | 8 | def Wind2Df(wind_data): 9 | df = pd.DataFrame(wind_data.Data).T 10 | df.columns = wind_data.Fields 11 | df.index = wind_data.Times 12 | return df 13 | 14 | def is_due_date(date): 15 | if 15 <= date.day <= 21: 16 | if date.weekday() == 4: 17 | return True 18 | return False 19 | 20 | def gen_due_date(year, month): 21 | date0 = datetime.date(year,month, 1) 22 | for i in range(31): 23 | date0 = date0 + datetime.timedelta(1) 24 | if is_due_date(date0): 25 | return date0 26 | return None 27 | 28 | def get_due_date(date): 29 | due_date_this_month = gen_due_date(date.year, date.month) 30 | if date.month != 12: 31 | due_date_next_month = gen_due_date(date.year, date.month + 1) 32 | else: 33 | due_date_next_month = gen_due_date(date.year - 1, 1) 34 | if date > due_date_this_month: 35 | return due_date_next_month 36 | else: 37 | return due_date_this_month 38 | 39 | w.start() 40 | 41 | df_if00 = Wind2Df(w.wsi("IF00.CFE", "close, volume", "2016-02-01 09:30:00", "2017-08-16 13:48:43", "periodstart=09:30:00;periodend=15:00:00")) 42 | df_if01 = Wind2Df(w.wsi("IF01.CFE", "close, volume", "2016-02-01 09:30:00", "2017-08-16 13:48:43", "periodstart=09:30:00;periodend=15:00:00")) 43 | df_if00.columns = ['close0', 'volume0'] 44 | df_if01.columns = ['close1', 'volume1'] 45 | 46 | df_all = pd.merge(df_if00, df_if01, left_index=True, right_index=True) 47 | 48 | df_all['diff'] = df_all.close0 - df_all.close1 49 | df_all['date'] = map(lambda x: x.date(), df_all.index) 50 | df_all['due_time'] = map(lambda x: get_due_date(x.date()), df_all.index) 51 | df_all['t'] = map(lambda x,y: (y - x.date() ).days, df_all.index, df_all.due_time) 52 | 53 | 54 | df_all1 = df_all.copy() 55 | df_all1.index = range(df_all1.shape[0]) 56 | df_all1['0day_diff'] = map(lambda y,x: y if x == 0 else np.nan, df_all1['diff'], df_all1['t']) 57 | df_all1['1day_diff'] = map(lambda y,x: y if x == 1 else np.nan, df_all1['diff'], df_all1['t']) 58 | df_all1['2day_diff'] = map(lambda y,x: y if x == 2 else np.nan, df_all1['diff'], df_all1['t']) 59 | 60 | fig = plt.figure() 61 | ax1 = fig.add_subplot(211) 62 | ax2 = fig.add_subplot(212) 63 | df_all1[['volume0', 'volume1']].plot(ax = ax1) 64 | df_all1[['diff','0day_diff','1day_diff','2day_diff' ]].plot(ax = ax2) 65 | plt.show() 66 | 67 | 68 | 69 | 70 | -------------------------------------------------------------------------------- /小工具/lasso_regression.m: -------------------------------------------------------------------------------- 1 | % Lasso 回归 2 | % 3 | % # lasso （L1正则项最小二乘） 4 | % lasso用于变量筛选 5 | % 6 | % http://blog.csdn.net/sinat_26917383/article/details/52092040 # 正则项，lasso和ridge区别 7 | % matlab实现： http://cn.mathworks.com/help/stats/lasso.html 8 | % 9 | 10 | % lasso 11 | data = csvread('auto_1.csv',1,0); 12 | X = data(:,1:10); 13 | Y = data(:, 11); 14 | weight = lasso(X,Y); 15 | 16 | % plot 17 | hold on 18 | axis([0 100 -0.6 0.1]); 19 | xlabel log(lam); 20 | ylabel weights; 21 | y = zeros(1,100); 22 | for i = 1:10 23 | x = 0:99; 24 | y(1,:) = weight(i,:); 25 | plot(x,y); 26 | %legend(int2str( i)); 27 | end 28 | legend('1','2','3','4','5','6','7','8','9','10'); 29 | hold off 30 | 31 | % 结果分析： 32 | % 默认输出lambda=0:99时对应的参数，返回一个100行n列（n为自变量数）的矩阵 33 | % lasso结果用以筛选变量，无关变量的系数会趋近与0，相关变量则不会 34 | 35 | % 含参数的lasso 36 | % 1. lasso & ridge 37 | % 采用ridge回归, lasso占比1%, 输出lambda = 0:19 38 | lasso(data(:,1:10), data(:,11), 'Alpha', 0.01, 'NumLambda', 20) 39 | 40 | -------------------------------------------------------------------------------- /小工具/ridgeRegression_func1.m: -------------------------------------------------------------------------------- 1 | function [ w ] = ridgeRegression_func1( x, y, lam ) 2 | xTx = x'*x; 3 | [m,n] = size(xTx); 4 | temp = xTx + eye(m,n)*lam; 5 | if det(temp) == 0 6 | disp('This matrix is singular, cannot do inverse'); 7 | end 8 | w = temp^(-1)*x'*y; 9 | end -------------------------------------------------------------------------------- /小工具/ridge_regression.m: -------------------------------------------------------------------------------- 1 | %% 岭回归(Ridge Regression) 2 | 3 | % # 岭回归（L2正则项最小二乘） 4 | % - 有偏估计，但是在保证RSS足够小的情况下，使得参数更稳定 5 | % - 在原先的最小二乘估计中加入扰动项（二阶正则项），使问题稳定有解 6 | % - 岭回归针对样本没有办法提供给你足够的有效的信息的情况，此时OLS唯一存在的条件不满足， 7 | % 以损失部分信息、降低精度为代价获得回归系数更为符合实际、更可靠的回归方法，对病态数据的拟合要强于OLS 8 | % 9 | % http://blog.csdn.net/google19890102/article/details/27228279 10 | % http://f.dataguru.cn/thread-598486-1-1.html 11 | 12 | %导入数据 13 | data = csvread('auto_1.csv', 1,0); 14 | [m,n] = size(data); 15 | 16 | dataX = data(:,1:10);%特征 17 | dataY = data(:,11);%标签 18 | 19 | %标准化 20 | yMeans = mean(dataY); 21 | for i = 1:m 22 | yMat(i,:) = dataY(i,:)-yMeans; 23 | end 24 | 25 | xMeans = mean(dataX); 26 | xVars = var(dataX); 27 | for i = 1:m 28 | xMat(i,:) = (dataX(i,:) - xMeans)./xVars; 29 | end 30 | 31 | % 运算30次 32 | testNum = 30; 33 | weights = zeros(testNum, n-1); 34 | for i = 1:testNum 35 | w = ridgeRegression_func1(xMat, yMat, exp(i-10)); 36 | weights(i,:) = w'; 37 | end 38 | 39 | % 画出随着参数lam 岭迹图 40 | % λ的选择：一般通过观察，选取喇叭口附近的值，此时各β值已趋于稳定，但总的RSS又不是很大。 41 | % 选择变量：删除那些β取值一直趋于0的变量。 42 | hold on 43 | axis([-9 20 -1.0 2.5]); 44 | xlabel log(lam); 45 | ylabel weights; 46 | for i = 1:n-1 47 | x = -9:20; 48 | y(1,:) = weights(:,i)'; 49 | plot(x,y); 50 | end 51 | 52 | % 怎么看结果： 53 | % 每一行对应一个lambda值，以及该lambda值下每个自变量的参数beta_i(lambda) -------------------------------------------------------------------------------- /小工具/trade_account.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | from WindPy import * 3 | import pandas as pd 4 | import numpy as np 5 | from collections import defaultdict 6 | import pickle 7 | 8 | INSTRUMENT_OPTION = 0 9 | INSTRUMENT_FUTURE = 1 10 | 11 | DIRECTION_BUY = 0 12 | DIRECTION_SELL = 1 13 | 14 | OPENTYPE_OPEN = 0 15 | OPENTYPE_CLOSE = 1 16 | 17 | future_parameter = { 18 | 'interest_rate' : 0.0000231, 19 | 'deposit_rate' : 0.2, 20 | 'point_value' : 300 21 | } 22 | 23 | option_parameter = { 24 | 'interest' : 2.5 25 | } 26 | 27 | class PositionQueue: 28 | 29 | def __init__(self): 30 | self.buy_queue_dict = defaultdict(list) 31 | self.sell_queue_dict = defaultdict(list) 32 | self.positions = defaultdict(int) 33 | 34 | def add(self, instrument, direction, price): 35 | self.positions[(instrument, direction)] += 1 36 | 37 | if direction == DIRECTION_BUY: 38 | self.buy_queue_dict[instrument].append(price) 39 | else: 40 | self.sell_queue_dict[instrument].append(price) 41 | 42 | def pop(self, instrument, direction): 43 | self.positions[(instrument, direction)] -= 1 44 | if direction == DIRECTION_SELL: 45 | self.buy_queue_dict[instrument].pop(0) 46 | else: 47 | self.sell_queue_dict[instrument].pop(0) 48 | 49 | def have_position(self, instrument, direction): 50 | if direction == DIRECTION_SELL: 51 | if instrument in self.buy_queue_dict: 52 | return True 53 | else: 54 | return False 55 | else: 56 | if instrument in self.sell_queue_dict: 57 | return True 58 | else: 59 | return False 60 | 61 | def display(self): 62 | pass 63 | 64 | 65 | class TradeAccount: 66 | 67 | def __init__(self, init_capital): 68 | self.capital = init_capital 69 | self.init_capital = init_capital 70 | self.deposit = 0 71 | self.cash = init_capital 72 | self.position = PositionQueue() 73 | 74 | def order_future(self, price, instrument, direction, open_type, amount=1): 75 | if open_type == OPENTYPE_OPEN: 76 | deposit = amount * price * future_parameter['point_value'] * future_parameter['deposit_rate'] 77 | interest = amount * price * future_parameter['point_value'] * future_parameter['interest_rate'] 78 | self.deposit += deposit 79 | self.cash -= deposit + interest 80 | self.capital -= interest 81 | for i in range(amount): 82 | self.position.add(instrument, direction, price) 83 | 84 | elif open_type == OPENTYPE_CLOSE: 85 | if direction == DIRECTION_BUY: 86 | interest = price * future_parameter['point_value'] * future_parameter['interest_rate'] 87 | for i in range(amount): 88 | to_be_offset_list = self.position.sell_queue_dict[instrument] 89 | if to_be_offset_list == []: 90 | print '期货空头合约仓位不足，剩余 %s 个请求无法平仓' % (amount-i) 91 | break 92 | else: 93 | deposit = to_be_offset_list[0] * future_parameter['point_value'] * future_parameter['deposit_rate'] 94 | earn = (to_be_offset_list[0] - price) * future_parameter['point_value'] 95 | self.deposit -= deposit 96 | self.cash += deposit + earn - interest 97 | self.capital += earn - interest 98 | self.position.pop(instrument, direction) 99 | 100 | elif direction == DIRECTION_SELL: 101 | interest = price * future_parameter['point_value'] * future_parameter['interest_rate'] 102 | for i in range(amount): 103 | to_be_offset_list = self.position.buy_queue_dict[instrument] 104 | if to_be_offset_list == []: 105 | print '期货多头合约仓位不足，剩余 %s 个请求无法平仓' % (amount - i) 106 | break 107 | else: 108 | deposit = to_be_offset_list[0] * future_parameter['point_value'] * future_parameter[ 109 | 'deposit_rate'] 110 | earn = (price - to_be_offset_list[0]) * future_parameter['point_value'] 111 | self.deposit -= deposit 112 | self.cash += deposit + earn - interest 113 | self.capital += earn - interest 114 | self.position.pop(instrument, direction) 115 | else: 116 | raise Exception('No Such Open Type') 117 | 118 | def order_option(self, price, instrument, direction, open_type, amount): 119 | # 期权卖开无手续费 120 | if open_type == OPENTYPE_OPEN: 121 | if direction == DIRECTION_BUY: 122 | interest = option_parameter['interest'] * amount 123 | cost = price * 10000 * amount 124 | self.cash += - cost - interest 125 | self.capital += - interest 126 | for i in range(amount): 127 | self.position.buy_queue_dict[instrument].append(price) 128 | 129 | elif direction == DIRECTION_SELL: 130 | interest = 0 131 | get = price * 10000 * amount 132 | self.cash += get - interest 133 | self.capital += - interest 134 | for i in range(amount): 135 | self.position.sell_queue_dict[instrument].append(price) 136 | 137 | elif open_type == OPENTYPE_CLOSE: 138 | interest = option_parameter['interest'] 139 | if direction == DIRECTION_BUY: 140 | for i in range(amount): 141 | to_be_offset_list = self.position.sell_queue_dict[instrument] 142 | if to_be_offset_list == []: 143 | print '期权空头合约仓位不足，剩余 %s 个请求无法平仓' % (amount - i) 144 | break 145 | else: 146 | earn = (to_be_offset_list[0] - price) * 10000 147 | self.cash += - price * 10000 - interest 148 | self.capital += earn - interest 149 | self.position.pop(instrument, direction) 150 | 151 | elif direction == DIRECTION_SELL: 152 | for i in range(amount): 153 | to_be_offset_list = self.position.buy_queue_dict[instrument] 154 | if to_be_offset_list == []: 155 | print '期权多头合约仓位不足，剩余 %s 个请求无法平仓' % (amount - i) 156 | break 157 | else: 158 | earn = (price - to_be_offset_list[0]) * 10000 159 | self.cash += price * 10000 - interest 160 | self.capital += earn - interest 161 | self.position.pop(instrument, direction) 162 | 163 | def end_trade(self): 164 | with open('account_record', 'wb') as f: 165 | pickle.dump(self, f) 166 | 167 | 168 | class ValueCalculate(): 169 | 170 | def __init__(self, capital_list, init_capital): 171 | self._init_capital = init_capital 172 | self.capital_list = capital_list 173 | self.return_list = [] 174 | self.profit_list = [] 175 | self.get_return_list() 176 | 177 | def get_return_list(self): 178 | for i, capital in enumerate(self.capital_list): 179 | if i == 0: 180 | self.return_list.append((capital - self._init_capital) / self._init_capital) 181 | self.profit_list.append(capital - self._init_capital) 182 | else: 183 | self.return_list.append((capital - self.capital_list[i-1]) / self.capital_list[i-1]) 184 | self.profit_list.append(capital - self.capital_list[i-1]) 185 | 186 | 187 | def get_total_return(self): 188 | return (self.capital_list[-1] - self._init_capital) / self._init_capital 189 | 190 | def get_annual_return(self): 191 | return self.get_total_return() / len(self.capital_list) * 250.0 192 | 193 | def get_average_return(self): 194 | return np.mean(self.return_list) 195 | 196 | def get_total_trade_times(self): 197 | return "%s / %s" % ((self.get_win_times() + self.get_lose_times()), len(self.capital_list)) 198 | 199 | def get_return_volatility(self): 200 | rit_bar = self.get_average_return() 201 | sum_temp = 0 202 | for i in self.return_list: 203 | sum_temp += np.square(i - rit_bar) 204 | volatility = np.sqrt((250.0 / (len(self.capital_list) - 1)) * sum_temp) 205 | return volatility 206 | 207 | def get_win_times(self): 208 | win_list = [i for i in self.return_list if i > 0] 209 | return len(win_list) 210 | 211 | def get_lose_times(self): 212 | lose_list = [i for i in self.return_list if i < 0] 213 | return len(lose_list) 214 | 215 | def get_win_ratio(self): 216 | return self.get_win_times() * 1.0 / (self.get_win_times() + self.get_lose_times()) 217 | 218 | def get_win_lose_ratio(self): 219 | win_sum = np.sum([i for i in self.profit_list if i > 0]) 220 | lose_sum = np.sum([i for i in self.profit_list if i < 0]) 221 | return - win_sum * 1.0 / lose_sum 222 | 223 | def get_max_win(self): 224 | return max([i for i in self.profit_list if i > 0]) / self._init_capital 225 | 226 | def get_max_lose(self): 227 | return -min([i for i in self.profit_list if i < 0]) / self._init_capital 228 | 229 | def get_continue_win_times(self): 230 | time_count_list = [] 231 | temp = 0 232 | for i, returns in enumerate(self.return_list): 233 | if returns > 0: 234 | temp += 1 235 | else: 236 | time_count_list.append(temp) 237 | temp = 0 238 | return max(time_count_list) 239 | 240 | def get_continue_lose_times(self): 241 | time_count_list = [] 242 | temp = 0 243 | for i, returns in enumerate(self.return_list): 244 | if returns < 0: 245 | temp += 1 246 | else: 247 | time_count_list.append(temp) 248 | temp = 0 249 | return max(time_count_list) 250 | 251 | def get_max_drawdown(self): 252 | drawdown_list = [] 253 | for i, capital in enumerate(self.capital_list): 254 | new_capital_list = self.capital_list[:i] 255 | if len(new_capital_list) > 0: 256 | max_capital_past = max(new_capital_list) 257 | drawdown = (1 - capital / max_capital_past) 258 | drawdown_list.append(drawdown) 259 | return max(drawdown_list) 260 | 261 | def get_sharp_ratio(self): 262 | volatility = self.get_return_volatility() 263 | sharp = ((self.capital_list[-1] - self._init_capital) / self._init_capital - 0.03) / volatility 264 | return sharp 265 | 266 | def display(self): 267 | print "总收益率: ", self.get_total_return() 268 | print "年化收益率: ", self.get_annual_return() 269 | print "日均收益率: ", self.get_average_return() 270 | print "总交易次数: ", self.get_total_trade_times() 271 | print "收益波动率: ", self.get_return_volatility() 272 | print "获胜次数: ", self.get_win_times() 273 | print "失败次数: ", self.get_lose_times() 274 | print "胜率: ", self.get_win_ratio() 275 | print "盈亏比: ", self.get_win_lose_ratio() 276 | print "单次最大盈利: ", self.get_max_win() 277 | print "单次最大亏损: ", self.get_max_lose() 278 | print "最大连胜次数: ", self.get_continue_win_times() 279 | print "最大连负次数: ", self.get_continue_lose_times() 280 | print "最大回撤: ", self.get_max_drawdown() 281 | print "夏普比率: ", self.get_sharp_ratio() 282 | 283 | if __name__ == "__main__": 284 | pass 285 | -------------------------------------------------------------------------------- /小工具/二分法期权计算器.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.ComponentModel; 4 | using System.Data; 5 | using System.Drawing; 6 | using System.Linq; 7 | using System.Text; 8 | using System.Threading.Tasks; 9 | using System.Windows.Forms; 10 | 11 | namespace 期权计算器 12 | { 13 | public partial class Form1 : Form 14 | { 15 | public Form1() 16 | { 17 | InitializeComponent(); 18 | } 19 | 20 | private static double CumDensity(double z) 21 | { 22 | double p = 0.3275911; 23 | double a1 = 0.254829592; 24 | double a2 = -0.284496736; 25 | double a3 = 1.421413741; 26 | double a4 = -1.453152027; 27 | double a5 = 1.061405429; 28 | 29 | int sign; 30 | if (z < 0.0) 31 | sign = -1; 32 | else 33 | sign = 1; 34 | 35 | double x = Math.Abs(z) / Math.Sqrt(2.0); 36 | double t = 1.0 / (1.0 + p * x); 37 | double erf = 1.0 - (((((a5 * t + a4) * t) + a3) 38 | * t + a2) * t + a1) * t * Math.Exp(-x * x); 39 | return 0.5 * (1.0 + sign * erf); 40 | } 41 | 42 | private double get_value(double[] double_array ) 43 | { 44 | double underlying_price = double_array[0]; 45 | double strike_price = double_array[1]; 46 | double due_time = double_array[2]; 47 | double rate = double_array[3]; 48 | double vol = double_array[4]; 49 | 50 | double d1 = (Math.Log(underlying_price / strike_price) + (rate + Math.Pow(vol, 2) / 2) * due_time) / (vol * Math.Sqrt(due_time)); 51 | double d2 = d1 - vol * Math.Sqrt(due_time); 52 | return underlying_price * CumDensity(d1) - strike_price * Math.Exp(-rate * due_time) * CumDensity(d2); 53 | } 54 | 55 | private List has_one_null(string[] str) 56 | { 57 | List null_index = new List(); 58 | for (int i = 0; i < str.Length; i++) 59 | { 60 | if (str[i].Trim() == string.Empty) 61 | { 62 | null_index.Add(i); 63 | } 64 | } 65 | return null_index; 66 | } 67 | 68 | private double dichotomy_cal(int index, double upper, double lower, double[] param, bool positive, double price) 69 | { 70 | param[index] = (upper + lower) / 2.0; 71 | double pre; 72 | int count = 0; 73 | while (true) { 74 | count++; 75 | double price1 = get_value(param); 76 | pre = param[index]; 77 | if (positive) 78 | { 79 | if (price1 < price) 80 | { 81 | lower = param[index]; 82 | param[index] = (lower + upper) / 2.0; 83 | } 84 | else if (price1 > price) 85 | { 86 | upper = param[index]; 87 | param[index] = (lower + upper) / 2.0; 88 | } 89 | else 90 | return param[index]; 91 | if (Math.Abs(param[index] - pre) < 1e-5) 92 | return param[index]; 93 | if(count > 10000) 94 | return 9999; 95 | } 96 | else 97 | { 98 | if (price1 > price) 99 | { 100 | lower = param[index]; 101 | param[index] = (lower + upper) / 2.0; 102 | } 103 | else if (price1 < price) 104 | { 105 | upper = param[index]; 106 | param[index] = (lower + upper) / 2.0; 107 | } 108 | else 109 | return param[index]; 110 | if (Math.Abs(param[index] - pre) < 1e-5) 111 | return param[index]; 112 | if (count > 10000) 113 | return 9999; 114 | } 115 | 116 | } 117 | } 118 | 119 | 120 | private double up, sp, t, r, vol, p; 121 | 122 | 123 | 124 | private void button1_Click(object sender, EventArgs e) 125 | { 126 | textBox_result.Text = ""; 127 | 128 | string underlying_price = textBox_up.Text; 129 | string strike_price = textBox_sp.Text; 130 | string due_time = textBox_t.Text; 131 | string rate = textBox_r.Text; 132 | string volatity = textBox_v.Text; 133 | string price = textBox_price.Text; 134 | 135 | up = sp = t = r = vol = p = 9999; 136 | 137 | if (underlying_price != String.Empty) 138 | up = Convert.ToDouble(underlying_price); 139 | if (strike_price != String.Empty) 140 | sp = Convert.ToDouble(strike_price); 141 | if (due_time != String.Empty) 142 | t = Convert.ToDouble(due_time) / 365; 143 | if (rate != String.Empty) 144 | r = Convert.ToDouble(rate); 145 | if (volatity != String.Empty) 146 | vol = Convert.ToDouble(volatity); 147 | if (price != String.Empty) 148 | p = Convert.ToDouble(price); 149 | 150 | 151 | string[] string_array = new string[6] { underlying_price, strike_price, due_time, rate, volatity, price }; 152 | var null_index = has_one_null(string_array); 153 | 154 | if (null_index.Count != 1) 155 | { 156 | MessageBox.Show("只能空一个待求参数！"); 157 | } 158 | else 159 | { 160 | switch (null_index[0]) 161 | { 162 | case 5: 163 | var result5 = get_value(new double[] { up, sp, t, r, vol }); 164 | textBox_result.Text = Math.Round(result5, 4).ToString(); 165 | break; 166 | case 4: 167 | // vol 168 | var result4 = dichotomy_cal(4, 100, 0, new double[] { up, sp, t, r, vol }, true, p); 169 | if (result4 != 9999) 170 | textBox_result.Text = Math.Round(result4, 4).ToString(); 171 | else 172 | MessageBox.Show("深度实值认购期权隐波无解！"); 173 | break; 174 | case 3: 175 | // rate 176 | var result3 = dichotomy_cal(3, 1.0, 0, new double[] { up, sp, t, r, vol }, true, p); 177 | textBox_result.Text = Math.Round(result3, 4).ToString(); 178 | break; 179 | case 2: 180 | // due_time 181 | var result2 = dichotomy_cal(2, 100, 0, new double[] { up, sp, t, r, vol }, true, p); 182 | textBox_result.Text = Math.Round(result2 * 365).ToString(); 183 | break; 184 | case 1: 185 | //strike price 186 | var result1 = dichotomy_cal(1, 10, 0, new double[] { up, sp, t, r, vol }, false, p); 187 | textBox_result.Text = Math.Round(result1, 4).ToString(); 188 | break; 189 | case 0: 190 | //underlying price 191 | var result0 = dichotomy_cal(0, 10, 0, new double[] { up, sp, t, r, vol }, true, p); 192 | textBox_result.Text = Math.Round(result0, 4).ToString(); 193 | break; 194 | } 195 | } 196 | 197 | 198 | 199 | } 200 | } 201 | } 202 | -------------------------------------------------------------------------------- /评价模型/EntropyWeight.m: -------------------------------------------------------------------------------- 1 | function weights = EntropyWeight(R) 2 | %% 熵权法求指标权重,R为输入矩阵,返回权重向量weights 3 | 4 | [rows,cols]=size(R); % 输入矩阵的大小,rows为对象个数，cols为指标个数 5 | k=1/log(rows); % 求k 6 | 7 | f=zeros(rows,cols); % 初始化fij 8 | sumBycols=sum(R,1); % 输入矩阵的每一列之和(结果为一个1*cols的行向量) 9 | % 计算fij 10 | for i=1:rows 11 | for j=1:cols 12 | f(i,j)=R(i,j)./sumBycols(1,j); 13 | end 14 | end 15 | 16 | lnfij=zeros(rows,cols); % 初始化lnfij 17 | % 计算lnfij 18 | for i=1:rows 19 | for j=1:cols 20 | if f(i,j)==0 21 | lnfij(i,j)=0; 22 | else 23 | lnfij(i,j)=log(f(i,j)); 24 | end 25 | end 26 | end 27 | 28 | Hj=-k*(sum(f.*lnfij,1)); % 计算熵值Hj 29 | weights=(1-Hj)/(cols-sum(Hj)); 30 | end 31 | %------------------------------------------------------分割线--------------------------------------------------------------- 32 | -------------------------------------------------------------------------------- /评价模型/PPE.asv: -------------------------------------------------------------------------------- 1 | % this program use PPE method to judge the quality of a second-hand car 2 | 3 | load('auto.mat') 4 | this_size = size(auto_dataset); 5 | n = this_size(1); 6 | p = this_size(2); 7 | 8 | 9 | standard_data = zeros(74,11); 10 | for j = 1:p 11 | for i = 1:n 12 | if(ismember(j, [1,2,4,5,6,7]) == 1) 13 | standard_data(i,j) = (auto_dataset(i,j) - min(auto_dataset(:,j))) / ... 14 | (max(auto_dataset(:,j)) - min(auto_dataset(:,j))); 15 | else 16 | standard_data(i,j) = (max(auto_dataset(:,j)) - auto_dataset(i,j)) / ... 17 | (max(auto_dataset(:,j)) - min(auto_dataset(:,j))); 18 | end 19 | end 20 | end 21 | 22 | alpha = zeros(1,p); 23 | 24 | 25 | function = get_z(alpha, n, p, data, R) 26 | z = zeros(n,1); 27 | for i = 1:n 28 | sum = 0; 29 | for j = 1:p 30 | sum = sum + alpha(j) * data(i,j); 31 | end 32 | z(i) = sum; 33 | end 34 | S_alpha = std(z); 35 | sum_d = 0; 36 | for i = 1:n 37 | for j = 1:p 38 | u = 0; 39 | temp = R - abs(z(i) - z(j)); 40 | if(temp>=0) 41 | u = 1 42 | end 43 | sum_d = sum_d + temp * u; 44 | end 45 | end 46 | result = S_alpha * sum_d; 47 | end -------------------------------------------------------------------------------- /评价模型/PPE.m: -------------------------------------------------------------------------------- 1 | % this program use PPE method to judge the quality of a second-hand car 2 | 3 | data= csvread('question4.csv',1,3); 4 | this_size = size(data); 5 | global n p standard_data; 6 | p = this_size(2); 7 | n = this_size(1); 8 | standard_data = zeros(23,3); 9 | for j = 1:p 10 | for i = 1:n 11 | if(ismember(j, [2,3]) == 1) 12 | standard_data(i,j) = (data(i,j) - min(data(:,j))) / ... 13 | (max(data(:,j)) - min(data(:,j))); 14 | else 15 | standard_data(i,j) = (max(data(:,j)) - data(i,j)) / ... 16 | (max(data(:,j)) - min(data(:,j))); 17 | end 18 | end 19 | end 20 | 21 | alpha = zeros(1,p); 22 | for j = 1:p 23 | alpha(j) = 1/p; 24 | end 25 | 26 | %[a] = get_Q(alpha); 27 | [value_list,best_a,b] = pso_optimal(100,3); 28 | 29 | % load('auto.mat') 30 | % this_size = size(auto_dataset); 31 | % global n p standard_data; 32 | % n = this_size(1); 33 | % p = this_size(2); 34 | % standard_data = zeros(74,11); 35 | % 36 | % % premnmx() 归一化 37 | % for j = 1:p 38 | % for i = 1:n 39 | % if(ismember(j, [1,2,4,5,6,7]) == 1) 40 | % standard_data(i,j) = (auto_dataset(i,j) - min(auto_dataset(:,j))) / ... 41 | % (max(auto_dataset(:,j)) - min(auto_dataset(:,j))); 42 | % else 43 | % standard_data(i,j) = (max(auto_dataset(:,j)) - auto_dataset(i,j)) / ... 44 | % (max(auto_dataset(:,j)) - min(auto_dataset(:,j))); 45 | % end 46 | % end 47 | % end 48 | % 49 | % % 初始化起点 50 | % alpha = zeros(1,p); 51 | % for j = 1:p 52 | % alpha(j) = 1/p; 53 | % end 54 | % 55 | % %[a] = get_Q(alpha); 56 | % [best_a,b] = pso_optimal(100); 57 | % 58 | % Z=zeros(n,1); 59 | % for i=1:n 60 | % Z(i)=abs(sum(best_a.*standard_data(i,:))); 61 | % end 62 | % Z=abs(Z); 63 | % 64 | % figure%投影散布图 65 | % plot(abs(Z),'bd','LineWidth',1,'MarkerEdgeColor','k','MarkerFaceColor','b','MarkerSize',5); 66 | % %axis([1,12,0,2.5]);%图形边界根据需要显示 67 | % grid on 68 | % xlabel(' ','FontName','TimesNewRoman','FontSize',12); 69 | % ylabel('Projective Value','FontName','Times New Roman','Fontsize',12); 70 | % figure 71 | % [newZ,I]=sort(Z); 72 | % plot(abs(newZ),'bd','LineWidth',1,'MarkerEdgeColor','k','MarkerFaceColor','b','MarkerSize',5); 73 | % %axis([1,12,0,2.5]);%图形边界根据需要显示 74 | % grid on 75 | % xlabel(' ','FontName','TimesNewRoman','FontSize',12); 76 | % ylabel('Projective Value','FontName','Times New Roman','Fontsize',12); 77 | % 78 | % disp('最佳投影向量为') 79 | % disp(best_a); -------------------------------------------------------------------------------- /评价模型/SOM.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Self Organizing Maps for clustering 3 | """ 4 | 相关文档 5 | theory: 6 | - http://www.cnblogs.com/sylvanas2012/p/5117056.html 7 | - http://www.68dl.com/research/2014/0922/9129.html 8 | matlab: 9 | - http://blog.sina.com.cn/s/blog_906d892d0102vxfv.html 10 | - http://blog.csdn.net/bwangk/article/details/53300622 11 | - https://cn.mathworks.com/help/nnet/ug/cluster-with-self-organizing-map-neural-network.html 12 | python: 13 | - http://blog.csdn.net/chenge_j/article/details/72537568 个人实现 14 | - https://github.com/sevamoo/SOMPY 官方包 15 | """ 16 | 17 | import numpy as np 18 | from matplotlib import pyplot as plt 19 | from sompy.sompy import SOMFactory 20 | from sklearn.datasets import fetch_california_housing 21 | import pandas as pd 22 | from collections import Counter 23 | 24 | 25 | class MySOM: 26 | def __init__(self, df, mapsize, initialization = 'random'): 27 | """ 28 | 29 | :param df: 数据框 30 | :param mapsize: 输出层维度，一般为二维，输入(20,20)的形式 31 | :param initialization: "PCA" 或 "random"，初始化权重的方法 32 | - PCA是以变量的主成分值作为权重，见sompy.codebool.pca_linear_initialization 33 | - random是以随机数进行初始化 34 | """ 35 | self.data = np.array(df) 36 | self.sm = SOMFactory().build(self.data, mapsize=mapsize, initialization=initialization, component_names=df.columns) 37 | self.train() 38 | 39 | def train(self): 40 | self.sm.train(n_job=1,verbose=False, train_rough_len=2, train_finetune_len=5) 41 | 42 | def print_error(self): 43 | topographic_error = self.sm.calculate_topographic_error() 44 | quantization_error = np.mean(self.sm._bmu[1]) 45 | print ("Topographic error = %s; Quantization error = %s" % (topographic_error, quantization_error)) 46 | 47 | def draw_input_weights(self): 48 | from sompy.visualization.mapview import View2D 49 | view2D = View2D(10, 10, "rand data", text_size=10) 50 | view2D.show(self.sm, col_sz=4, which_dim="all", desnormalize=True) 51 | plt.show() 52 | 53 | def draw_hit_map(self): 54 | from sompy.visualization.bmuhits import BmuHitsView 55 | vhts = BmuHitsView(4, 4, "Hits Map", text_size=12) 56 | vhts.show(self.sm, anotate=True, onlyzeros=False, labelsize=12, cmap="Greys", logaritmic=False) 57 | plt.show() 58 | 59 | def draw_cluster_map(self): 60 | from sompy.visualization.hitmap import HitMapView 61 | hits = HitMapView(20, 20, "Clustering", text_size=12) 62 | hits.show(self.sm) 63 | plt.show() 64 | 65 | def cluster(self, n): 66 | self.sm.cluster(n) 67 | 68 | def get_cluster_label(self): 69 | # 长度等于mapsize[0] * mapsize[1] 70 | return self.sm.cluster_labels 71 | 72 | def get_neurons(self): 73 | """ 74 | 获取原数据的每个样本对应的神经元，原包并未提供此方法，所以自己动手 75 | :return: array, length = self.df.shape[0] 76 | """ 77 | return self.sm._bmu[0] 78 | 79 | def get_label(self): 80 | """ 81 | 获取原数据的每个样本对应的分类标签，原包并未提供此方法，所以自己动手 82 | :return: array, length = self.df.shape[0] 83 | """ 84 | neurons_label_dict = {i:j for i,j in enumerate(self.sm.cluster_labels)} 85 | return np.array([neurons_label_dict[i] for i in self.sm._bmu[0]]) 86 | 87 | def predict(self, x): 88 | """ 89 | 以label作为y，采取各种机器学习算法 90 | :param x: 91 | :return: 92 | """ 93 | pass 94 | 95 | if __name__ == '__main__': 96 | data = fetch_california_housing() 97 | descr = data.DESCR 98 | names = data.feature_names+["HouseValue"] 99 | data = np.column_stack([data.data, data.target]) 100 | df = pd.DataFrame(data) 101 | df.columns = names 102 | 103 | my_som = MySOM(df, (20,20)) 104 | my_som.draw_input_weights() 105 | my_som.draw_hit_map() 106 | 107 | my_som.cluster(5) 108 | my_som.draw_cluster_map() 109 | print my_som.get_label()[:10] 110 | print Counter(my_som.get_label()) 111 | 112 | my_som.predict(np.array(df.iloc[0])) -------------------------------------------------------------------------------- /评价模型/cluster.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import pandas as pd 6 | from sklearn.decomposition import PCA 7 | 8 | class Cluster: 9 | 10 | def __init__(self, df): 11 | from scipy.cluster.vq import whiten 12 | self.df = df 13 | self.data = whiten(df) 14 | self.sample_names = np.array(df.index) 15 | 16 | def K_means(self, K, axis=0): 17 | from scipy.cluster.vq import kmeans, vq 18 | # k-means最后输出的结果其实是两维的,第一维是聚类中心,第二维是损失distortion 19 | if axis == 0: 20 | # 此时对样本聚类 21 | centroid, distortion = kmeans(self.data, K) 22 | # 使用vq函数根据聚类中心对所有数据进行分类,vq的输出也是两维的,[0]表示的是所有数据的label 23 | label, distortion = vq(self.data, centroid) 24 | else: 25 | # 此时对变量聚类 26 | centroid, distortion = kmeans(self.data.T, K) 27 | label, distortion = vq(self.data.T, centroid) 28 | return label 29 | 30 | def hierarchical(self): 31 | import scipy.cluster.hierarchy as sch 32 | # 生成点与点之间的距离矩阵,这里用的欧氏距离: 33 | disMat = sch.distance.pdist(self.data, 'euclidean') 34 | # 进行层次聚类: 35 | Z = sch.linkage(disMat, method='average') 36 | self.hierarchial_plot(Z) 37 | # 根据linkage matrix Z得到聚类结果: 38 | cluster = sch.fcluster(Z, 1, 'inconsistent') 39 | return cluster 40 | 41 | def hierarchial_plot(self, Z): 42 | import scipy.cluster.hierarchy as sch 43 | # 将层级聚类结果以树状图表示出来，其中labels为每个样本的名称数组,应该为self.sample_names 44 | sch.dendrogram(Z, labels=self.sample_names, orientation='right') 45 | plt.tick_params( 46 | axis='x', # 使用 x 坐标轴 47 | which='both', # 同时使用主刻度标签（major ticks）和次刻度标签（minor ticks） 48 | bottom='off', # 取消底部边缘（bottom edge）标签 49 | top='off', # 取消顶部边缘（top edge）标签 50 | labelbottom='off') 51 | plt.tight_layout() # 展示紧凑的绘图布局 52 | plt.show() 53 | # plt.savefig('plot_dendrogram.png') 54 | 55 | def cluster_plot(self, label): 56 | # 聚类结果适合在二维数据中进行可视化，而面对多维情况，采取主成分分析进行降唯 57 | pca_result = self._pca() 58 | color = ['r', 'y', 'k', 'g', 'm'] * 10 59 | for i in range(max(label)+1): 60 | idx = np.where(label==i) 61 | plt.scatter(pca_result[idx, 0], pca_result[idx, 1], marker='o',label = str(i), color=color[i]) 62 | plt.legend([u"Class: "+ str(i) for i in range(max(label) + 1)]) 63 | plt.show() 64 | 65 | def _pca(self): 66 | pca = PCA(n_components=2) # ='mle' 时自动判断需要保留几个主成分，在这里因为需要做图，所以保留前两个 67 | pca.fit(self.data) 68 | print "variance_ratio:", pca.explained_variance_ratio_ 69 | return pca.transform(self.data) 70 | 71 | def auto_cluster(self): 72 | # 先层次聚类，获取分类数，再根据类别进行K均值聚类 73 | hierarchical_cluster = self.hierarchical() 74 | K = max(hierarchical_cluster) 75 | labels = self.K_means(K) 76 | self.cluster_plot(labels) 77 | 78 | 79 | if __name__ == '__main__': 80 | df = pd.read_csv("/home/ray/Documents/suibe/2017/建模/Modeling_Preparation/dataset/auto_1.csv") 81 | df = df.dropna(axis=0) 82 | clu = Cluster(df) 83 | 84 | label = clu.K_means(4) 85 | clu.cluster_plot(label) 86 | 87 | label2 = clu.hierarchical() 88 | clu.cluster_plot(label2) 89 | -------------------------------------------------------------------------------- /评价模型/constraint.m: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yangruipis/ModelingPreparation/1663c8153fb97049ae716e2f0dab27d457bcbc9c/评价模型/constraint.m -------------------------------------------------------------------------------- /评价模型/get_Q.m: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yangruipis/ModelingPreparation/1663c8153fb97049ae716e2f0dab27d457bcbc9c/评价模型/get_Q.m -------------------------------------------------------------------------------- /评价模型/optimal_tools.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yangruipis/ModelingPreparation/1663c8153fb97049ae716e2f0dab27d457bcbc9c/评价模型/optimal_tools.png -------------------------------------------------------------------------------- /评价模型/pso_optimal.asv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yangruipis/ModelingPreparation/1663c8153fb97049ae716e2f0dab27d457bcbc9c/评价模型/pso_optimal.asv -------------------------------------------------------------------------------- /评价模型/pso_optimal.m: -------------------------------------------------------------------------------- 1 | function [value_list, optimal_position, optimal_value] = pso_optimal(itertimes, variable_number) 2 | w = 0.8; 3 | c1 = 2; c2 = 2; 4 | r1 = 0.25; 5 | r2 = 0.75; 6 | particle_number = 1000; % 100�� 7 | %variable_number = variable_number; % 11�� 8 | X = zeros(particle_number, variable_number); 9 | V = zeros(particle_number, variable_number); 10 | particle_optimal_position = zeros(particle_number, variable_number); 11 | optimal_position = zeros(1, variable_number); 12 | opitmal_value = 1e10; 13 | x_range = [0,1]; 14 | value_list = []; 15 | % initial the infomation of each particle 16 | 17 | for i = 1:particle_number 18 | for j = 1:variable_number 19 | X(i,j) = x_range(1) + (x_range(2) - x_range(1)) * rand(); 20 | V(i,j) = 0; 21 | end 22 | X(i,:) = X(i,:) / norm(X(i,:)); 23 | particle_optimal_position(i,:) = X(i,:); 24 | temp_value = get_Q(X(i,:)); 25 | if(temp_value < opitmal_value) 26 | optimal_position = X(i,:); 27 | optimal_value = temp_value; 28 | end 29 | end 30 | count = 0; 31 | % update the particle 32 | for iter = 1:itertimes 33 | for i = 1:particle_number 34 | V(i,:) = w * V(i,:) + c1 * r1 * (particle_optimal_position(i,:) - ... 35 | X(i,:)) + c2 * r2 * (optimal_position - X(i,:)); 36 | X(i,:) = X(i,:) + V(i,:); 37 | X(i,:) = X(i,:) / norm(X(i,:)); 38 | if ( x_range(1) <= min(X(i,:)) && x_range(2) >= max(X(i,:))) 39 | value_before = get_Q(particle_optimal_position(i,:)); 40 | value_now = get_Q(X(i,:)); 41 | if(value_now < value_before) 42 | particle_optimal_position(i,:) = X(i,:); 43 | end 44 | 45 | if (value_now < optimal_value) 46 | optimal_position =X(i,:); 47 | optimal_value = value_now; 48 | count = count + 1; 49 | value_list(count) = optimal_value; 50 | end 51 | end 52 | end 53 | end 54 | 55 | 56 | end -------------------------------------------------------------------------------- /评价模型/som_data.txt: -------------------------------------------------------------------------------- 1 | 0.697,0.46 2 | 0.774,0.376 3 | 0.634,0.264 4 | 0.608,0.318 5 | 0.556,0.215 6 | 0.403,0.237 7 | 0.481,0.149 8 | 0.437,0.211 9 | 0.666,0.091 10 | 0.243,0.267 11 | 0.245,0.057 12 | 0.343,0.099 13 | 0.639,0.161 14 | 0.657,0.198 15 | 0.36,0.37 16 | 0.593,0.042 17 | 0.719,0.103 18 | 0.359,0.188 19 | 0.339,0.241 20 | 0.282,0.257 21 | 0.748,0.232 22 | 0.714,0.346 23 | 0.483,0.312 24 | 0.478,0.437 25 | 0.525,0.369 26 | 0.751,0.489 27 | 0.532,0.472 28 | 0.473,0.376 29 | 0.725,0.445 30 | 0.446,0.459 -------------------------------------------------------------------------------- /赛题整理/赛题整理.md: -------------------------------------------------------------------------------- 1 | # 2016 E题 2 | 3 | 1. 影响粮食种植面积的因素比较多，它们之间的关系错综复杂而且可能存在着粮食品种和区域差异。请你们建立影响粮食种植面积的**指标体系**和关于粮食种植面积的数学模型，讨论、评价指标体系的合理性，研究他们之间的关系，并对得出的相应结果的可信度和可靠性给出检验和分析。 4 | - Spearman 相关检验, 主成分回归模型 5 | - 非参数 Spearman 相关检验法，偏最小二乘回归 6 | - 结构方程模型，证性因子分析和路径分析 7 | - Kolmogorov-Smirnov，相关性检验和主成分因子分析，Granger 因果关系检验，Granger 因果关系检验，似然比检验法，单位根检验和协整检验法 8 | - 多元线性回归模型，相关系数、残差和显著性水平 9 | 2. 对粮食最低收购价政策的作用，学者们褒贬不一。请你们建立粮食最低收购价**政策执行效果的评价模型**。并运用你们所建立的评价模型，结合粮食品种和区域差异，选择几个省份比较研究粮食主产区粮食最低收购价执行的效果。 10 | - / 11 | - 主成分分析，混合线性模型 12 | - 三角模糊数 13 | - 最低收购价政策执行效果综合评价指数模型，基于粒子群和投影寻踪算法的权重确定模型 14 | - 主成分分析法 15 | 3. 粮食市场收购价是粮食企业收购粮食的市场价格，是由粮食供需双方通过市场调节来决定。它与粮食最低收购价一起构成粮食价格体系，是宏观价格调控系统中有一定相对独立性的重要措施。请你们运用数据分析或建立数学模型探讨我国粮食价格所具有的**特殊规律性**。 16 | - 市场收购价理论和局部调整模型，包括供应量模型、企业收购量模型和市场收购价格模型 17 | - 基于供需理论构建粮食供需及价格联动模型， ARIMA 模型 18 | - “蛛网”模型， ARCH 类模型 19 | - 局部均衡模型和正反馈系统 20 | - / 21 | 4. 结合前面的研究和国家制定粮食最低收购价政策的初衷，请你们建立粮食最低收购价的合理**定价模型**，进而对“十二五”期间国家发展与改革委员会公布的粮食最低收购价价格的合理性做出评价，并运用你们所建立的模型对2017年的粮食最低收购价的合理范围进行**预测** 22 | - 以粮食产量为目标模型，以价格波动、财政支出、库存和种植面积为约束条件建立了粮食最低收购价合理定价的线性规划模型 23 | - 优化模型 24 | - GARCH 模型、单变量二阶差分方程模型(DDE)、支持向量机预测模型（ SVM 模型）以及马尔科夫链的时变权组合预测模型（ HM-TWA） 25 | - 基于正态分布随机数遗传，多目标合理定价模型，基于有序加权平均（ OWA）算子 26 | - / 27 | 28 | # 2014年B题 29 | 30 | 1. 根据附录中1000个样本在某条有可能致病的染色体片段上的9445个位点的编码信息(见genotype.dat)和样本患有遗传疾病A的信息（见phenotype.txt文件）。设计或采用一个方法，找出某种疾病最有可能的一个或几个致病位点，并给出相关的理论依据。 31 | 32 | 2. 同上题中的样本患有遗传疾病A的信息（phenotype.txt文件）。现有300个基因，每个基因所包含的位点名称见文件夹gene_info中的300个dat文件，每个dat文件列出了对应基因所包含的位点(位点信息见文件genotype.dat)。由于可以把基因理解为若干个位点组成的集合，遗传疾病与基因的关联性可以由基因中包含的位点的全集或其子集合表现出来请找出与疾病最有可能相关的一个或几个基因，并说明理由。 33 | 34 | 3. 已知9445个位点，其编码信息见genotype.dat文件。在实际的研究中，科研人员往往把相关的性状或疾病看成一个整体，然后来探寻与它们相关的位点或基因。试根据multi_phenos.txt文件给出的1000个样本的10个相关联性状的信息及其9445个位点的编码信息(见genotype.dat)，找出与multi_phenos.txt中10个性状有关联的位点。 35 | 36 | # 2014年D题 37 | 1. 筛选出主要的水果和蔬菜品种，并尝试用多种方法建立数学模型对其消费量进行估计，研究其发展趋势 38 | 39 | 2. 评价中国居民目前矿物质、维生素、膳食纤维等营养的年摄入水平是否合理。按照水果和蔬菜近期的消费趋势，至2020年，中国居民的人体营养健康状况是趋于好转还是恶化？请给出支持你们结论的充分依据。 40 | 41 | 3. 为当今中国居民（可以分区域分季节）提供主要的水果和蔬菜产品的按年度合理人均消费量，使人们能够以较低的购买成本（假定各品种价格按照原有趋势合理变动）满足自身的营养健康需要 42 | 43 | 4. 建立数学模型重新计算中国居民主要的水果和蔬菜产品的按年度合理人均消费量，并给出到2020年我国水果和蔬菜产品生产的调整战略。 44 | 45 | # 2013年F题 46 | 47 | 1. 中国城乡居民（含新农保）养老金收入、支出的宏观数学模型 48 | 49 | 2. 对养老金缺口的理解和对未来有关情况的合理估计,城乡居民养老保险收支矛盾最尖锐的情况发生在什么时间，严重程度如何？考虑到党的十八大提出的收入倍增计划，你们的数学模型哪些部分需要调整？ 50 | 51 | 3. 利用仿真手段寻找替代率和缴费率的合理区间以保证我国养老保险体系的可持续性;在步入良性循环之前，在矛盾最尖锐到来前的过渡期内应该采取哪些政策措施实现平稳过渡并仿真预测相关政策的效果 -------------------------------------------------------------------------------- /预测模型/GM1_1.m: -------------------------------------------------------------------------------- 1 | %%%% GM(1,1)预测模型 2 | 3 | input = [132,92,118,130,187,207,213,284,301,333]; 4 | predict_times = 10; 5 | 6 | sum_input = cumsum(input); 7 | 8 | B = ones(length(input)-1,2); 9 | for i = 1:length(input)-1 10 | B(i,1) = -(sum_input(i) + sum_input(i+1)) / 2.0; 11 | end 12 | Y = input(2:end); 13 | 14 | a_hat = inv(B' * B) * B' * Y'; 15 | a = a_hat(1); 16 | u = a_hat(2); 17 | 18 | result_length = length(input) + predict_times; 19 | sum_result = zeros(result_length, 1); 20 | result = zeros(result_length, 1); 21 | for i = 1:result_length 22 | sum_result(i) = (input(1) - u / a) * exp(-a * (i - 1)) + u / a; 23 | if(i == 1) 24 | result(i) = sum_result(i); 25 | else 26 | result(i) = sum_result(i) - sum_result(i-1); 27 | end 28 | end 29 | x_1 = 1:length(input); 30 | x_2 = 1:result_length; 31 | 32 | plot(x_1, input, '.b', x_2, result, 'ro--') 33 | legend('Acture Value', 'Predict Value') 34 | title('GM(1,1) Predict Result') 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /预测模型/HMM.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | """ 4 | Theory: 5 | 6 | 通过识别隐含状态，以及计算隐含状态到观测状态的概率，实现对未来隐含状态的预测（ 7 | 方法1. 已知当期的隐状态，推断下一期隐状态概率，以及各自的观测分布情况，进行预测 8 | 方法2. 根据当期t_1的观测，寻找与当期最相似的时期t_2，类比t_2下一期的观测值金预测 9 | ） 10 | 11 | 模型参数：隐含状态转移概率矩阵、隐状态->观测转移概率(emission matrix, 混淆矩阵）、初始隐状态概率 12 | 13 | 根据观测分类： 14 | - MultinomialHMM 观测值离散的HMM 15 | - GaussianHMM 观测值连续的HMM，当观测为一维时，假定为正态分布；当观测为n维时，为n维联合正态分布 16 | - GMMHMM 同样为连续观测，运用混合正态分布 17 | 18 | 根据问题分类： 19 | 1. 已知整个模型（包括转移概率矩阵、混淆矩阵），根据观测值序列，计算该序列产生的概率如何 20 | 2. 已知整个模型（包括转移概率矩阵、混淆矩阵），根据观测值序列，推断这段时间的隐含状态 21 | 3. 模型未知，只知道观测值序列，求解整个模型，计算两个概率矩阵（或者是概率分布，连续情况），以及初始隐含状态概率（分布） 22 | 23 | 对应求解方法： 24 | 1. 前向、后向算法 25 | 2. Viterbi Algo，维特比算法 26 | 3. Baum-Welch Algo，鲍姆-韦尔奇算法 27 | ref： 28 | https://www.zhihu.com/question/20962240 29 | 30 | python: 31 | ref: 32 | http://www.cnblogs.com/pinard/p/7001397.html 33 | https://uqer.io/community/share/56ec30bf228e5b887be50b35 # 量化 34 | http://blog.csdn.net/baskbeast/article/details/51218777 # 量化 35 | 36 | """ 37 | 38 | import hmmlearn 39 | import pandas as pd 40 | import numpy as np 41 | import matplotlib.pyplot as plt 42 | import warnings 43 | warnings.filterwarnings("ignore") 44 | 45 | 46 | def MyMultinomialHMM(): 47 | from hmmlearn import hmm 48 | 49 | # 离散观测情况 50 | states = ["box 1", "box 2", "box3"] 51 | n_states = len(states) 52 | 53 | observations = ["red", "white"] 54 | n_observations = len(observations) 55 | 56 | start_probability = np.array([0.2, 0.4, 0.4]) 57 | 58 | transition_probability = np.array([ 59 | [0.5, 0.2, 0.3], 60 | [0.3, 0.5, 0.2], 61 | [0.2, 0.3, 0.5] 62 | ]) 63 | 64 | emission_probability = np.array([ 65 | [0.5, 0.5], 66 | [0.4, 0.6], 67 | [0.7, 0.3] 68 | ]) 69 | 70 | model = hmm.MultinomialHMM(n_components=n_states) 71 | model.startprob_ = start_probability 72 | model.transmat_ = transition_probability 73 | model.emissionprob_ = emission_probability 74 | 75 | # question 2 76 | seen = np.array([[0, 1, 0, 1, 0, 0, 1]]).T # 观测序列 77 | logprob, box = model.decode(seen, algorithm="viterbi") 78 | print "The ball picked:", ", ".join(map(lambda x: observations[x], seen.T.reshape(7))) 79 | print "The hidden box", ", ".join(map(lambda x: states[x], box)) 80 | 81 | box2 = model.predict(seen) 82 | print "The ball picked:", ", ".join(map(lambda x: observations[x], seen.T.reshape(7))) 83 | print "The hidden box", ", ".join(map(lambda x: states[x], box2)) 84 | 85 | # question 1 86 | print np.exp(model.score(seen)) 87 | 88 | # question 3 89 | 90 | # states = ["box 1", "box 2", "box3"] 91 | n_states = 3 # 参数 1 92 | X2 = np.array([[0, 1, 0, 1], [0, 0, 0, 1], [1, 0, 1, 1]]) # 参数 2 93 | 94 | model2 = hmm.MultinomialHMM(n_components=n_states, n_iter=20, tol=0.01) 95 | model2.fit(X2) 96 | for i in range(10): 97 | # 由于鲍姆-韦尔奇算法是基于EM算法的近似算法，所以我们需要多跑几次，选择X2概率最大的作为模型估计结果 98 | model2.fit(X2) 99 | print model2.startprob_ 100 | print model2.transmat_ 101 | print model2.emissionprob_ 102 | print np.exp(model2.score(X2)) 103 | print model2.sample(10) 104 | print model2.predict(X2.reshape([3, 4, 1])[1]) 105 | 106 | def MyGaussianHMM(): 107 | from hmmlearn.hmm import GaussianHMM 108 | df = pd.read_csv("/home/ray/Documents/suibe/2017/建模/Modeling_Preparation/dataset/SZIndex.csv", header=-1) 109 | df.head() 110 | X = np.array(df.iloc[:, 0:5]) 111 | 112 | # 一、未知模型情况下，解决问题3 113 | model = GaussianHMM(n_components=6, covariance_type="diag", n_iter=1000) # 方差矩阵为对角阵 114 | """ 115 | 参数解释： 116 | covariance_type: 117 | "spherical" ：主对角元素均为1，其余元素为0，独立同分布 (数据不足时，难以进行参数估计) 118 | "diag" ：主对角元素不为0，其余为0 (一般情况，折中) 119 | "full" ：所有元素均不为0 (数据足够进行参数估计时) 120 | """ 121 | model.fit(X) 122 | print "隐含状态为: ", model.predict(X) # 列出每一天的隐含状态 123 | print "特征数目 %s" % model.n_features 124 | print "隐状态数目 %s" % model.n_components 125 | print "起始概率 :", model.startprob_ 126 | print "隐状态转移矩阵", model.transmat_ 127 | ## 每个隐含层对应的特征概率空间假设为正态分布，则可以得到一个model.n_components行model.n_features列的均值矩阵 128 | print "混淆矩阵：均值部分", model.means_ 129 | print "混淆矩阵：方差部分", model.covars_ 130 | 131 | ## 绘图 132 | hidden_states = model.predict(X) 133 | tradeDate = df.iloc[:, 5].values 134 | closeIndex = df.iloc[:, 6].values 135 | plt.figure(figsize=(15, 8)) 136 | for i in range(model.n_components): 137 | idx = (hidden_states == i) 138 | plt.plot_date(pd.to_datetime(tradeDate[idx]), closeIndex[idx], '.', label='%dth hidden state' % i, lw=1) 139 | plt.legend() 140 | plt.grid(1) 141 | plt.show() 142 | 143 | # 二、已知模型情况下，解决问题1,2 144 | 145 | ## 沿用上述模型 146 | ### 问题1 147 | print "某天出现该观测的概率为： %s" % np.exp(model.score(X[0])) 148 | ### 问题2 149 | log_prob, state = model.decode(X[:10], algorithm="viterbi") 150 | print "只根据前十天，推断出最有可能的隐含状态序列为：", state 151 | 152 | ## 自己输入模型参数 153 | ### 一个2特征，4隐状态情况 154 | startprob = np.array([0.6, 0.3, 0.1, 0.0]) 155 | # The transition matrix, note that there are no transitions possible 156 | # between component 1 and 3 157 | transmat = np.array([[0.7, 0.2, 0.0, 0.1], 158 | [0.3, 0.5, 0.2, 0.0], 159 | [0.0, 0.3, 0.5, 0.2], 160 | [0.2, 0.0, 0.2, 0.6]]) 161 | # The means of each component 162 | means = np.array([[0.0, 0.0], 163 | [0.0, 11.0], 164 | [9.0, 10.0], 165 | [11.0, -1.0]]) 166 | # The covariance of each component 167 | covars = .5 * np.tile(np.identity(2), (4, 1, 1)) 168 | model2 = GaussianHMM(n_components=4, covariance_type="full", n_iter=1000) 169 | model2.startprob_ = startprob 170 | model2.transmat_ = transmat 171 | model2.means_ = means 172 | model2.covars_ = covars 173 | 174 | if __name__ == '__main__': 175 | MyGaussianHMM() 176 | pass 177 | -------------------------------------------------------------------------------- /预测模型/LSTM_predict.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | """ 3 | ref: 4 | # 通过上一期序列值预测下一期 5 | https://machinelearningmastery.com/time-series-prediction-lstm-recurrent-neural-networks-python-keras/ 6 | # theory 7 | https://deeplearning4j.org/lstm.html#long 8 | 9 | """ 10 | # LSTM for international airline passengers problem with regression framing 11 | import numpy 12 | import matplotlib.pyplot as plt 13 | from pandas import read_csv 14 | import math 15 | from keras.models import Sequential 16 | from keras.layers import Dense 17 | from keras.layers import LSTM 18 | from sklearn.preprocessing import MinMaxScaler 19 | from sklearn.metrics import mean_squared_error 20 | # convert an array of values into a dataset matrix 21 | 22 | 23 | def create_dataset(dataset, look_back=1): 24 | dataX, dataY = [], [] 25 | for i in range(len(dataset)-look_back-1): 26 | a = dataset[i:(i+look_back), 0] 27 | dataX.append(a) 28 | dataY.append(dataset[i + look_back, 0]) 29 | return numpy.array(dataX), numpy.array(dataY) 30 | 31 | 32 | # fix random seed for reproducibility 33 | numpy.random.seed(7) 34 | # load the dataset 35 | 36 | #df = read_csv('../dataset/SZIndex.csv',header=-1) 37 | #dataset = df[6].values 38 | #dataset = dataset.reshape(dataset.shape[0], 1) 39 | 40 | dataframe = read_csv('../dataset/international-airline-passengers.csv', usecols=[1], engine='python', skipfooter=3) 41 | dataset = dataframe.values 42 | dataset = dataset.astype('float32') 43 | # normalize the dataset 44 | scaler = MinMaxScaler(feature_range=(0, 1)) 45 | dataset = scaler.fit_transform(dataset) 46 | # split into train and test sets 47 | train_size = int(len(dataset) * 0.67) 48 | test_size = len(dataset) - train_size 49 | train, test = dataset[0:train_size,:], dataset[train_size:len(dataset),:] 50 | # reshape into X=t and Y=t+1 51 | look_back = 2 52 | trainX, trainY = create_dataset(train, look_back) 53 | testX, testY = create_dataset(test, look_back) 54 | # reshape input to be [samples, time steps, features] 55 | trainX = numpy.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1])) 56 | testX = numpy.reshape(testX, (testX.shape[0], 1, testX.shape[1])) 57 | 58 | # create and fit the LSTM network 59 | model = Sequential() 60 | model.add(LSTM(4, input_shape=(1, look_back))) 61 | model.add(Dense(1)) 62 | model.compile(loss='mean_squared_error', optimizer='adam') 63 | model.fit(trainX, trainY, epochs=100, batch_size=1, verbose=2) 64 | # make predictions 65 | trainPredict = model.predict(trainX) 66 | testPredict = model.predict(testX) 67 | # invert predictions 68 | trainPredict = scaler.inverse_transform(trainPredict) 69 | trainY = scaler.inverse_transform([trainY]) 70 | testPredict = scaler.inverse_transform(testPredict) 71 | testY = scaler.inverse_transform([testY]) 72 | # calculate root mean squared error 73 | trainScore = math.sqrt(mean_squared_error(trainY[0], trainPredict[:,0])) 74 | print('Train Score: %.2f RMSE' % (trainScore)) 75 | testScore = math.sqrt(mean_squared_error(testY[0], testPredict[:,0])) 76 | print('Test Score: %.2f RMSE' % (testScore)) 77 | # shift train predictions for plotting 78 | trainPredictPlot = numpy.empty_like(dataset) 79 | trainPredictPlot[:, :] = numpy.nan 80 | trainPredictPlot[look_back:len(trainPredict)+look_back, :] = trainPredict 81 | # shift test predictions for plotting 82 | testPredictPlot = numpy.empty_like(dataset) 83 | testPredictPlot[:, :] = numpy.nan 84 | testPredictPlot[len(trainPredict)+(look_back*2)+1:len(dataset)-1, :] = testPredict 85 | # plot baseline and predictions 86 | plt.plot(scaler.inverse_transform(dataset)) 87 | plt.plot(trainPredictPlot) 88 | plt.plot(testPredictPlot) 89 | plt.show() -------------------------------------------------------------------------------- /预测模型/ML_classify_model.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | from sklearn import cross_validation 3 | import numpy as np 4 | import pandas as pd 5 | 6 | class myclassify(): 7 | def __init__(self, train_x, train_y): 8 | self.x = train_x 9 | self.y = train_y 10 | self.cv_time = 10 11 | 12 | def knn(self, k=3): 13 | from sklearn import neighbors 14 | knn_model = neighbors.KNeighborsClassifier(n_neighbors=k) 15 | scores = cross_validation.cross_val_score(knn_model, self.x, self.y, cv=self.cv_time) 16 | knn_model.fit(self.x, self.y) 17 | return np.mean(scores), knn_model 18 | 19 | def logistic(self): 20 | from sklearn.linear_model import LogisticRegression 21 | logit_model = LogisticRegression() 22 | scores = cross_validation.cross_val_score(logit_model, self.x, self.y, cv=self.cv_time) 23 | logit_model.fit(self.x, self.y) 24 | return np.mean(scores), logit_model 25 | 26 | def decision_tree(self): 27 | from sklearn import tree 28 | dt_model = tree.DecisionTreeClassifier(criterion='entropy') 29 | scores = cross_validation.cross_val_score(dt_model, self.x, self.y, cv=self.cv_time) 30 | dt_model.fit(self.x, self.y) 31 | return np.mean(scores), dt_model 32 | 33 | def naive_bayes(self): 34 | from sklearn.naive_bayes import MultinomialNB 35 | nb_model = MultinomialNB() 36 | scores = cross_validation.cross_val_score(nb_model, self.x, self.y, cv=self.cv_time) 37 | nb_model.fit(self.x, self.y) 38 | return np.mean(scores), nb_model 39 | 40 | def svm(self): 41 | from sklearn.svm import SVC 42 | model = SVC(kernel='rbf', probability=True) 43 | scores = cross_validation.cross_val_score(model, self.x, self.y, cv=self.cv_time) 44 | model.fit(self.x, self.y) 45 | return np.mean(scores), model 46 | 47 | def svm_cv(self): 48 | from sklearn.grid_search import GridSearchCV 49 | from sklearn.svm import SVC 50 | model = SVC(kernel='rbf', probability=True) 51 | param_grid = {'C': [1e-3, 1e-2, 1e-1, 1, 10, 100, 1000], 'gamma': [0.001, 0.0001]} 52 | grid_search = GridSearchCV(model, param_grid, n_jobs=1, verbose=1) 53 | grid_search.fit(self.x, self.y) 54 | best_parameters = grid_search.best_estimator_.get_params() 55 | for para, val in list(best_parameters.items()): 56 | print(para, val) 57 | model = SVC(kernel='rbf', C=best_parameters['C'], gamma=best_parameters['gamma'], probability=True) 58 | scores = cross_validation.cross_val_score(model, self.x, self.y, cv=self.cv_time) 59 | model.fit(self.x, self.y) 60 | return scores, model 61 | 62 | if __name__ == '__main__': 63 | df = pd.read_csv("../dataset/auto_1.csv") 64 | df = df.dropna(axis=0) 65 | mc = myclassify(df.iloc[:, 0:10], df.iloc[:,-1]) 66 | #scores, model = mc.knn(3) 67 | #scores, model = mc.svm() 68 | scores, model = mc.svm_cv() 69 | predict_num = -3 70 | print scores,model.predict(df.iloc[predict_num,0:10].values.T)[0],df.iloc[predict_num,-1] 71 | 72 | 73 | -------------------------------------------------------------------------------- /预测模型/PLSR.m: -------------------------------------------------------------------------------- 1 | clc;clear 2 | Y=[ 0.1 0.5 0.7 3 | 0.2 0.6 0.4 4 | 0.3 0.7 0.5 5 | 0.4 0.6 0.3 6 | 0.5 0.8 0.2 7 | 0.6 0.3 0.5 8 | 0.4 0.7 0.6 9 | 0.3 0.5 0.7]; 10 | X=[0.2876 0.6173 0.9647 1.1936 1.0636 0.7332 0.5441 0.6247 0.7421 0.7052 11 | 0.2653 0.5167 0.8403 1.0435 1.008 0.7396 0.5344 0.5675 0.6312 0.5368 12 | 0.3833 0.7089 1.0544 1.2805 1.2524 0.8886 0.6596 0.6815 0.75 0.6671 13 | 0.3957 0.6853 0.9204 1.0648 1.0486 0.7999 0.5579 0.5381 0.5698 0.469 14 | 0.472 0.7413 1.0124 1.2202 1.2297 0.9699 0.6646 0.635 0.6254 0.4978 15 | 0.6268 0.9851 1.1633 1.1629 1.0128 0.7123 0.5161 0.482 0.5194 0.4909 16 | 0.4921 0.8723 1.2407 1.4583 1.3631 1.0073 0.7341 0.7032 0.8171 0.7228 17 | 0.4308 0.8232 1.146 1.309 1.1767 0.8207 0.5852 0.6604 0.7677 0.7237]; 18 | %X0=[0.4089 0.6996 0.8712 1.0159 0.9638 0.7115 0.5112 0.4722 0.5059 0.4343]; 19 | 20 | [A,B,r,U,V,stats] = canoncorr(X,Y); 21 | % A X变量数×典型变量个数，第i列表示自变量中的第i个典型变量里，X各个变量的系数，系数越大，表示影响越大 22 | % B Y变量数×典型变量个数，第i列表示因变量中的第i个典型变量里，Y各个变量的系数 23 | % r 典型变量个数，向量；表示自变量中第i个典型变量和因变量中第i个典型变量的相关系数，此时均为最大相关系数 24 | % var( X * A(:,1)) = var( X * A(:,2)) = var( Y * B(:,1)) = 1 25 | % U = (X - repmat(mean(X), size(X,1), 1)) * A 26 | % V = (Y - repmat(mean(Y), size(Y,1), 1)) * B 27 | % stats 统计参数，见https://cn.mathworks.com/help/stats/canoncorr.html 28 | 29 | 30 | 31 | [XL,YL,XS,YS,BETA,PCTVAR,MSE] = plsregress(X,Y,7); 32 | % X为8×10, Y为8×3 33 | % 此时，获得7个主成分（默认获取样本数-1个主成分，根据解释方差累计图确认所需主成分），XL为10*7维矩阵 34 | % 第i列表示第i个主成分，每一个变量对应的系数，共7个主成分 35 | % XS为8*7维矩阵，第i列表示第i个主成分，每一个样本所对应的值 36 | % 同理，YL，YS 如上 37 | % BETA表示每个自变量对因变量的系数，注意包括了截距项，获取yhat时需要对X加上全为1的列，作为第一列 38 | % PCTVAR表示每个成分的解释方差，第一行为X的，第二行为Y的 39 | % MSE表示均方误差，第一行为X的，第二行为Y的 40 | % https://cn.mathworks.com/help/stats/plsregress.html 41 | 42 | plot(1:size(PCTVAR, 2),cumsum(100*PCTVAR(1,:)),'-bo'); 43 | xlabel('Number of PLS components'); 44 | ylabel('Percent Variance Explained in x'); 45 | 46 | plot(1:size(PCTVAR, 2),cumsum(100*PCTVAR(2,:)),'-bo'); 47 | xlabel('Number of PLS components'); 48 | ylabel('Percent Variance Explained in y'); 49 | 50 | Ytest = [ones(size(X,1),1) X]*BETA; 51 | residuals = Y-Ytest; 52 | stem(residuals) 53 | xlabel('Observation'); 54 | ylabel('Residual'); 55 | -------------------------------------------------------------------------------- /预测模型/SVR.py: -------------------------------------------------------------------------------- 1 | from sklearn.datasets import load_boston 2 | 3 | boston = load_boston() 4 | 5 | from sklearn.cross_validation import train_test_split 6 | 7 | import numpy as np; 8 | 9 | X = boston.data 10 | y = boston.target 11 | 12 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 33, test_size = 0.25) 13 | 14 | print 'The max target value is: ', np.max(boston.target) 15 | print 'The min target value is: ', np.min(boston.target) 16 | print 'The average terget value is: ', np.mean(boston.target) 17 | 18 | from sklearn.preprocessing import StandardScaler 19 | 20 | ss_X = StandardScaler() 21 | ss_y = StandardScaler() 22 | 23 | X_train = ss_X.fit_transform(X_train) 24 | X_test = ss_X.transform(X_test) 25 | y_train = ss_y.fit_transform(y_train) 26 | y_test = ss_y.transform(y_test) 27 | 28 | from sklearn.svm import SVR 29 | 30 | linear_svr = SVR(kernel = 'linear') 31 | 32 | linear_svr.fit(X_train, y_train) 33 | 34 | linear_svr_y_predict = linear_svr.predict(X_test) 35 | 36 | poly_svr = SVR(kernel = 'poly') 37 | poly_svr.fit(X_train, y_train) 38 | poly_svr_y_predict = poly_svr.predict(X_test) 39 | 40 | rbf_svr = SVR(kernel = 'rbf') 41 | rbf_svr.fit(X_train, y_train) 42 | rbf_svr_y_predict = rbf_svr.predict(X_test) 43 | 44 | from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error 45 | 46 | print 'R-squared value of linear SVR is: ', linear_svr.score(X_test, y_test) 47 | print 'The mean squared error of linear SVR is: ', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(linear_svr_y_predict)) 48 | print 'The mean absolute error of lin SVR is: ', mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(linear_svr_y_predict)) 49 | 50 | print 'R-squared of ploy SVR is: ', poly_svr.score(X_test, y_test) 51 | print 'the value of mean squared error of poly SVR is: ', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(poly_svr_y_predict)) 52 | print 'the value of mean ssbsolute error of poly SVR is: ', mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(poly_svr_y_predict)) 53 | 54 | print 'R-squared of rbf SVR is: ', rbf_svr.score(X_test, y_test) 55 | print 'the value of mean squared error of rbf SVR is: ', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(rbf_svr_y_predict)) 56 | print 'the value of mean ssbsolute error of rbf SVR is: ', mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(rbf_svr_y_predict)) 57 | -------------------------------------------------------------------------------- /预测模型/decision_tree.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf8 -*- 2 | import pandas as pd 3 | from collections import Counter 4 | 5 | 6 | class Node: 7 | 8 | def __init__(self, feature, df): 9 | self.feature = feature 10 | self.df = df 11 | self.left = None 12 | self.right = None 13 | self.feature_value = None 14 | self.label_value = None 15 | 16 | 17 | class Tree: 18 | 19 | """ 20 | 确保数据框标签列列名为'label' 21 | """ 22 | 23 | def __init__(self, df): 24 | self.df = df 25 | feature_name = self.get_feature(self.df) 26 | self.init_node = Node(feature_name, self.df) 27 | 28 | def get_feature(self, df): 29 | gini = {} 30 | for i in df.columns: 31 | if i != 'label': 32 | value_count_dict = df[i].value_counts() 33 | sums = value_count_dict.values.sum() 34 | gini[i] = 1 - sum([(j * 1.0 / sums)**2 for j in value_count_dict.values]) 35 | return max(gini, key=gini.get) 36 | 37 | @staticmethod 38 | def vote(df, columns_name, value): 39 | label_data = df.loc[df[columns_name] == value, 'label'].values 40 | return Counter(label_data).most_common()[0][0] 41 | 42 | def gen_tree(self, node): 43 | df = node.df 44 | feature_name = self.get_feature(df) 45 | feature_value_set = list(set(df[feature_name].values)) 46 | if len(feature_value_set) > 2: 47 | raise ValueError 48 | elif len(feature_value_set) == 1: 49 | node.label_value = self.vote(df, feature_name, feature_value_set[0]) 50 | return 51 | elif len(feature_value_set) == 2: 52 | left_node = Node(feature_name, df.loc[df[feature_name] == feature_value_set[0]]) 53 | left_node.feature_value = feature_value_set[0] 54 | right_node = Node(feature_name, df.loc[df[feature_name] == feature_value_set[1]]) 55 | right_node.feature_value = feature_value_set[1] 56 | node.left = left_node 57 | node.right = right_node 58 | self.gen_tree(left_node) 59 | self.gen_tree(right_node) 60 | 61 | def display_node(self, node, depth): 62 | if node.left == None: 63 | print "%slabel：%s" % ((depth - 1) *'\t|---' + '', node.label_value) 64 | else: 65 | print "%sfeature: %s, value: %s" % (depth * '\t' + '|---', node.left.feature, node.left.feature_value) 66 | self.display_node(node.left,depth+1) 67 | print "%sfeature: %s, value: %s" % (depth * '\t' +'|---', node.right.feature, node.right.feature_value) 68 | self.display_node(node.right,depth+1) 69 | 70 | 71 | if __name__ == '__main__': 72 | data_set = [ 73 | [1, 0, 1, 1], 74 | [0, 1, 1, 1], 75 | [0, 0, 0, 0], 76 | [1, 1, 1, 1], 77 | [0, 0, 0, 0], 78 | [0, 1, 0, 1], 79 | [1, 0, 1, 1], 80 | [0, 0, 0, 0], 81 | [0, 1, 0, 0], 82 | [0, 0, 0, 0] 83 | ] 84 | df = pd.DataFrame(data_set) 85 | df.columns = ['house', 'marriage', 'wage', 'label'] 86 | tree = Tree(df) 87 | tree.gen_tree(tree.init_node) 88 | tree.display_node(tree.init_node, 0) 89 | -------------------------------------------------------------------------------- /预测模型/evaluate.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | """ 4 | 对所有预测模型的预测效果进行评估 5 | ref: 6 | http://blog.csdn.net/sinat_26917383/article/details/75199996?locationNum=3&fps=1 7 | http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html 8 | https://www.zhihu.com/question/30643044 9 | """ 10 | 11 | from sklearn import metrics 12 | import matplotlib.pyplot as plt 13 | import numpy as np 14 | 15 | TYPE_DISCRETE = 0 # 实际值与预测值均为离散 16 | TYPE_DISCRETE_2 =1 # 实际值为离散，预测值为连续 logistic 17 | TYPE_CONTINUE = 2 # 实际值与预测值均为连续 18 | 19 | 20 | class Evaluate: 21 | 22 | def __init__(self, true_array, predict_array, pred_type = TYPE_DISCRETE): 23 | self.type = pred_type 24 | self.true_array = np.array(true_array) 25 | self.pred_array = np.array(predict_array) 26 | 27 | @property 28 | def accuracy(self): 29 | # 获取精确度 30 | # 采取宏平均 macro，也可采用(None, ‘micro’, ‘macro’, ‘weighted’, ‘samples’) 31 | return metrics.precision_score(self.true_array, self.pred_array, average='macro') 32 | @property 33 | def recall(self): 34 | # 获取召回率 35 | return metrics.recall_score(self.true_array, self.pred_array, average='macro') 36 | 37 | @property 38 | def f1(self): 39 | # 获取F1值，即精确值和召回率的调和均值 40 | return metrics.f1_score(self.true_array, self.pred_array, average='weighted') 41 | 42 | @property 43 | def confusion_matrix(self): 44 | return metrics.confusion_matrix(self.true_array, self.pred_array) 45 | 46 | def confusion_matrix_plot(self, cmap=plt.cm.Blues): 47 | """Matplotlib绘制混淆矩阵图 48 | parameters 49 | ---------- 50 | y_truth: 真实的y的值, 1d array 51 | y_predict: 预测的y的值, 1d array 52 | cmap: 画混淆矩阵图的配色风格, 使用cm.Blues，更多风格请参考官网 53 | """ 54 | cm = metrics.confusion_matrix(self.true_array, self.pred_array) 55 | plt.matshow(cm, cmap=cmap) # 混淆矩阵图 56 | plt.colorbar() # 颜色标签 57 | 58 | for x in range(len(cm)): # 数据标签 59 | for y in range(len(cm)): 60 | plt.annotate(cm[x, y], xy=(x, y), horizontalalignment='center', verticalalignment='center') 61 | 62 | plt.ylabel('True label') # 坐标轴标签 63 | plt.xlabel('Predicted label') # 坐标轴标签 64 | plt.show() # 显示作图结果 65 | 66 | @property 67 | def classify_report(self): 68 | return metrics.classification_report(self.true_array, self.pred_array) 69 | 70 | @property 71 | def kappa_score(self): 72 | # kappa score是一个介于(-1, 1)之间的数. score>0.8意味着好的分类；0或更低意味着不好 73 | return metrics.cohen_kappa_score(self.true_array, self.pred_array) 74 | 75 | @property 76 | def roc_score(self): 77 | return metrics.roc_auc_score(self.true_array, self.pred_array) 78 | 79 | def roc_plot(self, title='Receiver operating characteristic plot'): 80 | # 只针对二分了问题，如果是多分类，分别转换为二分类作图，即是第一类和不是第一类，是第二类和不是第二类等等 81 | fpr, tpr, _ = metrics.roc_curve(self.true_array, self.pred_array) 82 | plt.figure() 83 | # lw : line width 84 | plt.plot(fpr, tpr, color='darkorange', 85 | lw=2, label='ROC curve (area = %0.2f)' % self.roc_score) 86 | plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') 87 | plt.xlim([0.0, 1.0]) 88 | plt.ylim([0.0, 1.05]) 89 | plt.xlabel('False Positive Rate') 90 | plt.ylabel('True Positive Rate') 91 | plt.title(title) 92 | plt.legend(loc="lower right") 93 | plt.show() 94 | 95 | @property 96 | def hamming_distance(self): 97 | return metrics.hamming_loss(self.true_array, self.pred_array) 98 | 99 | @property 100 | def jaccard_distance(self): 101 | return metrics.jaccard_similarity_score(self.true_array, self.pred_array) 102 | 103 | @property 104 | def explained_variance(self): 105 | return metrics.explained_variance_score(self.true_array, self.pred_array) 106 | 107 | @property 108 | def mean_squared_error(self): 109 | return metrics.mean_squared_error(self.true_array, self.pred_array) 110 | 111 | @property 112 | def mean_absolute_error(self): 113 | return metrics.mean_absolute_error(self.true_array, self.pred_array) 114 | 115 | @property 116 | def median_absolute_error(self): 117 | return metrics.median_absolute_error(self.true_array, self.pred_array) 118 | 119 | @property 120 | def r_square(self): 121 | return metrics.r2_score(self.true_array, self.pred_array) 122 | 123 | def display(self): 124 | if self.type == TYPE_DISCRETE: 125 | print "accuracy : %s" % self.accuracy 126 | print "recall : %s" % self.recall 127 | print "F1 : %s" % self.f1 128 | print "confusion_matrix : \n %s" % self.confusion_matrix 129 | print "kappa : %s" % self.kappa_score 130 | print "ROC score : %s" % self.roc_score 131 | print "report : \n %s" % self.classify_report 132 | print "hamming loss : %s" % self.hamming_distance 133 | print "jaccard distance : %s" % self.jaccard_distance 134 | self.confusion_matrix_plot() 135 | self.roc_plot() 136 | elif self.type == TYPE_DISCRETE_2: 137 | print "ROC score : %s" % self.roc_score 138 | self.roc_plot() 139 | 140 | print "mean_squared_error : %s" % self.mean_squared_error 141 | print "mean_absolute_error : %s" % self.mean_absolute_error 142 | print "median_absolute_error : %s" % self.median_absolute_error 143 | print "explained_variance : %s" % self.explained_variance 144 | print "r_square : %s" % self.r_square 145 | 146 | 147 | if __name__ == '__main__': 148 | true_y_0 = [1,1,0,1,0,1,1,1] 149 | pred_y_0 = [1,0,1,1,0,1,0,1] 150 | 151 | true_y_1 = [1, 1, 0, 1, 0, 1, 1, 0, 1, 1] 152 | pred_y_1 = [1, 0.8, 0.2, 1.2, 0, 1.0, 0, 1.7, 2.1, 3.1] 153 | 154 | true_y_2 = [1, 1, 0.9, 1.1, 0.1, 1, 1, 0] 155 | pred_y_2 = [1, 0, 1, 1.2, 0, 1, 0, 1] 156 | 157 | eva_0 = Evaluate(true_y_0, pred_y_0, TYPE_DISCRETE) 158 | eva_1 = Evaluate(true_y_1, pred_y_1, TYPE_DISCRETE_2) 159 | eva_2 = Evaluate(true_y_2, pred_y_2, TYPE_CONTINUE) 160 | 161 | eva_0.display() 162 | eva_1.display() 163 | eva_2.display() 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | -------------------------------------------------------------------------------- /预测模型/neural_network.m: -------------------------------------------------------------------------------- 1 | 2 | % ------------------------- EXAMPLE 1 ----------------------- 3 | P=[3.2 3.2 3 3.2 3.2 3.4 3.2 3 3.2 3.2 3.2 3.9 3.1 3.2; 4 | 9.6 10.3 9 10.3 10.1 10 9.6 9 9.6 9.2 9.5 9 9.5 9.7; 5 | 3.45 3.75 3.5 3.65 3.5 3.4 3.55 3.5 3.55 3.5 3.4 3.1 3.6 3.45; 6 | 2.15 2.2 2.2 2.2 2 2.15 2.14 2.1 2.1 2.1 2.15 2 2.1 2.15; 7 | 140 120 140 150 80 130 130 100 130 140 115 80 90 130; 8 | 2.8 3.4 3.5 2.8 1.5 3.2 3.5 1.8 3.5 2.5 2.8 2.2 2.7 4.6; 9 | 11 10.9 11.4 10.8 11.3 11.5 11.8 11.3 11.8 11 11.9 13 11.1 10.85; 10 | 50 70 50 80 50 60 65 40 65 50 50 50 70 70]; 11 | T=[2.24 2.33 2.24 2.32 2.2 2.27 2.2 2.26 2.2 2.24 2.24 2.2 2.2 2.35]; 12 | [p1,minp,maxp,t1,mint,maxt]=premnmx(P,T); % 归一化 13 | 14 | 15 | net=newff(minmax(P),[8,6,1],{'tansig','tansig','purelin'},'trainlm'); 16 | net.trainParam.epochs = 5000; 17 | net.trainParam.goal=0.0000001; 18 | [net,tr]=train(net,p1,t1); 19 | 20 | 21 | a=[3.0;9.3;3.3;2.05;100;2.8;11.2;50]; 22 | a=premnmx(a); 23 | %放入到网络输出数据 24 | b=sim(net,a); 25 | c=postmnmx(b,mint,maxt); 26 | disp(c) 27 | 28 | % ------------------------- EXAMPLE 2 ----------------------- 29 | clear; 30 | clc; 31 | 32 | P=[110 0.807 240 0.2 15 1 18 2 1.5; 33 | 110 2.865 240 0.1 15 2 12 1 2; 34 | 110 2.59 240 0.1 12 4 24 1 1.5; 35 | 220 0.6 240 0.3 12 3 18 2 1; 36 | 220 3 240 0.3 25 3 21 1 1.5; 37 | 110 1.562 240 0.3 15 3 18 1 1.5; 38 | 110 0.547 240 0.3 15 1 9 2 1.5; 39 | 0 1.318 300 0.1 15 2 18 1 2]; 40 | 41 | T=[54248 162787 168380 314797; 42 | 28614 63958 69637 82898; 43 | 86002 402710 644415 328084; 44 | 230802 445102 362823 335913; 45 | 60257 127892 76753 73541; 46 | 34615 93532 80762 110049; 47 | 56783 172907 164548 144040; 48 | 907 117437 120368 130179]; 49 | m=max(max(P)); 50 | n=max(max(T)); 51 | P=P'/m; 52 | T=T'/n; 53 | %-------------------------------------------------------------------------% 54 | pr(1:9,1)=0; %输入矢量的取值范围矩阵 55 | pr(1:9,2)=1; 56 | bpnet=newff(pr,[12 4],{'logsig', 'logsig'}, 'traingdx', 'learngdm'); 57 | %建立BP神经网络， 12个隐层神经元，4个输出神经元 58 | %tranferFcn属性 'logsig' 隐层采用Sigmoid传输函数 59 | %tranferFcn属性 'logsig' 输出层采用Sigmoid传输函数 60 | %trainFcn属性 'traingdx' 自适应调整学习速率附加动量因子梯度下降反向传播算法训练函数 61 | %learn属性 'learngdm' 附加动量因子的梯度下降学习函数 62 | net.trainParam.epochs=1000;%允许最大训练步数2000步 63 | net.trainParam.goal=0.001; %训练目标最小误差0.001 64 | net.trainParam.show=10; %每间隔100步显示一次训练结果 65 | net.trainParam.lr=0.05; %学习速率0.05 66 | bpnet=train(bpnet,P,T); 67 | %------------------------------------------------------------------------- 68 | p=[110 1.318 300 0.1 15 2 18 1 2]; 69 | p=p'/m; 70 | 71 | r=sim(bpnet,p); 72 | R=r'*n; 73 | display(R); -------------------------------------------------------------------------------- /预测模型/neural_network.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import pandas as pd 5 | from sklearn import preprocessing # 数据标准化 6 | 7 | from pybrain.structure import * 8 | from pybrain.datasets import SupervisedDataSet 9 | from pybrain.supervised.trainers import BackpropTrainer 10 | 11 | """ 12 | 神经网络对连续值进行预测 13 | 预安装pybrain: > pip install pybrain 14 | ref: 15 | http://blog.csdn.net/u010900574/article/details/51290855 16 | """ 17 | 18 | 19 | def _generate_data(): 20 | """ 21 | 生成数据集 22 | 输入层为u(k-1) 和 y(k-1)，输出层为y(k) 23 | """ 24 | # u = np.random.uniform(-1,1,200) 25 | # y=[] 26 | # former_y_value = 0 27 | # for i in np.arange(0,200): 28 | # y.append(former_y_value) 29 | # next_y_value = (29.0 / 40) * np.sin( 30 | # (16.0 * u[i] + 8 * former_y_value) / (3.0 + 4.0 * (u[i] ** 2) + 4 * (former_y_value ** 2))) \ 31 | # + (2.0 / 10) * u[i] + (2.0 / 10) * former_y_value 32 | # former_y_value = next_y_value 33 | # return u,y 34 | u1 = np.random.uniform(-np.pi,np.pi,200) 35 | u2 = np.random.uniform(-1,1,200) 36 | y = np.zeros(200) 37 | for i in range(200): 38 | value = np.sin(u1[i]) + u2[i] 39 | y[i] = value 40 | return u1, u2, y 41 | 42 | def get_fnn(): 43 | """ 44 | 创建层 45 | 输入层: 2 units 46 | 隐含层: 10 units 47 | 输出层: 1 units 48 | """ 49 | # createa neural network 50 | fnn = FeedForwardNetwork() 51 | # claim the layer 52 | inLayer = LinearLayer(2, name='inLayer') 53 | hiddenLayer0 = SigmoidLayer(10, name='hiddenLayer0') 54 | outLayer = LinearLayer(1, name='outLayer') 55 | # add three layers to the neural network 56 | fnn.addInputModule(inLayer) 57 | fnn.addModule(hiddenLayer0) 58 | fnn.addOutputModule(outLayer) 59 | # link three layers 60 | in_to_hidden0 = FullConnection(inLayer, hiddenLayer0) 61 | hidden0_to_out = FullConnection(hiddenLayer0, outLayer) 62 | # add the links to neural network 63 | fnn.addConnection(in_to_hidden0) 64 | fnn.addConnection(hidden0_to_out) 65 | # make neural network come into effect 66 | fnn.sortModules() 67 | 68 | return fnn 69 | 70 | def get_train_data(): 71 | # definite the dataset as two input , one output 72 | DS = SupervisedDataSet(2, 1) 73 | 74 | u1, u2, y = _generate_data() 75 | # add data element to the dataset 76 | for i in np.arange(199): 77 | DS.addSample([u1[i], u2[i]], [y[i + 1]]) 78 | 79 | # you can get your input/output this way 80 | # X = DS['input'] 81 | # Y = DS['target'] 82 | 83 | # split the dataset into train dataset and test dataset 84 | dataTrain, dataTest = DS.splitWithProportion(0.8) 85 | 86 | return dataTrain, dataTest 87 | 88 | def train_and_predict(fnn, dataTrain, dataTest): 89 | # train the NN 90 | # we use BP Algorithm 91 | # verbose = True means print th total error 92 | trainer = BackpropTrainer(fnn, dataTrain, verbose=True, learningrate=0.01) 93 | # set the epoch times to make the NN fit 94 | trainer.trainUntilConvergence(maxEpochs=1000) 95 | 96 | xTest, yTest = dataTest['input'], dataTest['target'] 97 | predict_resutl = [] 98 | for i in np.arange(len(xTest)): 99 | predict_resutl.append(fnn.activate(xTest[i])[0]) 100 | print(predict_resutl) 101 | 102 | plt.figure() 103 | plt.plot(np.arange(0, len(xTest)), predict_resutl, 'ro--', label='predict number') 104 | plt.plot(np.arange(0, len(xTest)), yTest, 'ko-', label='true number') 105 | plt.legend() 106 | plt.xlabel("x") 107 | plt.ylabel("y") 108 | plt.show() 109 | 110 | # for mod in fnn.modules: 111 | # print ("Module:", mod.name) 112 | # if mod.paramdim > 0: 113 | # print ("--parameters:", mod.params) 114 | # for conn in fnn.connections[mod]: 115 | # print ("-connection to", conn.outmod.name) 116 | # if conn.paramdim > 0: 117 | # print ("- parameters", conn.params) 118 | # if hasattr(fnn, "recurrentConns"): 119 | # print ("Recurrent connections") 120 | # for conn in fnn.recurrentConns: 121 | # print ("-", conn.inmod.name, " to", conn.outmod.name) 122 | # if conn.paramdim > 0: 123 | # print ("- parameters", conn.params) 124 | 125 | def fnn_begin(): 126 | fnn = get_fnn() 127 | dataTrain, dataTest = get_train_data() 128 | train_and_predict(fnn, dataTrain, dataTest) 129 | 130 | class NeuralNetwork: 131 | 132 | def __init__(self, input_layer, hide_layer, output_layer, df): 133 | self.fnn = self.get_fnn(input_layer, hide_layer, output_layer) 134 | self.df = self.data_pre_handle(df) 135 | self.get_train_data(input_layer, output_layer) 136 | 137 | def data_pre_handle(self, df): 138 | """ 139 | 1. 剔除无分析价值列 140 | 2. 缺失值补全 141 | 3. 无效值剔除 142 | 4. 分类变量编码 143 | 5. 所有变量归一化 144 | 145 | """ 146 | #df['类别'] = df['类别'].astype('category') # 节省内存开支 147 | df = df.dropna(axis=0) 148 | for column in df.columns: 149 | # 归一化 150 | df[column] = preprocessing.scale(df[column]) 151 | return df 152 | 153 | def get_fnn(self, i, h, o): 154 | """ 155 | 创建层 156 | 输入层: i units 157 | 隐含层: h units 158 | 输出层: o units 159 | """ 160 | fnn = FeedForwardNetwork() 161 | 162 | inLayer = LinearLayer(i, name='inLayer') 163 | hiddenLayer0 = SigmoidLayer(h, name='hiddenLayer0') 164 | outLayer = LinearLayer(o, name='outLayer') 165 | 166 | fnn.addInputModule(inLayer) 167 | fnn.addModule(hiddenLayer0) 168 | fnn.addOutputModule(outLayer) 169 | 170 | in_to_hidden0 = FullConnection(inLayer, hiddenLayer0) 171 | hidden0_to_out = FullConnection(hiddenLayer0, outLayer) 172 | 173 | fnn.addConnection(in_to_hidden0) 174 | fnn.addConnection(hidden0_to_out) 175 | 176 | fnn.sortModules() 177 | return fnn 178 | 179 | def get_train_data(self, input_layer, output_layer): 180 | """ 181 | 输入数据为数据框，前input_layer列为输入数据，后output_layer列为输出数据 182 | """ 183 | DS = SupervisedDataSet(input_layer, output_layer) 184 | 185 | for i in range(self.df.shape[0] - 1): 186 | DS.addSample(self.df.iloc[i, :input_layer].values, self.df.iloc[i+1, input_layer:].values) 187 | 188 | # 打乱顺序，取80%训练，20%测试 189 | # self.dataTrain, self.dataTest = DS.splitWithProportion(0.8) 190 | 191 | def split_by_part(DS, proportion=0.9): 192 | # 不随机抽取，而是取前80%的样本训练，后20%测试 193 | leftIndices = range(int(len(DS) * proportion)) 194 | leftDs = DS.copy() 195 | leftDs.clear() 196 | rightDs = leftDs.copy() 197 | index = 0 198 | for sp in DS: 199 | if index in leftIndices: 200 | leftDs.addSample(*sp) 201 | else: 202 | rightDs.addSample(*sp) 203 | index += 1 204 | return leftDs, rightDs 205 | 206 | self.dataTrain, self.dataTest = split_by_part(DS, 0.99) 207 | 208 | def train(self, times = 1000): 209 | trainer = BackpropTrainer(self.fnn, self.dataTrain, verbose=True, learningrate=0.01) 210 | trainer.trainUntilConvergence(maxEpochs=times) 211 | 212 | def predict(self): 213 | xTest, yTest = self.dataTest['input'], self.dataTest['target'] 214 | predict_resut = [] 215 | for i in np.arange(len(xTest)): 216 | predict_resut.append(self.fnn.activate(xTest[i])) 217 | print(predict_resut) 218 | 219 | plt.figure() 220 | plt.plot(np.arange(0, len(xTest)), predict_resut, 'ro--', label='predict number') 221 | plt.plot(np.arange(0, len(xTest)), yTest, 'ko-', label='true number') 222 | plt.legend() 223 | plt.xlabel("x") 224 | plt.ylabel("y") 225 | plt.show() 226 | 227 | def single_predict(self, x_array): 228 | return self.fnn.activate(x_array) 229 | 230 | def Wind2Df(wind_data): 231 | df = pd.DataFrame(wind_data.Data).T 232 | df.columns = wind_data.Fields 233 | df.index = wind_data.Times 234 | return df 235 | 236 | 237 | if __name__ == '__main__': 238 | # # fnn_begin() 239 | # df = pd.read_csv('dataset/auto.csv') 240 | # df = df.loc[:,[u'mpg', u'rep78', u'headroom', u'trunk', u'weight', u'length', u'turn', u'displacement', u'gear_ratio', u'price']] 241 | # nn = NeuralNetwork(9, 10, 1, df) 242 | # nn.train() 243 | # nn.predict() 244 | # print nn.single_predict(nn.df.ix[0].values[:9]) 245 | # print nn.df.ix[0].values[-1] 246 | from WindPy import * 247 | import datetime 248 | w.start() 249 | df = Wind2Df(w.wst("IC1709.CFE", 250 | "volume,amt,oi,bsize1,asize1,ask2,bid2,bsize2,asize2,bid3,ask3,bsize3,asize3,ask1,bid1,last", 251 | "2017-08-22 09:00:00", "2017-08-22 14:45:05", "")) 252 | nn = NeuralNetwork(15, 15, 1, df) 253 | nn.train(100) 254 | nn.predict() 255 | 256 | 257 | 258 | 259 | 260 | --------------------------------------------------------------------------------