├── .gitignore ├── DL ├── __pycache__ │ └── run.cpython-37.pyc ├── backward.py ├── improve.py ├── learning.py ├── nn.py ├── perceptron.py ├── run.py └── sample_weight.pkl ├── DP.py ├── EDA.py ├── FE.py ├── LSTM.py ├── LSTM_work.py ├── MyFrame.py ├── NNDL ├── minst.py ├── mnist.pkl.gz └── run.py ├── README ├── X.npy ├── Y.npy ├── __pycache__ ├── run.cpython-37.pyc ├── run.cpython-38.pyc └── tools.cpython-38.pyc ├── copy_jsmp.py ├── hello.py ├── hidegpu ├── FE.py ├── nohup.out ├── optuna_test.py └── tools.py ├── janestreet ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-37.pyc │ └── __init__.cpython-38.pyc └── competition.cpython-37m-x86_64-linux-gnu.so ├── jsmp.py ├── jsmp_local.py ├── myxgboost.py ├── nn.py ├── optuna_DP.py ├── optuna_test.py ├── pic ├── 00.jpg ├── 01.jpg ├── 02.jpg ├── 03.jpg ├── 04.jpg ├── 05.jpg ├── 06.jpg ├── 07.jpg ├── 08.jpg ├── 09.jpg └── 10.jpg ├── preprocess.py ├── py_nn.py ├── py_nn_back.py ├── py_nn_use.py ├── pytorch_work.py ├── run.py ├── tc ├── FE.py ├── optuna_DP.py ├── run.py └── tools.py ├── test_dt.py ├── test_pytorch.py ├── test_work.py ├── tools.py └── works.py /.gitignore: -------------------------------------------------------------------------------- 1 | output 2 | *.csv 3 | serverIP.txt 4 | -------------------------------------------------------------------------------- /DL/__pycache__/run.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zwdnet/JSMPwork/b7ae1380986abc3ee5b10bea7f9d57cbb805a8f0/DL/__pycache__/run.cpython-37.pyc -------------------------------------------------------------------------------- /DL/backward.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | # kaggle Jane Street Market Prediction代码 3 | # 《深度学习入门:基于python的理论与实现》 4 | # 第五章 误差反向传播法 5 | 6 | 7 | import numpy as np 8 | import matplotlib.pyplot as plt 9 | import run 10 | import pandas as pd 11 | from PIL import Image 12 | import random 13 | import pickle 14 | from collections import OrderedDict 15 | 16 | 17 | # 乘法层的实现 18 | class MulLayer: 19 | def __init__(self): 20 | self.x = None 21 | self.y = None 22 | 23 | def forward(self, x, y): 24 | self.x = x 25 | self.y = y 26 | out = x*y 27 | return out 28 | 29 | def backward(self, dout): 30 | dx = dout*self.y # 翻转x和y 31 | dy = dout*self.x 32 | 33 | return dx, dy 34 | 35 | 36 | def testMul(): 37 | apple = 100 38 | apple_num = 2 39 | tax = 1.1 40 | 41 | mul_apple_layer = MulLayer() 42 | mul_tax_layer = MulLayer() 43 | 44 | # 前向传播 45 | apple_price = mul_apple_layer.forward(apple, apple_num) 46 | price = mul_tax_layer.forward(apple_price, tax) 47 | print(price) 48 | 49 | # 反向传播 50 | dprice = 1 51 | dapple_price, dtax = mul_tax_layer.backward(dprice) 52 | dapple, dapple_num = mul_apple_layer.backward(dapple_price) 53 | print(dapple_price, dtax, dapple, dapple_num) 54 | 55 | 56 | # 加法层实现 57 | class AddLayer: 58 | def __init__(self): 59 | pass 60 | 61 | def forward(self, x, y): 62 | out = x+y 63 | return out 64 | 65 | def backward(self, dout): 66 | dx = dout*1 67 | dy = dout*1 68 | return dx, dy 69 | 70 | 71 | def testAdd(): 72 | apple = 100 73 | apple_num = 2 74 | orange = 150 75 | orange_num = 3 76 | tax = 1.1 77 | 78 | mul_apple_layer = MulLayer() 79 | mul_orange_layer = MulLayer() 80 | add_apple_orange_layer = AddLayer() 81 | mul_tax_layer = MulLayer() 82 | 83 | # 前向传播 84 | apple_price = mul_apple_layer.forward(apple, apple_num) 85 | orange_price = mul_orange_layer.forward(orange, orange_num) 86 | all_price = add_apple_orange_layer.forward(apple_price, orange_price) 87 | price = mul_tax_layer.forward(all_price, tax) 88 | print(price) 89 | 90 | # 反向传播 91 | dprice = 1 92 | dall_price, dtax = mul_tax_layer.backward(dprice) 93 | dapple_price, dorange_price = add_apple_orange_layer.backward(dall_price) 94 | dorange, dorange_num = mul_orange_layer.backward(dorange_price) 95 | dapple, dapple_num = mul_apple_layer.backward(dapple_price) 96 | print(dapple_num, dapple, dorange, dorange_num, dtax) 97 | 98 | 99 | # ReLU激活函数层 100 | class ReLU: 101 | def __init__(self): 102 | self.mask = None 103 | 104 | def forward(self, x): 105 | self.mask = (x <= 0) 106 | out = x.copy() 107 | out[self.mask] = 0 108 | 109 | return out 110 | 111 | def backward(self, dout): 112 | dout[self.mask] = 0 113 | dx = dout 114 | 115 | return dx 116 | 117 | 118 | def testReLU(): 119 | x = np.array([[1.0, -0.5], [-2.0, 3.0]]) 120 | print(x) 121 | mask = (x<0) 122 | print(mask) 123 | relu = ReLU() 124 | out = relu.forward(x) 125 | dout = relu.backward(out) 126 | print(out, dout) 127 | 128 | 129 | # Sigmoid激活函数层 130 | class Sigmoid: 131 | def __init__(self): 132 | self.out = None 133 | 134 | def forward(self, x): 135 | out = 1/(1+np.exp(-x)) 136 | self.out = out 137 | 138 | return out 139 | 140 | def backward(self, dout): 141 | dx = dout*(1.0-self.out)*self.out 142 | 143 | return dx 144 | 145 | 146 | def testSigmoid(): 147 | x = np.array([[1.0, -0.5], [-2.0, 3.0]]) 148 | print(x) 149 | sigmoid = Sigmoid() 150 | out = sigmoid.forward(x) 151 | dout = sigmoid.backward(out) 152 | print(out, dout) 153 | 154 | 155 | def testSum(): 156 | print("求和") 157 | x = np.array([[1, 2], [3, 4]]) 158 | s1 = np.sum(x, axis = 0) 159 | s2 = np.sum(x, axis = 1) 160 | s3 = np.sum(x) 161 | print(x, s1, s2, s3) 162 | 163 | 164 | # Affine层 165 | class Affine: 166 | def __init__(self, W, b): 167 | self.W = W 168 | self.b = b 169 | self.x = None 170 | self.dW = None 171 | self.db = None 172 | 173 | def forward(self, x): 174 | self.x = x 175 | out = np.dot(x, self.W) + self.b 176 | return out 177 | 178 | def backward(self, dout): 179 | dx = np.dot(dout, self.W.T) 180 | self.dW = np.dot(self.x.T, dout) 181 | self.db = np.sum(dout, axis = 0) 182 | 183 | return dx 184 | 185 | 186 | def softmax(x): 187 | if x.ndim == 2: 188 | x = x.T 189 | x = x - np.max(x, axis=0) 190 | y = np.exp(x) / np.sum(np.exp(x), axis=0) 191 | return y.T 192 | 193 | x = x - np.max(x) # 溢出对策 194 | return np.exp(x) / np.sum(np.exp(x)) 195 | 196 | 197 | def cross_entropy_error(y, t): 198 | if y.ndim == 1: 199 | t = t.reshape(1, t.size) 200 | y = y.reshape(1, y.size) 201 | 202 | # 监督数据是one-hot-vector的情况下,转换为正确解标签的索引 203 | if t.size == y.size: 204 | t = t.argmax(axis=1) 205 | 206 | batch_size = y.shape[0] 207 | return -np.sum(np.log(y[np.arange(batch_size), t] + 1e-7)) / batch_size 208 | 209 | 210 | # softmax和loss函数结合层 211 | class SoftmaxWithLoss: 212 | def __init__(self): 213 | self.loss = None 214 | self.y = None 215 | self.t = None 216 | 217 | def forward(self, x, t): 218 | self.t = t 219 | self.y = softmax(x) 220 | self.loss = cross_entropy_error(self.y, self.t) 221 | 222 | return self.loss 223 | 224 | def backward(self, dout = 1): 225 | batch_size = self.t.shape[0] 226 | dx = (self.y - self.t)/batch_size 227 | return dx 228 | 229 | 230 | # 数值微分 231 | def numerical_gradient(f, x): 232 | h = 1e-4 # 0.0001 233 | grad = np.zeros_like(x) 234 | 235 | it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite']) 236 | while not it.finished: 237 | idx = it.multi_index 238 | tmp_val = x[idx] 239 | x[idx] = float(tmp_val) + h 240 | fxh1 = f(x) # f(x+h) 241 | 242 | x[idx] = tmp_val - h 243 | fxh2 = f(x) # f(x-h) 244 | grad[idx] = (fxh1 - fxh2) / (2*h) 245 | 246 | x[idx] = tmp_val # 还原值 247 | it.iternext() 248 | 249 | return grad 250 | 251 | 252 | # 用上面这些构建神经网络 253 | class TwoLayerNet: 254 | def __init__(self, input_size, hidden_size, output_size, weight_init_std = 0.01): 255 | # 初始化权重 256 | self.params = {} 257 | self.params["W1"] = weight_init_std*np.random.randn(input_size, hidden_size) 258 | self.params["b1"] = np.zeros(hidden_size) 259 | self.params["W2"] = weight_init_std*np.random.randn(hidden_size, output_size) 260 | self.params["b2"] = np.zeros(output_size) 261 | 262 | # 生成层 263 | self.layers = OrderedDict() 264 | self.layers["Affine1"] = Affine(self.params["W1"], self.params["b1"]) 265 | self.layers["Relu1"] = ReLU() 266 | self.layers["Affine2"] = Affine(self.params["W2"], self.params["b2"]) 267 | self.lastLayer = SoftmaxWithLoss() 268 | 269 | def predict(self, x): 270 | for layer in self.layers.values(): 271 | x = layer.forward(x) 272 | return x 273 | 274 | def loss(self, x, t): 275 | y = self.predict(x) 276 | return self.lastLayer.forward(y, t) 277 | 278 | def accuracy(self, x, t): 279 | y = self.predict(x) 280 | y = np.argmax(y, axis = 1) 281 | if t.ndim != 1: 282 | t = np.argmax(t, axis = 1) 283 | accuracy = np.sum(y == t) / float(x.shape[0]) 284 | return accuracy 285 | 286 | def numerical_gradient(self, x, t): 287 | loss_W = lambda W : self.loss(x, t) 288 | 289 | grads = {} 290 | grads["W1"] = numerical_gradient(loss_W, self.params["W1"] ) 291 | grads["b1"] = numerical_gradient(loss_W, self.params["b1"] ) 292 | grads["W2"] = numerical_gradient(loss_W, self.params["W2"] ) 293 | grads["b2"] = numerical_gradient(loss_W, self.params["b2"] ) 294 | 295 | return grads 296 | 297 | 298 | # 更快的求梯度的方法 299 | def gradient(self, x, t): 300 | # forward 301 | self.loss(x, t) 302 | # backward 303 | dout = 1 304 | dout = self.lastLayer.backward(dout) 305 | 306 | layers = list(self.layers.values()) 307 | layers.reverse() 308 | for layer in layers: 309 | dout = layer.backward(dout) 310 | 311 | grads = {} 312 | grads["W1"] = self.layers["Affine1"].dW 313 | grads["b1"] = self.layers["Affine1"].db 314 | grads["W2"] = self.layers["Affine2"].dW 315 | grads["b2"] = self.layers["Affine2"].db 316 | 317 | return grads 318 | 319 | 320 | # 手写数字识别 321 | # 加载数据 322 | @run.change_dir 323 | def loadData(): 324 | training_data_file = open("mnist_train.csv", 'r') 325 | training_data_list = training_data_file.readlines() 326 | training_data_file.close() 327 | 328 | testing_data_file = open("mnist_test.csv", 'r') 329 | testing_data_list = testing_data_file.readlines() 330 | testing_data_file.close() 331 | 332 | x_train, t_train = [], [] 333 | for record in training_data_list: 334 | # 通过','将数分段 335 | all_values = record.split(',') 336 | # 将所有的像素点的值转换为0.01-1.00 337 | inputs = (np.asfarray(all_values[1:]) / 255.0 * 0.99 + 0.01) 338 | # 创建标签输出值 339 | target = int(all_values[0]) 340 | x_train.append(inputs) 341 | t_train.append(target) 342 | x_test, t_test = [], [] 343 | for record in testing_data_list: 344 | # 通过','将数分段 345 | all_values = record.split(',') 346 | # 将所有的像素点的值转换为0.01-1.00 347 | inputs = (np.asfarray(all_values[1:]) / 255.0 * 0.99 + 0.01) 348 | # 创建标签输出值 349 | target = int(all_values[0]) 350 | x_test.append(inputs) 351 | t_test.append(target) 352 | x_train = np.array(x_train) 353 | t_train = np.array(t_train) 354 | x_test = np.array(x_test) 355 | t_test = np.array(t_test) 356 | t_train = one_hot(t_train) 357 | t_test = one_hot(t_test) 358 | return x_train, t_train, x_test, t_test 359 | 360 | 361 | # one_hot过程 362 | def one_hot(t): 363 | tmp = np.zeros((t.shape[0], 10)) 364 | for i in range(t.shape[0]): 365 | tmp[i][t[i]] = 1 366 | t = tmp 367 | return t 368 | 369 | 370 | # 梯度确认 371 | def gradcheck(): 372 | print("梯度确认") 373 | n = 10 374 | x_train, t_train, x_test, t_test = loadData() 375 | network = TwoLayerNet(input_size = 784, hidden_size = 50, output_size = 10) 376 | x_batch = x_train[:n] 377 | t_batch = t_train[:n] 378 | 379 | grad_numerical = network.numerical_gradient(x_batch, t_batch) 380 | grad_backprop = network.gradient(x_batch, t_batch) 381 | 382 | for key in grad_numerical.keys(): 383 | diff = np.average(np.abs(grad_backprop[key] - grad_numerical[key])) 384 | print(key + ":" + str(diff)) 385 | 386 | 387 | # 实际解决手写输入识别问题 388 | @run.change_dir 389 | @run.timethis 390 | def minst(): 391 | print("实际解题") 392 | x_train, t_train, x_test, t_test = loadData() 393 | network = TwoLayerNet(input_size = 784, hidden_size = 50, output_size = 10) 394 | 395 | iters_num = 10000 396 | train_size = x_train.shape[0] 397 | batch_size = 100 398 | learning_rate = 0.1 399 | train_loss_list = [] 400 | train_acc_list = [] 401 | test_acc_list = [] 402 | 403 | iter_per_epoch = max(train_size/batch_size, 1) 404 | for i in range(iters_num): 405 | batch_mask = np.random.choice(train_size, batch_size) 406 | x_batch = x_train[batch_mask] 407 | t_batch = t_train[batch_mask] 408 | 409 | # 反向传播求梯度 410 | grad = network.gradient(x_batch, t_batch) 411 | 412 | # 更新参数 413 | for key in ["W1", "b1", "W2", "b2"]: 414 | network.params[key] -= learning_rate*grad[key] 415 | 416 | loss = network.loss(x_batch, t_batch) 417 | train_loss_list.append(loss) 418 | 419 | # 计算每个epoch的识别精度 420 | if i % iter_per_epoch == 0: 421 | train_acc = network.accuracy(x_train, t_train) 422 | test_acc = network.accuracy(x_test, t_test) 423 | train_acc_list.append(train_acc) 424 | test_acc_list.append(test_acc) 425 | print("训练集准确率{},测试集准确率{}".format(train_acc, test_acc)) 426 | 427 | # 画图 428 | plt.figure() 429 | plt.plot(train_loss_list) 430 | plt.savefig("./output/loss.png") 431 | plt.close() 432 | plt.figure() 433 | plt.plot(train_acc_list) 434 | plt.plot(test_acc_list) 435 | plt.savefig("./output/accuracy.png") 436 | plt.close() 437 | 438 | 439 | 440 | if __name__ == "__main__": 441 | testMul() 442 | testAdd() 443 | testReLU() 444 | testSigmoid() 445 | testSum() 446 | gradcheck() 447 | minst() 448 | -------------------------------------------------------------------------------- /DL/learning.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | # kaggle Jane Street Market Prediction代码 3 | # 《深度学习入门:基于python的理论与实现》 4 | # 第四章 神经网络的学习 5 | 6 | 7 | import numpy as np 8 | import matplotlib.pyplot as plt 9 | import run 10 | import pandas as pd 11 | from PIL import Image 12 | import random 13 | import pickle 14 | 15 | 16 | # 阶跃函数 17 | def step_function(x): 18 | """ 19 | if x > 0: 20 | return 1 21 | else: 22 | return 0 23 | """ 24 | # 用支持numpy的形式 25 | y = x>0 26 | return y.astype(np.int) 27 | 28 | 29 | # 画图 30 | @run.change_dir 31 | def draw_step(): 32 | x = np.arange(-5.0, 5.0, 0.1) 33 | y = step_function(x) 34 | plt.plot(x, y) 35 | plt.ylim(-0.1, 1.1) 36 | plt.savefig("./output/step_function.png") 37 | plt.close() 38 | 39 | 40 | # sigmoid函数 41 | def sigmoid(x): 42 | return 1/(1+np.exp(-x)) 43 | 44 | 45 | # 画图 46 | @run.change_dir 47 | def draw_sigmoid(): 48 | x = np.arange(-5.0, 5.0, 0.1) 49 | y = sigmoid(x) 50 | plt.plot(x, y) 51 | plt.ylim(-0.1, 1.1) 52 | plt.savefig("./output/sigmoid_function.png") 53 | plt.close() 54 | 55 | 56 | # ReLU函数 57 | def ReLU(x): 58 | return np.maximum(0, x) 59 | 60 | 61 | # 画图 62 | @run.change_dir 63 | def draw_ReLU(): 64 | x = np.arange(-5.0, 5.0, 0.1) 65 | y = ReLU(x) 66 | plt.plot(x, y) 67 | plt.savefig("./output/ReLU_function.png") 68 | plt.close() 69 | 70 | 71 | # 恒等函数 72 | def identity_function(x): 73 | return x 74 | 75 | 76 | # softmax函数 77 | def softmax(a): 78 | c = np.max(a) 79 | exp_a = np.exp(a-c) #防止数值太大,溢出 80 | sum_exp_a = np.sum(exp_a) 81 | y = exp_a/sum_exp_a 82 | return y 83 | 84 | 85 | # 手写数字识别 86 | # 加载数据 87 | @run.change_dir 88 | def loadData(): 89 | training_data_file = open("mnist_train.csv", 'r') 90 | training_data_list = training_data_file.readlines() 91 | training_data_file.close() 92 | 93 | testing_data_file = open("mnist_test.csv", 'r') 94 | testing_data_list = testing_data_file.readlines() 95 | testing_data_file.close() 96 | 97 | x_train, t_train = [], [] 98 | for record in training_data_list: 99 | # 通过','将数分段 100 | all_values = record.split(',') 101 | # 将所有的像素点的值转换为0.01-1.00 102 | inputs = (np.asfarray(all_values[1:]) / 255.0 * 0.99 + 0.01) 103 | # 创建标签输出值 104 | target = int(all_values[0]) 105 | x_train.append(inputs) 106 | t_train.append(target) 107 | x_test, t_test = [], [] 108 | for record in testing_data_list: 109 | # 通过','将数分段 110 | all_values = record.split(',') 111 | # 将所有的像素点的值转换为0.01-1.00 112 | inputs = (np.asfarray(all_values[1:]) / 255.0 * 0.99 + 0.01) 113 | # 创建标签输出值 114 | target = int(all_values[0]) 115 | x_test.append(inputs) 116 | t_test.append(target) 117 | x_train = np.array(x_train) 118 | t_train = np.array(t_train) 119 | x_test = np.array(x_test) 120 | t_test = np.array(t_test) 121 | # print(x_train.shape) 122 | # print(t_train.shape) 123 | # print(x_test.shape) 124 | # print(t_test.shape) 125 | return x_train, t_train, x_test, t_test 126 | 127 | 128 | # 均方误差函数 129 | def mse(y, t): 130 | return 0.5*np.sum((y-t)**2) 131 | 132 | 133 | # 交叉熵误差 134 | def cee(y, t): 135 | delta = 1e-7 136 | return -np.sum(np.dot(t, np.log(y+delta))) 137 | 138 | 139 | # mini-batch选取样本 140 | def mini_batch(x_train, t_train, batch_size): 141 | train_size = x_train.shape[0] 142 | assert(train_size >= batch_size) 143 | batch_mask = np.random.choice(train_size, batch_size) 144 | x_batch = x_train[batch_mask] 145 | t_batch = t_train[batch_mask] 146 | return x_batch, t_batch 147 | 148 | 149 | # mini_batch版交叉熵误差 150 | def mb_cee(y, t, one_hot = False): 151 | delta = 1e-7 152 | if y.ndim == 1: 153 | t = t.reshape(1, t.size) 154 | y = y.reshape(1, y.size) 155 | 156 | batch_size = y.shape[0] 157 | if one_hot: 158 | return -np.sum(t*np.log(y+delta))/batch_size 159 | else: 160 | return -np.sum(np.log(y[np.arange(batch_size), t]+delta))/batch_size 161 | 162 | 163 | # 计算f在x处的导数 164 | def numerical_diff(f, x): 165 | h = 1e-4 166 | return (f(x+h) - f(x-h))/(2*h) 167 | 168 | 169 | # 定义求导的函数 170 | def function_1(x): 171 | return 0.01*x**2 + 0.1*x 172 | 173 | 174 | def function_2(x): 175 | return x[0]**2 + x[1]**2 176 | 177 | 178 | # 测试数值微分 179 | @run.change_dir 180 | def test_diff(): 181 | # 画图 182 | x = np.arange(0.0, 20.0, 0.1) 183 | y = function_1(x) 184 | plt.plot(x, y) 185 | plt.savefig("./output/num_diff.png") 186 | plt.close() 187 | 188 | print(numerical_diff(function_1, 5)) 189 | print(numerical_diff(function_1, 10)) 190 | 191 | 192 | # (3, 4)时对x0的偏导函数 193 | def function_tmp1(x0): 194 | return x0**2+4.0**2 195 | 196 | 197 | # (3, 4)时对x1的偏导函数 198 | def function_tmp2(x1): 199 | return 3.0**2+x1**2 200 | 201 | 202 | # 测试偏导数 203 | @run.change_dir 204 | def test_pdiff(): 205 | # 画图 206 | fig = plt.figure() 207 | ax1 = plt.axes(projection='3d') 208 | xx = np.arange(-5.0, 5.0, 0.5) 209 | yy = np.arange(-5.0, 5.0, 0.5) 210 | X, Y = np.meshgrid(xx, yy) 211 | Z = X**2 + Y**2 212 | ax1.plot_surface(X, Y, Z) 213 | plt.savefig("./output/num_pdiff.png") 214 | plt.close() 215 | print(numerical_diff(function_tmp1, 3.0)) 216 | print(numerical_diff(function_tmp2, 4.0)) 217 | 218 | 219 | """ 220 | # 求数值梯度 221 | def numerical_grad(f, x): 222 | h = 1e-4 223 | grad = np.zeros_like(x) 224 | print(x.shape, x.size) 225 | for idx in range(x.size): 226 | print(idx) 227 | tmp_val = x[idx] 228 | # f(x+h) 229 | x[idx] = tmp_val + h 230 | fx1 = f(x) 231 | # f(x-h) 232 | x[idx] = tmp_val - h 233 | fx2 = f(x) 234 | 235 | grad[idx] = (fx1 - fx2)/(2*h) 236 | x[idx] = tmp_val 237 | 238 | return grad 239 | """ 240 | 241 | def _numerical_gradient_no_batch(f, x): 242 | h = 1e-4 # 0.0001 243 | grad = np.zeros_like(x) 244 | 245 | for idx in range(x.size): 246 | tmp_val = x[idx] 247 | x[idx] = float(tmp_val) + h 248 | fxh1 = f(x) # f(x+h) 249 | 250 | x[idx] = tmp_val - h 251 | fxh2 = f(x) # f(x-h) 252 | grad[idx] = (fxh1 - fxh2) / (2*h) 253 | 254 | x[idx] = tmp_val # 还原值 255 | 256 | return grad 257 | 258 | 259 | def numerical_gradient(f, X): 260 | if X.ndim == 1: 261 | return _numerical_gradient_no_batch(f, X) 262 | else: 263 | grad = np.zeros_like(X) 264 | 265 | for idx, x in enumerate(X): 266 | grad[idx] = _numerical_gradient_no_batch(f, x) 267 | 268 | return grad 269 | 270 | 271 | def test_grad(): 272 | print(numerical_gradient(function_2, np.array([3.0, 4.0]))) 273 | print(numerical_gradient(function_2, np.array([0.0, 2.0]))) 274 | print(numerical_gradient(function_2, np.array([3.0, 0.0]))) 275 | 276 | 277 | # 梯度下降法 278 | def gradient_descent(f, init_x, lr = 0.01, step_num = 100): 279 | x = init_x 280 | print("学习率{}".format(lr)) 281 | for i in range(step_num): 282 | grad = numerical_gradient(f, x) 283 | x -= lr*grad 284 | 285 | return x 286 | 287 | 288 | # 测试梯度下降法 289 | def test_gd(): 290 | print("测试梯度下降") 291 | init_x = np.array([-3.0, 4.0]) 292 | print(gradient_descent(function_2, init_x)) 293 | init_x = np.array([-3.0, 4.0]) 294 | print(gradient_descent(function_2, init_x, lr = 10.0)) 295 | init_x = np.array([-3.0, 4.0]) 296 | print(gradient_descent(function_2, init_x, lr = 1e-10)) 297 | 298 | 299 | # 定义简单的神经网络 300 | class simpleNet: 301 | def __init__(self): 302 | # 用高斯分布进行初始化 303 | self.W = np.random.randn(2, 3) 304 | 305 | def predict(self, x): 306 | return np.dot(x, self.W) 307 | 308 | def loss(self, x, t): 309 | z = self.predict(x) 310 | y = softmax(z) 311 | loss = cee(y, t) 312 | return loss 313 | 314 | 315 | # 测试神经网络 316 | def test_nn(): 317 | print("测试神经网络") 318 | net = simpleNet() 319 | print(net.W) 320 | x = np.array([0.6, 0.9]) 321 | p = net.predict(x) 322 | print(p) 323 | print(np.argmax(p)) 324 | t = np.array([0, 0, 1]) 325 | print(net.loss(x, t)) 326 | def f(W): 327 | return net.loss(x, t) 328 | dW = numerical_gradient(f, net.W) 329 | print(dW) 330 | 331 | 332 | def sigmoid_grad(x): 333 | return (1.0 - sigmoid(x)) * sigmoid(x) 334 | 335 | 336 | # 两层神经网络 337 | class TwoLayerNet: 338 | def __init__(self, input_size, hidden_size, output_size, weight_init_std = 0.01): 339 | # 初始化权重 340 | self.params = {} 341 | self.params["W1"] = weight_init_std*np.random.randn(input_size, hidden_size) 342 | self.params["b1"] = np.zeros(hidden_size) 343 | self.params["W2"] = weight_init_std*np.random.randn(hidden_size, output_size) 344 | self.params["b2"] = np.zeros(output_size) 345 | 346 | def predict(self, x): 347 | W1, W2 = self.params["W1"], self.params["W2"] 348 | b1, b2 = self.params["b1"], self.params["b2"] 349 | 350 | a1 = np.dot(x, W1) + b1 351 | z1 = sigmoid(a1) 352 | a2 = np.dot(z1, W2) + b2 353 | y = softmax(a2) 354 | 355 | return y 356 | 357 | def loss(self, x, t): 358 | y = self.predict(x) 359 | return cee(y, t) 360 | 361 | def accuracy(self, x, t): 362 | tmp = np.zeros((t.shape[0], 10))+0.01 363 | for i in range(t.shape[0]): 364 | tmp[i][t[i]] = 0.99 365 | t = tmp 366 | y = self.predict(x) 367 | y = np.argmax(y, axis = 1) 368 | t = np.argmax(t, axis = 1) 369 | 370 | accuracy = np.sum(y == t) / float(x.shape[0]) 371 | return accuracy 372 | 373 | def numerical_gradient(self, x, t): 374 | loss_W = lambda W : self.loss(x, t) 375 | 376 | grads = {} 377 | grads["W1"] = numerical_gradient(loss_W, self.params["W1"] ) 378 | grads["b1"] = numerical_gradient(loss_W, self.params["b1"] ) 379 | grads["W2"] = numerical_gradient(loss_W, self.params["W2"] ) 380 | grads["b2"] = numerical_gradient(loss_W, self.params["b2"] ) 381 | 382 | return grads 383 | 384 | # 更快的求梯度的方法 385 | def gradient(self, x, t): 386 | W1, W2 = self.params['W1'], self.params['W2'] 387 | b1, b2 = self.params['b1'], self.params['b2'] 388 | grads = {} 389 | 390 | batch_num = x.shape[0] 391 | 392 | # forward 393 | a1 = np.dot(x, W1) + b1 394 | z1 = sigmoid(a1) 395 | a2 = np.dot(z1, W2) + b2 396 | y = softmax(a2) 397 | 398 | # backward 399 | # print(type(y), type(t)) 400 | # print(y.shape, t.shape) 401 | # print(y[0], t[0]) 402 | tmp = np.zeros((t.shape[0], 10))+0.01 403 | for i in range(t.shape[0]): 404 | tmp[i][t[i]] = 0.99 405 | # print(tmp.shape) 406 | dy = (y - tmp) / batch_num 407 | grads['W2'] = np.dot(z1.T, dy) 408 | grads['b2'] = np.sum(dy, axis=0) 409 | 410 | da1 = np.dot(dy, W2.T) 411 | dz1 = sigmoid_grad(a1) * da1 412 | grads['W1'] = np.dot(x.T, dz1) 413 | grads['b1'] = np.sum(dz1, axis=0) 414 | 415 | return grads 416 | 417 | 418 | # 测试两层神经网络 419 | @run.change_dir 420 | @run.timethis 421 | def test_2_nn(): 422 | print("测试两层神经网络") 423 | # net = TwoLayerNet(input_size = 784, hidden_size = 100, output_size = 10) 424 | # print(net.params["W1"].shape) 425 | # print(net.params["b1"].shape) 426 | # print(net.params["W2"].shape) 427 | # print(net.params["b2"].shape) 428 | # x = np.random.rand(100, 784) 429 | # y = net.predict(x) 430 | # # print(y) 431 | # t = np.random.rand(100, 10) 432 | # grads = net.numerical_gradient(x, t) 433 | # print(grads["W1"].shape) 434 | # print(grads["b1"].shape) 435 | # print(grads["W2"].shape) 436 | # print(grads["b2"].shape) 437 | 438 | # 加载数据 439 | x_train, t_train, x_test, t_test = loadData() 440 | 441 | # 训练 442 | train_loss_list = [] 443 | train_acc_list = [] 444 | test_acc_list = [] 445 | # 超参数 446 | iters_num = 10000 447 | train_size = x_train.shape[0] 448 | batch_size = 100 449 | learning_rate = 0.1 450 | # 平均每个epoch的重复次数 451 | iter_per_epoch = max(train_size/batch_size, 1) 452 | 453 | network = TwoLayerNet(input_size = 784, hidden_size = 100, output_size = 10) 454 | 455 | for i in range(iters_num): 456 | # 获取mini_batch 457 | batch_mask = np.random.choice(train_size, batch_size) 458 | x_batch = x_train[batch_mask] 459 | t_batch = t_train[batch_mask] 460 | # 计算梯度 461 | #grad = network.numerical_gradient(x_batch, t_batch) 462 | grad = network.gradient(x_batch, t_batch) 463 | # 更新参数 464 | for key in ["W1", "b1", "W2", "b2"]: 465 | network.params[key] -= learning_rate*grad[key] 466 | 467 | # 记录学习过程 468 | loss = network.loss(x_batch, t_batch) 469 | train_loss_list.append(loss) 470 | # print(i, loss) 471 | # 计算每个epoch的识别精度 472 | if i % iter_per_epoch == 0: 473 | train_acc = network.accuracy(x_train, t_train) 474 | test_acc = network.accuracy(x_test, t_test) 475 | train_acc_list.append(train_acc) 476 | test_acc_list.append(test_acc) 477 | print("训练集准确率{},测试集准确率{}".format(train_acc, test_acc)) 478 | 479 | # 画图 480 | plt.figure() 481 | plt.plot(train_loss_list) 482 | plt.savefig("./output/loss.png") 483 | plt.close() 484 | plt.figure() 485 | plt.plot(train_acc_list) 486 | plt.plot(test_acc_list) 487 | plt.savefig("./output/accuracy.png") 488 | plt.close() 489 | 490 | 491 | if __name__ == "__main__": 492 | # 测试mse 493 | t = np.zeros(10) 494 | t[2] = 1 495 | y = np.array([0.1, 0.05, 0.6, 0.0, 0.05, 0.1, 0.0, 0.1, 0.0, 0.0]) 496 | print(t, y) 497 | print(mse(y, t)) 498 | print(cee(y, t)) 499 | 500 | # 测试mini_batch 501 | x_train, t_train, x_test, t_test = loadData() 502 | x_batch, t_batch = mini_batch(x_train, t_train, 10) 503 | print(x_batch) 504 | print(t_batch) 505 | 506 | # 数值微分 507 | test_diff() 508 | test_pdiff() 509 | test_grad() 510 | test_gd() 511 | 512 | # 测试神经网络 513 | test_nn() 514 | # 测试两层神经网络 515 | test_2_nn() 516 | 517 | -------------------------------------------------------------------------------- /DL/nn.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | # kaggle Jane Street Market Prediction代码 3 | # 《深度学习入门:基于python的理论与实现》 4 | # 第三章 神经网络 5 | 6 | 7 | import numpy as np 8 | import matplotlib.pyplot as plt 9 | import run 10 | import pandas as pd 11 | from PIL import Image 12 | import random 13 | import pickle 14 | 15 | 16 | # 阶跃函数 17 | def step_function(x): 18 | """ 19 | if x > 0: 20 | return 1 21 | else: 22 | return 0 23 | """ 24 | # 用支持numpy的形式 25 | y = x>0 26 | return y.astype(np.int) 27 | 28 | 29 | # 画图 30 | @run.change_dir 31 | def draw_step(): 32 | x = np.arange(-5.0, 5.0, 0.1) 33 | y = step_function(x) 34 | plt.plot(x, y) 35 | plt.ylim(-0.1, 1.1) 36 | plt.savefig("./output/step_function.png") 37 | plt.close() 38 | 39 | 40 | # sigmoid函数 41 | def sigmoid(x): 42 | return 1/(1+np.exp(-x)) 43 | 44 | 45 | # 画图 46 | @run.change_dir 47 | def draw_sigmoid(): 48 | x = np.arange(-5.0, 5.0, 0.1) 49 | y = sigmoid(x) 50 | plt.plot(x, y) 51 | plt.ylim(-0.1, 1.1) 52 | plt.savefig("./output/sigmoid_function.png") 53 | plt.close() 54 | 55 | 56 | # ReLU函数 57 | def ReLU(x): 58 | return np.maximum(0, x) 59 | 60 | 61 | # 画图 62 | @run.change_dir 63 | def draw_ReLU(): 64 | x = np.arange(-5.0, 5.0, 0.1) 65 | y = ReLU(x) 66 | plt.plot(x, y) 67 | plt.savefig("./output/ReLU_function.png") 68 | plt.close() 69 | 70 | 71 | # 初始化神经网络 72 | def init_network(): 73 | network = {} 74 | network["W1"] = np.array([[0.1, 0.3, 0.5], [0.2, 0.4, 0.6]]) 75 | network["b1"] = np.array([0.1, 0.2, 0.3]) 76 | network["W2"] = np.array([[0.1, 0.4], [0.2, 0.5], [0.3, 0.6]]) 77 | network["b2"] = np.array([0.1, 0.2]) 78 | network["W3"] = np.array([[0.1, 0.3], [0.2, 0.4]]) 79 | network["b3"] = np.array([0.1, 0.2]) 80 | 81 | return network 82 | 83 | 84 | # 恒等函数 85 | def identity_function(x): 86 | return x 87 | 88 | 89 | # softmax函数 90 | def softmax(a): 91 | c = np.max(a) 92 | exp_a = np.exp(a-c) #防止数值太大,溢出 93 | sum_exp_a = np.sum(exp_a) 94 | y = exp_a/sum_exp_a 95 | return y 96 | 97 | 98 | # 前向传播过程 99 | def forward(network, x): 100 | W1, W2, W3 = network["W1"], network["W2"], network["W3"] 101 | b1, b2, b3 = network["b1"], network["b2"], network["b3"] 102 | 103 | a1 = np.dot(x, W1) + b1 104 | z1 = sigmoid(a1) 105 | a2 = np.dot(z1, W2) + b2 106 | z2 = sigmoid(a2) 107 | a3 = np.dot(z2, W3) + b3 108 | 109 | y = identity_function(a3) 110 | 111 | return y 112 | 113 | 114 | # 手写数字识别 115 | # 加载数据 116 | @run.change_dir 117 | def loadData(): 118 | training_data_file = open("mnist_train.csv", 'r') 119 | training_data_list = training_data_file.readlines() 120 | training_data_file.close() 121 | 122 | testing_data_file = open("mnist_test.csv", 'r') 123 | testing_data_list = testing_data_file.readlines() 124 | testing_data_file.close() 125 | 126 | x_train, t_train = [], [] 127 | for record in training_data_list: 128 | # 通过','将数分段 129 | all_values = record.split(',') 130 | # 将所有的像素点的值转换为0.01-1.00 131 | inputs = (np.asfarray(all_values[1:]) / 255.0 * 0.99 + 0.01) 132 | # 创建标签输出值 133 | target = int(all_values[0]) 134 | x_train.append(inputs) 135 | t_train.append(target) 136 | x_test, t_test = [], [] 137 | for record in testing_data_list: 138 | # 通过','将数分段 139 | all_values = record.split(',') 140 | # 将所有的像素点的值转换为0.01-1.00 141 | inputs = (np.asfarray(all_values[1:]) / 255.0 * 0.99 + 0.01) 142 | # 创建标签输出值 143 | target = int(all_values[0]) 144 | x_test.append(inputs) 145 | t_test.append(target) 146 | x_train = np.array(x_train) 147 | t_train = np.array(t_train) 148 | x_test = np.array(x_test) 149 | t_test = np.array(t_test) 150 | # print(x_train.shape) 151 | # print(t_train.shape) 152 | # print(x_test.shape) 153 | # print(t_test.shape) 154 | return x_train, t_train, x_test, t_test 155 | 156 | 157 | # 绘制数据 158 | @run.change_dir 159 | def drawNum(data, target): 160 | i = random.randint(0, data.shape[0]-1) 161 | print(i) 162 | img = data[i] 163 | label = target[i] 164 | print(label) 165 | print(img.shape) 166 | img = img.reshape(28, 28) 167 | print(img.shape) 168 | pil_img = Image.fromarray(np.uint8(img*255)) 169 | pil_img.save("./output/number.png", "png") 170 | 171 | 172 | # 测试minst 173 | @run.change_dir 174 | def testMinst(): 175 | x_train, t_train, x_test, t_test = loadData() 176 | drawNum(x_train, t_train) 177 | 178 | # 加载训练好的模型 179 | with open("sample_weight.pkl", "rb") as f: 180 | network = pickle.load(f) 181 | 182 | batch_size = 100 183 | accuracy_cnt = 0 184 | for i in range(0, len(x_test), batch_size): 185 | x_batch = x_test[i:i+batch_size] 186 | y_batch = forward(network, x_batch) 187 | p = np.argmax(y_batch, axis = 1) 188 | accuracy_cnt += np.sum(p == t_test[i:i+batch_size]) 189 | 190 | print("预测准确率:{}/{}={}".format(accuracy_cnt, len(x_test), accuracy_cnt/len(x_test))) 191 | 192 | 193 | 194 | if __name__ == "__main__": 195 | draw_step() 196 | draw_sigmoid() 197 | draw_ReLU() 198 | 199 | # 实现神经网络 200 | network = init_network() 201 | x = np.array([1.0, 0.5]) 202 | y = forward(network, x) 203 | print(y) 204 | 205 | # 测试softmax函数 206 | a = np.array([1010, 1000, 990]) 207 | y = softmax(a) 208 | print(y) 209 | print(np.sum(y)) 210 | 211 | # minst测试 212 | testMinst() 213 | -------------------------------------------------------------------------------- /DL/perceptron.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | # kaggle Jane Street Market Prediction代码 3 | # 《深度学习入门:基于python的理论与实现》 4 | # 第二章 感知机 5 | 6 | 7 | import numpy as np 8 | 9 | 10 | # 用感知机来实现逻辑门 11 | # 与门 12 | def AND(x1, x2): 13 | w1, w2, theta = 0.5, 0.5, 0.7 14 | tmp = w1*x1 + w2*x2 15 | if tmp <= theta: 16 | return 0 17 | else: 18 | return 1 19 | 20 | 21 | # 另一种形式实现与门 b = -theta 22 | def AND2(x1, x2): 23 | x = np.array([x1, x2]) 24 | w = np.array([0.5, 0.5]) 25 | b = -0.7 26 | tmp = np.sum(w*x)+b 27 | if tmp <= 0: 28 | return 0 29 | else: 30 | return 1 31 | 32 | 33 | # 与非门 34 | def NAND(x1, x2): 35 | x = np.array([x1, x2]) 36 | w = np.array([-0.5, -0.5]) 37 | b = 0.7 38 | tmp = np.sum(w*x)+b 39 | if tmp <= 0: 40 | return 0 41 | else: 42 | return 1 43 | 44 | 45 | # 或门 46 | def OR(x1, x2): 47 | x = np.array([x1, x2]) 48 | w = np.array([0.5, 0.5]) 49 | b = -0.2 50 | tmp = np.sum(w*x)+b 51 | if tmp <= 0: 52 | return 0 53 | else: 54 | return 1 55 | 56 | 57 | # 用感知机组合实现异或门 58 | def XOR(x1, x2): 59 | s1 = NAND(x1, x2) 60 | s2 = OR(x1, x2) 61 | y = AND2(s1, s2) 62 | return y 63 | 64 | 65 | if __name__ == "__main__": 66 | print(AND(0, 0), AND(0, 1), AND(1, 0), AND(1, 1)) 67 | print(AND2(0, 0), AND2(0, 1), AND2(1, 0), AND2(1, 1)) 68 | print(NAND(0, 0), NAND(0, 1), NAND(1, 0), NAND(1, 1)) 69 | print(OR(0, 0), OR(0, 1), OR(1, 0), OR(1, 1)) 70 | print(XOR(0, 0), XOR(0, 1), XOR(1, 0), XOR(1, 1)) 71 | -------------------------------------------------------------------------------- /DL/run.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | # 将程序上传到服务器上执行 3 | import os 4 | import sys 5 | from functools import wraps 6 | import time 7 | 8 | 9 | # 上传代码至服务器并运行 10 | def run(gpus, server): 11 | # 上传本目录所有文件再执行指定文件 12 | if gpus == "all": 13 | # 清除服务器代码目录里所有源文件以及输出目录中的文件 14 | s = "ssh ubuntu@" + server + " \"sudo rm -rf ~/code/*.py\"" 15 | os.system(s) 16 | s = "ssh ubuntu@" + server + " \"sudo rm -rf ~/code/output/*\"" 17 | os.system(s) 18 | # 将本地目录所有文件上传至容器 19 | s = "scp -r ./*.py ubuntu@" + server + ":~/code" 20 | os.system(s) 21 | # 运行指定代码 22 | s = "ssh root@" + server + " -p 2222 \"python /home/code/" + sys.argv[2] + "\"" 23 | print("正在运行代码……\n") 24 | os.system(s) 25 | # 将代码目录里所有输出文件传回 26 | s = "scp -r ubuntu@" + server + ":~/code/output/* ./output/" 27 | os.system(s) 28 | # 将所有结果文件传回 29 | elif gpus == "copy": 30 | s = "scp -r ubuntu@" + server + ":~/code/output/* ./output/" 31 | os.system(s) 32 | # 上传指定文件并执行 33 | else: 34 | ## 清除服务器代码目录里所有源文件以及输出目录中的文件 35 | s = "ssh ubuntu@" + server + " \"sudo rm -rf ~/code/*.py\"" 36 | os.system(s) 37 | s = "ssh ubuntu@" + server + " \"sudo rm -rf ~/code/output/*\"" 38 | os.system(s) 39 | # 将本地目录指定文件上传至容器 40 | s = "scp " + sys.argv[1] + " ubuntu@" + server + ":~/code" 41 | os.system(s) 42 | # 运行指定代码 43 | s = "ssh root@" + server + " -p 2222 \"python /home/code/" + sys.argv[1] + "\"" 44 | os.system(s) 45 | # 将代码目录里所有文件传回 46 | s = "scp -r ubuntu@" + server + ":~/code/output/* ./output/" 47 | os.system(s) 48 | 49 | 50 | if __name__ == "__main__": 51 | gpus = sys.argv[1] 52 | # 读取服务器IP地址,自己编辑serverIP.txt去 53 | with open("serverIP.txt", "rt") as f: 54 | server = f.read() 55 | run(gpus, server) 56 | 57 | 58 | # 工具函数,在上传到服务器上运行时改变当前目录 59 | def change_dir(func): 60 | @wraps(func) 61 | def change(*args, **kwargs): 62 | oldpath = os.getcwd() 63 | newpath = "/home/code/" 64 | os.chdir(newpath) 65 | r = func(*args, **kwargs) 66 | os.chdir(oldpath) 67 | return r 68 | return change 69 | 70 | 71 | # 工具函数,计算函数运行时间 72 | def timethis(func): 73 | @wraps(func) 74 | def wrapper(*args, **kwargs): 75 | start = time.perf_counter() 76 | r = func(*args, **kwargs) 77 | end = time.perf_counter() 78 | print('{}.{}的运行时间为 : {}秒'.format(func.__module__, func.__name__, end - start)) 79 | return r 80 | return wrapper 81 | -------------------------------------------------------------------------------- /DL/sample_weight.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zwdnet/JSMPwork/b7ae1380986abc3ee5b10bea7f9d57cbb805a8f0/DL/sample_weight.pkl -------------------------------------------------------------------------------- /FE.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | # kaggle竞赛Jane Street Market Prediction 3 | # 特征工程代码 4 | 5 | 6 | from run import * 7 | from tools import * 8 | import pandas as pd 9 | import matplotlib.pyplot as plt 10 | 11 | 12 | """ 13 | # 特征工程 14 | @change_dir 15 | def featureEngineer(data): 16 | tages = pd.DataFrame() 17 | tagename = feature.columns 18 | for i in range(29): 19 | # tagename = "tag_" + str(i) 20 | # tages[tagename[i+1]] = feature[(feature[tagename[i+1]] == True)].iloc[:, i+1] 21 | #print(tages[i]) 22 | temp = feature["feature"][feature[tagename[i+1]] == True] 23 | temp.name = tagename[i+1] 24 | print(temp) 25 | #print(tages) 26 | # 填充空值 27 | print(data.isnull().sum()) 28 | for col in data.columns: 29 | mean_val = data[col].mean() 30 | data[col].fillna(mean_val, inplace=True) 31 | print(data.isnull().sum()) 32 | # 处理feature_0 33 | feature_0 = data["feature_0"].cumsum() 34 | plt.plot(feature_0) 35 | plt.savefig("./output/cumf_0.png") 36 | plt.close() 37 | data["feature_0"] = feature_0 38 | # print(feature_0) 39 | return data 40 | """ 41 | # 特征工程 42 | def featureEngineer(data): 43 | # data = data[data['weight'] != 0] 44 | data = data.fillna(0.0) 45 | weight = data['weight'].values 46 | resp = data['resp'].values 47 | data['action'] = ((weight * resp) > 0).astype('int') 48 | return data 49 | 50 | 51 | 52 | 53 | if __name__ == "__main__": 54 | train, feature = loadData() 55 | # feature = feature[feature == True] 56 | print(feature) 57 | train = featureEngineer(train) 58 | -------------------------------------------------------------------------------- /LSTM.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | # kaggle Jane Street Market Prediction代码 3 | """ 4 | 用pytorch实现LSTM模型 5 | 参考:https://zhuanlan.zhihu.com/p/104475016 6 | """ 7 | 8 | 9 | import numpy as np 10 | import torch 11 | from torch import nn 12 | import matplotlib.pyplot as plt 13 | from run import * 14 | 15 | 16 | class LstmRNN(nn.Module): 17 | def __init__(self, input_size, hidden_size = 1, output_size = 1, num_layers = 1): 18 | super().__init__() 19 | self.lstm = nn.LSTM(input_size, hidden_size, num_layers) 20 | self.forwardCalculation = nn.Linear(hidden_size, output_size) 21 | 22 | def forward(self, _x): 23 | x, _ = self.lstm(_x) 24 | s, b, h = x.shape # seq_len, batch, hidden_size 25 | x = x.view(s*b, h) 26 | x = self.forwardCalculation(x) 27 | x = x.view(s, b, -1) 28 | return x 29 | 30 | 31 | @change_dir 32 | def LSTM(): 33 | # 建立数据 34 | data_len = 200 35 | t = np.linspace(0, 12*np.pi, data_len) 36 | sin_t = np.sin(t) 37 | cos_t = np.cos(t) 38 | 39 | dataset = np.zeros((data_len, 2)) 40 | dataset[:, 0] = sin_t 41 | dataset[:, 1] = cos_t 42 | dataset = dataset.astype("float32") 43 | 44 | # 划分数据 45 | train_data_ratio = 0.5 46 | train_data_len = int(data_len*train_data_ratio) 47 | train_x = dataset[:train_data_len, 0] 48 | train_y = dataset[:train_data_len, 1] 49 | INPUT_FEATURES_NUM = 1 50 | OUTPUT_FEATURES_NUM = 1 51 | t_for_training = t[:train_data_len] 52 | 53 | test_x = dataset[train_data_len:, 0] 54 | test_y = dataset[train_data_len:, 1] 55 | t_for_testing = t[train_data_len:] 56 | 57 | # 训练 58 | train_x_tensor = train_x.reshape(-1, 5, INPUT_FEATURES_NUM) # 分5批 59 | train_y_tensor = train_y.reshape(-1, 5, OUTPUT_FEATURES_NUM) # 分5批 60 | train_x_tensor = torch.from_numpy(train_x_tensor) 61 | train_y_tensor = torch.from_numpy(train_y_tensor) 62 | 63 | lstm_model = LstmRNN(INPUT_FEATURES_NUM, 16, output_size = OUTPUT_FEATURES_NUM, num_layers = 1) 64 | print('LSTM model:', lstm_model) 65 | print('model.parameters:', lstm_model.parameters) 66 | 67 | loss_fn = nn.MSELoss() 68 | lr = 1e-2 69 | optimizer = torch.optim.Adam(lstm_model.parameters(), lr) 70 | 71 | max_epochs = 10000 72 | for epoch in range(max_epochs): 73 | output = lstm_model(train_x_tensor) 74 | loss = loss_fn(output, train_y_tensor) 75 | loss.backward() 76 | optimizer.step() 77 | optimizer.zero_grad() 78 | 79 | if loss.item() < 1e-4: 80 | print('Epoch [{}/{}], Loss: {:.5f}'.format(epoch+1, max_epochs, loss.item())) 81 | print("The loss value is reached") 82 | break 83 | elif (epoch+1) % 100 == 0: 84 | print('Epoch [{}/{}], Loss: {:.5f}'.format(epoch+1, max_epochs, loss.item())) 85 | 86 | # 用模型预测 87 | # 训练集上 88 | predictive_y_for_training = lstm_model(train_x_tensor) 89 | predictive_y_for_training = predictive_y_for_training.view(-1, OUTPUT_FEATURES_NUM).data.numpy() 90 | 91 | # 切换为测试状态 92 | lstm_model = lstm_model.eval() 93 | # 用测试集预测 94 | test_x_tensor = test_x.reshape(-1, 5, INPUT_FEATURES_NUM) 95 | test_x_tensor = torch.from_numpy(test_x_tensor) 96 | predictive_y_for_testing = lstm_model(test_x_tensor) 97 | predictive_y_for_testing = predictive_y_for_testing.view(-1, OUTPUT_FEATURES_NUM).data.numpy() 98 | 99 | # 画图 100 | plt.figure() 101 | plt.plot(t_for_training, train_x, 'g', label='sin_trn') 102 | plt.plot(t_for_training, train_y, 'b', label='ref_cos_trn') 103 | plt.plot(t_for_training, predictive_y_for_training, 'y--', label='pre_cos_trn') 104 | 105 | plt.plot(t_for_testing, test_x, 'c', label='sin_tst') 106 | plt.plot(t_for_testing, test_y, 'k', label='ref_cos_tst') 107 | plt.plot(t_for_testing, predictive_y_for_testing, 'm--', label='pre_cos_tst') 108 | 109 | plt.plot([t[train_data_len], t[train_data_len]], [-1.2, 4.0], 'r--', label='separation line') # separation line 110 | 111 | plt.xlabel('t') 112 | plt.ylabel('sin(t) and cos(t)') 113 | plt.xlim(t[0], t[-1]) 114 | plt.ylim(-1.2, 4) 115 | plt.legend(loc='upper right') 116 | plt.text(14, 2, "train", size = 15, alpha = 1.0) 117 | plt.text(20, 2, "test", size = 15, alpha = 1.0) 118 | 119 | plt.savefig("./output/LSTM.png") 120 | 121 | 122 | if __name__ == "__main__": 123 | LSTM() -------------------------------------------------------------------------------- /LSTM_work.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | # kaggle Jane Street Market Prediction代码 3 | # 实际自己工作的代码 4 | # LSTM模型 5 | 6 | 7 | import numpy as np 8 | import pandas as pd 9 | pd.set_option('display.max_columns', None) 10 | import janestreet 11 | 12 | import matplotlib.pyplot as plt 13 | from sklearn.model_selection import train_test_split 14 | from sklearn import metrics 15 | from sklearn.metrics import accuracy_score 16 | import torch 17 | import torch.nn as nn 18 | import torch.optim as optim 19 | 20 | import os 21 | 22 | from FE import featureEngineer 23 | from tools import * 24 | 25 | 26 | 27 | # 建模前处理数据 28 | def preprocessing(train): 29 | X_train = train.loc[:, train.columns.str.contains('feature')] 30 | # y_train = train.loc[:, 'resp'] 31 | y_train = train.loc[:, 'action'] 32 | 33 | # X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, random_state=666, test_size=0.2) 34 | 35 | return X_train, y_train 36 | 37 | 38 | # 评分函数 39 | def Score(model, data): 40 | # test_df = pd.read_csv("/kaggle/input/jane-street-market-prediction/train.csv") 41 | data = data.fillna(-999) 42 | X_test = data.loc[:, data.columns.str.contains('feature')] 43 | resp = model.predict(X_test) 44 | date = data["date"].values 45 | weight = data["weight"].values 46 | action = (resp > 0).astype("int") 47 | 48 | count_i = len(np.unique(date)) 49 | Pi = np.zeros(count_i) 50 | # 用循环太慢 51 | #for i, day in enumerate(np.unique(date)): 52 | # Pi[i] = np.sum(weight[date == day] * resp[date == day] * action[date == day]) 53 | # 用下面这行代替 54 | Pi = np.bincount(date, weight * resp * action) 55 | t = np.sum(Pi) / np.sqrt(np.sum(Pi ** 2)) * np.sqrt(250 / count_i) 56 | u = np.clip(t, 0, 6) * np.sum(Pi) 57 | return u 58 | 59 | 60 | # 进行预测,生成提交文件,分类版 61 | def predict_clf(model): 62 | env = janestreet.make_env() 63 | iter_test = env.iter_test() 64 | for (test_df, sample_prediction_df) in iter_test: 65 | if test_df['weight'].item() > 0: 66 | # test_df = featureEngineer(test_df) 67 | X_test = test_df.loc[:, test_df.columns.str.contains('feature')] 68 | X_test = X_test.fillna(0.0) 69 | y_preds = model.predict(X_test)[0] 70 | else: 71 | y_preds = 0 72 | # print(y_preds) 73 | sample_prediction_df.action = y_preds 74 | env.predict(sample_prediction_df) 75 | 76 | 77 | class LstmRNN(nn.Module): 78 | def __init__(self, input_size, hidden_size = 10, output_size = 1, num_layers = 1): 79 | super().__init__() 80 | self.linear = nn.Linear(input_size, hidden_size) 81 | self.lstm = nn.LSTM(input_size, hidden_size, num_layers) 82 | self.forwardCalculation = nn.Linear(hidden_size, output_size) 83 | self.sigmoid = nn.Sigmoid() 84 | 85 | def forward(self, _x): 86 | # s, b, h = _x.shape 87 | # x = _x.view(s*b, h) 88 | # x = self.linear(x) 89 | # x = x.view(s, b, h) 90 | x, _ = self.lstm(_x) 91 | s, b, h = x.shape # seq_len, batch, hidden_size 92 | x = x.view(s*b, h) 93 | x = self.forwardCalculation(x) 94 | x = self.sigmoid(x) 95 | x = x.view(s, b, -1) 96 | return x 97 | 98 | 99 | if __name__ == "__main__": 100 | newpath = "/home/code" 101 | os.chdir(newpath) 102 | 103 | # data_explore() 104 | 105 | # 真正开始干活 106 | p = 0.001 107 | train = loadData(p = p) 108 | train = featureEngineer(train) 109 | print(train.info()) 110 | # print(train.head()) 111 | 112 | # 计算模型评分 113 | # score = Score(model, train) 114 | # print("模型评分:%.2f" % score) 115 | test = loadData(p = p) 116 | test = featureEngineer(test) 117 | 118 | #训练数据预处理 119 | x_train, y_train = preprocessing(train) 120 | x_test, y_test = preprocessing(test) 121 | 122 | # 深度学习 123 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 124 | # x_train.values.reshape(-1, 1, 130) 125 | # y_train.values.reshape(-1, 1, 1) 126 | x_tensor = torch.from_numpy(x_train.values.reshape(-1, 1, 130)).float().to(device) 127 | y_tensor = torch.from_numpy(y_train.values.reshape(-1, 1, 1)).float().to(device) 128 | 129 | 130 | Model = LstmRNN(130, 5).to(device) 131 | 132 | # model = Model(x_tensor).to(device) 133 | # print(model.state_dict()) 134 | # 设置超参数 135 | lr = 0.000678 136 | n_epochs = 110 137 | 138 | # loss_fn = nn.BCELoss(reduction='sum') 139 | loss_fn = nn.MSELoss(reduction = "mean") 140 | optimizer = optim.Adam(Model.parameters(), lr = lr) 141 | # 创建训练器 142 | train_step = make_train_step(Model, loss_fn, optimizer) 143 | losses = [] 144 | 145 | print("开始训练") 146 | # 进行训练 147 | for epoch in range(n_epochs): 148 | # y_tensor = y_tensor.detach() 149 | loss = train_step(x_tensor, y_tensor) 150 | losses.append(loss) 151 | 152 | # print(model.state_dict()) 153 | print(losses) 154 | plt.figure() 155 | plt.plot(losses) 156 | plt.savefig("./output/loss.png") 157 | # 验证模型 158 | # x_test.reshape(-1, 1, 130) 159 | # y_test.reshape(-1, 1, 1) 160 | x_test_tensor = torch.from_numpy(x_test.values.reshape(-1, 1, 130)).float().to(device) 161 | y_test_tensor = torch.from_numpy(y_test.values.reshape(-1, 1, 1)).float().to(device) 162 | result = [] 163 | preds = [] 164 | # dph = 0.0 165 | for x in Model(x_test_tensor): 166 | preds.append(x.detach().cpu().numpy()[0][0]) 167 | # dph = np.min(preds) + (np.max(preds) - np.min(preds))/2.0 168 | # print(dph) 169 | if x >= 0.5: 170 | result.append(1) 171 | else: 172 | result.append(0) 173 | y_test = y_test_tensor.numpy() 174 | # print(len(y_test)) 175 | # print(result) 176 | plt.figure() 177 | plt.hist(preds) 178 | plt.savefig("./output/predicts.png") 179 | plt.close() 180 | print("预测结果均值:{}".format(np.mean(preds))) 181 | print("预测结果中位数:{}".format(np.median(preds))) 182 | print("预测结果极值之差:{}-{}={}".format(np.max(preds), np.min(preds), np.max(preds) - np.min(preds))) 183 | count = 0 184 | for i in range(len(result)): 185 | if y_test[i] == result[i]: 186 | count += 1 187 | print(count) 188 | print("预测正确率:%f" % (count/len(y_test))) 189 | # 进行预测 190 | # predict_clf(model) 191 | -------------------------------------------------------------------------------- /MyFrame.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | # 开课吧《创造你的第一个深度学习框架》实操 3 | 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import random 8 | from run import * 9 | from sklearn.datasets import load_boston 10 | from sklearn.utils import shuffle, resample 11 | from collections import defaultdict 12 | from sklearn.model_selection import train_test_split 13 | import matplotlib.pyplot as plt 14 | import torch 15 | from torch import nn 16 | 17 | 18 | # 神经节类 19 | class Node: 20 | def __init__(self, inputs = []): 21 | self.inputs = inputs 22 | self.outputs = [] 23 | for n in self.inputs: 24 | n.outputs.append(self) 25 | 26 | self.value = None 27 | self.gradients = {} 28 | 29 | def forward(self): 30 | raise NotImplemented 31 | 32 | def backward(self): 33 | raise NotImplemented 34 | 35 | 36 | # 占位节点,没有输入的节点,其值要指定 37 | class Placeholder(Node): 38 | def __init__(self): 39 | Node.__init__(self) 40 | 41 | def forward(self, value = None): 42 | if value is not None: 43 | self.value = value 44 | 45 | def backward(self): 46 | self.gradients = {self:0} 47 | for n in self.outputs: 48 | grad_cost = n.gradients[self] 49 | self.gradients[self] = grad_cost*1 50 | 51 | 52 | # 线性节点 53 | class Linear(Node): 54 | def __init__(self, nodes, weights, bias): 55 | Node.__init__(self, inputs = [nodes, weights, bias]) 56 | 57 | def forward(self): 58 | inputs = self.inputs[0].value 59 | weights = self.inputs[1].value 60 | bias = self.inputs[2].value 61 | # print("测试", type(inputs), type(weights)) 62 | self.value = np.dot(inputs, weights) + bias 63 | 64 | def backward(self): 65 | self.gradients = {n: np.zeros_like(n.value) for n in self.inputs} 66 | for n in self.outputs: 67 | grad_cost = n.gradients[self] 68 | self.gradients[self.inputs[0]] = np.dot(grad_cost, self.inputs[1].value.T) 69 | self.gradients[self.inputs[1]] = np.dot(self.inputs[0].value.T, grad_cost) 70 | self.gradients[self.inputs[2]] = np.sum(grad_cost, axis=0, keepdims=False) 71 | 72 | 73 | # Sigmoid激活节点 74 | class Sigmoid(Node): 75 | def __init__(self, node): 76 | Node.__init__(self, inputs = [node]) 77 | 78 | def _sigmoid(self, x): 79 | return 1./(1 + np.exp(-1 * x)) 80 | 81 | def forward(self): 82 | self.x = self.inputs[0].value 83 | self.value = self._sigmoid(self.x) 84 | 85 | def backward(self): 86 | self.partial = self._sigmoid(self.x) * (1 - self._sigmoid(self.x)) 87 | self.gradients = {n: np.zeros_like(n.value) for n in self.inputs} 88 | for n in self.outputs: 89 | grad_cost = n.gradients[self] 90 | self.gradients[self.inputs[0]] = grad_cost * self.partial 91 | 92 | 93 | # 损失函数 94 | class MSE(Node): 95 | def __init__(self, y, yhat): 96 | Node.__init__(self, inputs = [y, yhat]) 97 | 98 | def forward(self): 99 | y = self.inputs[0].value.reshape(-1, 1) 100 | yhat = self.inputs[1].value.reshape(-1, 1) 101 | assert(y.shape == yhat.shape) 102 | 103 | self.m = self.inputs[0].value.shape[0] 104 | self.diff = y - yhat 105 | self.value = np.mean(self.diff**2) 106 | 107 | def backward(self): 108 | self.gradients[self.inputs[0]] = (2/self.m)*self.diff 109 | self.gradients[self.inputs[1]] = (-2/self.m)*self.diff 110 | 111 | 112 | # sigmoid函数 113 | def sigmoid(x): 114 | return 1./(1 + np.exp(-1 * x)) 115 | 116 | 117 | # 加载数据 118 | def loaddata(): 119 | data = load_boston() 120 | X_ = data["data"] 121 | Y_ = data["target"] 122 | # 将数据归一化 123 | X_ = (X_ - np.mean(X_, axis=0)) / np.std(X_, axis=0) 124 | x_train, x_test, y_train, y_test = train_test_split(X_, Y_, test_size = 0.2, random_state = 666) 125 | return (x_train, x_test, y_train, y_test) 126 | 127 | 128 | # 将节点数据转换为图 129 | def convert_feed_dict_to_graph(feed_dict): 130 | computing_graph = defaultdict(list) 131 | nodes = [n for n in feed_dict] 132 | while nodes: 133 | n = nodes.pop() 134 | if isinstance(n, Placeholder): 135 | n.value = feed_dict[n] 136 | 137 | if n in computing_graph: 138 | continue 139 | 140 | for m in n.outputs: 141 | computing_graph[n].append(m) 142 | nodes.append(m) 143 | 144 | return computing_graph 145 | 146 | 147 | # 将图进行拓扑排序,生成计算图 148 | def toplogic(graph): 149 | sorted_nodes = [] 150 | while len(graph) > 0: 151 | all_inputs = [] 152 | all_outputs = [] 153 | 154 | for n in graph: 155 | all_inputs += graph[n] 156 | all_outputs.append(n) 157 | 158 | all_inputs = set(all_inputs) 159 | all_outputs = set(all_outputs) 160 | 161 | need_remove = all_outputs - all_inputs 162 | 163 | if len(need_remove) > 0: 164 | node = random.choice(list(need_remove)) 165 | need_to_visited = [node] 166 | if len(graph) == 1: 167 | need_to_visited += graph[node] 168 | graph.pop(node) 169 | sorted_nodes += need_to_visited 170 | 171 | for _, links in graph.items(): 172 | if node in links: 173 | links.remove(node) 174 | else: 175 | break 176 | 177 | return sorted_nodes 178 | 179 | 180 | # 生成计算图 181 | def topological_sort_feed_dict(feed_dict): 182 | graph = convert_feed_dict_to_graph(feed_dict) 183 | return toplogic(graph) 184 | 185 | 186 | # 前向传播 187 | def forward(graph): 188 | for n in graph: 189 | n.forward() 190 | 191 | 192 | # 后向传播 193 | def backward(graph): 194 | for n in graph[::-1]: 195 | n.backward() 196 | 197 | 198 | # 更新参数 199 | def optimizer(trainables, learning_rate=1e-2): 200 | for t in trainables: 201 | t.value += -1 * learning_rate * t.gradients[t] 202 | 203 | 204 | # 进行预测 205 | @change_dir 206 | def MyFramePredict(w1_, b1_, w2_, b2_, losses, X_test, Y_test): 207 | y1 = np.dot(x_test, w1_.value) + b1_.value 208 | s = sigmoid(y1) 209 | y2 = np.dot(s, w2_.value) + b2_.value 210 | # print(y2) 211 | # 用误差平方和评价 212 | sse = ((y2-y_test)**2).sum() 213 | print("框架评分:{}".format(sse)) 214 | 215 | # 画图 216 | plt.figure() 217 | plt.plot(losses) 218 | plt.title("cost of model") 219 | plt.savefig("./output/FrameCost.png") 220 | plt.close() 221 | plt.figure() 222 | y2 = y2.flatten() 223 | delta = y2-y_test 224 | # print(y2.shape, y_test.shape, delta.shape, x.shape) 225 | plt.plot(delta, color = "green") 226 | # plt.scatter(x, y_test, color = "red") 227 | plt.savefig("./output/FrameResult.png") 228 | plt.close() 229 | 230 | 231 | # pytorch版预测 232 | @change_dir 233 | def PytorchPredict(model, losses, x_test, y_test): 234 | # 准备数据 235 | x_test = x_test.astype(np.float32) 236 | y_test = y_test.astype(np.float32) 237 | x_tensor = torch.from_numpy(x_test) 238 | y_tensor = torch.from_numpy(y_test) 239 | # 用模型进行预测 240 | y_pred = model(x_tensor).detach().numpy() 241 | sse = ((y_pred-y_test)**2).sum() 242 | print("pytorch评分:{}".format(sse)) 243 | 244 | # 画图 245 | plt.figure() 246 | plt.plot(losses) 247 | plt.title("cost of pytorch") 248 | plt.savefig("./output/PytorchCost.png") 249 | plt.close() 250 | plt.figure() 251 | y_pred = y_pred.flatten() 252 | delta = y_pred-y_test 253 | # print(y2.shape, y_test.shape, delta.shape, x.shape) 254 | plt.plot(delta, color = "green") 255 | # plt.scatter(x, y_test, color = "red") 256 | plt.savefig("./output/PytorchResult.png") 257 | plt.close() 258 | 259 | 260 | # 测试我的框架 261 | @change_dir 262 | @timethis 263 | def testMyFrame(X_, Y_): 264 | # 初始化参数 265 | n_features = X_.shape[1] 266 | n_hidden = 10 267 | W1_ = np.random.randn(n_features, n_hidden) 268 | b1_ = np.zeros(n_hidden) 269 | W2_ = np.random.randn(n_hidden, 1) 270 | b2_ = np.zeros(1) 271 | # 定义神经节 272 | X, y = Placeholder(), Placeholder() 273 | W1, b1 = Placeholder(), Placeholder() 274 | W2, b2 = Placeholder(), Placeholder() 275 | 276 | # 定义模型 277 | l1 = Linear(X, W1, b1) 278 | s1 = Sigmoid(l1) 279 | l2 = Linear(s1, W2, b2) 280 | cost = MSE(y, l2) 281 | 282 | # 定义初始值 283 | feed_dict = { 284 | X: X_, 285 | y: Y_, 286 | W1: W1_, 287 | b1: b1_, 288 | W2: W2_, 289 | b2: b2_ 290 | } 291 | 292 | # 定义超参数 293 | epochs = 5000 294 | m = X_.shape[0] 295 | batch_size = 16 296 | steps_per_epoch = m // batch_size 297 | 298 | # 生成计算图 299 | graph = topological_sort_feed_dict(feed_dict) 300 | trainables = [W1, b1, W2, b2] 301 | 302 | print("样本总数{}".format(m)) 303 | 304 | # 训练过程 305 | losses = [] 306 | 307 | for i in range(epochs): 308 | loss = 0 309 | for j in range(steps_per_epoch): 310 | # 步骤①,对样本随机采样 311 | X_batch, y_batch = resample(X_, Y_, n_samples=batch_size) 312 | # 重置X, Y的输入值 313 | X.value = X_batch 314 | y.value = y_batch 315 | # 步骤②,前向和后向传播 316 | # _ = None 317 | forward(graph) 318 | backward(graph) 319 | # 步骤③ 更新参数 320 | rate = 1e-2 321 | optimizer(trainables, rate) 322 | loss += graph[-1].value 323 | 324 | # 输出 325 | if i % 100 == 0: 326 | # print("Epoch: {}, Loss: {:.3f}".format(i+1, loss/steps_per_epoch)) 327 | losses.append(loss/steps_per_epoch) 328 | 329 | return (W1, b1, W2, b2, losses) 330 | 331 | 332 | # 测试pytorch 333 | @change_dir 334 | @timethis 335 | def testPytorch(x_train, y_train): 336 | n_features = x_train.shape[1] 337 | n_hidden = 10 338 | # 定义网络结构 339 | net = nn.Sequential( 340 | nn.Linear(n_features, n_hidden), 341 | nn.Sigmoid(), 342 | nn.Linear(n_hidden, 1) 343 | ) 344 | # 定义损失函数和优化器 345 | loss_fn = nn.MSELoss() 346 | optimizer = torch.optim.SGD(net.parameters(), lr = 1e-2) 347 | 348 | # 定义训练参数 349 | epochs = 5000 350 | m = n_features 351 | batch_size = 16 352 | steps_per_epoch = m // batch_size 353 | 354 | # 定义数据加载器 355 | x_train = x_train.astype(np.float32) 356 | y_train = y_train.astype(np.float32) 357 | x_tensor = torch.from_numpy(x_train) 358 | y_tensor = torch.from_numpy(y_train) 359 | train = torch.utils.data.TensorDataset(x_tensor, y_tensor) 360 | train_loader = torch.utils.data.DataLoader(train, batch_size = batch_size, shuffle = False) 361 | 362 | #训练 363 | losses = [] 364 | for i in range(epochs): 365 | for x, y in train_loader: 366 | y = y.view(-1, 1) 367 | # print(i, x.size(), y.size()) 368 | # 清除梯度 369 | optimizer.zero_grad() 370 | outputs = net(x) 371 | loss = loss_fn(outputs, y) 372 | loss.backward() 373 | optimizer.step() 374 | # 输出 375 | if i % 100 == 0: 376 | # print("Epoch: {}, Loss: {:.3f}".format(i+1, loss.data)) 377 | losses.append(loss.data) 378 | 379 | return (net, losses) 380 | 381 | 382 | if __name__ == "__main__": 383 | x_train, x_test, y_train, y_test = loaddata() 384 | x_train_, x_test_, y_train_, y_test_ = x_train.copy(), x_test.copy(), y_train.copy(), y_test.copy() 385 | params = testMyFrame(x_train_, y_train_) 386 | MyFramePredict(params[0], params[1], params[2], params[3], params[4], x_test_, y_test_) 387 | x_train_, x_test_, y_train_, y_test_ = x_train.copy(), x_test.copy(), y_train.copy(), y_test.copy() 388 | model, losses = testPytorch(x_train_, y_train_) 389 | PytorchPredict(model, losses, x_test_, y_test_) 390 | -------------------------------------------------------------------------------- /NNDL/minst.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | # 《神经网络与深度学习》代码实现 3 | 4 | 5 | import numpy as np 6 | import run 7 | import random 8 | 9 | import pickle 10 | import gzip 11 | 12 | @run.change_dir 13 | def load_data(): 14 | f = gzip.open('./mnist.pkl.gz', 'rb') 15 | training_data, validation_data, test_data = pickle.load(f, encoding='bytes') 16 | f.close() 17 | return (training_data, validation_data, test_data) 18 | 19 | 20 | def load_data_wrapper(): 21 | tr_d, va_d, te_d = load_data() 22 | training_inputs = [np.reshape(x, (784, 1)) for x in tr_d[0]] 23 | training_results = [vectorized_result(y) for y in tr_d[1]] 24 | training_data = list(zip(training_inputs, training_results)) 25 | validation_inputs = [np.reshape(x, (784, 1)) for x in va_d[0]] 26 | validation_data = zip(validation_inputs, va_d[1]) 27 | test_inputs = [np.reshape(x, (784, 1)) for x in te_d[0]] 28 | test_data = list(zip(test_inputs, te_d[1])) 29 | return (training_data, validation_data, test_data) 30 | 31 | 32 | def vectorized_result(j): 33 | e = np.zeros((10, 1)) 34 | e[j] = 1.0 35 | return e 36 | 37 | 38 | # 定义神经网络 39 | class Network: 40 | def __init__(self, sizes): 41 | self.num_layers = len(sizes) 42 | self.sizes = sizes 43 | self.biases = [np.random.randn(y, 1) for y in sizes[1:]] 44 | self.weights = [np.random.randn(y, x) for x, y in zip(sizes[:-1], sizes[1:])] 45 | 46 | def feedforward(self, a): 47 | for b, w in zip(self.biases, self.weights): 48 | a = sigmoid(np.dot(w, a) + b) 49 | return a 50 | 51 | def SGD(self, training_data, epochs, mini_batch_size, eta, test_data = None): 52 | if test_data: 53 | n_test = len(test_data) 54 | n = len(training_data) 55 | for j in range(epochs): 56 | random.shuffle(training_data) 57 | mini_batches = [ 58 | training_data[k:k+mini_batch_size] for k in range(0, n, mini_batch_size)] 59 | for mini_batch in mini_batches: 60 | self.update_mini_batch(mini_batch, eta) 61 | if test_data: 62 | print("Epoch {}: {}/{}".format(j, self.evaluate(test_data), n_test)) 63 | else: 64 | print("Epoch {}完成".format(j)) 65 | 66 | def update_mini_batch(self, mini_batch, eta): 67 | nabla_b = [np.zeros(b.shape) for b in self.biases] 68 | nabla_w = [np.zeros(w.shape) for w in self.weights] 69 | for x, y in mini_batch: 70 | delta_nabla_b, delta_nabla_w = self.backprop(x, y) 71 | nabla_b = [nb+dnb for nb, dnb in zip(nabla_b, delta_nabla_b)] 72 | nabla_w = [nw+dnw for nw, dnw in zip(nabla_w, delta_nabla_w)] 73 | self.weights = [w-(eta/len(mini_batch))*nw for w, nw in zip(self.weights, nabla_w)] 74 | self.biases = [b-(eta/len(mini_batch))*nb for b, nb in zip(self.biases, nabla_b)] 75 | 76 | def backprop(self, x, y): 77 | nabla_b = [np.zeros(b.shape) for b in self.biases] 78 | nabla_w = [np.zeros(w.shape) for w in self.weights] 79 | activation = x 80 | activations = [x] 81 | zs = [] 82 | for b, w in zip(self.biases, self.weights): 83 | z = np.dot(w, activation) + b 84 | zs.append(z) 85 | activation = sigmoid(z) 86 | activations.append(activation) 87 | # 反向传播过程 88 | delta = self.cost_derivative(activations[-1], y)*sigmoid_prime(zs[-1]) 89 | nabla_b[-1] = delta 90 | nabla_w[-1] = np.dot(delta, activations[-2].transpose()) 91 | for l in range(2, self.num_layers): 92 | z = zs[-l] 93 | sp = sigmoid_prime(z) 94 | delta = np.dot(self.weights[-l+1].transpose(), delta)*sp 95 | nabla_b[-l] = delta 96 | nabla_w[-l] = np.dot(delta, activations[-l-1].transpose()) 97 | return (nabla_b, nabla_w) 98 | 99 | def evaluate(self, test_data): 100 | test_results = [(np.argmax(self.feedforward(x)), y) for (x, y) in test_data] 101 | return sum(int(x == y) for (x, y) in test_results) 102 | 103 | def cost_derivative(self, output_activations, y): 104 | return (output_activations-y) 105 | 106 | 107 | # 逻辑函数 108 | def sigmoid(z): 109 | return 1.0/(1.0+np.exp(-z)) 110 | 111 | 112 | def sigmoid_prime(z): 113 | return sigmoid(z)*(1-sigmoid(z)) 114 | 115 | 116 | if __name__ == "__main__": 117 | training_data, validation_data, test_data = load_data_wrapper() 118 | net = Network([784, 30, 10]) 119 | net.SGD(training_data, 30, 10, 3.0, test_data = test_data) 120 | 121 | -------------------------------------------------------------------------------- /NNDL/mnist.pkl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zwdnet/JSMPwork/b7ae1380986abc3ee5b10bea7f9d57cbb805a8f0/NNDL/mnist.pkl.gz -------------------------------------------------------------------------------- /NNDL/run.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | # 将程序上传到服务器上执行 3 | import os 4 | import sys 5 | from functools import wraps 6 | import time 7 | 8 | 9 | # 上传代码至服务器并运行 10 | def run(gpus, server): 11 | # 上传本目录所有文件再执行指定文件 12 | if gpus == "all": 13 | # 清除服务器代码目录里所有源文件以及输出目录中的文件 14 | s = "ssh ubuntu@" + server + " \"sudo rm -rf ~/code/*.py\"" 15 | os.system(s) 16 | s = "ssh ubuntu@" + server + " \"sudo rm -rf ~/code/output/*\"" 17 | os.system(s) 18 | # 将本地目录所有文件上传至容器 19 | s = "scp -r ./*.py ubuntu@" + server + ":~/code" 20 | os.system(s) 21 | # 运行指定代码 22 | s = "ssh root@" + server + " -p 2222 \"python /home/code/" + sys.argv[2] + "\"" 23 | print("正在运行代码……\n") 24 | os.system(s) 25 | # 将代码目录里所有输出文件传回 26 | s = "scp -r ubuntu@" + server + ":~/code/output/* ./output/" 27 | os.system(s) 28 | # 将所有结果文件传回 29 | elif gpus == "copy": 30 | s = "scp -r ubuntu@" + server + ":~/code/output/* ./output/" 31 | os.system(s) 32 | # 上传指定文件并执行 33 | else: 34 | ## 清除服务器代码目录里所有源文件以及输出目录中的文件 35 | s = "ssh ubuntu@" + server + " \"sudo rm -rf ~/code/*.py\"" 36 | os.system(s) 37 | s = "ssh ubuntu@" + server + " \"sudo rm -rf ~/code/output/*\"" 38 | os.system(s) 39 | # 将本地目录指定文件上传至容器 40 | s = "scp " + sys.argv[1] + " ubuntu@" + server + ":~/code" 41 | os.system(s) 42 | # 运行指定代码 43 | s = "ssh root@" + server + " -p 2222 \"python /home/code/" + sys.argv[1] + "\"" 44 | os.system(s) 45 | # 将代码目录里所有文件传回 46 | s = "scp -r ubuntu@" + server + ":~/code/output/* ./output/" 47 | os.system(s) 48 | 49 | 50 | if __name__ == "__main__": 51 | gpus = sys.argv[1] 52 | # 读取服务器IP地址,自己编辑serverIP.txt去 53 | with open("serverIP.txt", "rt") as f: 54 | server = f.read() 55 | run(gpus, server) 56 | 57 | 58 | # 工具函数,在上传到服务器上运行时改变当前目录 59 | def change_dir(func): 60 | @wraps(func) 61 | def change(*args, **kwargs): 62 | oldpath = os.getcwd() 63 | newpath = "/home/code/" 64 | os.chdir(newpath) 65 | r = func(*args, **kwargs) 66 | os.chdir(oldpath) 67 | return r 68 | return change 69 | 70 | 71 | # 工具函数,计算函数运行时间 72 | def timethis(func): 73 | @wraps(func) 74 | def wrapper(*args, **kwargs): 75 | start = time.perf_counter() 76 | r = func(*args, **kwargs) 77 | end = time.perf_counter() 78 | print('{}.{}的运行时间为 : {}秒'.format(func.__module__, func.__name__, end - start)) 79 | return r 80 | return wrapper 81 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | Kaggle竞赛Jane Street Market Prediction实操代码 2 | 竞赛地址: https://www.kaggle.com/c/jane-street-market-prediction/overview -------------------------------------------------------------------------------- /X.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zwdnet/JSMPwork/b7ae1380986abc3ee5b10bea7f9d57cbb805a8f0/X.npy -------------------------------------------------------------------------------- /Y.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zwdnet/JSMPwork/b7ae1380986abc3ee5b10bea7f9d57cbb805a8f0/Y.npy -------------------------------------------------------------------------------- /__pycache__/run.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zwdnet/JSMPwork/b7ae1380986abc3ee5b10bea7f9d57cbb805a8f0/__pycache__/run.cpython-37.pyc -------------------------------------------------------------------------------- /__pycache__/run.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zwdnet/JSMPwork/b7ae1380986abc3ee5b10bea7f9d57cbb805a8f0/__pycache__/run.cpython-38.pyc -------------------------------------------------------------------------------- /__pycache__/tools.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zwdnet/JSMPwork/b7ae1380986abc3ee5b10bea7f9d57cbb805a8f0/__pycache__/tools.cpython-38.pyc -------------------------------------------------------------------------------- /copy_jsmp.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | # kaggle Jane Street Market Prediction代码 3 | # copy别人的代码:https://www.kaggle.com/c/jane-street-market-prediction/submissions 4 | 5 | 6 | import numpy as np 7 | import pandas as pd 8 | import janestreet 9 | 10 | import plotly.express as px 11 | from plotly.subplots import make_subplots 12 | import plotly.graph_objs as go 13 | import plotly.io as pio 14 | 15 | import matplotlib.pyplot as plt 16 | from xgboost import XGBClassifier 17 | from sklearn.model_selection import train_test_split 18 | from sklearn import metrics 19 | from sklearn.metrics import accuracy_score 20 | import optuna 21 | from optuna.samplers import TPESampler 22 | 23 | import os 24 | import time 25 | 26 | 27 | # 数据探索 28 | def data_explore(): 29 | # 读取数据 30 | train = pd.read_csv("./train.csv", nrows = 10000) 31 | print(train.head()) 32 | 33 | # 先画图看目标特征的分布 34 | # .plt.figure() 35 | plot_list = ['weight', 'resp_1', 'resp_2', 'resp_3', 'resp_4', 'resp'] 36 | fig = make_subplots(rows=3, cols=2) 37 | traces = [ 38 | go.Histogram( 39 | x = train[col], 40 | nbinsx = 100, 41 | name = col 42 | ) for col in plot_list 43 | ] 44 | 45 | for i in range(len(traces)): 46 | fig.append_trace( 47 | traces[i], 48 | (i // 2) + 1, 49 | (i % 2) + 1 50 | ) 51 | 52 | fig.update_layout( 53 | title_text='Target features distributions', 54 | height = 900, 55 | width = 800 56 | ) 57 | 58 | pio.write_image(fig, "./output/target_distribute.png") 59 | 60 | # 看特征值的分布 61 | features = train.columns 62 | features = features[7:] 63 | features = features[:130] 64 | fig = make_subplots( 65 | rows = 44, 66 | cols = 3 67 | ) 68 | traces = [ 69 | go.Histogram( 70 | x = train[col], 71 | nbinsx = 100, 72 | name = col 73 | ) for col in features 74 | ] 75 | 76 | for i in range(len(traces)): 77 | fig.append_trace( 78 | traces[i], 79 | (i // 3) + 1, 80 | (i % 3) + 1 81 | ) 82 | 83 | fig.update_layout( 84 | title_text='Train features distributions', 85 | height = 5000 86 | ) 87 | 88 | pio.write_image(fig, "./output/features_distribute.png") 89 | 90 | cols = features 91 | 92 | # 读取其它数据文件看看 93 | features = pd.read_csv("./features.csv") 94 | print(features) 95 | example_test = pd.read_csv("./example_test.csv") 96 | print(example_test) 97 | submission = pd.read_csv("./example_sample_submission.csv") 98 | print(submission) 99 | 100 | # 开始建模 101 | train = pd.read_csv("./small_train.csv") 102 | # 先找到高度相关的特征 103 | all_columns = [] 104 | for i in range(0, len(cols)): 105 | for j in range(i+1, len(cols)): 106 | if abs(train[cols[i]].corr(train[cols[j]])) > 0.95: 107 | all_columns = all_columns + [cols[i], cols[j]] 108 | 109 | all_columns = list(set(all_columns)) 110 | print('Number of columns:', len(all_columns)) 111 | # 画图 112 | data = train[all_columns] 113 | f = plt.figure( 114 | figsize = (22, 22) 115 | ) 116 | plt.matshow( 117 | data.corr(), 118 | fignum = f.number 119 | ) 120 | plt.xticks( 121 | range(data.shape[1]), 122 | data.columns, 123 | fontsize = 14, 124 | rotation = 90 125 | ) 126 | plt.yticks( 127 | range(data.shape[1]), 128 | data.columns, 129 | fontsize = 14 130 | ) 131 | cb = plt.colorbar() 132 | cb.ax.tick_params( 133 | labelsize = 14 134 | ) 135 | plt.savefig("./output/features_corr.png") 136 | 137 | # 目标值的相关度 138 | data = train[['weight', 'resp_1', 'resp_2', 'resp_3', 'resp_4', 'resp']] 139 | f = plt.figure( 140 | figsize = (12, 12) 141 | ) 142 | plt.matshow( 143 | data.corr(), 144 | fignum = f.number 145 | ) 146 | plt.xticks( 147 | range(data.shape[1]), 148 | data.columns, 149 | fontsize = 14, 150 | rotation = 90 151 | ) 152 | plt.yticks( 153 | range(data.shape[1]), 154 | data.columns, 155 | fontsize = 14 156 | ) 157 | cb = plt.colorbar() 158 | cb.ax.tick_params( 159 | labelsize = 14 160 | ) 161 | plt.savefig("./output/targets_corr.png") 162 | 163 | 164 | # 建模过程 165 | def modeling(): 166 | print("开始建模") 167 | # train = pd.read_csv("./small_train.csv") 168 | train = pd.read_csv("./train.csv", nrows = 10000) 169 | 170 | train = train[train['weight'] != 0] 171 | train['action'] = ((train['weight'].values * train['resp'].values) > 0).astype('int') 172 | 173 | X_train = train.loc[:, train.columns.str.contains('feature')] 174 | y_train = train.loc[:, 'action'] 175 | 176 | X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, random_state=666, test_size=0.2) 177 | 178 | del train 179 | 180 | X_train = X_train.fillna(-999) 181 | sampler = TPESampler(seed=666) 182 | tm = "auto" 183 | 184 | def create_model(trial): 185 | max_depth = trial.suggest_int("max_depth", 2, 12) 186 | n_estimators = trial.suggest_int("n_estimators", 2, 600) 187 | learning_rate = trial.suggest_uniform('learning_rate', 0.0001, 0.99) 188 | subsample = trial.suggest_uniform('subsample', 0.0001, 1.0) 189 | colsample_bytree = trial.suggest_uniform('colsample_bytree', 0.0000001, 1) 190 | model = XGBClassifier( 191 | n_estimators=n_estimators, 192 | max_depth=max_depth, 193 | learning_rate=learning_rate, 194 | subsample=subsample, 195 | colsample_bytree=colsample_bytree, 196 | random_state=666, 197 | tree_method=tm, 198 | silent = 1 199 | ) 200 | 201 | return model 202 | 203 | def objective(trial): 204 | model = create_model(trial) 205 | model.fit(X_train, y_train) 206 | score = accuracy_score( 207 | y_train, 208 | model.predict(X_train) 209 | ) 210 | return score 211 | 212 | params1 = { 213 | 'max_depth': 8, 214 | 'n_estimators': 500, 215 | 'learning_rate': 0.01, 216 | 'subsample': 0.9, 217 | 'tree_method': tm, 218 | 'random_state': 666 219 | } 220 | 221 | params3 = { 222 | 'max_depth': 10, 223 | 'n_estimators': 500, 224 | 'learning_rate': 0.03, 225 | 'subsample': 0.9, 226 | 'colsample_bytree': 0.7, 227 | 'tree_method': tm, 228 | 'random_state': 666 229 | } 230 | 231 | start_time = time.time() 232 | model1 = XGBClassifier(**params1) 233 | model1.fit(X_train, y_train, eval_metric='auc') 234 | model1.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], eval_metric='auc',verbose=False) 235 | evals_result = model1.evals_result() 236 | print("模型1评分") 237 | y_true, y_pred = y_test, model1.predict(X_test) 238 | print("Accuracy : %.4g" % metrics.accuracy_score(y_true, y_pred)) 239 | 240 | model3 = XGBClassifier(**params3) 241 | model3.fit(X_train, y_train, eval_metric='auc') 242 | model3.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], eval_metric='auc',verbose=False) 243 | evals_result = model3.evals_result() 244 | print("模型3评分") 245 | y_true, y_pred = y_test, model3.predict(X_test) 246 | print("Accuracy : %.4g" % metrics.accuracy_score(y_true, y_pred)) 247 | end_time = time.time() 248 | print("建模时间:%.2f秒" % (end_time - start_time)) 249 | 250 | return (model1, model3) 251 | 252 | 253 | if __name__ == "__main__": 254 | newpath = "/home/code" 255 | os.chdir(newpath) 256 | # pio.orca.config.use_xvfb = True 257 | # pio.orca.config.executable = "/opt/conda/envs/tensorflow/bin/orca" 258 | pd.set_option('display.max_columns', None) 259 | 260 | # data_explore() 261 | 262 | # 真正开始干活 263 | model1, model3 = modeling() 264 | 265 | # 进行预测 266 | env = janestreet.make_env() 267 | iter_test = env.iter_test() 268 | for (test_df, sample_prediction_df) in iter_test: 269 | if test_df['weight'].item() > 0: 270 | X_test = test_df.loc[:, test_df.columns.str.contains('feature')] 271 | X_test = X_test.fillna(-999) 272 | y_preds = model1.predict(X_test) + model3.predict(X_test) 273 | if y_preds == 2: 274 | y_preds = np.array([1]) 275 | else: 276 | y_preds = np.array([0]) 277 | else: 278 | y_preds = np.array([0]) 279 | sample_prediction_df.action = y_preds 280 | env.predict(sample_prediction_df) 281 | -------------------------------------------------------------------------------- /hello.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import tools 6 | 7 | 8 | p = 0.02 9 | train = pd.read_csv("./small_train.csv", skiprows = lambda x: x>0 and np.random.rand() > p) 10 | train.to_csv("very_small.csv") 11 | -------------------------------------------------------------------------------- /hidegpu/FE.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | # kaggle竞赛Jane Street Market Prediction 3 | # 特征工程代码 4 | 5 | 6 | import pandas as pd 7 | import matplotlib.pyplot as plt 8 | 9 | 10 | """ 11 | # 特征工程 12 | @change_dir 13 | def featureEngineer(data): 14 | tages = pd.DataFrame() 15 | tagename = feature.columns 16 | for i in range(29): 17 | # tagename = "tag_" + str(i) 18 | # tages[tagename[i+1]] = feature[(feature[tagename[i+1]] == True)].iloc[:, i+1] 19 | #print(tages[i]) 20 | temp = feature["feature"][feature[tagename[i+1]] == True] 21 | temp.name = tagename[i+1] 22 | print(temp) 23 | #print(tages) 24 | # 填充空值 25 | print(data.isnull().sum()) 26 | for col in data.columns: 27 | mean_val = data[col].mean() 28 | data[col].fillna(mean_val, inplace=True) 29 | print(data.isnull().sum()) 30 | # 处理feature_0 31 | feature_0 = data["feature_0"].cumsum() 32 | plt.plot(feature_0) 33 | plt.savefig("./output/cumf_0.png") 34 | plt.close() 35 | data["feature_0"] = feature_0 36 | # print(feature_0) 37 | return data 38 | """ 39 | # 特征工程 40 | def featureEngineer(data): 41 | # data = data[data['weight'] != 0] 42 | data = data.fillna(0.0) 43 | weight = data['weight'].values 44 | resp = data['resp'].values 45 | data['action'] = ((weight * resp) > 0).astype('int') 46 | return data 47 | 48 | 49 | 50 | 51 | if __name__ == "__main__": 52 | train, feature = loadData() 53 | # feature = feature[feature == True] 54 | print(feature) 55 | train = featureEngineer(train) 56 | 57 | -------------------------------------------------------------------------------- /hidegpu/optuna_test.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | # kaggle Jane Street Market Prediction代码 3 | # optuna的测试代码 4 | 5 | import numpy as np 6 | import pandas as pd 7 | pd.set_option('display.max_columns', None) 8 | import os 9 | # from tools import * 10 | from FE import featureEngineer 11 | 12 | import matplotlib.pyplot as plt 13 | from sklearn.model_selection import train_test_split, cross_val_score 14 | from sklearn import metrics 15 | from sklearn.metrics import accuracy_score 16 | import optuna 17 | from optuna.samplers import TPESampler 18 | 19 | # XGBoost 20 | from xgboost import XGBClassifier 21 | 22 | 23 | def objective(trial): 24 | x = trial.suggest_uniform("x", -10, 10) 25 | return (x - 2)**2 26 | 27 | 28 | def objective2(trial, x, y): 29 | train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.3, random_state = 101) 30 | param = { 31 | "eval_metric":trial.suggest_categorical("eval_metric", ["logloss"]), 32 | "tree_method":trial.suggest_categorical("tree_method", ["gpu_hist"]), 33 | "n_estimators" : trial.suggest_int('n_estimators', 1, 100), 34 | 'max_depth':trial.suggest_int('max_depth', 2, 12), 35 | 'learning_rate':trial.suggest_loguniform('learning_rate',0.001,0.5), 36 | "subsample":trial.suggest_loguniform("subsample", 0.5, 1.0) 37 | } 38 | model = XGBClassifier(**param) 39 | model.fit(train_x, train_y) 40 | 41 | return cross_val_score(model,test_x,test_y).mean() 42 | 43 | 44 | # 建模前处理数据 45 | def preprocessing(train): 46 | X_train = train.loc[:, train.columns.str.contains('feature')] 47 | # y_train = train.loc[:, 'resp'] 48 | y_train = train.loc[:, 'action'] 49 | 50 | X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, random_state=666, test_size=0.2) 51 | 52 | return X_train, y_train 53 | 54 | 55 | if __name__ == "__main__": 56 | # study = optuna.create_study() 57 | # study.optimize(objective, n_trials = 100) 58 | # print("结果:", study.best_params) 59 | # print(study.best_value) 60 | # print(study.best_trial) 61 | # study.optimize(objective, n_trials = 100) 62 | # print("结果:", study.best_params) 63 | # print(study.best_value) 64 | # print(study.best_trial) 65 | 66 | 67 | # data_explore() 68 | 69 | # 真正开始干活 70 | p = 0.001 71 | train = pd.read_csv("small_train.csv") 72 | train = featureEngineer(train) 73 | # print(train.head()) 74 | 75 | # 计算模型评分 76 | # score = Score(model, train) 77 | # print("模型评分:%.2f" % score) 78 | 79 | #训练数据预处理 80 | X_train, y_train = preprocessing(train) 81 | 82 | # xgboost 83 | print("XGBoost") 84 | study = optuna.create_study(direction = "maximize", sampler = TPESampler()) 85 | study.optimize(lambda trial:objective2(trial, X_train, y_train), n_trials = 100) 86 | print("结果:", study.best_params) 87 | print(study.best_value) 88 | print(study.best_trial) 89 | 90 | -------------------------------------------------------------------------------- /hidegpu/tools.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | # kaggle竞赛Jane Street Market Prediction 3 | # 工具函数 4 | 5 | from run import * 6 | import pandas as pd 7 | import matplotlib.pyplot as plt 8 | import numpy as np 9 | from sklearn.model_selection import cross_val_score, learning_curve 10 | from sklearn.metrics import classification_report, roc_curve, auc 11 | 12 | 13 | # 载入数据 14 | @change_dir 15 | def loadData(p = 0.01): 16 | # 抽样,读取1%数据 17 | # 参考https://mp.weixin.qq.com/s/2LSKnN9R-N-I2HcHePT9zA 18 | train = pd.read_csv("./train.csv", skiprows = lambda x: x>0 and np.random.rand() > p) 19 | # feature = pd.read_csv("./features.csv") 20 | return train 21 | 22 | 23 | # 对模型进行交叉验证 24 | def cross_val(model, X, Y, cv = 10): 25 | scores = cross_val_score(model, X, Y, cv=cv) 26 | score = scores.mean() 27 | return score 28 | 29 | 30 | # 模型评估 31 | def evalution(model, X, y_true): 32 | # X = test.loc[:, test.columns.str.contains("feature")].values 33 | # y_true = test.action.values 34 | y_pred = model.predict(X) 35 | target_names = ["1", "0"] 36 | result = classification_report(y_true, y_pred, target_names = target_names, output_dict = False ) 37 | return result 38 | 39 | 40 | # 对模型评分 41 | @timethis 42 | def score(model, test, modelName): 43 | if modelName == "XGBoost": 44 | X = test.loc[:, test.columns.str.contains("feature")] 45 | Y = test.action 46 | else: 47 | X = test.loc[:, test.columns.str.contains("feature")].values 48 | Y = test.action.values 49 | model_score = model.score(X, Y) 50 | cross_score = cross_val(model, X, Y) 51 | report = evalution(model, X, Y) 52 | print("模型评分:", model_score) 53 | print("交叉验证:", cross_score) 54 | print("模型评估:\n", report) 55 | Roc(model, X, Y, modelName) 56 | Lc(model, modelName, X, Y) 57 | 58 | 59 | # 画roc曲线 60 | @change_dir 61 | def Roc(model, X, Y, modelName): 62 | y_label = Y 63 | y_pred = model.predict(X) 64 | fpr, tpr, thersholds = roc_curve(y_label, y_pred) 65 | 66 | roc_auc = auc(fpr, tpr) 67 | 68 | plt.plot(fpr, tpr, 'k--', label = "ROC (area = {0:.2f})".format(roc_auc), lw = 2) 69 | plt.tick_params(axis='x', labelsize=15) 70 | plt.tick_params(axis='y', labelsize=15) 71 | plt.xlim([-0.05, 1.05]) 72 | plt.ylim([-0.05, 1.05]) 73 | plt.xlabel("False Positive Rate") 74 | plt.ylabel("True Positive Rate") 75 | plt.title(modelName + " ROC Curve") 76 | plt.legend(loc = "best") 77 | plt.savefig("./output/" + modelName + "_ROC.png") 78 | 79 | 80 | # 画学习曲线 81 | @change_dir 82 | def Lc(model, modelName, X, y, ylim = None, cv = None, n_jobs = 1, train_sizes = np.linspace(0.1, 1.0, 5), verbose = 0): 83 | plt.figure() 84 | plt.title(modelName+" Learning Curve") 85 | if ylim is not None: 86 | plt.ylim(*ylim) 87 | plt.xlabel("Training Samples") 88 | plt.ylabel("Score") 89 | train_sizes, train_scores, test_scores = learning_curve(model, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes) 90 | train_scores_mean = np.mean(train_scores, axis = 1) 91 | train_scores_std = np.std(train_scores, axis = 1) 92 | test_scores_mean = np.mean(test_scores, axis = 1) 93 | test_scores_std = np.std(test_scores, axis = 1) 94 | plt.grid() 95 | 96 | plt.fill_between(train_sizes, train_scores_mean - train_scores_std,train_scores_mean + train_scores_std, alpha=0.1, color="r") 97 | plt.fill_between(train_sizes,test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g") 98 | plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score") 99 | plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score") 100 | 101 | plt.legend(loc="best") 102 | plt.savefig("./output/" + modelName + "_Learning Curve.png") 103 | 104 | 105 | # 工具函数,返回神经网络训练的每一步 106 | def make_train_step(model, loss_fn, optimizer): 107 | # 执行在循环中训练过程 108 | def train_step(x, y): 109 | # 设置训练模式 110 | model.train() 111 | # 预测 112 | yhat = model(x) 113 | # 计算损失 114 | # print("测试") 115 | yhat = yhat.squeeze(-1) 116 | # print(yhat.shape, y.shape) 117 | loss = loss_fn(yhat, y) 118 | # 计算梯度 119 | loss.backward() 120 | # 更新参数,梯度置零 121 | optimizer.step() 122 | optimizer.zero_grad() 123 | # 返回损失值 124 | return loss.item() 125 | 126 | # 返回在训练循环中调用的函数 127 | return train_step 128 | 129 | 130 | -------------------------------------------------------------------------------- /janestreet/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .competition import make_env 3 | 4 | __all__ = ['make_env'] 5 | -------------------------------------------------------------------------------- /janestreet/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zwdnet/JSMPwork/b7ae1380986abc3ee5b10bea7f9d57cbb805a8f0/janestreet/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /janestreet/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zwdnet/JSMPwork/b7ae1380986abc3ee5b10bea7f9d57cbb805a8f0/janestreet/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /janestreet/competition.cpython-37m-x86_64-linux-gnu.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zwdnet/JSMPwork/b7ae1380986abc3ee5b10bea7f9d57cbb805a8f0/janestreet/competition.cpython-37m-x86_64-linux-gnu.so -------------------------------------------------------------------------------- /jsmp.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | # kaggle Jane Street Market Prediction代码 3 | # 服务器版本 4 | 5 | 6 | import pandas as pd 7 | import matplotlib.pyplot as plt 8 | from sklearn.linear_model import LinearRegression 9 | from sklearn.model_selection import train_test_split 10 | from sklearn import metrics 11 | import numpy as np 12 | from sklearn.externals import joblib 13 | import pickle 14 | from run import * 15 | import socket 16 | import sys 17 | from sklearn.impute import SimpleImputer, MissingIndicator 18 | from sklearn.pipeline import FeatureUnion 19 | import janestreet 20 | 21 | 22 | # 获取转换器 23 | def getTransformer(X, y): 24 | transformer = FeatureUnion( 25 | transformer_list = [ 26 | ("features", SimpleImputer(strategy = "mean")), 27 | ("indicators", MissingIndicator()) 28 | ] 29 | ) 30 | transformer = transformer.fit(X, y) 31 | return transformer 32 | 33 | 34 | # 特征工程 35 | def fp(data): 36 | print(data.info(verbose = True, null_counts = True)) 37 | ds = data.describe() 38 | data = data[data["weight"] != 0] 39 | data["action"] =((data["weight"].values * data["resp"].values) > 0).astype("int") 40 | # 查看缺失值 41 | print(data.isnull().sum()) 42 | # 复制数据,进行操作 43 | newdata = data.copy() 44 | # 特征列名称 45 | features = [c for c in newdata.columns if 'feature' in c] + ["date"] 46 | # 处理缺失值 47 | 48 | X = newdata.loc[:, features] 49 | y = newdata.loc[:, "action"] 50 | transformer = getTransformer(X, y) 51 | # X = transformer.transform(X) 52 | X = X.fillna(-999) 53 | print("特征工程结束") 54 | return (X, y, features, transformer) 55 | 56 | 57 | # 形成提交文件 58 | def makeSubmittion(model, features, transformer): 59 | print("正在生成提交文件") 60 | env = janestreet.make_env() 61 | iter_test = env.iter_test() 62 | 63 | for (test_df, pred_df) in iter_test: 64 | X_test = test_df.loc[:, features] 65 | #X_test = transformer.transform(X_test) 66 | X_test = X_test.fillna(-999) 67 | preds = model.predict(X_test) 68 | action = ((test_df['weight'].values * preds) > 0).astype('int') 69 | pred_df.action = action 70 | env.predict(pred_df) 71 | 72 | 73 | # 线性回归模型 74 | def LR(X, y): 75 | train_set, test_set, train_action, test_action = train_test_split(X, y, test_size = 0.2) 76 | print(len(train_set)) 77 | # 训练 78 | linreg = LinearRegression() 79 | linreg.fit(train_set, train_action) 80 | # 预测 81 | train_pred = linreg.predict(train_set) 82 | test_pred = linreg.predict(test_set) 83 | # 模型评估 84 | print("train MSE:", metrics.mean_squared_error(train_action, train_pred)) 85 | print("test MSE:", metrics.mean_squared_error(test_action, test_pred)) 86 | print("train RMSE:", np.sqrt(metrics.mean_squared_error(train_action, train_pred))) 87 | print("test RMSE:", np.sqrt(metrics.mean_squared_error(test_action, test_pred))) 88 | # 保存模型到文件 89 | # joblib.dump(linreg, "LinesRegress.pkl") 90 | with open("/home/code/output/LinesRegress.pkl", "wb") as fw: 91 | pickle.dump(linreg, fw) 92 | print(test_pred) 93 | fig = plt.figure() 94 | plt.plot(test_pred[:100], "b.") 95 | plt.plot(test_action[:100], "rx") 96 | plt.savefig("/home/code/output/LR_result.png") 97 | return linreg 98 | 99 | 100 | if __name__ == "__main__": 101 | print(os.getcwd()) 102 | data = pd.read_csv("/home/code/small_train.csv", index_col = 0) 103 | X, y, features, transformer = fp(data) 104 | model = LR(X, y) 105 | makeSubmittion(model, features, transformer) 106 | -------------------------------------------------------------------------------- /jsmp_local.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | # kaggle Jane Street Market Prediction代码 3 | # 本地运行版本 4 | 5 | 6 | import pandas as pd 7 | import matplotlib.pyplot as plt 8 | from sklearn.linear_model import LinearRegression 9 | from sklearn.model_selection import train_test_split 10 | from sklearn import metrics 11 | import numpy as np 12 | from sklearn.externals import joblib 13 | import pickle 14 | from run import * 15 | import socket 16 | import sys 17 | 18 | 19 | # 特征工程 20 | def fp(data): 21 | print(data.info(verbose = True, null_counts = True)) 22 | ds = data.describe() 23 | # 查看缺失值 24 | print(data.isnull().sum()) 25 | # 复制数据,进行操作 26 | newdata = data.copy() 27 | # 特征列名称 28 | features = [c for c in newdata.columns if 'feature' in c] 29 | # print(features) 30 | x_tt = newdata.loc[:, features].values 31 | # 填充空值 32 | if np.isnan(x_tt[:, :].sum()): 33 | x_tt[:, :] = np.nan_to_num(x_tt[:, :]) + np.isnan(x_tt[:, :])*10.0 34 | newdata.update(pd.DataFrame(x_tt, columns = features)) 35 | print(newdata.head()) 36 | # 够造训练集的行动变量 37 | print(data.weight.describe()) 38 | p = data[data["weight"] < 50].weight.hist().get_figure() 39 | p.savefig("./output/weight_hist.png") 40 | newdata["action"] = ((newdata["weight"].values) > 0.549).astype("int") 41 | print(newdata.action) 42 | print("特征工程结束") 43 | return newdata 44 | 45 | 46 | # 线性回归模型 47 | def LR(data): 48 | train_set, test_set, train_action, test_action = train_test_split(data.loc[:, "feature_0":"feature_129"], data.action, test_size = 0.2) 49 | print(len(train_set)) 50 | # 训练 51 | linreg = LinearRegression() 52 | linreg.fit(train_set, train_action) 53 | # 预测 54 | train_pred = linreg.predict(train_set) 55 | test_pred = linreg.predict(test_set) 56 | # 模型评估 57 | print("train MSE:", metrics.mean_squared_error(train_action, train_pred)) 58 | print("test MSE:", metrics.mean_squared_error(test_action, test_pred)) 59 | print("train RMSE:", np.sqrt(metrics.mean_squared_error(train_action, train_pred))) 60 | print("test RMSE:", np.sqrt(metrics.mean_squared_error(test_action, test_pred))) 61 | # 保存模型到文件 62 | # joblib.dump(linreg, "LinesRegress.pkl") 63 | with open("./output/LinesRegress.pkl", "wb") as fw: 64 | pickle.dump(linreg, fw) 65 | print(test_pred) 66 | fig = plt.figure() 67 | plt.hist(test_pred) 68 | plt.savefig("./output/LR_result.png") 69 | 70 | 71 | if __name__ == "__main__": 72 | print(os.getcwd()) 73 | data = pd.read_csv("small_train.csv", index_col = 0) 74 | newdata = fp(data) 75 | print(newdata.info(verbose = True, null_counts = True)) 76 | print(newdata.date) 77 | # 用多元线性回归模型训练 78 | LR(newdata) 79 | -------------------------------------------------------------------------------- /myxgboost.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | # kaggle Jane Street Market Prediction代码 3 | # 实际自己工作的代码 4 | 5 | 6 | import numpy as np 7 | import pandas as pd 8 | pd.set_option('display.max_columns', None) 9 | import janestreet 10 | 11 | import matplotlib.pyplot as plt 12 | from sklearn.model_selection import train_test_split 13 | from sklearn import metrics 14 | from sklearn.metrics import accuracy_score 15 | import optuna 16 | # 逻辑回归 17 | from sklearn.linear_model import LinearRegression, LogisticRegression 18 | # 支持向量机 19 | from sklearn.svm import SVC, LinearSVC 20 | # 随机森林 21 | from sklearn.ensemble import RandomForestClassifier 22 | # KNN算法 23 | from sklearn.neighbors import KNeighborsClassifier 24 | # 朴素贝叶斯算法 25 | from sklearn.naive_bayes import GaussianNB 26 | # SGD算法 27 | from sklearn.linear_model import SGDClassifier 28 | # 决策树算法 29 | from sklearn.tree import DecisionTreeClassifier 30 | # XGBoost 31 | from xgboost import XGBClassifier 32 | from xgboost import plot_importance 33 | 34 | import os 35 | 36 | from EDA import data_explore 37 | from FE import featureEngineer 38 | from tools import * 39 | from run import * 40 | 41 | 42 | 43 | # 建模前处理数据 44 | def preprocessing(train): 45 | X_train = train.loc[:, train.columns.str.contains('feature')] 46 | # y_train = train.loc[:, 'resp'] 47 | y_train = train.loc[:, 'action'] 48 | 49 | X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, random_state=666, test_size=0.2) 50 | 51 | return X_train, y_train 52 | 53 | 54 | # 评分函数 55 | def Score(model, data): 56 | # test_df = pd.read_csv("/kaggle/input/jane-street-market-prediction/train.csv") 57 | data = data.fillna(-999) 58 | X_test = data.loc[:, data.columns.str.contains('feature')] 59 | resp = model.predict(X_test) 60 | date = data["date"].values 61 | weight = data["weight"].values 62 | action = (resp > 0).astype("int") 63 | 64 | count_i = len(np.unique(date)) 65 | Pi = np.zeros(count_i) 66 | # 用循环太慢 67 | #for i, day in enumerate(np.unique(date)): 68 | # Pi[i] = np.sum(weight[date == day] * resp[date == day] * action[date == day]) 69 | # 用下面这行代替 70 | Pi = np.bincount(date, weight * resp * action) 71 | t = np.sum(Pi) / np.sqrt(np.sum(Pi ** 2)) * np.sqrt(250 / count_i) 72 | u = np.clip(t, 0, 6) * np.sum(Pi) 73 | return u 74 | 75 | 76 | # 进行预测,生成提交文件,求值版 77 | def predict_value(model): 78 | env = janestreet.make_env() 79 | iter_test = env.iter_test() 80 | for (test_df, sample_prediction_df) in iter_test: 81 | if test_df['weight'].item() > 0: 82 | test_df = featureEngineer(test_df) 83 | X_test = test_df.loc[:, test_df.columns.str.contains('feature')] 84 | # X_test = X_test.fillna(-999) 85 | y_resp = model.predict(X_test)[0] 86 | y_preds = 0 if y_resp < 0 else 1 87 | else: 88 | y_preds = 0 89 | # print(y_preds) 90 | sample_prediction_df.action = y_preds 91 | env.predict(sample_prediction_df) 92 | 93 | 94 | # 进行预测,生成提交文件,分类版 95 | def predict_clf(model): 96 | env = janestreet.make_env() 97 | iter_test = env.iter_test() 98 | for (test_df, sample_prediction_df) in iter_test: 99 | if test_df['weight'].item() > 0: 100 | test_df = featureEngineer(test_df) 101 | X_test = test_df.loc[:, test_df.columns.str.contains('feature')] 102 | X_test = X_test.fillna(0.0) 103 | y_preds = model.predict(X_test)[0] 104 | else: 105 | y_preds = 0 106 | # print(y_preds) 107 | sample_prediction_df.action = y_preds 108 | env.predict(sample_prediction_df) 109 | 110 | 111 | from sklearn.model_selection import GridSearchCV 112 | from sklearn.model_selection import StratifiedKFold 113 | # 调参 114 | @change_dir 115 | @timethis 116 | def tc(X, Y, param_grid, param_name): 117 | model = XGBClassifier(use_label_encoder=False, eval_metric = "logloss") 118 | kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7) 119 | grid_search = GridSearchCV(model, param_grid, scoring="neg_log_loss", n_jobs=-1, cv=kfold) 120 | grid_result = grid_search.fit(X, Y) 121 | print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_)) 122 | # 输出每个参数对应分数 123 | means = grid_result.cv_results_['mean_test_score'] 124 | stds = grid_result.cv_results_['std_test_score'] 125 | params = grid_result.cv_results_['params'] 126 | y = [] 127 | for mean, stdev, param in zip(means, stds, params): 128 | print("%f (%f) with: %r" % (mean, stdev, param)) 129 | y.append(mean) 130 | plt.plot(y) 131 | plt.savefig("./output/"+param_name+".png") 132 | 133 | 134 | if __name__ == "__main__": 135 | newpath = "/home/code" 136 | os.chdir(newpath) 137 | 138 | # data_explore() 139 | 140 | # 真正开始干活 141 | p = 0.0001 142 | train = loadData(p = p) 143 | train = featureEngineer(train) 144 | # print(train.head()) 145 | 146 | # 计算模型评分 147 | # score = Score(model, train) 148 | # print("模型评分:%.2f" % score) 149 | test = loadData(p = p) 150 | test = featureEngineer(test) 151 | 152 | #训练数据预处理 153 | X_train, y_train = preprocessing(train) 154 | 155 | # xgboost 156 | print("XGBoost") 157 | max_depth = [3, 4, 5] 158 | subsample = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] 159 | param_grid = dict(learning_rate = [0.1], max_depth = max_depth, subsample = subsample) 160 | tc(X_train, y_train, param_grid, "subsample") 161 | """ 162 | model = XGBClassifier() 163 | eval_set = [(X_train, y_train)] 164 | model.fit(X_train, y_train, early_stopping_rounds = 10, eval_metric = "logloss", eval_set = eval_set, verbose = True) 165 | # X_test, y_test = preprocessing(test) 166 | # y_pred = model.predict(X_test) 167 | # print(y_pred[:10]) 168 | # predictions = [round(value) for value in y_pred] 169 | # print(predictions[:10]) 170 | score(model, test, "XGBoost") 171 | fig, ax = plt.subplots(nrows = 1, ncols = 1, figsize = (10, 200)) 172 | plot_importance(model, ax = ax) 173 | plt.savefig("./output/feature_importance.png") 174 | plt.close() 175 | """ 176 | 177 | # 进行预测 178 | # predict_clf(model) 179 | -------------------------------------------------------------------------------- /nn.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | # kaggle Jane Street Market Prediction代码 3 | # 神经网络及深度学习练习 4 | 5 | 6 | # 先用手撸 7 | # 参考 https://b23.tv/srXty3 8 | from numpy import array, exp, random, dot 9 | 10 | 11 | # 正向传播 12 | def fp(X, weights): 13 | z = dot(X, weights) 14 | return 1/(1+exp(-z)) 15 | 16 | 17 | # 反向传播 18 | def bp(y, output): 19 | error = y - output 20 | return error * output * (1-output) 21 | 22 | 23 | # 手撸单层神经网络 24 | def nn(): 25 | # X = array([[0,0,1], [1,1,1], [1,0,1], [0,1,1]]) 26 | # y = array([[0,1,1,0]]).T 27 | X = array([[0,0,1], [0,1,1], [1,0,1], [1,1,1]]) 28 | y = array([[0,1,1,0]]).T 29 | random.seed(1) 30 | weights = 2*random.random((3,1)) - 1 31 | for it in range(10000): 32 | output = fp(X, weights) 33 | delta = bp(y, output) 34 | weights += dot(X.T, delta) 35 | print(weights) 36 | print(fp([0, 0, 1], weights)) 37 | 38 | 39 | # 多层正向传播 40 | def mfp(X, w0, w1): 41 | l1 = 1/(1+exp(-dot(X, w0))) 42 | l2 = 1/(1+exp(-dot(l1, w1))) 43 | return l1, l2 44 | 45 | 46 | # 反向传播 47 | def mbp(l1, l2, y, w1): 48 | error = y - l2 49 | slope = l2 * (1-l2) 50 | l1_delta = error*slope 51 | 52 | l0_error = l1_delta.dot(w1.T) 53 | l0_slope = l1 * (1-l1) 54 | l0_delta = l0_error*l0_slope 55 | return l0_delta, l1_delta 56 | 57 | 58 | # 手撸多层神经网络 59 | def mnn(): 60 | # X = array([[0,0,1], [1,1,1], [1,0,1], [0,1,1]]) 61 | # y = array([[0,1,1,0]]).T 62 | X = array([[0,0,1], [0,1,1], [1,0,1], [1,1,1]]) 63 | y = array([[0,1,1,0]]).T 64 | random.seed(1) 65 | # weights = 2*random.random((3,1)) - 1 66 | w0 = 2*random.random((3, 4)) - 1 67 | w1 = 2*random.random((4, 1)) - 1 68 | for it in range(10000): 69 | l0 = X 70 | l1, l2 = mfp(X, w0, w1) 71 | l0_delta, l1_delta = mbp(l1, l2, y, w1) 72 | w1 += dot(l1.T, l1_delta) 73 | w0 += dot(l0.T, l0_delta) 74 | # print(weights) 75 | print(mfp([0, 0, 0], w0, w1)[1]) 76 | 77 | 78 | # 再尝试pytorch 79 | import torch 80 | import torch.nn as nn 81 | import torch.nn.functional as F 82 | 83 | 84 | def testTorch(): 85 | # 张量操作 86 | print("张量操作") 87 | x = torch.empty(5, 3) 88 | print(x) 89 | x = torch.rand(5, 3) 90 | print(x) 91 | x = torch.zeros(5, 3, dtype = torch.long) 92 | print(x) 93 | x = torch.tensor([5.5, 3]) 94 | print(x) 95 | x = x.new_ones(5, 3, dtype = torch.double) 96 | print(x) 97 | x = torch.randn_like(x, dtype = torch.float) 98 | print(x) 99 | print(x.size()) 100 | y = torch.rand(5, 3) 101 | print(x+y) 102 | print(torch.add(x, y)) 103 | result = torch.empty(5, 3) 104 | torch.add(x, y, out = result) 105 | print(result) 106 | y.add_(x) 107 | print(y) 108 | print(x[:, 1]) 109 | x = torch.randn(4, 4) 110 | y = x.view(16) 111 | z = x.view(-1, 8) 112 | print(x.size(), y.size(), z.size()) 113 | x = torch.randn(1) 114 | print(x) 115 | print(x.item()) 116 | # 自动微分 117 | print("自动微分") 118 | x = torch.ones(2, 2, requires_grad = True) 119 | print(x) 120 | y = x+2 121 | print(y) 122 | print(y.grad_fn) 123 | z = y*y*3 124 | out = z.mean() 125 | print(z, out) 126 | a = torch.randn(2, 2) 127 | a = ((a*3) / (a-1)) 128 | print(a.requires_grad) 129 | a.requires_grad_(True) 130 | print(a.requires_grad) 131 | b = (a*a).sum() 132 | print(b.grad_fn) 133 | out.backward() 134 | print(x.grad) 135 | x = torch.randn(3, requires_grad = True) 136 | y = x*2 137 | while y.data.norm() < 1000: 138 | y = y*2 139 | print(y) 140 | v = torch.tensor([0.1, 1.0, 0.0001], dtype = torch.float) 141 | y.backward(v) 142 | print(x.grad) 143 | print(x.requires_grad) 144 | print((x**2).requires_grad) 145 | 146 | with torch.no_grad(): 147 | print((x**2).requires_grad) 148 | 149 | 150 | if __name__ == "__main__": 151 | nn() 152 | mnn() 153 | testTorch() 154 | -------------------------------------------------------------------------------- /optuna_DP.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | # kaggle Jane Street Market Prediction代码 3 | # 实际自己工作的代码 4 | # 用optuna对深度学习模型调参 5 | 6 | 7 | import numpy as np 8 | import pandas as pd 9 | pd.set_option('display.max_columns', None) 10 | import janestreet 11 | 12 | import matplotlib.pyplot as plt 13 | from sklearn.model_selection import train_test_split 14 | from sklearn import metrics 15 | from sklearn.metrics import accuracy_score 16 | import torch 17 | import torch.nn as nn 18 | import torch.optim as optim 19 | import optuna 20 | 21 | import os 22 | 23 | from FE import featureEngineer 24 | from tools import * 25 | 26 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 27 | 28 | 29 | # 建模前处理数据 30 | def preprocessing(train): 31 | X = train.loc[:, train.columns.str.contains('feature')] 32 | # y_train = train.loc[:, 'resp'] 33 | Y = train.loc[:, 'action'] 34 | 35 | x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state=666, test_size=0.2) 36 | 37 | return x_train, x_test, y_train, y_test 38 | 39 | 40 | # 评分函数 41 | def Score(model, data): 42 | # test_df = pd.read_csv("/kaggle/input/jane-street-market-prediction/train.csv") 43 | data = data.fillna(-999) 44 | X_test = data.loc[:, data.columns.str.contains('feature')] 45 | resp = model.predict(X_test) 46 | date = data["date"].values 47 | weight = data["weight"].values 48 | action = (resp > 0).astype("int") 49 | 50 | count_i = len(np.unique(date)) 51 | Pi = np.zeros(count_i) 52 | # 用循环太慢 53 | #for i, day in enumerate(np.unique(date)): 54 | # Pi[i] = np.sum(weight[date == day] * resp[date == day] * action[date == day]) 55 | # 用下面这行代替 56 | Pi = np.bincount(date, weight * resp * action) 57 | t = np.sum(Pi) / np.sqrt(np.sum(Pi ** 2)) * np.sqrt(250 / count_i) 58 | u = np.clip(t, 0, 6) * np.sum(Pi) 59 | return u 60 | 61 | 62 | # 进行预测,生成提交文件,分类版 63 | def predict_clf(model): 64 | env = janestreet.make_env() 65 | iter_test = env.iter_test() 66 | for (test_df, sample_prediction_df) in iter_test: 67 | if test_df['weight'].item() > 0: 68 | test_df = featureEngineer(test_df) 69 | X_test = test_df.loc[:, test_df.columns.str.contains('feature')] 70 | X_test = X_test.fillna(0.0) 71 | y_preds = model.predict(X_test)[0] 72 | else: 73 | y_preds = 0 74 | # print(y_preds) 75 | sample_prediction_df.action = y_preds 76 | env.predict(sample_prediction_df) 77 | 78 | 79 | # 进行预测,生成提交文件,神经网络模型版 80 | def predict_nn(model): 81 | env = janestreet.make_env() 82 | iter_test = env.iter_test() 83 | for (test_df, sample_prediction_df) in iter_test: 84 | if test_df['weight'].item() > 0: 85 | # test_df = featureEngineer(test_df) 86 | X_test = test_df.loc[:, test_df.columns.str.contains('feature')] 87 | X_test = X_test.fillna(0.0) 88 | X_test_tensor = torch.from_numpy(X_test.values).float().to(device) 89 | pred = model(X_test_tensor).detach().cpu().numpy() 90 | if pred >= 0.5: 91 | y_preds = 1 92 | else: 93 | y_preds = 0 94 | else: 95 | y_preds = 0 96 | # print(y_preds) 97 | sample_prediction_df.action = y_preds 98 | env.predict(sample_prediction_df) 99 | 100 | 101 | # 获取数据 102 | def getData(): 103 | p = 0.1 104 | data = loadData(p = p) 105 | data = featureEngineer(data) 106 | # print(data.info()) 107 | 108 | #训练数据预处理 109 | x_train, x_test, y_train, y_test = preprocessing(data) 110 | 111 | return x_train, y_train, x_test, y_test 112 | 113 | 114 | # 获取模型准确率 115 | def getAccuracyRate(Model): 116 | result = [] 117 | for x in Model(x_test_tensor): 118 | if x >= 0.5: 119 | result.append(1) 120 | else: 121 | result.append(0) 122 | y_test = y_test_tensor.numpy() 123 | # print(y_test[:10]) 124 | # print(result[:10]) 125 | count = 0 126 | for i in range(len(result)): 127 | if y_test[i] == result[i]: 128 | count += 1 129 | 130 | return count/len(y_test) 131 | 132 | 133 | # 定义模型 134 | def define_model(trial): 135 | input_dim = 130 136 | hide1_dim = trial.suggest_int("hide1_dim", 100, 200) 137 | hide2_dim = trial.suggest_int("hide2_dim", 10, 200) 138 | output_dim = 1 139 | Model = nn.Sequential( 140 | nn.Linear(input_dim, hide1_dim), 141 | nn.ReLU(), 142 | nn.Linear(hide1_dim, hide2_dim), 143 | nn.Sigmoid(), 144 | nn.Linear(hide2_dim, output_dim) 145 | ) 146 | return Model 147 | 148 | 149 | # 加载数据,为避免反复读取和数据一致,用全局变量 150 | x_train, y_train, x_test, y_test = getData() 151 | x_tensor = torch.from_numpy(x_train.values).float().to(device) 152 | y_tensor = torch.from_numpy(y_train.values).float().to(device) 153 | x_test_tensor = torch.from_numpy(x_test.values).float().to(device) 154 | y_test_tensor = torch.from_numpy(y_test.values).float().to(device) 155 | 156 | 157 | # 优化目标函数 158 | @timethis 159 | def objective(trial): 160 | Model = define_model(trial).to(device) 161 | optimizer_name = trial.suggest_categorical("optimizer", ["Adam", "RMSprop", "SGD"]) 162 | lr = trial.suggest_loguniform("lr", 1e-5, 1e-1) 163 | optimizer = getattr(optim, optimizer_name)(Model.parameters(), lr=lr) 164 | n_epochs = trial.suggest_int("epochs", 50, 200) 165 | loss_fn = nn.MSELoss(reduction = "mean") 166 | 167 | # 创建训练器 168 | train_step = make_train_step(Model, loss_fn, optimizer) 169 | # losses = [] 170 | 171 | # 进行训练 172 | for epoch in range(n_epochs): 173 | # y_tensor = y_tensor.detach() 174 | loss = train_step(x_tensor, y_tensor) 175 | # losses.append(loss) 176 | accuracy = getAccuracyRate(Model) 177 | 178 | return accuracy 179 | 180 | 181 | if __name__ == "__main__": 182 | newpath = "/home/code" 183 | os.chdir(newpath) 184 | 185 | # 用optuna进行调参 186 | study = optuna.create_study(direction="maximize") 187 | study.optimize(objective, n_trials=10) 188 | 189 | print("结果:", study.best_params) 190 | print(study.best_value) 191 | print(study.best_trial) 192 | 193 | # 进行预测 194 | # predict_clf(model) 195 | -------------------------------------------------------------------------------- /optuna_test.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | # kaggle Jane Street Market Prediction代码 3 | # optuna的测试代码 4 | 5 | import numpy as np 6 | import pandas as pd 7 | pd.set_option('display.max_columns', None) 8 | import os 9 | from tools import * 10 | from FE import featureEngineer 11 | 12 | import matplotlib.pyplot as plt 13 | from sklearn.model_selection import train_test_split, cross_val_score 14 | from sklearn import metrics 15 | from sklearn.metrics import accuracy_score 16 | import optuna 17 | from optuna.samplers import TPESampler 18 | 19 | # XGBoost 20 | from xgboost import XGBClassifier 21 | 22 | 23 | def objective(trial): 24 | x = trial.suggest_uniform("x", -10, 10) 25 | return (x - 2)**2 26 | 27 | 28 | def objective2(trial, x, y): 29 | train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.3, random_state = 101) 30 | param = { 31 | "n_estimators" : trial.suggest_int('n_estimators', 0, 1000), 32 | 'max_depth':trial.suggest_int('max_depth', 2, 25), 33 | 'learning_rate':trial.suggest_loguniform('learning_rate',0.005,0.5) 34 | } 35 | model = XGBClassifier(**param) 36 | model.fit(train_x, train_y) 37 | 38 | return cross_val_score(model,test_x,test_y).mean() 39 | 40 | 41 | # 建模前处理数据 42 | def preprocessing(train): 43 | X_train = train.loc[:, train.columns.str.contains('feature')] 44 | # y_train = train.loc[:, 'resp'] 45 | y_train = train.loc[:, 'action'] 46 | 47 | X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, random_state=666, test_size=0.2) 48 | 49 | return X_train, y_train 50 | 51 | 52 | if __name__ == "__main__": 53 | # study = optuna.create_study() 54 | # study.optimize(objective, n_trials = 100) 55 | # print("结果:", study.best_params) 56 | # print(study.best_value) 57 | # print(study.best_trial) 58 | # study.optimize(objective, n_trials = 100) 59 | # print("结果:", study.best_params) 60 | # print(study.best_value) 61 | # print(study.best_trial) 62 | 63 | newpath = "/home/code" 64 | os.chdir(newpath) 65 | 66 | # data_explore() 67 | 68 | # 真正开始干活 69 | p = 0.001 70 | train = loadData(p = p) 71 | train = featureEngineer(train) 72 | # print(train.head()) 73 | 74 | # 计算模型评分 75 | # score = Score(model, train) 76 | # print("模型评分:%.2f" % score) 77 | test = loadData(p = p) 78 | test = featureEngineer(test) 79 | 80 | #训练数据预处理 81 | X_train, y_train = preprocessing(train) 82 | 83 | # xgboost 84 | print("XGBoost") 85 | study = optuna.create_study(direction = "maximize", sampler = TPESampler()) 86 | study.optimize(lambda trial:objective2(trial, X_train, y_train), n_trials = 50) 87 | print("结果:", study.best_params) 88 | print(study.best_value) 89 | print(study.best_trial) 90 | -------------------------------------------------------------------------------- /pic/00.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zwdnet/JSMPwork/b7ae1380986abc3ee5b10bea7f9d57cbb805a8f0/pic/00.jpg -------------------------------------------------------------------------------- /pic/01.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zwdnet/JSMPwork/b7ae1380986abc3ee5b10bea7f9d57cbb805a8f0/pic/01.jpg -------------------------------------------------------------------------------- /pic/02.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zwdnet/JSMPwork/b7ae1380986abc3ee5b10bea7f9d57cbb805a8f0/pic/02.jpg -------------------------------------------------------------------------------- /pic/03.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zwdnet/JSMPwork/b7ae1380986abc3ee5b10bea7f9d57cbb805a8f0/pic/03.jpg -------------------------------------------------------------------------------- /pic/04.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zwdnet/JSMPwork/b7ae1380986abc3ee5b10bea7f9d57cbb805a8f0/pic/04.jpg -------------------------------------------------------------------------------- /pic/05.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zwdnet/JSMPwork/b7ae1380986abc3ee5b10bea7f9d57cbb805a8f0/pic/05.jpg -------------------------------------------------------------------------------- /pic/06.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zwdnet/JSMPwork/b7ae1380986abc3ee5b10bea7f9d57cbb805a8f0/pic/06.jpg -------------------------------------------------------------------------------- /pic/07.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zwdnet/JSMPwork/b7ae1380986abc3ee5b10bea7f9d57cbb805a8f0/pic/07.jpg -------------------------------------------------------------------------------- /pic/08.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zwdnet/JSMPwork/b7ae1380986abc3ee5b10bea7f9d57cbb805a8f0/pic/08.jpg -------------------------------------------------------------------------------- /pic/09.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zwdnet/JSMPwork/b7ae1380986abc3ee5b10bea7f9d57cbb805a8f0/pic/09.jpg -------------------------------------------------------------------------------- /pic/10.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zwdnet/JSMPwork/b7ae1380986abc3ee5b10bea7f9d57cbb805a8f0/pic/10.jpg -------------------------------------------------------------------------------- /preprocess.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | # kaggle Jane Street Market Prediction代码 3 | # 数据探索及预处理 4 | 5 | 6 | import pandas as pd 7 | from run import * 8 | import matplotlib.pyplot as plt 9 | import dask.dataframe as dd 10 | 11 | 12 | # 初步探索花了一天 13 | @change_dir 14 | def drawData(): 15 | # data = pd.read_csv("train.csv", usecols = [0,1]) 16 | n = 2390491 17 | row_read = int(n/100) 18 | # row_read = 5 19 | # data = pd.read_csv("./train.csv", nrows = row_read) 20 | data = dd.read_csv("./train.csv") 21 | # print(data.head()) 22 | # print(data.info()) 23 | print(data.info()) 24 | print(data.columns) 25 | 26 | fig = plt.figure() 27 | plt.plot(data["weight"].values.compute()) 28 | plt.savefig("./output/weight.png") 29 | 30 | s = "resp_" 31 | for i in range(1, 5): 32 | col = s+str(i) 33 | plt.close() 34 | fig = plt.figure() 35 | plt.plot(data[col].values.compute()) 36 | plt.savefig("./output/"+col+".png") 37 | 38 | plt.close() 39 | fig = plt.figure() 40 | plt.plot(data["resp"].values.compute()) 41 | plt.savefig("./output/"+"resp"+".png") 42 | 43 | s = "feature_" 44 | for i in range(0, 130): 45 | col = s+str(i) 46 | plt.close() 47 | fig = plt.figure() 48 | plt.plot(data[col].values.compute()) 49 | plt.savefig("./output/"+col+".png") 50 | 51 | return data 52 | 53 | 54 | # 读取数据,提取前1/10做研究 55 | @change_dir 56 | def smallData(): 57 | n = 2390491 58 | row_read = int(n/10) 59 | data = pd.read_csv("./train.csv", nrows = row_read) 60 | print(data.info()) 61 | # 画图 62 | fig = plt.figure() 63 | plt.plot(data["weight"].values) 64 | plt.savefig("./output/weight_small.png") 65 | 66 | s = "resp_" 67 | for i in range(1, 5): 68 | col = s+str(i) 69 | plt.close() 70 | fig = plt.figure() 71 | plt.plot(data[col].values) 72 | plt.savefig("./output/"+col+"_small.png") 73 | 74 | plt.close() 75 | fig = plt.figure() 76 | plt.plot(data["resp"].values) 77 | plt.savefig("./output/"+"resp"+"_small.png") 78 | 79 | s = "feature_" 80 | for i in range(0, 130): 81 | col = s+str(i) 82 | plt.close() 83 | fig = plt.figure() 84 | plt.plot(data[col].values) 85 | plt.savefig("./output/"+col+"_small.png") 86 | data.to_csv("./small_train.csv") 87 | 88 | 89 | if __name__ == "__main__": 90 | data = pd.read_csv("small_train.csv") 91 | print(data.info()) 92 | -------------------------------------------------------------------------------- /py_nn.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | # 《python神经网络编程》实操代码 3 | 4 | 5 | import numpy as np 6 | import scipy.special 7 | import run 8 | import pandas as pd 9 | import matplotlib.pyplot as plt 10 | import optuna 11 | import optuna.visualization as pv 12 | 13 | 14 | # 神经网络类 15 | class NN: 16 | def __init__(self, inputnodes, hiddennodes, outputnodes, learningrate): 17 | # 设置输入、隐藏和输出层维度 18 | self.inodes = inputnodes 19 | self.hnodes = hiddennodes 20 | self.onodes = outputnodes 21 | 22 | 23 | # simple random number 24 | # self.wih = (np.random.rand(self.hnodes, self.inodes) - 0.5) 25 | # self.who = (np.random.rand(self.onodes, self.hnodes) - 0.5) 26 | 27 | # Normal distribution 28 | # average = 0 29 | # Standard deviation = 1/evolution of number of nodes passed in 30 | # 用正态分布随机数初始化权重 31 | self.wih = np.random.normal(0, pow(self.hnodes, -0.5), (self.hnodes, self.inodes)) 32 | self.who = np.random.normal(0, pow(self.onodes, -0.5), (self.onodes, self.hnodes)) 33 | 34 | # 学习率 35 | self.lr = learningrate 36 | 37 | # 用sigmoid函数做激活函数 38 | self.activation_function = lambda x: scipy.special.expit(x) 39 | 40 | 41 | # 训练神经网络 42 | def train(self, inputs_list, targets_list): 43 | # 将数据转换为二维数组 44 | inputs = np.array(inputs_list, ndmin=2).T 45 | targets = np.array(targets_list, ndmin=2).T 46 | 47 | # 利用传输矩阵wih,计算隐藏层输入 48 | hidden_inputs = np.dot(self.wih, inputs) 49 | # 计算隐藏层输出,激活函数 50 | hidden_outputs = self.activation_function(hidden_inputs) 51 | # 利用传输矩阵who,计算输出层输入 52 | final_inputs = np.dot(self.who, hidden_outputs) 53 | # 用激活函数计算输出信号 54 | final_outputs = self.activation_function(final_inputs) 55 | 56 | # 计算误差值 57 | output_errors = targets - final_outputs 58 | 59 | # 按权重分配误差 60 | hidden_errors = np.dot(self.who.T, output_errors) 61 | # update the weights for the links between the hidden and output layers 62 | # wj,k = learningrate * error * sigmoid(ok) * (1 - sigmoid(ok)) · oj^T 63 | # 更新隐藏层及输出层之间的权重值 64 | self.who += self.lr * np.dot( 65 | (output_errors * final_outputs * (1.0 - final_outputs)), 66 | np.transpose(hidden_outputs)) 67 | # update the weights for the links between the input and hidden layers 68 | # 更新输入层及隐藏层之间的权重值 69 | self.wih += self.lr * np.dot( 70 | (hidden_errors * hidden_outputs * (1.0 - hidden_outputs)), 71 | np.transpose(inputs)) 72 | 73 | 74 | # 前向传播 75 | def query(self, inputs_list): 76 | # 输入矩阵 77 | inputs = np.array(inputs_list, ndmin=2).T 78 | 79 | # calculate signals into hidden layer 80 | # 利用传输矩阵wih,计算隐藏层输入 81 | hidden_inputs = np.dot(self.wih, inputs) 82 | # calculate the signals emerging from hidden layer 83 | # 计算隐藏层输出,激活函数 84 | hidden_outputs = self.activation_function(hidden_inputs) 85 | # calculate signals into final output layer 86 | # 利用传输矩阵who,计算输出层输入 87 | final_inputs = np.dot(self.who, hidden_outputs) 88 | # calculate the signals emerging from final output layer 89 | final_outputs = self.activation_function(final_inputs) 90 | 91 | return final_outputs 92 | 93 | 94 | # 加载数据 95 | def loadData(): 96 | # load the mnist training data CSV file into a list 97 | training_data_file = open("mnist_train.csv", 'r') 98 | training_data_list = training_data_file.readlines() 99 | training_data_file.close() 100 | 101 | testing_data_file = open("mnist_test.csv", 'r') 102 | testing_data_list = testing_data_file.readlines() 103 | testing_data_file.close() 104 | 105 | return training_data_list, testing_data_list 106 | 107 | 108 | # 创建模型 109 | def init_model(input_nodes, hidden_nodes, output_nodes, learning_rate): 110 | # create instance of neural network 111 | n = NN(input_nodes, hidden_nodes, output_nodes, learning_rate) 112 | 113 | return n 114 | 115 | 116 | # 训练过程 117 | def train(n, epochs, training_data_list, output_nodes): 118 | # 对训练过程进行循环 119 | for e in range(epochs): 120 | for record in training_data_list: 121 | # split the record by the ',' commas 122 | # 通过','将数分段 123 | all_values = record.split(',') 124 | # scale and shift the inputs 125 | # 将所有的像素点的值转换为0.01-1.00 126 | inputs = (np.asfarray(all_values[1:]) / 255.0 * 0.99 + 0.01) 127 | # creat the target output values 128 | # 创建标签输出值 129 | targets = np.zeros(output_nodes) + 0.01 130 | # all_values[0] is the target label for this record 131 | # 10个输出值,对应的为0.99,其他为0.01 132 | targets[int(all_values[0])] = 0.99 133 | # 传入网络进行训练 134 | n.train(inputs, targets) 135 | return n 136 | 137 | 138 | # 获取预测准确率 139 | def getScores(n, testing_data_list): 140 | # 创建一个空白的计分卡 141 | scorecard = [] 142 | # 遍历测试数据 143 | for record in testing_data_list: 144 | all_values = record.split(',') 145 | # 提取正确的标签 146 | correct_label = int(all_values[0]) 147 | # print(correct_label, 'correct label') 148 | # 读取像素值并转换 149 | inputs = (np.asfarray(all_values[1:]) / 255.0 * 0.99 + 0.01) 150 | # 通过神经网络得出结果 151 | outputs = n.query(inputs) 152 | # 结果 153 | label = np.argmax(outputs) 154 | # print(label, "network's answer") 155 | # 标签相同,计分卡加一,否则加零 156 | if (label == correct_label): 157 | scorecard.append(1) 158 | else: 159 | scorecard.append(0) 160 | # 输出计分卡 161 | # print(scorecard) 162 | # 输出分数 163 | scorecard_array = np.asarray(scorecard) 164 | 165 | return scorecard_array 166 | 167 | 168 | # 解MINST手写数字识别问题 169 | @run.change_dir 170 | @run.timethis 171 | def minst(trial): 172 | input_nodes = 784 173 | hidden_nodes = trial.suggest_categorical("hidden_dim", [50, 100, 200, 300]) 174 | output_nodes = 10 175 | # 学习率 176 | learning_rate = trial.suggest_discrete_uniform("learning_rate", 0.01, 0.81, 0.1) 177 | n = init_model(input_nodes, hidden_nodes, output_nodes, learning_rate) 178 | training_data_list, testing_data_list = loadData() 179 | # 训练 180 | epochs = trial.suggest_int("epochs:", 1, 10) 181 | n = train(n, epochs, training_data_list, output_nodes) 182 | # 测试 183 | res = getScores(n, testing_data_list) 184 | return res.sum() / res.size 185 | 186 | 187 | # 画图 188 | @run.change_dir 189 | def draw_results(study): 190 | # 优化历史 191 | plt.figure() 192 | fig = pv.plot_optimization_history(study) 193 | fig.write_image("./output/opt_his.png") 194 | plt.close() 195 | # 等高线图 196 | plt.figure() 197 | fig = pv.plot_contour(study) 198 | fig.write_image("./output/opt_contour.png") 199 | plt.close() 200 | # 经验分布图 201 | plt.figure() 202 | fig = pv.plot_edf(study) 203 | fig.write_image("./output/opt_edf.png") 204 | plt.close() 205 | # 高维参数 206 | plt.figure() 207 | fig = pv.plot_parallel_coordinate(study) 208 | fig.write_image("./output/opt_coordinate.png") 209 | plt.close() 210 | 211 | 212 | if __name__ == "__main__": 213 | input_nodes = 3 214 | hidden_nodes = 3 215 | output_nodes = 3 216 | 217 | learning_rate = 0.3 218 | 219 | # n = NN(input_nodes, hidden_nodes, output_nodes, learning_rate) 220 | # print(n.query([1.0, 0.5, -0.5])) 221 | 222 | # minst() 223 | 224 | study = optuna.create_study(direction="maximize") 225 | study.optimize(minst, n_trials=100) 226 | print("结果:", study.best_params) 227 | print(study.best_value) 228 | print(study.best_trial) 229 | if pv.is_available: 230 | print("结果作图") 231 | draw_results(study) 232 | else: 233 | print("不能作图") 234 | -------------------------------------------------------------------------------- /py_nn_back.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | # 《python神经网络编程》实操代码 3 | # 反向查询看看 4 | 5 | 6 | import numpy as np 7 | import scipy.special 8 | import run 9 | import pandas as pd 10 | import matplotlib.pyplot as plt 11 | import optuna 12 | import optuna.visualization as pv 13 | import cv2 14 | import glob 15 | 16 | 17 | # 神经网络类 18 | class NN: 19 | def __init__(self, inputnodes, hiddennodes, outputnodes, learningrate): 20 | # 设置输入、隐藏和输出层维度 21 | self.inodes = inputnodes 22 | self.hnodes = hiddennodes 23 | self.onodes = outputnodes 24 | 25 | 26 | # simple random number 27 | # self.wih = (np.random.rand(self.hnodes, self.inodes) - 0.5) 28 | # self.who = (np.random.rand(self.onodes, self.hnodes) - 0.5) 29 | 30 | # Normal distribution 31 | # average = 0 32 | # Standard deviation = 1/evolution of number of nodes passed in 33 | # 用正态分布随机数初始化权重 34 | self.wih = np.random.normal(0, pow(self.hnodes, -0.5), (self.hnodes, self.inodes)) 35 | self.who = np.random.normal(0, pow(self.onodes, -0.5), (self.onodes, self.hnodes)) 36 | 37 | # 学习率 38 | self.lr = learningrate 39 | 40 | # 用sigmoid函数做激活函数 41 | self.activation_function = lambda x: scipy.special.expit(x) 42 | # 激活函数的反函数 43 | self.inverse_activation_function = lambda x: scipy.special.logit(x) 44 | 45 | 46 | # 训练神经网络 47 | def train(self, inputs_list, targets_list): 48 | # 将数据转换为二维数组 49 | inputs = np.array(inputs_list, ndmin=2).T 50 | targets = np.array(targets_list, ndmin=2).T 51 | 52 | # 利用传输矩阵wih,计算隐藏层输入 53 | hidden_inputs = np.dot(self.wih, inputs) 54 | # 计算隐藏层输出,激活函数 55 | hidden_outputs = self.activation_function(hidden_inputs) 56 | # 利用传输矩阵who,计算输出层输入 57 | final_inputs = np.dot(self.who, hidden_outputs) 58 | # 用激活函数计算输出信号 59 | final_outputs = self.activation_function(final_inputs) 60 | 61 | # 计算误差值 62 | output_errors = targets - final_outputs 63 | 64 | # 按权重分配误差 65 | hidden_errors = np.dot(self.who.T, output_errors) 66 | # update the weights for the links between the hidden and output layers 67 | # wj,k = learningrate * error * sigmoid(ok) * (1 - sigmoid(ok)) · oj^T 68 | # 更新隐藏层及输出层之间的权重值 69 | self.who += self.lr * np.dot( 70 | (output_errors * final_outputs * (1.0 - final_outputs)), 71 | np.transpose(hidden_outputs)) 72 | # update the weights for the links between the input and hidden layers 73 | # 更新输入层及隐藏层之间的权重值 74 | self.wih += self.lr * np.dot( 75 | (hidden_errors * hidden_outputs * (1.0 - hidden_outputs)), 76 | np.transpose(inputs)) 77 | 78 | 79 | # 前向传播 80 | def query(self, inputs_list): 81 | # 输入矩阵 82 | inputs = np.array(inputs_list, ndmin=2).T 83 | 84 | # calculate signals into hidden layer 85 | # 利用传输矩阵wih,计算隐藏层输入 86 | hidden_inputs = np.dot(self.wih, inputs) 87 | # calculate the signals emerging from hidden layer 88 | # 计算隐藏层输出,激活函数 89 | hidden_outputs = self.activation_function(hidden_inputs) 90 | # calculate signals into final output layer 91 | # 利用传输矩阵who,计算输出层输入 92 | final_inputs = np.dot(self.who, hidden_outputs) 93 | # calculate the signals emerging from final output layer 94 | final_outputs = self.activation_function(final_inputs) 95 | 96 | return final_outputs 97 | 98 | # 反向查询,给定输出值,看输入会是啥 99 | def backquery(self, targets_list): 100 | # 转换为垂直向量 101 | final_outputs = np.array(targets_list, ndmin = 2).T 102 | # 计算最后的输入信号,用激活函数的反函数 103 | final_inputs = self.inverse_activation_function(final_outputs) 104 | # 计算隐藏层的输出 105 | hidden_outputs = np.dot(self.who.T, final_inputs) 106 | # 归一化 107 | hidden_outputs -= np.min(hidden_outputs) 108 | hidden_outputs /= np.max(hidden_outputs) 109 | hidden_outputs *= 0.98 110 | hidden_outputs += 0.01 111 | # 计算进入隐藏层的信号 112 | hidden_inputs = self.inverse_activation_function(hidden_outputs) 113 | 114 | # 计算输入层的输出信号 115 | inputs = np.dot(self.wih.T, hidden_inputs) 116 | # 归一化 117 | inputs -= np.min(inputs) 118 | inputs /= np.max(inputs) 119 | inputs *= 0.98 120 | inputs += 0.01 121 | 122 | return inputs 123 | 124 | 125 | # 加载数据 126 | @run.change_dir 127 | def loadData(): 128 | # load the mnist training data CSV file into a list 129 | training_data_file = open("mnist_train.csv", 'r') 130 | training_data_list = training_data_file.readlines() 131 | training_data_file.close() 132 | 133 | testing_data_file = open("mnist_test.csv", 'r') 134 | testing_data_list = testing_data_file.readlines() 135 | testing_data_file.close() 136 | 137 | return training_data_list, testing_data_list 138 | 139 | 140 | # 创建模型 141 | def init_model(input_nodes, hidden_nodes, output_nodes, learning_rate): 142 | # create instance of neural network 143 | n = NN(input_nodes, hidden_nodes, output_nodes, learning_rate) 144 | 145 | return n 146 | 147 | 148 | # 训练过程 149 | def train(n, epochs, training_data_list, output_nodes): 150 | # 对训练过程进行循环 151 | for e in range(epochs): 152 | print("第{}轮".format(e)) 153 | for record in training_data_list: 154 | # split the record by the ',' commas 155 | # 通过','将数分段 156 | all_values = record.split(',') 157 | # scale and shift the inputs 158 | # 将所有的像素点的值转换为0.01-1.00 159 | inputs = (np.asfarray(all_values[1:]) / 255.0 * 0.99 + 0.01) 160 | # creat the target output values 161 | # 创建标签输出值 162 | targets = np.zeros(output_nodes) + 0.01 163 | # all_values[0] is the target label for this record 164 | # 10个输出值,对应的为0.99,其他为0.01 165 | targets[int(all_values[0])] = 0.99 166 | # 传入网络进行训练 167 | n.train(inputs, targets) 168 | return n 169 | 170 | 171 | # 获取预测准确率 172 | def getScores(n, testing_data_list): 173 | # 创建一个空白的计分卡 174 | scorecard = [] 175 | # 遍历测试数据 176 | for record in testing_data_list: 177 | all_values = record.split(',') 178 | # 提取正确的标签 179 | correct_label = int(all_values[0]) 180 | # print(correct_label, 'correct label') 181 | # 读取像素值并转换 182 | inputs = (np.asfarray(all_values[1:]) / 255.0 * 0.99 + 0.01) 183 | # 通过神经网络得出结果 184 | outputs = n.query(inputs) 185 | # 结果 186 | label = np.argmax(outputs) 187 | # print(label, "network's answer") 188 | # 标签相同,计分卡加一,否则加零 189 | if (label == correct_label): 190 | scorecard.append(1) 191 | else: 192 | scorecard.append(0) 193 | # 输出计分卡 194 | # print(scorecard) 195 | # 输出分数 196 | scorecard_array = np.asarray(scorecard) 197 | 198 | return scorecard_array 199 | 200 | 201 | # 解MINST手写数字识别问题 202 | @run.change_dir 203 | @run.timethis 204 | def minst(trial): 205 | input_nodes = 784 206 | hidden_nodes = trial.suggest_categorical("hidden_dim", [50, 100, 200, 300]) 207 | output_nodes = 10 208 | # 学习率 209 | learning_rate = trial.suggest_discrete_uniform("learning_rate", 0.01, 0.81, 0.1) 210 | n = init_model(input_nodes, hidden_nodes, output_nodes, learning_rate) 211 | training_data_list, testing_data_list = loadData() 212 | # 训练 213 | epochs = trial.suggest_int("epochs:", 1, 10) 214 | n = train(n, epochs, training_data_list, output_nodes) 215 | # 测试 216 | res = getScores(n, testing_data_list) 217 | return res.sum() / res.size 218 | 219 | 220 | # 画图 221 | @run.change_dir 222 | def draw_results(study): 223 | # 优化历史 224 | plt.figure() 225 | fig = pv.plot_optimization_history(study) 226 | fig.write_image("./output/opt_his.png") 227 | plt.close() 228 | # 等高线图 229 | plt.figure() 230 | fig = pv.plot_contour(study) 231 | fig.write_image("./output/opt_contour.png") 232 | plt.close() 233 | # 经验分布图 234 | plt.figure() 235 | fig = pv.plot_edf(study) 236 | fig.write_image("./output/opt_edf.png") 237 | plt.close() 238 | # 高维参数 239 | plt.figure() 240 | fig = pv.plot_parallel_coordinate(study) 241 | fig.write_image("./output/opt_coordinate.png") 242 | plt.close() 243 | 244 | 245 | # 手写数字识别应用 246 | # 处理输入数据 247 | @run.change_dir 248 | def data_process(): 249 | targets = [] 250 | datas = [] 251 | for file in glob.glob(r"./pic/*.png"): 252 | targets.append(int(file.split("/")[2].split(".")[0])) 253 | img_array = cv2.imread(file) 254 | img_array = cv2.resize(img_array, (28, 28)) 255 | img_array = cv2.cvtColor(img_array, cv2.COLOR_BGR2GRAY) 256 | height,width = img_array.shape 257 | dst = np.zeros((height,width),np.uint8) 258 | for i in range(height): 259 | for j in range(width): 260 | dst[i,j] = 255 - img_array[i,j] 261 | img_array = dst.reshape(784) 262 | datas.append(img_array) 263 | return (targets, datas) 264 | 265 | 266 | # 训练模型 267 | @run.timethis 268 | def trainModel(): 269 | print("开始训练") 270 | input_nodes = 784 271 | hidden_nodes = 300 272 | output_nodes = 10 273 | learning_rate = 0.11 274 | epochs = 8 275 | 276 | model = NN(input_nodes, hidden_nodes, output_nodes, learning_rate) 277 | training_data_list, _ = loadData() 278 | 279 | # 对训练过程进行循环 280 | for e in range(epochs): 281 | for record in training_data_list: 282 | # 通过','将数分段 283 | all_values = record.split(',') 284 | # 将所有的像素点的值转换为0.01-1.00 285 | inputs = (np.asfarray(all_values[1:]) / 255.0 * 0.99 + 0.01) 286 | # 创建标签输出值 287 | targets = np.zeros(output_nodes) + 0.01 288 | # 10个输出值,对应的为0.99,其他为0.01 289 | targets[int(all_values[0])] = 0.99 290 | # 传入网络进行训练 291 | model.train(inputs, targets) 292 | 293 | return model 294 | 295 | 296 | # 用模型识别实际数据 297 | def testModel(model, test_datas, targets): 298 | n = len(test_datas) 299 | correct = 0 300 | for i in range(n): 301 | # 用模型得出预测值 302 | outputs = model.query(test_datas[i]) 303 | # 转换为结果 304 | label = np.argmax(outputs) 305 | print("预测结果{},实际结果{}".format(label, targets[i])) 306 | if label == targets[i]: 307 | correct += 1 308 | 309 | return correct/n 310 | 311 | 312 | # 反向查询给定输出的输入 313 | @run.change_dir 314 | def back(model): 315 | output_nodes = 10 316 | for i in range(10): 317 | label = i 318 | targets = np.zeros(output_nodes) + 0.01 319 | targets[label] = 0.99 320 | image_data = model.backquery(targets) 321 | filename = "./output/"+str(i)+".png" 322 | print(filename) 323 | plt.figure() 324 | plt.imshow(image_data.reshape(28,28), cmap='Greys', interpolation='None') 325 | plt.savefig(filename) 326 | plt.close() 327 | 328 | 329 | if __name__ == "__main__": 330 | """ 331 | input_nodes = 3 332 | hidden_nodes = 3 333 | output_nodes = 3 334 | 335 | learning_rate = 0.3 336 | 337 | # n = NN(input_nodes, hidden_nodes, output_nodes, learning_rate) 338 | # print(n.query([1.0, 0.5, -0.5])) 339 | 340 | # minst() 341 | 342 | study = optuna.create_study(direction="maximize") 343 | study.optimize(minst, n_trials=100) 344 | print("结果:", study.best_params) 345 | print(study.best_value) 346 | print(study.best_trial) 347 | if pv.is_available: 348 | print("结果作图") 349 | draw_results(study) 350 | else: 351 | print("不能作图") 352 | """ 353 | # 具体应用模型 354 | # 目前得到的最佳参数:{'hidden_dim': 300, 'learning_rate': 0.11, 'epochs:': 9} 355 | # targets, datas = data_process() 356 | model = trainModel() 357 | # score = testModel(model, datas, targets) 358 | # print("模型预测准确率:{}".format(score)) 359 | back(model) 360 | -------------------------------------------------------------------------------- /py_nn_use.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | # 《python神经网络编程》实操代码 3 | # 具体应用 4 | 5 | 6 | import numpy as np 7 | import scipy.special 8 | import run 9 | import pandas as pd 10 | import matplotlib.pyplot as plt 11 | import optuna 12 | import optuna.visualization as pv 13 | import cv2 14 | import glob 15 | 16 | 17 | # 神经网络类 18 | class NN: 19 | def __init__(self, inputnodes, hiddennodes, outputnodes, learningrate): 20 | # 设置输入、隐藏和输出层维度 21 | self.inodes = inputnodes 22 | self.hnodes = hiddennodes 23 | self.onodes = outputnodes 24 | 25 | 26 | # simple random number 27 | # self.wih = (np.random.rand(self.hnodes, self.inodes) - 0.5) 28 | # self.who = (np.random.rand(self.onodes, self.hnodes) - 0.5) 29 | 30 | # Normal distribution 31 | # average = 0 32 | # Standard deviation = 1/evolution of number of nodes passed in 33 | # 用正态分布随机数初始化权重 34 | self.wih = np.random.normal(0, pow(self.hnodes, -0.5), (self.hnodes, self.inodes)) 35 | self.who = np.random.normal(0, pow(self.onodes, -0.5), (self.onodes, self.hnodes)) 36 | 37 | # 学习率 38 | self.lr = learningrate 39 | 40 | # 用sigmoid函数做激活函数 41 | self.activation_function = lambda x: scipy.special.expit(x) 42 | 43 | 44 | # 训练神经网络 45 | def train(self, inputs_list, targets_list): 46 | # 将数据转换为二维数组 47 | inputs = np.array(inputs_list, ndmin=2).T 48 | targets = np.array(targets_list, ndmin=2).T 49 | 50 | # 利用传输矩阵wih,计算隐藏层输入 51 | hidden_inputs = np.dot(self.wih, inputs) 52 | # 计算隐藏层输出,激活函数 53 | hidden_outputs = self.activation_function(hidden_inputs) 54 | # 利用传输矩阵who,计算输出层输入 55 | final_inputs = np.dot(self.who, hidden_outputs) 56 | # 用激活函数计算输出信号 57 | final_outputs = self.activation_function(final_inputs) 58 | 59 | # 计算误差值 60 | output_errors = targets - final_outputs 61 | 62 | # 按权重分配误差 63 | hidden_errors = np.dot(self.who.T, output_errors) 64 | # update the weights for the links between the hidden and output layers 65 | # wj,k = learningrate * error * sigmoid(ok) * (1 - sigmoid(ok)) · oj^T 66 | # 更新隐藏层及输出层之间的权重值 67 | self.who += self.lr * np.dot( 68 | (output_errors * final_outputs * (1.0 - final_outputs)), 69 | np.transpose(hidden_outputs)) 70 | # update the weights for the links between the input and hidden layers 71 | # 更新输入层及隐藏层之间的权重值 72 | self.wih += self.lr * np.dot( 73 | (hidden_errors * hidden_outputs * (1.0 - hidden_outputs)), 74 | np.transpose(inputs)) 75 | 76 | 77 | # 前向传播 78 | def query(self, inputs_list): 79 | # 输入矩阵 80 | inputs = np.array(inputs_list, ndmin=2).T 81 | 82 | # calculate signals into hidden layer 83 | # 利用传输矩阵wih,计算隐藏层输入 84 | hidden_inputs = np.dot(self.wih, inputs) 85 | # calculate the signals emerging from hidden layer 86 | # 计算隐藏层输出,激活函数 87 | hidden_outputs = self.activation_function(hidden_inputs) 88 | # calculate signals into final output layer 89 | # 利用传输矩阵who,计算输出层输入 90 | final_inputs = np.dot(self.who, hidden_outputs) 91 | # calculate the signals emerging from final output layer 92 | final_outputs = self.activation_function(final_inputs) 93 | 94 | return final_outputs 95 | 96 | 97 | # 加载数据 98 | @run.change_dir 99 | def loadData(): 100 | # load the mnist training data CSV file into a list 101 | training_data_file = open("mnist_train.csv", 'r') 102 | training_data_list = training_data_file.readlines() 103 | training_data_file.close() 104 | 105 | testing_data_file = open("mnist_test.csv", 'r') 106 | testing_data_list = testing_data_file.readlines() 107 | testing_data_file.close() 108 | 109 | return training_data_list, testing_data_list 110 | 111 | 112 | # 创建模型 113 | def init_model(input_nodes, hidden_nodes, output_nodes, learning_rate): 114 | # create instance of neural network 115 | n = NN(input_nodes, hidden_nodes, output_nodes, learning_rate) 116 | 117 | return n 118 | 119 | 120 | # 训练过程 121 | def train(n, epochs, training_data_list, output_nodes): 122 | # 对训练过程进行循环 123 | for e in range(epochs): 124 | print("第{}轮".format(e)) 125 | for record in training_data_list: 126 | # split the record by the ',' commas 127 | # 通过','将数分段 128 | all_values = record.split(',') 129 | # scale and shift the inputs 130 | # 将所有的像素点的值转换为0.01-1.00 131 | inputs = (np.asfarray(all_values[1:]) / 255.0 * 0.99 + 0.01) 132 | # creat the target output values 133 | # 创建标签输出值 134 | targets = np.zeros(output_nodes) + 0.01 135 | # all_values[0] is the target label for this record 136 | # 10个输出值,对应的为0.99,其他为0.01 137 | targets[int(all_values[0])] = 0.99 138 | # 传入网络进行训练 139 | n.train(inputs, targets) 140 | return n 141 | 142 | 143 | # 获取预测准确率 144 | def getScores(n, testing_data_list): 145 | # 创建一个空白的计分卡 146 | scorecard = [] 147 | # 遍历测试数据 148 | for record in testing_data_list: 149 | all_values = record.split(',') 150 | # 提取正确的标签 151 | correct_label = int(all_values[0]) 152 | # print(correct_label, 'correct label') 153 | # 读取像素值并转换 154 | inputs = (np.asfarray(all_values[1:]) / 255.0 * 0.99 + 0.01) 155 | # 通过神经网络得出结果 156 | outputs = n.query(inputs) 157 | # 结果 158 | label = np.argmax(outputs) 159 | # print(label, "network's answer") 160 | # 标签相同,计分卡加一,否则加零 161 | if (label == correct_label): 162 | scorecard.append(1) 163 | else: 164 | scorecard.append(0) 165 | # 输出计分卡 166 | # print(scorecard) 167 | # 输出分数 168 | scorecard_array = np.asarray(scorecard) 169 | 170 | return scorecard_array 171 | 172 | 173 | # 解MINST手写数字识别问题 174 | @run.change_dir 175 | @run.timethis 176 | def minst(trial): 177 | input_nodes = 784 178 | hidden_nodes = trial.suggest_categorical("hidden_dim", [50, 100, 200, 300]) 179 | output_nodes = 10 180 | # 学习率 181 | learning_rate = trial.suggest_discrete_uniform("learning_rate", 0.01, 0.81, 0.1) 182 | n = init_model(input_nodes, hidden_nodes, output_nodes, learning_rate) 183 | training_data_list, testing_data_list = loadData() 184 | # 训练 185 | epochs = trial.suggest_int("epochs:", 1, 10) 186 | n = train(n, epochs, training_data_list, output_nodes) 187 | # 测试 188 | res = getScores(n, testing_data_list) 189 | return res.sum() / res.size 190 | 191 | 192 | # 画图 193 | @run.change_dir 194 | def draw_results(study): 195 | # 优化历史 196 | plt.figure() 197 | fig = pv.plot_optimization_history(study) 198 | fig.write_image("./output/opt_his.png") 199 | plt.close() 200 | # 等高线图 201 | plt.figure() 202 | fig = pv.plot_contour(study) 203 | fig.write_image("./output/opt_contour.png") 204 | plt.close() 205 | # 经验分布图 206 | plt.figure() 207 | fig = pv.plot_edf(study) 208 | fig.write_image("./output/opt_edf.png") 209 | plt.close() 210 | # 高维参数 211 | plt.figure() 212 | fig = pv.plot_parallel_coordinate(study) 213 | fig.write_image("./output/opt_coordinate.png") 214 | plt.close() 215 | 216 | 217 | # 手写数字识别应用 218 | # 处理输入数据 219 | @run.change_dir 220 | def data_process(): 221 | targets = [] 222 | datas = [] 223 | for file in glob.glob(r"./pic/*.jpg"): 224 | targets.append(int(file.split("/")[2].split(".")[0])) 225 | if targets[-1] == 10: 226 | targets[-1] = 0 227 | img_array = cv2.imread(file) 228 | img_array = cv2.resize(img_array, (28, 28)) 229 | img_array = cv2.cvtColor(img_array, cv2.COLOR_BGR2GRAY) 230 | height,width = img_array.shape 231 | dst = np.zeros((height,width),np.uint8) 232 | for i in range(height): 233 | for j in range(width): 234 | dst[i,j] = 255 - img_array[i,j] 235 | img_array = dst.reshape(784) 236 | datas.append(img_array) 237 | return (targets, datas) 238 | 239 | 240 | # 训练模型 241 | @run.timethis 242 | def trainModel(): 243 | print("开始训练") 244 | input_nodes = 784 245 | hidden_nodes = 300 246 | output_nodes = 10 247 | learning_rate = 0.11 248 | epochs = 8 249 | 250 | model = NN(input_nodes, hidden_nodes, output_nodes, learning_rate) 251 | training_data_list, _ = loadData() 252 | 253 | # 对训练过程进行循环 254 | for e in range(epochs): 255 | for record in training_data_list: 256 | # 通过','将数分段 257 | all_values = record.split(',') 258 | # 将所有的像素点的值转换为0.01-1.00 259 | inputs = (np.asfarray(all_values[1:]) / 255.0 * 0.99 + 0.01) 260 | # 创建标签输出值 261 | targets = np.zeros(output_nodes) + 0.01 262 | # 10个输出值,对应的为0.99,其他为0.01 263 | targets[int(all_values[0])] = 0.99 264 | # 传入网络进行训练 265 | model.train(inputs, targets) 266 | 267 | return model 268 | 269 | 270 | # 用模型识别实际数据 271 | def testModel(model, test_datas, targets): 272 | n = len(test_datas) 273 | correct = 0 274 | for i in range(n): 275 | # 用模型得出预测值 276 | outputs = model.query(test_datas[i]) 277 | # 转换为结果 278 | label = np.argmax(outputs) 279 | print("预测结果{},实际结果{}".format(label, targets[i])) 280 | if label == targets[i]: 281 | correct += 1 282 | 283 | return correct/n 284 | 285 | 286 | if __name__ == "__main__": 287 | """ 288 | input_nodes = 3 289 | hidden_nodes = 3 290 | output_nodes = 3 291 | 292 | learning_rate = 0.3 293 | 294 | # n = NN(input_nodes, hidden_nodes, output_nodes, learning_rate) 295 | # print(n.query([1.0, 0.5, -0.5])) 296 | 297 | # minst() 298 | 299 | study = optuna.create_study(direction="maximize") 300 | study.optimize(minst, n_trials=100) 301 | print("结果:", study.best_params) 302 | print(study.best_value) 303 | print(study.best_trial) 304 | if pv.is_available: 305 | print("结果作图") 306 | draw_results(study) 307 | else: 308 | print("不能作图") 309 | """ 310 | # 具体应用模型 311 | # 目前得到的最佳参数:{'hidden_dim': 300, 'learning_rate': 0.11, 'epochs:': 9} 312 | targets, datas = data_process() 313 | model = trainModel() 314 | score = testModel(model, datas, targets) 315 | print("模型预测准确率:{}".format(score)) 316 | -------------------------------------------------------------------------------- /pytorch_work.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | # kaggle Jane Street Market Prediction代码 3 | # 实际自己工作的代码 4 | # 用pytorch 5 | 6 | 7 | import numpy as np 8 | import pandas as pd 9 | pd.set_option('display.max_columns', None) 10 | import janestreet 11 | 12 | import matplotlib.pyplot as plt 13 | from sklearn.model_selection import train_test_split 14 | from sklearn import metrics 15 | from sklearn.metrics import accuracy_score 16 | import torch 17 | import torch.nn as nn 18 | import torch.optim as optim 19 | 20 | import os 21 | 22 | from FE import featureEngineer 23 | from tools import * 24 | 25 | 26 | 27 | # 建模前处理数据 28 | def preprocessing(train): 29 | X_train = train.loc[:, train.columns.str.contains('feature')] 30 | # y_train = train.loc[:, 'resp'] 31 | y_train = train.loc[:, 'action'] 32 | 33 | # X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, random_state=666, test_size=0.2) 34 | 35 | return X_train, y_train 36 | 37 | 38 | # 评分函数 39 | def Score(model, data): 40 | # test_df = pd.read_csv("/kaggle/input/jane-street-market-prediction/train.csv") 41 | data = data.fillna(-999) 42 | X_test = data.loc[:, data.columns.str.contains('feature')] 43 | resp = model.predict(X_test) 44 | date = data["date"].values 45 | weight = data["weight"].values 46 | action = (resp > 0).astype("int") 47 | 48 | count_i = len(np.unique(date)) 49 | Pi = np.zeros(count_i) 50 | # 用循环太慢 51 | #for i, day in enumerate(np.unique(date)): 52 | # Pi[i] = np.sum(weight[date == day] * resp[date == day] * action[date == day]) 53 | # 用下面这行代替 54 | Pi = np.bincount(date, weight * resp * action) 55 | t = np.sum(Pi) / np.sqrt(np.sum(Pi ** 2)) * np.sqrt(250 / count_i) 56 | u = np.clip(t, 0, 6) * np.sum(Pi) 57 | return u 58 | 59 | 60 | # 进行预测,生成提交文件,分类版 61 | def predict_clf(model): 62 | env = janestreet.make_env() 63 | iter_test = env.iter_test() 64 | for (test_df, sample_prediction_df) in iter_test: 65 | if test_df['weight'].item() > 0: 66 | # test_df = featureEngineer(test_df) 67 | X_test = test_df.loc[:, test_df.columns.str.contains('feature')] 68 | X_test = X_test.fillna(0.0) 69 | y_preds = model.predict(X_test)[0] 70 | else: 71 | y_preds = 0 72 | # print(y_preds) 73 | sample_prediction_df.action = y_preds 74 | env.predict(sample_prediction_df) 75 | 76 | 77 | if __name__ == "__main__": 78 | newpath = "/home/code" 79 | os.chdir(newpath) 80 | 81 | # data_explore() 82 | 83 | # 真正开始干活 84 | p = 0.0001 85 | train = loadData(p = p) 86 | train = featureEngineer(train) 87 | print(train.info()) 88 | # print(train.head()) 89 | 90 | # 计算模型评分 91 | # score = Score(model, train) 92 | # print("模型评分:%.2f" % score) 93 | test = loadData(p = p) 94 | test = featureEngineer(test) 95 | 96 | #训练数据预处理 97 | x_train, y_train = preprocessing(train) 98 | x_test, y_test = preprocessing(test) 99 | 100 | # 深度学习 101 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 102 | 103 | x_tensor = torch.from_numpy(x_train.values).float().to(device) 104 | y_tensor = torch.from_numpy(y_train.values).float().to(device) 105 | 106 | 107 | Model = nn.Sequential( 108 | nn.Linear(130, 118), 109 | nn.ReLU(), 110 | nn.Linear(118, 142), 111 | nn.Sigmoid(), 112 | nn.Linear(142, 1) 113 | ).to(device) 114 | 115 | # model = Model(x_tensor).to(device) 116 | # print(model.state_dict()) 117 | # 设置超参数 118 | lr = 0.000678 119 | n_epochs = 110 120 | 121 | # loss_fn = nn.BCELoss(reduction='sum') 122 | loss_fn = nn.MSELoss(reduction = "mean") 123 | optimizer = optim.Adam(Model.parameters(), lr = lr) 124 | # 创建训练器 125 | train_step = make_train_step(Model, loss_fn, optimizer) 126 | losses = [] 127 | 128 | print("开始训练") 129 | # 进行训练 130 | for epoch in range(n_epochs): 131 | # y_tensor = y_tensor.detach() 132 | loss = train_step(x_tensor, y_tensor) 133 | losses.append(loss) 134 | 135 | # print(model.state_dict()) 136 | print(losses) 137 | plt.figure() 138 | plt.plot(losses) 139 | plt.savefig("./output/loss.png") 140 | # 验证模型 141 | x_test_tensor = torch.from_numpy(x_test.values).float().to(device) 142 | y_test_tensor = torch.from_numpy(y_test.values).float().to(device) 143 | result = [] 144 | for x in Model(x_test_tensor): 145 | if x >= 0.5: 146 | result.append(1) 147 | else: 148 | result.append(0) 149 | y_test = y_test_tensor.numpy() 150 | # print(len(y_test)) 151 | # print(result) 152 | count = 0 153 | for i in range(len(result)): 154 | if y_test[i] == result[i]: 155 | count += 1 156 | print(count) 157 | print("预测正确率:%f" % (count/len(y_test))) 158 | # 进行预测 159 | # predict_clf(model) 160 | -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | # 将程序上传到服务器上执行 3 | import os 4 | import sys 5 | from functools import wraps 6 | import time 7 | 8 | 9 | # 上传代码至服务器并运行 10 | def run(gpus, server): 11 | # 上传本目录所有文件再执行指定文件 12 | if gpus == "all": 13 | # 清除服务器代码目录里所有源文件以及输出目录中的文件 14 | s = "ssh ubuntu@" + server + " \"sudo rm -rf ~/code/*.py\"" 15 | os.system(s) 16 | s = "ssh ubuntu@" + server + " \"sudo rm -rf ~/code/output/*\"" 17 | os.system(s) 18 | # 将本地目录所有文件上传至容器 19 | s = "scp -r ./*.py ubuntu@" + server + ":~/code" 20 | os.system(s) 21 | # 运行指定代码 22 | s = "ssh root@" + server + " -p 2222 \"python /home/code/" + sys.argv[2] + "\"" 23 | print("正在运行代码……\n") 24 | os.system(s) 25 | # 将代码目录里所有输出文件传回 26 | s = "scp -r ubuntu@" + server + ":~/code/output/* ./output/" 27 | os.system(s) 28 | # 将所有结果文件传回 29 | elif gpus == "copy": 30 | s = "scp -r ubuntu@" + server + ":~/code/output/* ./output/" 31 | os.system(s) 32 | # 上传指定文件并执行 33 | else: 34 | ## 清除服务器代码目录里所有源文件以及输出目录中的文件 35 | s = "ssh ubuntu@" + server + " \"sudo rm -rf ~/code/*.py\"" 36 | os.system(s) 37 | s = "ssh ubuntu@" + server + " \"sudo rm -rf ~/code/output/*\"" 38 | os.system(s) 39 | # 将本地目录指定文件上传至容器 40 | s = "scp " + sys.argv[1] + " ubuntu@" + server + ":~/code" 41 | os.system(s) 42 | # 运行指定代码 43 | s = "ssh root@" + server + " -p 2222 \"python /home/code/" + sys.argv[1] + "\"" 44 | os.system(s) 45 | # 将代码目录里所有文件传回 46 | s = "scp -r ubuntu@" + server + ":~/code/output/* ./output/" 47 | os.system(s) 48 | 49 | 50 | if __name__ == "__main__": 51 | gpus = sys.argv[1] 52 | # 读取服务器IP地址,自己编辑serverIP.txt去 53 | with open("serverIP.txt", "rt") as f: 54 | server = f.read() 55 | run(gpus, server) 56 | 57 | 58 | # 工具函数,在上传到服务器上运行时改变当前目录 59 | def change_dir(func): 60 | @wraps(func) 61 | def change(*args, **kwargs): 62 | oldpath = os.getcwd() 63 | newpath = "/home/code/" 64 | os.chdir(newpath) 65 | r = func(*args, **kwargs) 66 | os.chdir(oldpath) 67 | return r 68 | return change 69 | 70 | 71 | # 工具函数,计算函数运行时间 72 | def timethis(func): 73 | @wraps(func) 74 | def wrapper(*args, **kwargs): 75 | start = time.perf_counter() 76 | r = func(*args, **kwargs) 77 | end = time.perf_counter() 78 | print('{}.{}的运行时间为 : {}秒'.format(func.__module__, func.__name__, end - start)) 79 | return r 80 | return wrapper 81 | -------------------------------------------------------------------------------- /tc/FE.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | # kaggle竞赛Jane Street Market Prediction 3 | # 特征工程代码 4 | 5 | 6 | from run import * 7 | from tools import * 8 | import pandas as pd 9 | import matplotlib.pyplot as plt 10 | 11 | 12 | """ 13 | # 特征工程 14 | @change_dir 15 | def featureEngineer(data): 16 | tages = pd.DataFrame() 17 | tagename = feature.columns 18 | for i in range(29): 19 | # tagename = "tag_" + str(i) 20 | # tages[tagename[i+1]] = feature[(feature[tagename[i+1]] == True)].iloc[:, i+1] 21 | #print(tages[i]) 22 | temp = feature["feature"][feature[tagename[i+1]] == True] 23 | temp.name = tagename[i+1] 24 | print(temp) 25 | #print(tages) 26 | # 填充空值 27 | print(data.isnull().sum()) 28 | for col in data.columns: 29 | mean_val = data[col].mean() 30 | data[col].fillna(mean_val, inplace=True) 31 | print(data.isnull().sum()) 32 | # 处理feature_0 33 | feature_0 = data["feature_0"].cumsum() 34 | plt.plot(feature_0) 35 | plt.savefig("./output/cumf_0.png") 36 | plt.close() 37 | data["feature_0"] = feature_0 38 | # print(feature_0) 39 | return data 40 | """ 41 | # 特征工程 42 | def featureEngineer(data): 43 | # data = data[data['weight'] != 0] 44 | data = data.fillna(0.0) 45 | weight = data['weight'].values 46 | resp = data['resp'].values 47 | data['action'] = ((weight * resp) > 0).astype('int') 48 | return data 49 | 50 | 51 | 52 | 53 | if __name__ == "__main__": 54 | train, feature = loadData() 55 | # feature = feature[feature == True] 56 | print(feature) 57 | train = featureEngineer(train) 58 | -------------------------------------------------------------------------------- /tc/optuna_DP.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | # kaggle Jane Street Market Prediction代码 3 | # 实际自己工作的代码 4 | # 用optuna对深度学习模型调参 5 | 6 | 7 | import numpy as np 8 | import pandas as pd 9 | pd.set_option('display.max_columns', None) 10 | import janestreet 11 | 12 | import matplotlib.pyplot as plt 13 | from sklearn.model_selection import train_test_split 14 | from sklearn import metrics 15 | from sklearn.metrics import accuracy_score 16 | import torch 17 | import torch.nn as nn 18 | import torch.optim as optim 19 | import optuna 20 | 21 | import os 22 | 23 | from FE import featureEngineer 24 | from tools import * 25 | 26 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 27 | 28 | 29 | # 建模前处理数据 30 | def preprocessing(train): 31 | X = train.loc[:, train.columns.str.contains('feature')] 32 | # y_train = train.loc[:, 'resp'] 33 | Y = train.loc[:, 'action'] 34 | 35 | x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state=666, test_size=0.2) 36 | 37 | return x_train, x_test, y_train, y_test 38 | 39 | 40 | # 评分函数 41 | def Score(model, data): 42 | # test_df = pd.read_csv("/kaggle/input/jane-street-market-prediction/train.csv") 43 | data = data.fillna(-999) 44 | X_test = data.loc[:, data.columns.str.contains('feature')] 45 | resp = model.predict(X_test) 46 | date = data["date"].values 47 | weight = data["weight"].values 48 | action = (resp > 0).astype("int") 49 | 50 | count_i = len(np.unique(date)) 51 | Pi = np.zeros(count_i) 52 | # 用循环太慢 53 | #for i, day in enumerate(np.unique(date)): 54 | # Pi[i] = np.sum(weight[date == day] * resp[date == day] * action[date == day]) 55 | # 用下面这行代替 56 | Pi = np.bincount(date, weight * resp * action) 57 | t = np.sum(Pi) / np.sqrt(np.sum(Pi ** 2)) * np.sqrt(250 / count_i) 58 | u = np.clip(t, 0, 6) * np.sum(Pi) 59 | return u 60 | 61 | 62 | # 进行预测,生成提交文件,分类版 63 | def predict_clf(model): 64 | env = janestreet.make_env() 65 | iter_test = env.iter_test() 66 | for (test_df, sample_prediction_df) in iter_test: 67 | if test_df['weight'].item() > 0: 68 | test_df = featureEngineer(test_df) 69 | X_test = test_df.loc[:, test_df.columns.str.contains('feature')] 70 | X_test = X_test.fillna(0.0) 71 | y_preds = model.predict(X_test)[0] 72 | else: 73 | y_preds = 0 74 | # print(y_preds) 75 | sample_prediction_df.action = y_preds 76 | env.predict(sample_prediction_df) 77 | 78 | 79 | # 获取数据 80 | def getData(): 81 | p = 0.1 82 | data = loadData(p = p) 83 | data = featureEngineer(data) 84 | # print(data.info()) 85 | 86 | #训练数据预处理 87 | x_train, x_test, y_train, y_test = preprocessing(data) 88 | 89 | return x_train, y_train, x_test, y_test 90 | 91 | 92 | # 获取模型准确率 93 | def getAccuracyRate(Model): 94 | result = [] 95 | for x in Model(x_test_tensor): 96 | if x >= 0.5: 97 | result.append(1) 98 | else: 99 | result.append(0) 100 | y_test = y_test_tensor.numpy() 101 | # print(y_test[:10]) 102 | # print(result[:10]) 103 | count = 0 104 | for i in range(len(result)): 105 | if y_test[i] == result[i]: 106 | count += 1 107 | 108 | return count/len(y_test) 109 | 110 | 111 | # 定义模型 112 | def define_model(trial): 113 | input_dim = 130 114 | hide1_dim = trial.suggest_int("hide1_dim", 100, 200) 115 | hide2_dim = trial.suggest_int("hide2_dim", 10, 200) 116 | output_dim = 1 117 | Model = nn.Sequential( 118 | nn.Linear(input_dim, hide1_dim), 119 | nn.ReLU(), 120 | nn.Linear(hide1_dim, hide2_dim), 121 | nn.Sigmoid(), 122 | nn.Linear(hide2_dim, output_dim) 123 | ) 124 | return Model 125 | 126 | 127 | # 加载数据,为避免反复读取和数据一致,用全局变量 128 | x_train, y_train, x_test, y_test = getData() 129 | x_tensor = torch.from_numpy(x_train.values).float().to(device) 130 | y_tensor = torch.from_numpy(y_train.values).float().to(device) 131 | x_test_tensor = torch.from_numpy(x_test.values).float().to(device) 132 | y_test_tensor = torch.from_numpy(y_test.values).float().to(device) 133 | 134 | 135 | # 优化目标函数 136 | @timethis 137 | def objective(trial): 138 | Model = define_model(trial).to(device) 139 | optimizer_name = trial.suggest_categorical("optimizer", ["Adam", "RMSprop", "SGD"]) 140 | lr = trial.suggest_loguniform("lr", 1e-5, 1e-1) 141 | optimizer = getattr(optim, optimizer_name)(Model.parameters(), lr=lr) 142 | n_epochs = trial.suggest_int("epochs", 50, 200) 143 | loss_fn = nn.MSELoss(reduction = "mean") 144 | 145 | # 创建训练器 146 | train_step = make_train_step(Model, loss_fn, optimizer) 147 | # losses = [] 148 | 149 | # 进行训练 150 | for epoch in range(n_epochs): 151 | # y_tensor = y_tensor.detach() 152 | loss = train_step(x_tensor, y_tensor) 153 | # losses.append(loss) 154 | accuracy = getAccuracyRate(Model) 155 | 156 | return accuracy 157 | 158 | 159 | if __name__ == "__main__": 160 | newpath = "/home/code" 161 | os.chdir(newpath) 162 | 163 | # 用optuna进行调参 164 | study = optuna.create_study(direction="maximize") 165 | study.optimize(objective, n_trials=10) 166 | 167 | print("结果:", study.best_params) 168 | print(study.best_value) 169 | print(study.best_trial) 170 | 171 | # 进行预测 172 | # predict_clf(model) 173 | -------------------------------------------------------------------------------- /tc/run.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | # 将程序上传到服务器上执行 3 | import os 4 | import sys 5 | from functools import wraps 6 | import time 7 | 8 | 9 | # 上传代码至服务器并运行 10 | def run(gpus, server): 11 | # 上传本目录所有文件再执行指定文件 12 | if gpus == "all": 13 | # 清除服务器代码目录里所有源文件以及输出目录中的文件 14 | s = "ssh ubuntu@" + server + " \"sudo rm -rf ~/code/*.py\"" 15 | os.system(s) 16 | s = "ssh ubuntu@" + server + " \"sudo rm -rf ~/code/output/*\"" 17 | os.system(s) 18 | # 将本地目录所有文件上传至容器 19 | s = "scp -r ./*.py ubuntu@" + server + ":~/code" 20 | os.system(s) 21 | # 运行指定代码 22 | s = "ssh root@" + server + " -p 2222 \"python /home/code/" + sys.argv[2] + "\"" 23 | print("正在运行代码……\n") 24 | os.system(s) 25 | # 将代码目录里所有输出文件传回 26 | s = "scp -r ubuntu@" + server + ":~/code/output/* ./output/" 27 | os.system(s) 28 | # 将所有结果文件传回 29 | elif gpus == "copy": 30 | s = "scp -r ubuntu@" + server + ":~/code/output/* ./output/" 31 | os.system(s) 32 | # 上传指定文件并执行 33 | else: 34 | ## 清除服务器代码目录里所有源文件以及输出目录中的文件 35 | s = "ssh ubuntu@" + server + " \"sudo rm -rf ~/code/*.py\"" 36 | os.system(s) 37 | s = "ssh ubuntu@" + server + " \"sudo rm -rf ~/code/output/*\"" 38 | os.system(s) 39 | # 将本地目录指定文件上传至容器 40 | s = "scp " + sys.argv[1] + " ubuntu@" + server + ":~/code" 41 | os.system(s) 42 | # 运行指定代码 43 | s = "ssh root@" + server + " -p 2222 \"python /home/code/" + sys.argv[1] + "\"" 44 | os.system(s) 45 | # 将代码目录里所有文件传回 46 | s = "scp -r ubuntu@" + server + ":~/code/output/* ./output/" 47 | os.system(s) 48 | 49 | 50 | if __name__ == "__main__": 51 | gpus = sys.argv[1] 52 | # 读取服务器IP地址,自己编辑serverIP.txt去 53 | with open("serverIP.txt", "rt") as f: 54 | server = f.read() 55 | run(gpus, server) 56 | 57 | 58 | # 工具函数,在上传到服务器上运行时改变当前目录 59 | def change_dir(func): 60 | @wraps(func) 61 | def change(*args, **kwargs): 62 | oldpath = os.getcwd() 63 | newpath = "/home/code/" 64 | os.chdir(newpath) 65 | r = func(*args, **kwargs) 66 | os.chdir(oldpath) 67 | return r 68 | return change 69 | 70 | 71 | # 工具函数,计算函数运行时间 72 | def timethis(func): 73 | @wraps(func) 74 | def wrapper(*args, **kwargs): 75 | start = time.perf_counter() 76 | r = func(*args, **kwargs) 77 | end = time.perf_counter() 78 | print('{}.{}的运行时间为 : {}秒'.format(func.__module__, func.__name__, end - start)) 79 | return r 80 | return wrapper 81 | -------------------------------------------------------------------------------- /tc/tools.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | # kaggle竞赛Jane Street Market Prediction 3 | # 工具函数 4 | 5 | from run import * 6 | import pandas as pd 7 | import matplotlib.pyplot as plt 8 | import numpy as np 9 | from sklearn.model_selection import cross_val_score, learning_curve 10 | from sklearn.metrics import classification_report, roc_curve, auc 11 | 12 | 13 | # 载入数据 14 | @change_dir 15 | def loadData(p = 0.01): 16 | # 抽样,读取1%数据 17 | # 参考https://mp.weixin.qq.com/s/2LSKnN9R-N-I2HcHePT9zA 18 | train = pd.read_csv("./train.csv", skiprows = lambda x: x>0 and np.random.rand() > p) 19 | # feature = pd.read_csv("./features.csv") 20 | return train 21 | 22 | 23 | # 对模型进行交叉验证 24 | def cross_val(model, X, Y, cv = 10): 25 | scores = cross_val_score(model, X, Y, cv=cv) 26 | score = scores.mean() 27 | return score 28 | 29 | 30 | # 模型评估 31 | def evalution(model, X, y_true): 32 | # X = test.loc[:, test.columns.str.contains("feature")].values 33 | # y_true = test.action.values 34 | y_pred = model.predict(X) 35 | target_names = ["1", "0"] 36 | result = classification_report(y_true, y_pred, target_names = target_names, output_dict = False ) 37 | return result 38 | 39 | 40 | # 对模型评分 41 | @timethis 42 | def score(model, test, modelName): 43 | if modelName == "XGBoost": 44 | X = test.loc[:, test.columns.str.contains("feature")] 45 | Y = test.action 46 | else: 47 | X = test.loc[:, test.columns.str.contains("feature")].values 48 | Y = test.action.values 49 | model_score = model.score(X, Y) 50 | cross_score = cross_val(model, X, Y) 51 | report = evalution(model, X, Y) 52 | print("模型评分:", model_score) 53 | print("交叉验证:", cross_score) 54 | print("模型评估:\n", report) 55 | Roc(model, X, Y, modelName) 56 | Lc(model, modelName, X, Y) 57 | 58 | 59 | # 画roc曲线 60 | @change_dir 61 | def Roc(model, X, Y, modelName): 62 | y_label = Y 63 | y_pred = model.predict(X) 64 | fpr, tpr, thersholds = roc_curve(y_label, y_pred) 65 | 66 | roc_auc = auc(fpr, tpr) 67 | 68 | plt.plot(fpr, tpr, 'k--', label = "ROC (area = {0:.2f})".format(roc_auc), lw = 2) 69 | plt.tick_params(axis='x', labelsize=15) 70 | plt.tick_params(axis='y', labelsize=15) 71 | plt.xlim([-0.05, 1.05]) 72 | plt.ylim([-0.05, 1.05]) 73 | plt.xlabel("False Positive Rate") 74 | plt.ylabel("True Positive Rate") 75 | plt.title(modelName + " ROC Curve") 76 | plt.legend(loc = "best") 77 | plt.savefig("./output/" + modelName + "_ROC.png") 78 | 79 | 80 | # 画学习曲线 81 | @change_dir 82 | def Lc(model, modelName, X, y, ylim = None, cv = None, n_jobs = 1, train_sizes = np.linspace(0.1, 1.0, 5), verbose = 0): 83 | plt.figure() 84 | plt.title(modelName+" Learning Curve") 85 | if ylim is not None: 86 | plt.ylim(*ylim) 87 | plt.xlabel("Training Samples") 88 | plt.ylabel("Score") 89 | train_sizes, train_scores, test_scores = learning_curve(model, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes) 90 | train_scores_mean = np.mean(train_scores, axis = 1) 91 | train_scores_std = np.std(train_scores, axis = 1) 92 | test_scores_mean = np.mean(test_scores, axis = 1) 93 | test_scores_std = np.std(test_scores, axis = 1) 94 | plt.grid() 95 | 96 | plt.fill_between(train_sizes, train_scores_mean - train_scores_std,train_scores_mean + train_scores_std, alpha=0.1, color="r") 97 | plt.fill_between(train_sizes,test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g") 98 | plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score") 99 | plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score") 100 | 101 | plt.legend(loc="best") 102 | plt.savefig("./output/" + modelName + "_Learning Curve.png") 103 | 104 | 105 | # 工具函数,返回神经网络训练的每一步 106 | def make_train_step(model, loss_fn, optimizer): 107 | # 执行在循环中训练过程 108 | def train_step(x, y): 109 | # 设置训练模式 110 | model.train() 111 | # 梯度置零 112 | optimizer.zero_grad() 113 | # 预测 114 | yhat = model(x) 115 | # print(yhat[:10]) 116 | # 计算损失 117 | # print("测试") 118 | yhat = yhat.squeeze(-1) 119 | # print(yhat.shape, y.shape) 120 | loss = loss_fn(yhat, y) 121 | # 计算梯度 122 | loss.backward() 123 | # 更新参数,梯度置零 124 | optimizer.step() 125 | # 返回损失值 126 | return loss.item() 127 | 128 | # 返回在训练循环中调用的函数 129 | return train_step 130 | 131 | 132 | -------------------------------------------------------------------------------- /test_dt.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | # kaggle Jane Street Market Prediction代码 3 | # 测试datatable的代码 4 | 5 | 6 | import datatable as dt 7 | import pandas as pd 8 | from run import * 9 | 10 | 11 | # 测试计时函数 12 | @change_dir 13 | @timethis 14 | def testtime(): 15 | print(3) 16 | sum = 0 17 | N = 1000 18 | for i in range(N): 19 | for j in range(N): 20 | sum += i*j 21 | print("sum = {}".format(sum)) 22 | 23 | 24 | # 读取数据 25 | @change_dir 26 | @timethis 27 | def testread(): 28 | train_df = dt.fread("./train.csv") 29 | print(train_df.shape) 30 | print(train_df.info()) 31 | print(train_df.describe()) 32 | print(train_df.sum()) 33 | 34 | 35 | if __name__ == "__main__": 36 | testread() 37 | -------------------------------------------------------------------------------- /test_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | # kaggle Jane Street Market Prediction代码 3 | # 学习pytorch 4 | # 参考https://pytorch.apachecn.org/docs/1.0/pytorch_with_examples.html 5 | 6 | 7 | from run import * 8 | import matplotlib.pyplot as plt 9 | 10 | # 用numpy实现 11 | import numpy as np 12 | 13 | 14 | # 前向传播 15 | def fp_np(x, w1, w2): 16 | # 向前传播,计算预测值 17 | h = x.dot(w1) 18 | h_relu = np.maximum(h, 0) 19 | y_pred = h_relu.dot(w2) 20 | return y_pred, h_relu, h 21 | 22 | 23 | # 反向传播 24 | def bp_np(x, y, y_pred, h_relu, h, w1, w2): 25 | grad_y_pred = 2.0*(y_pred - y) 26 | grad_w2 = h_relu.T.dot(grad_y_pred) 27 | grad_h_relu = grad_y_pred.dot(w2.T) 28 | grad_h = grad_h_relu.copy() 29 | grad_h[h < 0] = 0 30 | grad_w1 = x.T.dot(grad_h) 31 | return w1, w2 32 | 33 | 34 | def nn_numpy(): 35 | print("numy版神经网络") 36 | # N是批大小;D_in是输入维度 37 | # H是隐藏层维度;D_out是输出维度 38 | N, D_in, H, D_out = 64, 1000, 100, 10 39 | 40 | # 产生随机输入和输出数据 41 | x = np.random.randn(N, D_in) 42 | y = np.random.randn(N, D_out) 43 | print(len(x)) 44 | print(len(y)) 45 | 46 | # 随机初始化权重 47 | w1 = np.random.randn(D_in, H) 48 | w2 = np.random.randn(H, D_out) 49 | learning_rate = 1e-6 50 | 51 | for t in range(500): 52 | # 向前传播,计算预测值 53 | y_pred, h_relu, h = fp_np(x, w1, w2) 54 | 55 | # 计算并显示loss(损失) 56 | loss = np.square(y_pred - y).sum() 57 | # print(t, loss) 58 | 59 | # 反向传播,计算w1,w2对loss的梯度 60 | grad_w1, grad_w2 = bp_np(x, y, y_pred, h_relu, h, w1, w2) 61 | 62 | # 更新权重 63 | w1 -= learning_rate * grad_w1 64 | w2 -= learning_rate * grad_w2 65 | 66 | x_test = np.random.randn(N, D_in) 67 | print(fp_np(x_test, w1, w2)) 68 | 69 | 70 | # 用pytorch实现 71 | import torch 72 | import torch.nn as nn 73 | import torch.utils.data as Data 74 | from torch.utils.data import Dataset, TensorDataset, DataLoader 75 | import torch.optim as optim 76 | from torchviz import make_dot 77 | 78 | 79 | def nn_pytorch(): 80 | print("pytorch版神经网络") 81 | N, D_in, H, D_out = 64, 1000, 100, 10 82 | x = torch.randn(N, D_in, device=device) 83 | y = torch.randn(N, D_out, device=device) 84 | 85 | # 产生随机权重tensor 86 | w1 = torch.randn(D_in, H, device=device, requires_grad=True) 87 | w2 = torch.randn(H, D_out, device=device, requires_grad=True) 88 | 89 | learning_rate = 1e-6 90 | for t in range(500): 91 | # 前向传播,自动计算梯度 92 | y_pred = x.mm(w1).clamp(min = 0).mm(w2) 93 | # 计算并输出loss 94 | loss = (y_pred - y).pow(2).sum() 95 | print(t, loss.item()) 96 | # 反向传播 97 | loss.backward() 98 | 99 | # 更新权重,不自动计算梯度 100 | with torch.no_grad(): 101 | w1 -= learning_rate * w1.grad 102 | w2 -= learning_rate * w2.grad 103 | 104 | # 梯度置零 105 | w1.grad.zero_() 106 | w2.grad.zero_() 107 | x_test = torch.randn(N, D_in) 108 | print(x_test.mm(w1).clamp(min = 0).mm(w2)) 109 | 110 | 111 | # 用pytorch.nn实现 112 | def nn_torch_nn(): 113 | print("pytorch_nn版神经网络") 114 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 115 | N, D_in, H, D_out = 64, 1000, 100, 10 116 | x = torch.randn(N, D_in, device=device) 117 | y = torch.randn(N, D_out, device=device) 118 | 119 | model = torch.nn.Sequential( 120 | torch.nn.Linear(D_in, H), 121 | torch.nn.ReLU(), 122 | torch.nn.Linear(H, D_out), 123 | ).to(device) 124 | 125 | loss_fn = torch.nn.MSELoss(reduction = "sum") 126 | 127 | learning_rate = 1e-4 128 | for t in range(500): 129 | y_pred = model(x) 130 | loss = loss_fn(y_pred, y) 131 | print(t, loss.item()) 132 | model.zero_grad() 133 | loss.backward() 134 | with torch.no_grad(): 135 | for param in model.parameters(): 136 | param.data -= learning_rate * param.grad 137 | 138 | x_test = torch.randn(N, D_in) 139 | print(model(x_test)) 140 | 141 | 142 | # Pytorch实现二分类器 143 | def pytorch_class(): 144 | class ClassifyModel(nn.Module): 145 | def __init__(self, input_dim, hide_dim, output_dim): 146 | super(ClassifyModel, self).__init__() 147 | self.linear1 = nn.Linear(input_dim, hide_dim) 148 | self.linear2 = nn.Linear(hide_dim, output_dim) 149 | 150 | def forward(self, x): 151 | hidden = self.linear1(x) 152 | activate = torch.relu(hidden) 153 | output = self.linear2(activate) 154 | return output 155 | 156 | # 准备数据 157 | x = torch.unsqueeze(torch.linspace(-10, 10, 50), 1) 158 | y = torch.cat((torch.ones(25), torch.zeros(25))).type(torch.LongTensor) 159 | print(x) 160 | print(y) 161 | dataset = Data.TensorDataset(x, y) 162 | dataloader = Data.DataLoader(dataset=dataset, batch_size=5, shuffle=True) 163 | model = ClassifyModel(1, 10, 2) 164 | model2 = torch.nn.Sequential( 165 | nn.Linear(1, 10), 166 | nn.ReLU(), 167 | nn.Linear(10, 2), 168 | ) 169 | 170 | optim = torch.optim.Adam(model2.parameters(), lr=0.0001) 171 | loss_fn = nn.CrossEntropyLoss() 172 | 173 | for e in range(1000): 174 | epoch_loss = 0 175 | epoch_acc = 0 176 | for i, (x, y) in enumerate(dataloader): 177 | optim.zero_grad() 178 | out = model2(x) 179 | loss = loss_fn(out, y) 180 | 181 | loss.backward() 182 | optim.step() 183 | 184 | epoch_loss += loss.data 185 | epoch_acc += get_acc(out, y) 186 | 187 | if e % 200 == 0: 188 | print('epoch: %d, loss: %f, acc: %f' % (e, epoch_loss / 50, epoch_acc / 50)) 189 | 190 | x_test = torch.unsqueeze(torch.linspace(-2, 2, 10), 1) 191 | print(x_test) 192 | y_pred = (model2(x_test)) 193 | print(y_pred) 194 | 195 | 196 | def get_acc(outputs, labels): 197 | _, predict = torch.max(outputs.data, 1) 198 | total_num = labels.shape[0]*1.0 199 | correct_num = (labels == predict).sum().item() 200 | acc = correct_num / total_num 201 | return acc 202 | 203 | 204 | # 新的尝试 205 | # https://towardsdatascience.com/understanding-pytorch-with-an-example-a-step-by-step-tutorial-81fc5f8c4e8e 206 | @change_dir 207 | def new_try(): 208 | # 1.一个简单的回归问题 209 | # 生成数据 210 | np.random.seed(42) 211 | x = np.random.rand(100, 1) 212 | y = 1 + 2*x + 0.1*np.random.randn(100, 1) 213 | # 打乱顺序 214 | idx = np.arange(100) 215 | np.random.shuffle(idx) 216 | # 使用前80个数据做训练集 217 | train_idx = idx[:80] 218 | # 剩下的做验证集 219 | val_idx = idx[80:] 220 | x_train, y_train = x[train_idx], y[train_idx] 221 | x_test, y_test = x[val_idx], y[val_idx] 222 | plt.figure() 223 | plt.scatter(x_train, y_train) 224 | plt.savefig("./output/train.png") 225 | plt.close() 226 | plt.figure() 227 | plt.scatter(x_test, y_test) 228 | plt.savefig("./output/test.png") 229 | plt.close() 230 | 231 | # 2.梯度下降 232 | # 第一步,计算损失值loss 233 | # 对于回归问题,用平均方差 234 | # Mean Square Error (MSE) 235 | # 第二步,计算梯度 236 | # 即当我们轻微变动两个参数a,b时MSE如何变化 237 | # 第三步,更新参数 238 | # 第四步,用新的参数重新进行上述步骤 239 | # 这个过程就是训练模型的过程 240 | 241 | # 3.使用numpy进行线性回归 242 | # 初始化步骤有两步 243 | # ①随机初始化参数和权重 244 | np.random.seed(42) 245 | a = np.random.randn(1) 246 | b = np.random.randn(1) 247 | print(a, b) 248 | # ②初始化超参数 249 | lr = 1e-1 250 | n_epochs = 1000 251 | 252 | # 训练过程 253 | for epoch in range(n_epochs): 254 | # 计算模型预测值:前向传播 255 | yhat = a + b*x_train 256 | # 计算损失值 257 | error = (y_train - yhat) 258 | loss = (error**2).mean() 259 | # 计算每个参数的梯度值 260 | a_grad = -2*error.mean() 261 | b_grad = -2*(x_train*error).mean() 262 | # 使用梯度和学习率更新参数 263 | a -= lr*a_grad 264 | b -= lr*b_grad 265 | 266 | print(a, b) 267 | 268 | # 检查一下对不对 269 | from sklearn.linear_model import LinearRegression 270 | linr = LinearRegression() 271 | linr.fit(x_train, y_train) 272 | print(linr.intercept_, linr.coef_[0]) 273 | 274 | # 4.使用pytorch 275 | # 张量tensor,有三个或更多的维度 276 | # 加载数据 277 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 278 | x_train_tensor = torch.from_numpy(x_train).float().to(device) 279 | y_train_tensor = torch.from_numpy(y_train).float().to(device) 280 | print(type(x_train), type(x_train_tensor), x_train_tensor.type()) 281 | # 创建参数 282 | # 第一种方法 283 | a = torch.randn(1, requires_grad = True, dtype = torch.float) 284 | b = torch.randn(1, requires_grad = True, dtype = torch.float) 285 | print(a, b) 286 | # 第二种方法 287 | a = torch.randn(1, requires_grad = True, dtype = torch.float).to(device) 288 | b = torch.randn(1, requires_grad = True, dtype = torch.float).to(device) 289 | print(a, b) 290 | # 第三种方法 291 | a = torch.randn(1, dtype = torch.float).to(device) 292 | b = torch.randn(1, dtype = torch.float).to(device) 293 | a.requires_grad_() 294 | b.requires_grad_() 295 | print(a, b) 296 | # 创建时即确定 297 | torch.manual_seed(42) 298 | a = torch.randn(1, requires_grad=True, dtype=torch.float, device=device) 299 | b = torch.randn(1, requires_grad=True, dtype=torch.float, device=device) 300 | print(a, b) 301 | 302 | # 5.自动梯度 303 | lr = 1e-1 304 | n_epochs = 1000 305 | 306 | for epoch in range(n_epochs): 307 | yhat = a + b*x_train_tensor 308 | error = y_train_tensor - yhat 309 | loss = (error**2).mean() 310 | 311 | # 不用自己手动计算梯度了 312 | loss.backward() 313 | # print(a.grad) 314 | # print(b.grad) 315 | 316 | # 更新参数,这时不需要自动计算梯度 317 | with torch.no_grad(): 318 | a -= lr*a.grad 319 | b -= lr*b.grad 320 | 321 | # 将梯度置零,使过程继续 322 | a.grad.zero_() 323 | b.grad.zero_() 324 | 325 | print(a, b) 326 | 327 | # 6.动态计算图 328 | torch.manual_seed(42) 329 | a = torch.randn(1, requires_grad=True, dtype=torch.float, device=device) 330 | b = torch.randn(1, requires_grad=True, dtype=torch.float, device=device) 331 | yhat = a + b*x_train_tensor 332 | error = y_train_tensor - yhat 333 | loss = (error**2).mean() 334 | graph = make_dot(yhat) 335 | # graph.view("./output/yhat") 336 | 337 | # 7.优化 338 | torch.manual_seed(42) 339 | a = torch.randn(1, requires_grad=True, dtype=torch.float, device=device) 340 | b = torch.randn(1, requires_grad=True, dtype=torch.float, device=device) 341 | print(a, b) 342 | 343 | lr = 1e-1 344 | n_epochs = 1000 345 | 346 | optimizer = optim.SGD([a, b], lr = lr) 347 | for epoch in range(n_epochs): 348 | yhat = a + b*x_train_tensor 349 | error = y_train_tensor - yhat 350 | loss = (error**2).mean() 351 | 352 | # 不用自己手动计算梯度了 353 | loss.backward() 354 | 355 | # 也不用自己手动更新参数了 356 | optimizer.step() 357 | # 也不用手动将梯度归零 358 | optimizer.zero_grad() 359 | 360 | print(a, b) 361 | 362 | # 8.损失函数 loss 363 | # pytorch提供了很多损失函数计算方法 364 | # 还可以通过reduction参数来决定如何聚合单个神经节的损失。 365 | torch.manual_seed(42) 366 | a = torch.randn(1, requires_grad=True, dtype=torch.float, device=device) 367 | b = torch.randn(1, requires_grad=True, dtype=torch.float, device=device) 368 | print(a, b) 369 | 370 | lr = 1e-1 371 | n_epochs = 1000 372 | 373 | # 使用pytorch的损失函数 374 | loss_fn = nn.MSELoss(reduction='mean') 375 | 376 | optimizer = optim.SGD([a, b], lr = lr) 377 | for epoch in range(n_epochs): 378 | yhat = a + b*x_train_tensor 379 | # error = y_train_tensor - yhat 380 | # loss = (error**2).mean() 381 | # 不用自己算中间值了 382 | loss = loss_fn(y_train_tensor, yhat) 383 | 384 | # 不用自己手动计算梯度了 385 | loss.backward() 386 | 387 | # 也不用自己手动更新参数了 388 | optimizer.step() 389 | # 也不用手动将梯度归零 390 | optimizer.zero_grad() 391 | 392 | print(a, b) 393 | 394 | # 9.模型 395 | # 在pytorch中模型是继承自Module的一个类 396 | # 至少要实现__init__,初始化参数 397 | # 和forward,是实际计算过程,给定参数x, 398 | # 输出预测。 399 | # 使用model(x)来做出预测 400 | # 模型和数据应该在同一设备中 401 | class ManualLinearRegression(nn.Module): 402 | def __init__(self): 403 | super().__init__() 404 | """使用nn.Parameter使a,b成为模型真正的参数,可以通过parameters()获得参数列表,还可以通过state_dict()获得所有参数的当前值""" 405 | self.a = nn.Parameter(torch.randn(1, requires_grad = True, dtype = torch.float)) 406 | self.b = nn.Parameter(torch.randn(1, requires_grad = True, dtype = torch.float)) 407 | 408 | def forward(self, x): 409 | # 实际的计算过程 410 | return self.a + self.b*x 411 | 412 | # 使用模型 413 | 414 | torch.manual_seed(42) 415 | 416 | #创建模型并传到相关设备上 417 | model = ManualLinearRegression().to(device) 418 | # 输出模型参数状态 419 | print(model.state_dict()) 420 | 421 | lr = 1e-1 422 | n_epochs = 1000 423 | 424 | loss_fn = nn.MSELoss(reduction = "mean") 425 | optimizer = optim.SGD(model.parameters(), lr = lr) 426 | 427 | for epoch in range(n_epochs): 428 | # 这里不是训练,只是开启训练模式 429 | # 因为有的模型会使用诸如Dropout等 430 | # 它们在训练阶段和评估阶段的行为不同 431 | model.train() 432 | 433 | # 不用手动计算了 434 | yhat = model(x_train_tensor) 435 | 436 | loss = loss_fn(y_train_tensor, yhat) 437 | loss.backward() 438 | optimizer.step() 439 | optimizer.zero_grad() 440 | 441 | print(model.state_dict()) 442 | 443 | #嵌套模型 nested models 444 | class LayerLinearRegression(nn.Module): 445 | def __init__(self): 446 | super().__init__() 447 | self.linear = nn.Linear(1, 1) 448 | 449 | def forward(self, x): 450 | return self.linear(x) 451 | 452 | # 使用模型 453 | 454 | torch.manual_seed(42) 455 | 456 | #创建模型并传到相关设备上 457 | model = LayerLinearRegression().to(device) 458 | # 输出模型参数状态 459 | print(model.state_dict()) 460 | 461 | lr = 1e-1 462 | n_epochs = 1000 463 | 464 | loss_fn = nn.MSELoss(reduction = "mean") 465 | optimizer = optim.SGD(model.parameters(), lr = lr) 466 | 467 | for epoch in range(n_epochs): 468 | # 这里不是训练,只是开启训练模式 469 | # 因为有的模型会使用诸如Dropout等 470 | # 它们在训练阶段和评估阶段的行为不同 471 | model.train() 472 | 473 | # 不用手动计算了 474 | yhat = model(x_train_tensor) 475 | 476 | loss = loss_fn(y_train_tensor, yhat) 477 | loss.backward() 478 | optimizer.step() 479 | optimizer.zero_grad() 480 | 481 | print(model.state_dict()) 482 | 483 | # 序列模型Sequential Models 484 | # 为了不用新建一个类 485 | #对于前馈模型,前一层输出可以作为后层的输入 486 | model = nn.Sequential(nn.Linear(1, 1)).to(device) 487 | 488 | # 可以写一个函数封装固定的训练过程 489 | def make_train_step(model, loss_fn, optimizer): 490 | # 执行在循环中训练过程 491 | def train_step(x, y): 492 | # 设置训练模式 493 | model.train() 494 | # 预测 495 | yhat = model(x) 496 | # 计算损失 497 | loss = loss_fn(y, yhat) 498 | # 计算梯度 499 | loss.backward() 500 | # 更新参数,梯度置零 501 | optimizer.step() 502 | optimizer.zero_grad() 503 | # 返回损失值 504 | return loss.item() 505 | 506 | # 返回在训练循环中调用的函数 507 | return train_step 508 | 509 | torch.manual_seed(42) 510 | 511 | #创建模型并传到相关设备上 512 | model = LayerLinearRegression().to(device) 513 | # 输出模型参数状态 514 | print(model.state_dict()) 515 | 516 | lr = 1e-1 517 | n_epochs = 1000 518 | 519 | loss_fn = nn.MSELoss(reduction = "mean") 520 | optimizer = optim.SGD(model.parameters(), lr = lr) 521 | train_step = make_train_step(model, loss_fn, optimizer) 522 | losses = [] 523 | 524 | for epoch in range(n_epochs): 525 | loss = train_step(x_train_tensor, y_train_tensor) 526 | losses.append(loss) 527 | 528 | print(model.state_dict()) 529 | 530 | # 10.数据集 dataset 531 | # 代表继承自Dataset的一个类 532 | # 可看成一个tuples列表,每个tuple代表一个(特征,标签)点 533 | # 数据很大时,建议在需要时再加载,用__get_item__ 534 | class CustomDataset(Dataset): 535 | # 用csv文件或tensor输入 536 | def __init__(self, x_tensor, y_tensor): 537 | self.x = x_tensor 538 | self.y = y_tensor 539 | 540 | def __getitem__(self, index): 541 | return (self.x[index], self.y[index]) 542 | 543 | def __len__(self): 544 | return len(self.x) 545 | 546 | x_train_tensor = torch.from_numpy(x_train).float() 547 | y_train_tensor = torch.from_numpy(y_train).float() 548 | 549 | train_data = CustomDataset(x_train_tensor, y_train_tensor) 550 | print(train_data[0]) 551 | # 如果一个数据集是一对张量,可以用TensorDataset 552 | train_data = TensorDataset(x_train_tensor, y_train_tensor) 553 | print(train_data[0]) 554 | # 别把所有训练数据都放到GPU里,太占显存了 555 | # 创建数据集的目的是可以使用DataLoader 556 | 557 | # 11.加载数据DataLoader 558 | # 对于大数据集,在训练中只加载一部分 559 | train_loader = DataLoader(dataset = train_data, batch_size = 16, shuffle = True) 560 | 561 | # 使用 562 | losses = [] 563 | train_step = make_train_step(model, loss_fn, optimizer) 564 | 565 | for epoch in range(n_epochs): 566 | for x_batch, y_batch in train_loader: 567 | x_batch = x_batch.to(device) 568 | y_batch = y_batch.to(device) 569 | 570 | loss = train_step(x_batch, y_batch) 571 | losses.append(loss) 572 | 573 | print(model.state_dict()) 574 | # 随机划分训练_验证集 575 | x_tensor = torch.from_numpy(x).float() 576 | y_tensor = torch.from_numpy(y).float() 577 | 578 | dataset = TensorDataset(x_tensor, y_tensor) 579 | 580 | train_dataset, val_dataset = Data.dataset.random_split(dataset, [80, 20]) 581 | 582 | train_loader = DataLoader(dataset = train_dataset, batch_size = 16) 583 | val_loader = DataLoader(dataset = val_dataset, batch_size = 20) 584 | 585 | # 12.评估 586 | losses = [] 587 | val_losses = [] 588 | train_step = make_train_step(model, loss_fn, optimizer) 589 | 590 | for epoch in range(n_epochs): 591 | for x_batch, y_batch in train_loader: 592 | x_batch = x_batch.to(device) 593 | y_batch = y_batch.to(device) 594 | 595 | loss = train_step(x_batch, y_batch) 596 | losses.append(loss) 597 | 598 | with torch.no_grad(): 599 | for x_val, y_val in val_loader: 600 | x_val = x_val.to(device) 601 | y_val = y_val.to(device) 602 | 603 | # 将模型置为评估阶段 604 | model.eval() 605 | 606 | yhat = model(x_val) 607 | val_loss = loss_fn(y_val, yhat) 608 | val_losses.append(val_loss.item()) 609 | 610 | print(model.state_dict()) 611 | 612 | 613 | if __name__ == "__main__": 614 | # nn_numpy() 615 | # nn_pytorch() 616 | # nn_torch_nn() 617 | # pytorch_class() 618 | # print(torch.__version__) 619 | # print(torch.version.cuda) 620 | new_try() -------------------------------------------------------------------------------- /test_work.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | # kaggle Jane Street Market Prediction代码 3 | # 实际自己工作的代码 4 | 5 | 6 | import numpy as np 7 | import pandas as pd 8 | pd.set_option('display.max_columns', None) 9 | import janestreet 10 | 11 | import matplotlib.pyplot as plt 12 | from sklearn.model_selection import train_test_split 13 | from sklearn import metrics 14 | from sklearn.metrics import accuracy_score 15 | import optuna 16 | # 逻辑回归 17 | from sklearn.linear_model import LinearRegression, LogisticRegression 18 | # 支持向量机 19 | from sklearn.svm import SVC, LinearSVC 20 | # 随机森林 21 | from sklearn.ensemble import RandomForestClassifier 22 | # KNN算法 23 | from sklearn.neighbors import KNeighborsClassifier 24 | # 朴素贝叶斯算法 25 | from sklearn.naive_bayes import GaussianNB 26 | # SGD算法 27 | from sklearn.linear_model import SGDClassifier 28 | # 决策树算法 29 | from sklearn.tree import DecisionTreeClassifier 30 | 31 | import os 32 | 33 | from EDA import data_explore 34 | from FE import featureEngineer 35 | from tools import * 36 | 37 | 38 | 39 | # 建模前处理数据 40 | def preprocessing(train): 41 | X_train = train.loc[:, train.columns.str.contains('feature')] 42 | # y_train = train.loc[:, 'resp'] 43 | y_train = train.loc[:, 'action'] 44 | 45 | X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, random_state=666, test_size=0.2) 46 | 47 | return X_train, y_train 48 | 49 | 50 | # 评分函数 51 | def Score(model, data): 52 | # test_df = pd.read_csv("/kaggle/input/jane-street-market-prediction/train.csv") 53 | data = data.fillna(-999) 54 | X_test = data.loc[:, data.columns.str.contains('feature')] 55 | resp = model.predict(X_test) 56 | date = data["date"].values 57 | weight = data["weight"].values 58 | action = (resp > 0).astype("int") 59 | 60 | count_i = len(np.unique(date)) 61 | Pi = np.zeros(count_i) 62 | # 用循环太慢 63 | #for i, day in enumerate(np.unique(date)): 64 | # Pi[i] = np.sum(weight[date == day] * resp[date == day] * action[date == day]) 65 | # 用下面这行代替 66 | Pi = np.bincount(date, weight * resp * action) 67 | t = np.sum(Pi) / np.sqrt(np.sum(Pi ** 2)) * np.sqrt(250 / count_i) 68 | u = np.clip(t, 0, 6) * np.sum(Pi) 69 | return u 70 | 71 | 72 | # 进行预测,生成提交文件,求值版 73 | def predict_value(model): 74 | env = janestreet.make_env() 75 | iter_test = env.iter_test() 76 | for (test_df, sample_prediction_df) in iter_test: 77 | if test_df['weight'].item() > 0: 78 | test_df = featureEngineer(test_df) 79 | X_test = test_df.loc[:, test_df.columns.str.contains('feature')] 80 | # X_test = X_test.fillna(-999) 81 | y_resp = model.predict(X_test)[0] 82 | y_preds = 0 if y_resp < 0 else 1 83 | else: 84 | y_preds = 0 85 | # print(y_preds) 86 | sample_prediction_df.action = y_preds 87 | env.predict(sample_prediction_df) 88 | 89 | 90 | # 进行预测,生成提交文件,分类版 91 | def predict_clf(model): 92 | env = janestreet.make_env() 93 | iter_test = env.iter_test() 94 | for (test_df, sample_prediction_df) in iter_test: 95 | if test_df['weight'].item() > 0: 96 | test_df = featureEngineer(test_df) 97 | X_test = test_df.loc[:, test_df.columns.str.contains('feature')] 98 | X_test = X_test.fillna(0.0) 99 | y_preds = model.predict(X_test)[0] 100 | else: 101 | y_preds = 0 102 | # print(y_preds) 103 | sample_prediction_df.action = y_preds 104 | env.predict(sample_prediction_df) 105 | 106 | 107 | if __name__ == "__main__": 108 | newpath = "/home/code" 109 | os.chdir(newpath) 110 | 111 | # data_explore() 112 | 113 | # 真正开始干活 114 | p = 0.01 115 | train = loadData(p = p) 116 | train = featureEngineer(train) 117 | # print(train.head()) 118 | 119 | # 计算模型评分 120 | # score = Score(model, train) 121 | # print("模型评分:%.2f" % score) 122 | test = loadData(p = p) 123 | test = featureEngineer(test) 124 | 125 | #训练数据预处理 126 | X_train, y_train = preprocessing(train) 127 | 128 | # 逻辑回归 129 | print("逻辑回归") 130 | model = LogisticRegression(max_iter = 3000) 131 | model.fit(X_train, y_train) 132 | score(model, test, "Logist") 133 | 134 | # 支持向量机 135 | print("支持向量机") 136 | model = SVC() 137 | model.fit(X_train, y_train) 138 | score(model, test, "SVC") 139 | 140 | # 随机森林 141 | print("随机森林") 142 | model = RandomForestClassifier() 143 | model.fit(X_train, y_train) 144 | score(model, test, "RandomForest") 145 | 146 | # knn 147 | print("knn") 148 | model = KNeighborsClassifier(n_neighbors = 2) 149 | model.fit(X_train, y_train) 150 | score(model, test, "knn") 151 | 152 | # 朴素贝叶斯 153 | print("朴素贝叶斯") 154 | model = GaussianNB() 155 | model.fit(X_train, y_train) 156 | score(model, test, "Bayes") 157 | 158 | # SGD算法 159 | print("SGD算法") 160 | model = SGDClassifier() 161 | model.fit(X_train, y_train) 162 | score(model, test, "SGD") 163 | 164 | # 决策树 165 | print("决策树算法") 166 | model = DecisionTreeClassifier() 167 | model.fit(X_train, y_train) 168 | score(model, test, "DecisionTree") 169 | # 进行预测 170 | # predict_clf(model) 171 | -------------------------------------------------------------------------------- /tools.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | # kaggle竞赛Jane Street Market Prediction 3 | # 工具函数 4 | 5 | from run import * 6 | import pandas as pd 7 | import matplotlib.pyplot as plt 8 | import numpy as np 9 | from sklearn.model_selection import cross_val_score, learning_curve 10 | from sklearn.metrics import classification_report, roc_curve, auc 11 | 12 | 13 | # 载入数据 14 | @change_dir 15 | def loadData(p = 0.01): 16 | # 抽样,读取1%数据 17 | # 参考https://mp.weixin.qq.com/s/2LSKnN9R-N-I2HcHePT9zA 18 | train = pd.read_csv("./train.csv", skiprows = lambda x: x>0 and np.random.rand() > p) 19 | # feature = pd.read_csv("./features.csv") 20 | return train 21 | 22 | 23 | # 对模型进行交叉验证 24 | def cross_val(model, X, Y, cv = 10): 25 | scores = cross_val_score(model, X, Y, cv=cv) 26 | score = scores.mean() 27 | return score 28 | 29 | 30 | # 模型评估 31 | def evalution(model, X, y_true): 32 | # X = test.loc[:, test.columns.str.contains("feature")].values 33 | # y_true = test.action.values 34 | y_pred = model.predict(X) 35 | target_names = ["1", "0"] 36 | result = classification_report(y_true, y_pred, target_names = target_names, output_dict = False ) 37 | return result 38 | 39 | 40 | # 对模型评分 41 | @timethis 42 | def score(model, test, modelName): 43 | if modelName == "XGBoost": 44 | X = test.loc[:, test.columns.str.contains("feature")] 45 | Y = test.action 46 | else: 47 | X = test.loc[:, test.columns.str.contains("feature")].values 48 | Y = test.action.values 49 | model_score = model.score(X, Y) 50 | cross_score = cross_val(model, X, Y) 51 | report = evalution(model, X, Y) 52 | print("模型评分:", model_score) 53 | print("交叉验证:", cross_score) 54 | print("模型评估:\n", report) 55 | Roc(model, X, Y, modelName) 56 | Lc(model, modelName, X, Y) 57 | 58 | 59 | # 画roc曲线 60 | @change_dir 61 | def Roc(model, X, Y, modelName): 62 | y_label = Y 63 | y_pred = model.predict(X) 64 | fpr, tpr, thersholds = roc_curve(y_label, y_pred) 65 | 66 | roc_auc = auc(fpr, tpr) 67 | 68 | plt.plot(fpr, tpr, 'k--', label = "ROC (area = {0:.2f})".format(roc_auc), lw = 2) 69 | plt.tick_params(axis='x', labelsize=15) 70 | plt.tick_params(axis='y', labelsize=15) 71 | plt.xlim([-0.05, 1.05]) 72 | plt.ylim([-0.05, 1.05]) 73 | plt.xlabel("False Positive Rate") 74 | plt.ylabel("True Positive Rate") 75 | plt.title(modelName + " ROC Curve") 76 | plt.legend(loc = "best") 77 | plt.savefig("./output/" + modelName + "_ROC.png") 78 | 79 | 80 | # 画学习曲线 81 | @change_dir 82 | def Lc(model, modelName, X, y, ylim = None, cv = None, n_jobs = 1, train_sizes = np.linspace(0.1, 1.0, 5), verbose = 0): 83 | plt.figure() 84 | plt.title(modelName+" Learning Curve") 85 | if ylim is not None: 86 | plt.ylim(*ylim) 87 | plt.xlabel("Training Samples") 88 | plt.ylabel("Score") 89 | train_sizes, train_scores, test_scores = learning_curve(model, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes) 90 | train_scores_mean = np.mean(train_scores, axis = 1) 91 | train_scores_std = np.std(train_scores, axis = 1) 92 | test_scores_mean = np.mean(test_scores, axis = 1) 93 | test_scores_std = np.std(test_scores, axis = 1) 94 | plt.grid() 95 | 96 | plt.fill_between(train_sizes, train_scores_mean - train_scores_std,train_scores_mean + train_scores_std, alpha=0.1, color="r") 97 | plt.fill_between(train_sizes,test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g") 98 | plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score") 99 | plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score") 100 | 101 | plt.legend(loc="best") 102 | plt.savefig("./output/" + modelName + "_Learning Curve.png") 103 | 104 | 105 | # 工具函数,返回神经网络训练的每一步 106 | def make_train_step(model, loss_fn, optimizer): 107 | # 执行在循环中训练过程 108 | def train_step(x, y): 109 | # 设置训练模式 110 | model.train() 111 | # 梯度置零 112 | optimizer.zero_grad() 113 | # 预测 114 | yhat = model(x) 115 | # print(yhat[:10]) 116 | # 计算损失 117 | # print("测试") 118 | yhat = yhat.squeeze(-1) 119 | # print(yhat.shape, y.shape) 120 | loss = loss_fn(yhat, y) 121 | # 计算梯度 122 | loss.backward() 123 | # 更新参数,梯度置零 124 | optimizer.step() 125 | # 返回损失值 126 | return loss.item() 127 | 128 | # 返回在训练循环中调用的函数 129 | return train_step 130 | 131 | 132 | -------------------------------------------------------------------------------- /works.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | # kaggle Jane Street Market Prediction代码 3 | # 实际自己工作的代码 4 | 5 | 6 | import numpy as np 7 | import pandas as pd 8 | pd.set_option('display.max_columns', None) 9 | import janestreet 10 | 11 | import matplotlib.pyplot as plt 12 | from sklearn.model_selection import train_test_split 13 | from sklearn import metrics 14 | from sklearn.metrics import accuracy_score 15 | import optuna 16 | from sklearn.linear_model import LinearRegression, LogisticRegression 17 | 18 | import os 19 | 20 | from EDA import data_explore 21 | from FE import featureEngineer 22 | 23 | 24 | 25 | # 建模过程 26 | def modeling(train): 27 | print("开始建模") 28 | X_train = train.loc[:, train.columns.str.contains('feature')] 29 | # y_train = train.loc[:, 'resp'] 30 | y_train = train.loc[:, 'action'] 31 | 32 | X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, random_state=666, test_size=0.2) 33 | # model = LinearRegression() 34 | model = LogisticRegression() 35 | model.fit(X_train, y_train) 36 | 37 | return model 38 | 39 | 40 | # 评分函数 41 | def Score(model, data): 42 | # test_df = pd.read_csv("/kaggle/input/jane-street-market-prediction/train.csv") 43 | data = data.fillna(-999) 44 | X_test = data.loc[:, data.columns.str.contains('feature')] 45 | resp = model.predict(X_test) 46 | date = data["date"].values 47 | weight = data["weight"].values 48 | action = (resp > 0).astype("int") 49 | 50 | count_i = len(np.unique(date)) 51 | Pi = np.zeros(count_i) 52 | # 用循环太慢 53 | #for i, day in enumerate(np.unique(date)): 54 | # Pi[i] = np.sum(weight[date == day] * resp[date == day] * action[date == day]) 55 | # 用下面这行代替 56 | Pi = np.bincount(date, weight * resp * action) 57 | t = np.sum(Pi) / np.sqrt(np.sum(Pi ** 2)) * np.sqrt(250 / count_i) 58 | u = np.clip(t, 0, 6) * np.sum(Pi) 59 | return u 60 | 61 | 62 | # 进行预测,生成提交文件,求值版 63 | def predict_value(model): 64 | env = janestreet.make_env() 65 | iter_test = env.iter_test() 66 | for (test_df, sample_prediction_df) in iter_test: 67 | if test_df['weight'].item() > 0: 68 | X_test = test_df.loc[:, test_df.columns.str.contains('feature')] 69 | X_test = X_test.fillna(-999) 70 | y_resp = model.predict(X_test)[0] 71 | y_preds = 0 if y_resp < 0 else 1 72 | else: 73 | y_preds = 0 74 | # print(y_preds) 75 | sample_prediction_df.action = y_preds 76 | env.predict(sample_prediction_df) 77 | 78 | 79 | # 进行预测,生成提交文件,分类版 80 | def predict_clf(model): 81 | env = janestreet.make_env() 82 | iter_test = env.iter_test() 83 | for (test_df, sample_prediction_df) in iter_test: 84 | if test_df['weight'].item() > 0: 85 | X_test = test_df.loc[:, test_df.columns.str.contains('feature')] 86 | X_test = X_test.fillna(-999) 87 | y_preds = model.predict(X_test)[0] 88 | else: 89 | y_preds = 0 90 | # print(y_preds) 91 | sample_prediction_df.action = y_preds 92 | env.predict(sample_prediction_df) 93 | 94 | 95 | if __name__ == "__main__": 96 | newpath = "/home/code" 97 | os.chdir(newpath) 98 | 99 | # data_explore() 100 | 101 | # 真正开始干活 102 | train = pd.read_csv("./train.csv", nrows = 10000) 103 | feature = pd.read_csv("./features.csv") 104 | train = featureEngineer(train) 105 | model = modeling(train) 106 | # 计算模型评分 107 | # score = Score(model, train) 108 | # print("模型评分:%.2f" % score) 109 | 110 | # 进行预测 111 | predict_clf(model) 112 | --------------------------------------------------------------------------------