├── .gitignore
├── DL
    ├── __pycache__
    │   └── run.cpython-37.pyc
    ├── backward.py
    ├── improve.py
    ├── learning.py
    ├── nn.py
    ├── perceptron.py
    ├── run.py
    └── sample_weight.pkl
├── DP.py
├── EDA.py
├── FE.py
├── LSTM.py
├── LSTM_work.py
├── MyFrame.py
├── NNDL
    ├── minst.py
    ├── mnist.pkl.gz
    └── run.py
├── README
├── X.npy
├── Y.npy
├── __pycache__
    ├── run.cpython-37.pyc
    ├── run.cpython-38.pyc
    └── tools.cpython-38.pyc
├── copy_jsmp.py
├── hello.py
├── hidegpu
    ├── FE.py
    ├── nohup.out
    ├── optuna_test.py
    └── tools.py
├── janestreet
    ├── __init__.py
    ├── __pycache__
    │   ├── __init__.cpython-37.pyc
    │   └── __init__.cpython-38.pyc
    └── competition.cpython-37m-x86_64-linux-gnu.so
├── jsmp.py
├── jsmp_local.py
├── myxgboost.py
├── nn.py
├── optuna_DP.py
├── optuna_test.py
├── pic
    ├── 00.jpg
    ├── 01.jpg
    ├── 02.jpg
    ├── 03.jpg
    ├── 04.jpg
    ├── 05.jpg
    ├── 06.jpg
    ├── 07.jpg
    ├── 08.jpg
    ├── 09.jpg
    └── 10.jpg
├── preprocess.py
├── py_nn.py
├── py_nn_back.py
├── py_nn_use.py
├── pytorch_work.py
├── run.py
├── tc
    ├── FE.py
    ├── optuna_DP.py
    ├── run.py
    └── tools.py
├── test_dt.py
├── test_pytorch.py
├── test_work.py
├── tools.py
└── works.py


/.gitignore:
--------------------------------------------------------------------------------
1 | output
2 | *.csv
3 | serverIP.txt
4 | 


--------------------------------------------------------------------------------
/DL/__pycache__/run.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zwdnet/JSMPwork/b7ae1380986abc3ee5b10bea7f9d57cbb805a8f0/DL/__pycache__/run.cpython-37.pyc


--------------------------------------------------------------------------------
/DL/backward.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | # kaggle Jane Street Market Prediction代码
  3 | # 《深度学习入门:基于python的理论与实现》
  4 | # 第五章 误差反向传播法
  5 | 
  6 | 
  7 | import numpy as np
  8 | import matplotlib.pyplot as plt
  9 | import run
 10 | import pandas as pd
 11 | from PIL import Image
 12 | import random
 13 | import pickle
 14 | from collections import OrderedDict
 15 | 
 16 | 
 17 | # 乘法层的实现
 18 | class MulLayer:
 19 |     def __init__(self):
 20 |         self.x = None
 21 |         self.y = None
 22 |         
 23 |     def forward(self, x, y):
 24 |         self.x = x
 25 |         self.y = y
 26 |         out = x*y
 27 |         return out
 28 |         
 29 |     def backward(self, dout):
 30 |         dx = dout*self.y # 翻转x和y
 31 |         dy = dout*self.x
 32 |         
 33 |         return dx, dy
 34 |         
 35 |         
 36 | def testMul():
 37 |     apple = 100
 38 |     apple_num = 2
 39 |     tax = 1.1
 40 |     
 41 |     mul_apple_layer = MulLayer()
 42 |     mul_tax_layer = MulLayer()
 43 |     
 44 |     # 前向传播
 45 |     apple_price = mul_apple_layer.forward(apple, apple_num)
 46 |     price = mul_tax_layer.forward(apple_price, tax)
 47 |     print(price)
 48 |     
 49 |     # 反向传播
 50 |     dprice = 1
 51 |     dapple_price, dtax = mul_tax_layer.backward(dprice)
 52 |     dapple, dapple_num = mul_apple_layer.backward(dapple_price)
 53 |     print(dapple_price, dtax, dapple, dapple_num)
 54 |     
 55 |     
 56 | # 加法层实现
 57 | class AddLayer:
 58 |     def __init__(self):
 59 |         pass
 60 |         
 61 |     def forward(self, x, y):
 62 |         out = x+y
 63 |         return out
 64 |         
 65 |     def backward(self, dout):
 66 |         dx = dout*1
 67 |         dy = dout*1
 68 |         return dx, dy
 69 |         
 70 |         
 71 | def testAdd():
 72 |     apple = 100
 73 |     apple_num = 2
 74 |     orange = 150
 75 |     orange_num = 3
 76 |     tax = 1.1
 77 |     
 78 |     mul_apple_layer = MulLayer()
 79 |     mul_orange_layer = MulLayer()
 80 |     add_apple_orange_layer = AddLayer()
 81 |     mul_tax_layer = MulLayer()
 82 |     
 83 |     # 前向传播
 84 |     apple_price = mul_apple_layer.forward(apple, apple_num)
 85 |     orange_price = mul_orange_layer.forward(orange, orange_num)
 86 |     all_price = add_apple_orange_layer.forward(apple_price, orange_price)
 87 |     price = mul_tax_layer.forward(all_price, tax)
 88 |     print(price)
 89 |     
 90 |     # 反向传播
 91 |     dprice = 1
 92 |     dall_price, dtax = mul_tax_layer.backward(dprice)
 93 |     dapple_price, dorange_price = add_apple_orange_layer.backward(dall_price)
 94 |     dorange, dorange_num = mul_orange_layer.backward(dorange_price)
 95 |     dapple, dapple_num = mul_apple_layer.backward(dapple_price)
 96 |     print(dapple_num, dapple, dorange, dorange_num, dtax)
 97 |     
 98 |     
 99 | # ReLU激活函数层
100 | class ReLU:
101 |     def __init__(self):
102 |         self.mask = None
103 |         
104 |     def forward(self, x):
105 |         self.mask = (x <= 0)
106 |         out = x.copy()
107 |         out[self.mask] = 0
108 |         
109 |         return out
110 |         
111 |     def backward(self, dout):
112 |         dout[self.mask] = 0
113 |         dx = dout
114 |         
115 |         return dx
116 |         
117 |         
118 | def testReLU():
119 |     x = np.array([[1.0, -0.5], [-2.0, 3.0]])
120 |     print(x)
121 |     mask = (x<0)
122 |     print(mask)
123 |     relu = ReLU()
124 |     out = relu.forward(x)
125 |     dout = relu.backward(out)
126 |     print(out, dout)
127 |     
128 |     
129 | # Sigmoid激活函数层
130 | class Sigmoid:
131 |     def __init__(self):
132 |         self.out = None
133 |         
134 |     def forward(self, x):
135 |         out = 1/(1+np.exp(-x))
136 |         self.out = out
137 |         
138 |         return out
139 |         
140 |     def backward(self, dout):
141 |         dx = dout*(1.0-self.out)*self.out
142 |         
143 |         return dx
144 |         
145 |         
146 | def testSigmoid():
147 |     x = np.array([[1.0, -0.5], [-2.0, 3.0]])
148 |     print(x)
149 |     sigmoid = Sigmoid()
150 |     out = sigmoid.forward(x)
151 |     dout = sigmoid.backward(out)
152 |     print(out, dout)
153 |     
154 |     
155 | def testSum():
156 |     print("求和")
157 |     x = np.array([[1, 2], [3, 4]])
158 |     s1 = np.sum(x, axis = 0)
159 |     s2 = np.sum(x, axis = 1)
160 |     s3 = np.sum(x)
161 |     print(x, s1, s2, s3)
162 |     
163 |     
164 | # Affine层
165 | class Affine:
166 |     def __init__(self, W, b):
167 |         self.W = W
168 |         self.b = b
169 |         self.x = None
170 |         self.dW = None
171 |         self.db = None
172 |         
173 |     def forward(self, x):
174 |         self.x = x
175 |         out = np.dot(x, self.W) + self.b
176 |         return out
177 |         
178 |     def backward(self, dout):
179 |         dx = np.dot(dout, self.W.T)
180 |         self.dW = np.dot(self.x.T, dout)
181 |         self.db = np.sum(dout, axis = 0)
182 |         
183 |         return dx
184 |         
185 |         
186 | def softmax(x):
187 |     if x.ndim == 2:
188 |         x = x.T
189 |         x = x - np.max(x, axis=0)
190 |         y = np.exp(x) / np.sum(np.exp(x), axis=0)
191 |         return y.T 
192 | 
193 |     x = x - np.max(x) # 溢出对策
194 |     return np.exp(x) / np.sum(np.exp(x))
195 |     
196 |     
197 | def cross_entropy_error(y, t):
198 |     if y.ndim == 1:
199 |         t = t.reshape(1, t.size)
200 |         y = y.reshape(1, y.size)
201 |         
202 |     # 监督数据是one-hot-vector的情况下，转换为正确解标签的索引
203 |     if t.size == y.size:
204 |         t = t.argmax(axis=1)
205 |              
206 |     batch_size = y.shape[0]
207 |     return -np.sum(np.log(y[np.arange(batch_size), t] + 1e-7)) / batch_size
208 |         
209 |         
210 | # softmax和loss函数结合层
211 | class SoftmaxWithLoss:
212 |     def __init__(self):
213 |         self.loss = None
214 |         self.y = None
215 |         self.t = None
216 |         
217 |     def forward(self, x, t):
218 |         self.t = t
219 |         self.y = softmax(x)
220 |         self.loss = cross_entropy_error(self.y, self.t)
221 |         
222 |         return self.loss
223 |         
224 |     def backward(self, dout = 1):
225 |         batch_size = self.t.shape[0]
226 |         dx = (self.y - self.t)/batch_size
227 |         return dx
228 |         
229 |         
230 | # 数值微分
231 | def numerical_gradient(f, x):
232 |     h = 1e-4 # 0.0001
233 |     grad = np.zeros_like(x)
234 |     
235 |     it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
236 |     while not it.finished:
237 |         idx = it.multi_index
238 |         tmp_val = x[idx]
239 |         x[idx] = float(tmp_val) + h
240 |         fxh1 = f(x) # f(x+h)
241 |         
242 |         x[idx] = tmp_val - h 
243 |         fxh2 = f(x) # f(x-h)
244 |         grad[idx] = (fxh1 - fxh2) / (2*h)
245 |         
246 |         x[idx] = tmp_val # 还原值
247 |         it.iternext()   
248 |         
249 |     return grad
250 | 
251 | 
252 | # 用上面这些构建神经网络
253 | class TwoLayerNet:
254 |     def __init__(self, input_size, hidden_size, output_size, weight_init_std = 0.01):
255 |         # 初始化权重
256 |         self.params = {}
257 |         self.params["W1"] = weight_init_std*np.random.randn(input_size, hidden_size)
258 |         self.params["b1"] = np.zeros(hidden_size)
259 |         self.params["W2"] = weight_init_std*np.random.randn(hidden_size, output_size)
260 |         self.params["b2"] = np.zeros(output_size)
261 |         
262 |         # 生成层
263 |         self.layers = OrderedDict()
264 |         self.layers["Affine1"] = Affine(self.params["W1"], self.params["b1"])
265 |         self.layers["Relu1"] = ReLU()
266 |         self.layers["Affine2"] = Affine(self.params["W2"], self.params["b2"])
267 |         self.lastLayer = SoftmaxWithLoss()
268 |         
269 |     def predict(self, x):
270 |         for layer in self.layers.values():
271 |             x = layer.forward(x)
272 |         return x
273 |         
274 |     def loss(self, x, t):
275 |         y = self.predict(x)
276 |         return self.lastLayer.forward(y, t)
277 |         
278 |     def accuracy(self, x, t):
279 |         y = self.predict(x)
280 |         y = np.argmax(y, axis = 1)
281 |         if t.ndim != 1:
282 |             t = np.argmax(t, axis = 1)
283 |         accuracy = np.sum(y == t) / float(x.shape[0])
284 |         return accuracy
285 |         
286 |     def numerical_gradient(self, x, t):
287 |         loss_W = lambda W : self.loss(x, t)
288 |         
289 |         grads = {}
290 |         grads["W1"] = numerical_gradient(loss_W, self.params["W1"] )
291 |         grads["b1"] = numerical_gradient(loss_W, self.params["b1"] )
292 |         grads["W2"] = numerical_gradient(loss_W, self.params["W2"] )
293 |         grads["b2"] = numerical_gradient(loss_W, self.params["b2"] )
294 |         
295 |         return grads
296 | 
297 |         
298 |     #  更快的求梯度的方法
299 |     def gradient(self, x, t):
300 |         # forward
301 |         self.loss(x, t)
302 |         # backward
303 |         dout = 1
304 |         dout = self.lastLayer.backward(dout)
305 |         
306 |         layers = list(self.layers.values())
307 |         layers.reverse()
308 |         for layer in layers:
309 |             dout = layer.backward(dout)
310 |             
311 |         grads = {}
312 |         grads["W1"] = self.layers["Affine1"].dW
313 |         grads["b1"] = self.layers["Affine1"].db
314 |         grads["W2"] = self.layers["Affine2"].dW
315 |         grads["b2"] = self.layers["Affine2"].db
316 |         
317 |         return grads
318 |         
319 |         
320 | # 手写数字识别
321 | # 加载数据
322 | @run.change_dir
323 | def loadData():
324 |     training_data_file = open("mnist_train.csv", 'r')
325 |     training_data_list = training_data_file.readlines()
326 |     training_data_file.close()
327 |     
328 |     testing_data_file = open("mnist_test.csv", 'r')
329 |     testing_data_list = testing_data_file.readlines()
330 |     testing_data_file.close()
331 |     
332 |     x_train, t_train = [], []
333 |     for record in training_data_list:
334 |         # 通过','将数分段
335 |         all_values = record.split(',')
336 |         # 将所有的像素点的值转换为0.01-1.00
337 |         inputs = (np.asfarray(all_values[1:]) / 255.0 * 0.99 + 0.01)
338 |         # 创建标签输出值
339 |         target = int(all_values[0])
340 |         x_train.append(inputs)
341 |         t_train.append(target)
342 |     x_test, t_test = [], []
343 |     for record in testing_data_list:
344 |         # 通过','将数分段
345 |         all_values = record.split(',')
346 |         # 将所有的像素点的值转换为0.01-1.00
347 |         inputs = (np.asfarray(all_values[1:]) / 255.0 * 0.99 + 0.01)
348 |         # 创建标签输出值
349 |         target = int(all_values[0])
350 |         x_test.append(inputs)
351 |         t_test.append(target)
352 |     x_train = np.array(x_train)
353 |     t_train = np.array(t_train)
354 |     x_test = np.array(x_test)
355 |     t_test = np.array(t_test)
356 |     t_train = one_hot(t_train)
357 |     t_test = one_hot(t_test)
358 |     return x_train, t_train, x_test, t_test
359 |     
360 |     
361 | # one_hot过程
362 | def one_hot(t):
363 |     tmp = np.zeros((t.shape[0], 10))
364 |     for i in range(t.shape[0]):
365 |         tmp[i][t[i]] = 1
366 |     t = tmp
367 |     return t
368 |     
369 |         
370 | # 梯度确认
371 | def gradcheck():
372 |     print("梯度确认")
373 |     n = 10
374 |     x_train, t_train, x_test, t_test = loadData()
375 |     network = TwoLayerNet(input_size = 784, hidden_size = 50, output_size = 10)
376 |     x_batch = x_train[:n]
377 |     t_batch = t_train[:n]
378 |     
379 |     grad_numerical = network.numerical_gradient(x_batch, t_batch)
380 |     grad_backprop = network.gradient(x_batch, t_batch)
381 |     
382 |     for key in grad_numerical.keys():
383 |         diff = np.average(np.abs(grad_backprop[key] - grad_numerical[key]))
384 |         print(key + ":" + str(diff))
385 |         
386 |         
387 | # 实际解决手写输入识别问题
388 | @run.change_dir
389 | @run.timethis
390 | def minst():
391 |     print("实际解题")
392 |     x_train, t_train, x_test, t_test = loadData()
393 |     network = TwoLayerNet(input_size = 784, hidden_size = 50, output_size = 10)
394 |     
395 |     iters_num = 10000
396 |     train_size = x_train.shape[0]
397 |     batch_size = 100
398 |     learning_rate = 0.1
399 |     train_loss_list = []
400 |     train_acc_list = []
401 |     test_acc_list = []
402 |     
403 |     iter_per_epoch = max(train_size/batch_size, 1)
404 |     for i in range(iters_num):
405 |         batch_mask = np.random.choice(train_size, batch_size)
406 |         x_batch = x_train[batch_mask]
407 |         t_batch = t_train[batch_mask]
408 |         
409 |         # 反向传播求梯度
410 |         grad = network.gradient(x_batch, t_batch)
411 |         
412 |         # 更新参数
413 |         for key in ["W1", "b1", "W2", "b2"]:
414 |             network.params[key] -= learning_rate*grad[key]
415 |             
416 |         loss = network.loss(x_batch, t_batch)
417 |         train_loss_list.append(loss)
418 |         
419 |         # 计算每个epoch的识别精度
420 |         if i % iter_per_epoch == 0:
421 |             train_acc = network.accuracy(x_train, t_train)
422 |             test_acc = network.accuracy(x_test, t_test)
423 |             train_acc_list.append(train_acc)
424 |             test_acc_list.append(test_acc)
425 |             print("训练集准确率{}，测试集准确率{}".format(train_acc, test_acc))
426 |             
427 |     # 画图
428 |     plt.figure()
429 |     plt.plot(train_loss_list)
430 |     plt.savefig("./output/loss.png")
431 |     plt.close()
432 |     plt.figure()
433 |     plt.plot(train_acc_list)
434 |     plt.plot(test_acc_list)
435 |     plt.savefig("./output/accuracy.png")
436 |     plt.close()
437 |         
438 | 
439 | 
440 | if __name__ == "__main__":
441 |     testMul()
442 |     testAdd()
443 |     testReLU()
444 |     testSigmoid()
445 |     testSum()
446 |     gradcheck()
447 |     minst()
448 |         


--------------------------------------------------------------------------------
/DL/learning.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | # kaggle Jane Street Market Prediction代码
  3 | # 《深度学习入门:基于python的理论与实现》
  4 | # 第四章 神经网络的学习
  5 | 
  6 | 
  7 | import numpy as np
  8 | import matplotlib.pyplot as plt
  9 | import run
 10 | import pandas as pd
 11 | from PIL import Image
 12 | import random
 13 | import pickle
 14 | 
 15 | 
 16 | # 阶跃函数
 17 | def step_function(x):
 18 |     """
 19 |     if x > 0:
 20 |         return 1
 21 |     else:
 22 |         return 0
 23 |     """
 24 |     # 用支持numpy的形式
 25 |     y = x>0
 26 |     return y.astype(np.int)
 27 |     
 28 |     
 29 | # 画图
 30 | @run.change_dir
 31 | def draw_step():
 32 |     x = np.arange(-5.0, 5.0, 0.1)
 33 |     y = step_function(x)
 34 |     plt.plot(x, y)
 35 |     plt.ylim(-0.1, 1.1)
 36 |     plt.savefig("./output/step_function.png")
 37 |     plt.close()
 38 |     
 39 |     
 40 | # sigmoid函数
 41 | def sigmoid(x):
 42 |     return 1/(1+np.exp(-x))
 43 |     
 44 |     
 45 | # 画图
 46 | @run.change_dir
 47 | def draw_sigmoid():
 48 |     x = np.arange(-5.0, 5.0, 0.1)
 49 |     y = sigmoid(x)
 50 |     plt.plot(x, y)
 51 |     plt.ylim(-0.1, 1.1)
 52 |     plt.savefig("./output/sigmoid_function.png")
 53 |     plt.close()
 54 |     
 55 |     
 56 | # ReLU函数
 57 | def ReLU(x):
 58 |     return np.maximum(0, x)
 59 |     
 60 |     
 61 | # 画图
 62 | @run.change_dir
 63 | def draw_ReLU():
 64 |     x = np.arange(-5.0, 5.0, 0.1)
 65 |     y = ReLU(x)
 66 |     plt.plot(x, y)
 67 |     plt.savefig("./output/ReLU_function.png")
 68 |     plt.close()
 69 |     
 70 |     
 71 | # 恒等函数
 72 | def identity_function(x):
 73 |     return x
 74 |     
 75 |     
 76 | # softmax函数
 77 | def softmax(a):
 78 |     c = np.max(a)
 79 |     exp_a = np.exp(a-c) #防止数值太大，溢出
 80 |     sum_exp_a = np.sum(exp_a)
 81 |     y = exp_a/sum_exp_a
 82 |     return y
 83 |     
 84 |     
 85 | # 手写数字识别
 86 | # 加载数据
 87 | @run.change_dir
 88 | def loadData():
 89 |     training_data_file = open("mnist_train.csv", 'r')
 90 |     training_data_list = training_data_file.readlines()
 91 |     training_data_file.close()
 92 |     
 93 |     testing_data_file = open("mnist_test.csv", 'r')
 94 |     testing_data_list = testing_data_file.readlines()
 95 |     testing_data_file.close()
 96 |     
 97 |     x_train, t_train = [], []
 98 |     for record in training_data_list:
 99 |         # 通过','将数分段
100 |         all_values = record.split(',')
101 |         # 将所有的像素点的值转换为0.01-1.00
102 |         inputs = (np.asfarray(all_values[1:]) / 255.0 * 0.99 + 0.01)
103 |         # 创建标签输出值
104 |         target = int(all_values[0])
105 |         x_train.append(inputs)
106 |         t_train.append(target)
107 |     x_test, t_test = [], []
108 |     for record in testing_data_list:
109 |         # 通过','将数分段
110 |         all_values = record.split(',')
111 |         # 将所有的像素点的值转换为0.01-1.00
112 |         inputs = (np.asfarray(all_values[1:]) / 255.0 * 0.99 + 0.01)
113 |         # 创建标签输出值
114 |         target = int(all_values[0])
115 |         x_test.append(inputs)
116 |         t_test.append(target)
117 |     x_train = np.array(x_train)
118 |     t_train = np.array(t_train)
119 |     x_test = np.array(x_test)
120 |     t_test = np.array(t_test)
121 | #    print(x_train.shape)
122 | #    print(t_train.shape)
123 | #    print(x_test.shape)
124 | #    print(t_test.shape)
125 |     return x_train, t_train, x_test, t_test
126 | 
127 |     
128 | # 均方误差函数
129 | def mse(y, t):
130 |     return 0.5*np.sum((y-t)**2)
131 |     
132 |     
133 | # 交叉熵误差
134 | def cee(y, t):
135 |     delta = 1e-7
136 |     return -np.sum(np.dot(t, np.log(y+delta)))
137 |     
138 |     
139 | # mini-batch选取样本
140 | def mini_batch(x_train, t_train, batch_size):
141 |     train_size = x_train.shape[0]
142 |     assert(train_size >= batch_size)
143 |     batch_mask = np.random.choice(train_size, batch_size)
144 |     x_batch = x_train[batch_mask]
145 |     t_batch = t_train[batch_mask]
146 |     return x_batch, t_batch
147 |     
148 |     
149 | # mini_batch版交叉熵误差
150 | def mb_cee(y, t, one_hot = False):
151 |     delta = 1e-7
152 |     if y.ndim == 1:
153 |         t = t.reshape(1, t.size)
154 |         y = y.reshape(1, y.size)
155 |         
156 |     batch_size = y.shape[0]
157 |     if one_hot:
158 |         return -np.sum(t*np.log(y+delta))/batch_size
159 |     else:
160 |         return -np.sum(np.log(y[np.arange(batch_size), t]+delta))/batch_size
161 |         
162 |         
163 | # 计算f在x处的导数
164 | def numerical_diff(f, x):
165 |     h = 1e-4
166 |     return (f(x+h) - f(x-h))/(2*h)
167 |     
168 |     
169 | # 定义求导的函数
170 | def function_1(x):
171 |     return 0.01*x**2 + 0.1*x
172 |     
173 |     
174 | def function_2(x):
175 |     return x[0]**2 + x[1]**2
176 |     
177 |     
178 | # 测试数值微分
179 | @run.change_dir
180 | def test_diff():
181 |     # 画图
182 |     x = np.arange(0.0, 20.0, 0.1)
183 |     y = function_1(x)
184 |     plt.plot(x, y)
185 |     plt.savefig("./output/num_diff.png")
186 |     plt.close()
187 |     
188 |     print(numerical_diff(function_1, 5))
189 |     print(numerical_diff(function_1, 10))
190 |     
191 |     
192 | # (3, 4)时对x0的偏导函数
193 | def function_tmp1(x0):
194 |     return x0**2+4.0**2
195 |     
196 |     
197 | # (3, 4)时对x1的偏导函数
198 | def function_tmp2(x1):
199 |     return 3.0**2+x1**2
200 |     
201 |     
202 | # 测试偏导数
203 | @run.change_dir
204 | def test_pdiff():
205 |     # 画图
206 |     fig = plt.figure()
207 |     ax1 = plt.axes(projection='3d')
208 |     xx = np.arange(-5.0, 5.0, 0.5)
209 |     yy = np.arange(-5.0, 5.0, 0.5)
210 |     X, Y = np.meshgrid(xx, yy)
211 |     Z = X**2 + Y**2
212 |     ax1.plot_surface(X, Y, Z)
213 |     plt.savefig("./output/num_pdiff.png")
214 |     plt.close()
215 |     print(numerical_diff(function_tmp1, 3.0))
216 |     print(numerical_diff(function_tmp2, 4.0))
217 |     
218 |    
219 | """
220 | # 求数值梯度
221 | def numerical_grad(f, x):
222 |     h = 1e-4
223 |     grad = np.zeros_like(x)
224 |     print(x.shape, x.size)
225 |     for idx in range(x.size):
226 |         print(idx)
227 |         tmp_val = x[idx]
228 |         # f(x+h)
229 |         x[idx] = tmp_val + h
230 |         fx1 = f(x)
231 |         # f(x-h)
232 |         x[idx] = tmp_val - h
233 |         fx2 = f(x)
234 |         
235 |         grad[idx] = (fx1 - fx2)/(2*h)
236 |         x[idx] = tmp_val
237 |         
238 |     return grad
239 | """
240 | 
241 | def _numerical_gradient_no_batch(f, x):
242 |     h = 1e-4 # 0.0001
243 |     grad = np.zeros_like(x)
244 |     
245 |     for idx in range(x.size):
246 |         tmp_val = x[idx]
247 |         x[idx] = float(tmp_val) + h
248 |         fxh1 = f(x) # f(x+h)
249 |         
250 |         x[idx] = tmp_val - h 
251 |         fxh2 = f(x) # f(x-h)
252 |         grad[idx] = (fxh1 - fxh2) / (2*h)
253 |         
254 |         x[idx] = tmp_val # 还原值
255 |         
256 |     return grad
257 | 
258 | 
259 | def numerical_gradient(f, X):
260 |     if X.ndim == 1:
261 |         return _numerical_gradient_no_batch(f, X)
262 |     else:
263 |         grad = np.zeros_like(X)
264 |         
265 |         for idx, x in enumerate(X):
266 |             grad[idx] = _numerical_gradient_no_batch(f, x)
267 |         
268 |         return grad
269 |     
270 |     
271 | def test_grad():
272 |     print(numerical_gradient(function_2, np.array([3.0, 4.0])))
273 |     print(numerical_gradient(function_2, np.array([0.0, 2.0])))
274 |     print(numerical_gradient(function_2, np.array([3.0, 0.0])))
275 |     
276 |     
277 | # 梯度下降法
278 | def gradient_descent(f, init_x, lr = 0.01, step_num = 100):
279 |     x = init_x
280 |     print("学习率{}".format(lr))
281 |     for i in range(step_num):
282 |         grad = numerical_gradient(f, x)
283 |         x -= lr*grad
284 |         
285 |     return x
286 |     
287 |     
288 | # 测试梯度下降法
289 | def test_gd():
290 |     print("测试梯度下降")
291 |     init_x = np.array([-3.0, 4.0])
292 |     print(gradient_descent(function_2, init_x))
293 |     init_x = np.array([-3.0, 4.0])
294 |     print(gradient_descent(function_2, init_x, lr = 10.0))
295 |     init_x = np.array([-3.0, 4.0])
296 |     print(gradient_descent(function_2, init_x, lr = 1e-10))
297 |     
298 |     
299 | # 定义简单的神经网络
300 | class simpleNet:
301 |     def __init__(self):
302 |         # 用高斯分布进行初始化
303 |         self.W = np.random.randn(2, 3)
304 |         
305 |     def predict(self, x):
306 |         return np.dot(x, self.W)
307 |         
308 |     def loss(self, x, t):
309 |         z = self.predict(x)
310 |         y = softmax(z)
311 |         loss = cee(y, t)
312 |         return loss
313 |         
314 |         
315 | # 测试神经网络
316 | def test_nn():
317 |     print("测试神经网络")
318 |     net = simpleNet()
319 |     print(net.W)
320 |     x = np.array([0.6, 0.9])
321 |     p = net.predict(x)
322 |     print(p)
323 |     print(np.argmax(p))
324 |     t = np.array([0, 0, 1])
325 |     print(net.loss(x, t))
326 |     def f(W):
327 |         return net.loss(x, t)
328 |     dW = numerical_gradient(f, net.W)
329 |     print(dW)
330 |     
331 |     
332 | def sigmoid_grad(x):
333 |     return (1.0 - sigmoid(x)) * sigmoid(x)
334 |     
335 |     
336 | # 两层神经网络
337 | class TwoLayerNet:
338 |     def __init__(self, input_size, hidden_size, output_size, weight_init_std = 0.01):
339 |         # 初始化权重
340 |         self.params = {}
341 |         self.params["W1"] = weight_init_std*np.random.randn(input_size, hidden_size)
342 |         self.params["b1"] = np.zeros(hidden_size)
343 |         self.params["W2"] = weight_init_std*np.random.randn(hidden_size, output_size)
344 |         self.params["b2"] = np.zeros(output_size)
345 |         
346 |     def predict(self, x):
347 |         W1, W2 = self.params["W1"], self.params["W2"]
348 |         b1, b2 = self.params["b1"], self.params["b2"]
349 |         
350 |         a1 = np.dot(x, W1) + b1
351 |         z1 = sigmoid(a1)
352 |         a2 = np.dot(z1, W2) + b2
353 |         y = softmax(a2)
354 |         
355 |         return y
356 |         
357 |     def loss(self, x, t):
358 |         y = self.predict(x)
359 |         return cee(y, t)
360 |         
361 |     def accuracy(self, x, t):
362 |         tmp = np.zeros((t.shape[0], 10))+0.01
363 |         for i in range(t.shape[0]):
364 |             tmp[i][t[i]] = 0.99
365 |         t = tmp
366 |         y = self.predict(x)
367 |         y = np.argmax(y, axis = 1)
368 |         t = np.argmax(t, axis = 1)
369 |         
370 |         accuracy = np.sum(y == t) / float(x.shape[0])
371 |         return accuracy
372 |         
373 |     def numerical_gradient(self, x, t):
374 |         loss_W = lambda W : self.loss(x, t)
375 |         
376 |         grads = {}
377 |         grads["W1"] = numerical_gradient(loss_W, self.params["W1"] )
378 |         grads["b1"] = numerical_gradient(loss_W, self.params["b1"] )
379 |         grads["W2"] = numerical_gradient(loss_W, self.params["W2"] )
380 |         grads["b2"] = numerical_gradient(loss_W, self.params["b2"] )
381 |         
382 |         return grads
383 |         
384 |     #  更快的求梯度的方法
385 |     def gradient(self, x, t):
386 |         W1, W2 = self.params['W1'], self.params['W2']
387 |         b1, b2 = self.params['b1'], self.params['b2']
388 |         grads = {}
389 |         
390 |         batch_num = x.shape[0]
391 |         
392 |         # forward
393 |         a1 = np.dot(x, W1) + b1
394 |         z1 = sigmoid(a1)
395 |         a2 = np.dot(z1, W2) + b2
396 |         y = softmax(a2)
397 |         
398 |         # backward
399 | #        print(type(y), type(t))
400 | #        print(y.shape, t.shape)
401 | #        print(y[0], t[0])
402 |         tmp = np.zeros((t.shape[0], 10))+0.01
403 |         for i in range(t.shape[0]):
404 |             tmp[i][t[i]] = 0.99
405 |         # print(tmp.shape)
406 |         dy = (y - tmp) / batch_num
407 |         grads['W2'] = np.dot(z1.T, dy)
408 |         grads['b2'] = np.sum(dy, axis=0)
409 |         
410 |         da1 = np.dot(dy, W2.T)
411 |         dz1 = sigmoid_grad(a1) * da1
412 |         grads['W1'] = np.dot(x.T, dz1)
413 |         grads['b1'] = np.sum(dz1, axis=0)
414 | 
415 |         return grads
416 |         
417 |         
418 | # 测试两层神经网络
419 | @run.change_dir
420 | @run.timethis
421 | def test_2_nn():
422 |     print("测试两层神经网络")
423 | #    net = TwoLayerNet(input_size = 784, hidden_size = 100, output_size = 10)
424 | #    print(net.params["W1"].shape)
425 | #    print(net.params["b1"].shape)
426 | #    print(net.params["W2"].shape)
427 | #    print(net.params["b2"].shape)
428 | #    x = np.random.rand(100, 784)
429 | #    y = net.predict(x)
430 | #    # print(y)
431 | #    t = np.random.rand(100, 10)
432 | #    grads = net.numerical_gradient(x, t)
433 | #    print(grads["W1"].shape)
434 | #    print(grads["b1"].shape)
435 | #    print(grads["W2"].shape)
436 | #    print(grads["b2"].shape)
437 |     
438 |     # 加载数据
439 |     x_train, t_train, x_test, t_test = loadData()
440 |     
441 |     # 训练
442 |     train_loss_list = []
443 |     train_acc_list = []
444 |     test_acc_list = []
445 |     # 超参数
446 |     iters_num = 10000
447 |     train_size = x_train.shape[0]
448 |     batch_size = 100
449 |     learning_rate = 0.1
450 |     # 平均每个epoch的重复次数
451 |     iter_per_epoch = max(train_size/batch_size, 1)
452 | 
453 |     network = TwoLayerNet(input_size = 784, hidden_size = 100, output_size = 10)
454 |     
455 |     for i in range(iters_num):
456 |         # 获取mini_batch
457 |         batch_mask = np.random.choice(train_size, batch_size)
458 |         x_batch = x_train[batch_mask]
459 |         t_batch = t_train[batch_mask]
460 |         # 计算梯度
461 |         #grad = network.numerical_gradient(x_batch, t_batch)
462 |         grad = network.gradient(x_batch, t_batch)
463 |         # 更新参数
464 |         for key in ["W1", "b1", "W2", "b2"]:
465 |             network.params[key] -= learning_rate*grad[key]
466 |         
467 |         # 记录学习过程
468 |         loss = network.loss(x_batch, t_batch)
469 |         train_loss_list.append(loss)
470 |         # print(i, loss)
471 |         # 计算每个epoch的识别精度
472 |         if i % iter_per_epoch == 0:
473 |             train_acc = network.accuracy(x_train, t_train)
474 |             test_acc = network.accuracy(x_test, t_test)
475 |             train_acc_list.append(train_acc)
476 |             test_acc_list.append(test_acc)
477 |             print("训练集准确率{}，测试集准确率{}".format(train_acc, test_acc))
478 |         
479 |     # 画图
480 |     plt.figure()
481 |     plt.plot(train_loss_list)
482 |     plt.savefig("./output/loss.png")
483 |     plt.close()
484 |     plt.figure()
485 |     plt.plot(train_acc_list)
486 |     plt.plot(test_acc_list)
487 |     plt.savefig("./output/accuracy.png")
488 |     plt.close()
489 |     
490 | 
491 | if __name__ == "__main__":
492 |     # 测试mse
493 |     t = np.zeros(10)
494 |     t[2] = 1
495 |     y = np.array([0.1, 0.05, 0.6, 0.0, 0.05, 0.1, 0.0, 0.1, 0.0, 0.0])
496 |     print(t, y)
497 |     print(mse(y, t))
498 |     print(cee(y, t))
499 |     
500 |     # 测试mini_batch
501 |     x_train, t_train, x_test, t_test = loadData()
502 |     x_batch, t_batch = mini_batch(x_train, t_train, 10)
503 |     print(x_batch)
504 |     print(t_batch)
505 |     
506 |     # 数值微分
507 |     test_diff()
508 |     test_pdiff()
509 |     test_grad()
510 |     test_gd()
511 |     
512 |     # 测试神经网络
513 |     test_nn()
514 |     # 测试两层神经网络
515 |     test_2_nn()
516 |     
517 |         


--------------------------------------------------------------------------------
/DL/nn.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | # kaggle Jane Street Market Prediction代码
  3 | # 《深度学习入门:基于python的理论与实现》
  4 | # 第三章 神经网络
  5 | 
  6 | 
  7 | import numpy as np
  8 | import matplotlib.pyplot as plt
  9 | import run
 10 | import pandas as pd
 11 | from PIL import Image
 12 | import random
 13 | import pickle
 14 | 
 15 | 
 16 | # 阶跃函数
 17 | def step_function(x):
 18 |     """
 19 |     if x > 0:
 20 |         return 1
 21 |     else:
 22 |         return 0
 23 |     """
 24 |     # 用支持numpy的形式
 25 |     y = x>0
 26 |     return y.astype(np.int)
 27 |     
 28 |     
 29 | # 画图
 30 | @run.change_dir
 31 | def draw_step():
 32 |     x = np.arange(-5.0, 5.0, 0.1)
 33 |     y = step_function(x)
 34 |     plt.plot(x, y)
 35 |     plt.ylim(-0.1, 1.1)
 36 |     plt.savefig("./output/step_function.png")
 37 |     plt.close()
 38 |     
 39 |     
 40 | # sigmoid函数
 41 | def sigmoid(x):
 42 |     return 1/(1+np.exp(-x))
 43 |     
 44 |     
 45 | # 画图
 46 | @run.change_dir
 47 | def draw_sigmoid():
 48 |     x = np.arange(-5.0, 5.0, 0.1)
 49 |     y = sigmoid(x)
 50 |     plt.plot(x, y)
 51 |     plt.ylim(-0.1, 1.1)
 52 |     plt.savefig("./output/sigmoid_function.png")
 53 |     plt.close()
 54 |     
 55 |     
 56 | # ReLU函数
 57 | def ReLU(x):
 58 |     return np.maximum(0, x)
 59 |     
 60 |     
 61 | # 画图
 62 | @run.change_dir
 63 | def draw_ReLU():
 64 |     x = np.arange(-5.0, 5.0, 0.1)
 65 |     y = ReLU(x)
 66 |     plt.plot(x, y)
 67 |     plt.savefig("./output/ReLU_function.png")
 68 |     plt.close()
 69 |     
 70 |     
 71 | # 初始化神经网络
 72 | def init_network():
 73 |     network = {}
 74 |     network["W1"] = np.array([[0.1, 0.3, 0.5], [0.2, 0.4, 0.6]])
 75 |     network["b1"] = np.array([0.1, 0.2, 0.3])
 76 |     network["W2"] = np.array([[0.1, 0.4], [0.2, 0.5], [0.3, 0.6]])
 77 |     network["b2"] = np.array([0.1, 0.2])
 78 |     network["W3"] = np.array([[0.1, 0.3], [0.2, 0.4]])
 79 |     network["b3"] = np.array([0.1, 0.2])
 80 |     
 81 |     return network
 82 |     
 83 |     
 84 | # 恒等函数
 85 | def identity_function(x):
 86 |     return x
 87 |     
 88 |     
 89 | # softmax函数
 90 | def softmax(a):
 91 |     c = np.max(a)
 92 |     exp_a = np.exp(a-c) #防止数值太大，溢出
 93 |     sum_exp_a = np.sum(exp_a)
 94 |     y = exp_a/sum_exp_a
 95 |     return y
 96 |     
 97 |     
 98 | # 前向传播过程
 99 | def forward(network, x):
100 |     W1, W2, W3 = network["W1"], network["W2"], network["W3"]
101 |     b1, b2, b3 = network["b1"], network["b2"], network["b3"]
102 |     
103 |     a1 = np.dot(x, W1) + b1
104 |     z1 = sigmoid(a1)
105 |     a2 = np.dot(z1, W2) + b2
106 |     z2 = sigmoid(a2)
107 |     a3 = np.dot(z2, W3) + b3
108 |     
109 |     y = identity_function(a3)
110 |     
111 |     return y
112 |     
113 |     
114 | # 手写数字识别
115 | # 加载数据
116 | @run.change_dir
117 | def loadData():
118 |     training_data_file = open("mnist_train.csv", 'r')
119 |     training_data_list = training_data_file.readlines()
120 |     training_data_file.close()
121 |     
122 |     testing_data_file = open("mnist_test.csv", 'r')
123 |     testing_data_list = testing_data_file.readlines()
124 |     testing_data_file.close()
125 |     
126 |     x_train, t_train = [], []
127 |     for record in training_data_list:
128 |         # 通过','将数分段
129 |         all_values = record.split(',')
130 |         # 将所有的像素点的值转换为0.01-1.00
131 |         inputs = (np.asfarray(all_values[1:]) / 255.0 * 0.99 + 0.01)
132 |         # 创建标签输出值
133 |         target = int(all_values[0])
134 |         x_train.append(inputs)
135 |         t_train.append(target)
136 |     x_test, t_test = [], []
137 |     for record in testing_data_list:
138 |         # 通过','将数分段
139 |         all_values = record.split(',')
140 |         # 将所有的像素点的值转换为0.01-1.00
141 |         inputs = (np.asfarray(all_values[1:]) / 255.0 * 0.99 + 0.01)
142 |         # 创建标签输出值
143 |         target = int(all_values[0])
144 |         x_test.append(inputs)
145 |         t_test.append(target)
146 |     x_train = np.array(x_train)
147 |     t_train = np.array(t_train)
148 |     x_test = np.array(x_test)
149 |     t_test = np.array(t_test)
150 | #    print(x_train.shape)
151 | #    print(t_train.shape)
152 | #    print(x_test.shape)
153 | #    print(t_test.shape)
154 |     return x_train, t_train, x_test, t_test
155 |     
156 |     
157 | # 绘制数据
158 | @run.change_dir
159 | def drawNum(data, target):
160 |     i = random.randint(0, data.shape[0]-1)
161 |     print(i)
162 |     img = data[i]
163 |     label = target[i]
164 |     print(label)
165 |     print(img.shape)
166 |     img = img.reshape(28, 28)
167 |     print(img.shape)
168 |     pil_img = Image.fromarray(np.uint8(img*255))
169 |     pil_img.save("./output/number.png", "png")
170 |     
171 |     
172 | # 测试minst
173 | @run.change_dir
174 | def testMinst():
175 |     x_train, t_train, x_test, t_test = loadData()
176 |     drawNum(x_train, t_train)
177 |     
178 |     # 加载训练好的模型
179 |     with open("sample_weight.pkl", "rb") as f:
180 |         network = pickle.load(f)
181 |     
182 |     batch_size = 100
183 |     accuracy_cnt = 0
184 |     for i in range(0, len(x_test), batch_size):
185 |         x_batch = x_test[i:i+batch_size]
186 |         y_batch = forward(network, x_batch)
187 |         p = np.argmax(y_batch, axis = 1)
188 |         accuracy_cnt += np.sum(p == t_test[i:i+batch_size])
189 |             
190 |     print("预测准确率:{}/{}={}".format(accuracy_cnt, len(x_test), accuracy_cnt/len(x_test)))
191 |     
192 |     
193 | 
194 | if __name__ == "__main__":
195 |     draw_step()
196 |     draw_sigmoid()
197 |     draw_ReLU()
198 |     
199 |     # 实现神经网络
200 |     network = init_network()
201 |     x = np.array([1.0, 0.5])
202 |     y = forward(network, x)
203 |     print(y)
204 |     
205 |     # 测试softmax函数
206 |     a = np.array([1010, 1000, 990])
207 |     y = softmax(a)
208 |     print(y)
209 |     print(np.sum(y))
210 |     
211 |     # minst测试
212 |     testMinst()
213 |         


--------------------------------------------------------------------------------
/DL/perceptron.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8
 2 | # kaggle Jane Street Market Prediction代码
 3 | # 《深度学习入门:基于python的理论与实现》
 4 | # 第二章 感知机
 5 | 
 6 | 
 7 | import numpy as np
 8 | 
 9 | 
10 | # 用感知机来实现逻辑门
11 | # 与门
12 | def AND(x1, x2):
13 |     w1, w2, theta = 0.5, 0.5, 0.7
14 |     tmp = w1*x1 + w2*x2
15 |     if tmp <= theta:
16 |         return 0
17 |     else:
18 |         return 1
19 |         
20 |         
21 | # 另一种形式实现与门 b = -theta
22 | def AND2(x1, x2):
23 |     x = np.array([x1, x2])
24 |     w = np.array([0.5, 0.5])
25 |     b = -0.7
26 |     tmp = np.sum(w*x)+b
27 |     if tmp <= 0:
28 |         return 0
29 |     else:
30 |         return 1
31 |         
32 |        
33 | # 与非门
34 | def NAND(x1, x2):
35 |     x = np.array([x1, x2])
36 |     w = np.array([-0.5, -0.5])
37 |     b = 0.7
38 |     tmp = np.sum(w*x)+b
39 |     if tmp <= 0:
40 |         return 0
41 |     else:
42 |         return 1
43 |         
44 |         
45 | # 或门
46 | def OR(x1, x2):
47 |     x = np.array([x1, x2])
48 |     w = np.array([0.5, 0.5])
49 |     b = -0.2
50 |     tmp = np.sum(w*x)+b
51 |     if tmp <= 0:
52 |         return 0
53 |     else:
54 |         return 1
55 |         
56 |         
57 | # 用感知机组合实现异或门
58 | def XOR(x1, x2):
59 |     s1 = NAND(x1, x2)
60 |     s2 = OR(x1, x2)
61 |     y = AND2(s1, s2)
62 |     return y
63 |     
64 | 
65 | if __name__ == "__main__":
66 |     print(AND(0, 0), AND(0, 1), AND(1, 0), AND(1, 1))
67 |     print(AND2(0, 0), AND2(0, 1), AND2(1, 0), AND2(1, 1))
68 |     print(NAND(0, 0), NAND(0, 1), NAND(1, 0), NAND(1, 1))
69 |     print(OR(0, 0), OR(0, 1), OR(1, 0), OR(1, 1))
70 |     print(XOR(0, 0), XOR(0, 1), XOR(1, 0), XOR(1, 1))
71 |     


--------------------------------------------------------------------------------
/DL/run.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8
 2 | # 将程序上传到服务器上执行
 3 | import os
 4 | import sys
 5 | from functools import wraps
 6 | import time
 7 | 
 8 | 
 9 | # 上传代码至服务器并运行
10 | def run(gpus, server):
11 |     # 上传本目录所有文件再执行指定文件
12 |     if gpus == "all":
13 |         # 清除服务器代码目录里所有源文件以及输出目录中的文件
14 |         s = "ssh ubuntu@" + server + " \"sudo rm -rf ~/code/*.py\""
15 |         os.system(s)
16 |         s = "ssh ubuntu@" + server + " \"sudo rm -rf ~/code/output/*\""
17 |         os.system(s)
18 |         # 将本地目录所有文件上传至容器
19 |         s = "scp -r ./*.py ubuntu@" + server + ":~/code"
20 |         os.system(s)
21 |         # 运行指定代码
22 |         s = "ssh root@" + server +  " -p 2222 \"python /home/code/" + sys.argv[2] + "\""
23 |         print("正在运行代码……\n")
24 |         os.system(s)
25 |         # 将代码目录里所有输出文件传回
26 |         s = "scp -r ubuntu@" + server + ":~/code/output/* ./output/"
27 |         os.system(s)
28 |     # 将所有结果文件传回
29 |     elif gpus == "copy":
30 |         s = "scp -r ubuntu@" + server + ":~/code/output/* ./output/"
31 |         os.system(s)
32 |     # 上传指定文件并执行
33 |     else:
34 |         ## 清除服务器代码目录里所有源文件以及输出目录中的文件
35 |         s = "ssh ubuntu@" + server + " \"sudo rm -rf ~/code/*.py\""
36 |         os.system(s)
37 |         s = "ssh ubuntu@" + server + " \"sudo rm -rf ~/code/output/*\""
38 |         os.system(s)
39 |         # 将本地目录指定文件上传至容器
40 |         s = "scp " + sys.argv[1] + " ubuntu@" + server + ":~/code"
41 |         os.system(s)
42 |         # 运行指定代码
43 |         s = "ssh root@" + server +  " -p 2222 \"python /home/code/" + sys.argv[1] + "\""
44 |         os.system(s)
45 |         # 将代码目录里所有文件传回
46 |         s = "scp -r ubuntu@" + server + ":~/code/output/* ./output/"
47 |         os.system(s)
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     gpus = sys.argv[1]
52 |     # 读取服务器IP地址，自己编辑serverIP.txt去
53 |     with open("serverIP.txt", "rt") as f:
54 |         server = f.read()
55 |     run(gpus, server)
56 |         
57 |     
58 | # 工具函数，在上传到服务器上运行时改变当前目录
59 | def change_dir(func):
60 |     @wraps(func)
61 |     def change(*args, **kwargs):
62 |         oldpath = os.getcwd()
63 |         newpath = "/home/code/"
64 |         os.chdir(newpath)
65 |         r = func(*args, **kwargs)
66 |         os.chdir(oldpath)
67 |         return r
68 |     return change
69 |     
70 |     
71 | # 工具函数，计算函数运行时间    
72 | def timethis(func):
73 |     @wraps(func)
74 |     def wrapper(*args, **kwargs):
75 |         start = time.perf_counter()
76 |         r = func(*args, **kwargs)
77 |         end = time.perf_counter()
78 |         print('{}.{}的运行时间为 : {}秒'.format(func.__module__, func.__name__, end - start))
79 |         return r
80 |     return wrapper
81 |     


--------------------------------------------------------------------------------
/DL/sample_weight.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zwdnet/JSMPwork/b7ae1380986abc3ee5b10bea7f9d57cbb805a8f0/DL/sample_weight.pkl


--------------------------------------------------------------------------------
/FE.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8
 2 | # kaggle竞赛Jane Street Market Prediction
 3 | # 特征工程代码
 4 | 
 5 | 
 6 | from run import *
 7 | from tools import *
 8 | import pandas as pd
 9 | import matplotlib.pyplot as plt
10 | 
11 | 
12 | """
13 | # 特征工程
14 | @change_dir
15 | def featureEngineer(data):
16 |     tages = pd.DataFrame()
17 |     tagename = feature.columns
18 |     for i in range(29):
19 |         # tagename = "tag_" + str(i)
20 |         # tages[tagename[i+1]] = feature[(feature[tagename[i+1]] == True)].iloc[:, i+1]
21 |         #print(tages[i])
22 |         temp = feature["feature"][feature[tagename[i+1]] == True]
23 |         temp.name = tagename[i+1]
24 |         print(temp)
25 |     #print(tages)
26 |     # 填充空值
27 |     print(data.isnull().sum())
28 |     for col in data.columns:
29 |         mean_val = data[col].mean()
30 |         data[col].fillna(mean_val, inplace=True)
31 |     print(data.isnull().sum())
32 |     # 处理feature_0
33 |     feature_0 = data["feature_0"].cumsum()
34 |     plt.plot(feature_0)
35 |     plt.savefig("./output/cumf_0.png")
36 |     plt.close()
37 |     data["feature_0"] = feature_0
38 |     # print(feature_0)
39 |     return data
40 | """
41 | # 特征工程
42 | def featureEngineer(data):
43 |     # data = data[data['weight'] != 0]
44 |     data = data.fillna(0.0)
45 |     weight = data['weight'].values
46 |     resp = data['resp'].values
47 |     data['action'] = ((weight * resp) > 0).astype('int')
48 |     return data
49 |     
50 | 
51 |     
52 |     
53 | if __name__ == "__main__":
54 |     train, feature = loadData()
55 |     # feature = feature[feature == True]
56 |     print(feature)
57 |     train = featureEngineer(train)
58 |     


--------------------------------------------------------------------------------
/LSTM.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | # kaggle Jane Street Market Prediction代码
  3 | """
  4 | 用pytorch实现LSTM模型
  5 | 参考:https://zhuanlan.zhihu.com/p/104475016
  6 | """
  7 | 
  8 | 
  9 | import numpy as np
 10 | import torch
 11 | from torch import nn
 12 | import matplotlib.pyplot as plt
 13 | from run import *
 14 | 
 15 | 
 16 | class LstmRNN(nn.Module):
 17 |     def __init__(self, input_size, hidden_size = 1, output_size = 1, num_layers = 1):
 18 |         super().__init__()
 19 |         self.lstm = nn.LSTM(input_size, hidden_size, num_layers)
 20 |         self.forwardCalculation = nn.Linear(hidden_size, output_size)
 21 |         
 22 |     def forward(self, _x):
 23 |         x, _ = self.lstm(_x)
 24 |         s, b, h = x.shape # seq_len, batch, hidden_size
 25 |         x = x.view(s*b, h)
 26 |         x = self.forwardCalculation(x)
 27 |         x = x.view(s, b, -1)
 28 |         return x
 29 |         
 30 |         
 31 | @change_dir
 32 | def LSTM():
 33 |     # 建立数据
 34 |     data_len = 200
 35 |     t = np.linspace(0, 12*np.pi, data_len)
 36 |     sin_t = np.sin(t)
 37 |     cos_t = np.cos(t)
 38 |     
 39 |     dataset = np.zeros((data_len, 2))
 40 |     dataset[:, 0] = sin_t
 41 |     dataset[:, 1] = cos_t
 42 |     dataset = dataset.astype("float32")
 43 |     
 44 |     # 划分数据
 45 |     train_data_ratio = 0.5
 46 |     train_data_len = int(data_len*train_data_ratio)
 47 |     train_x = dataset[:train_data_len, 0]
 48 |     train_y = dataset[:train_data_len, 1]
 49 |     INPUT_FEATURES_NUM = 1
 50 |     OUTPUT_FEATURES_NUM = 1
 51 |     t_for_training = t[:train_data_len]
 52 |     
 53 |     test_x = dataset[train_data_len:, 0]
 54 |     test_y = dataset[train_data_len:, 1]
 55 |     t_for_testing = t[train_data_len:]
 56 |     
 57 |     # 训练
 58 |     train_x_tensor = train_x.reshape(-1, 5, INPUT_FEATURES_NUM) # 分5批
 59 |     train_y_tensor = train_y.reshape(-1, 5, OUTPUT_FEATURES_NUM) # 分5批
 60 |     train_x_tensor = torch.from_numpy(train_x_tensor)
 61 |     train_y_tensor = torch.from_numpy(train_y_tensor)
 62 |     
 63 |     lstm_model = LstmRNN(INPUT_FEATURES_NUM, 16, output_size = OUTPUT_FEATURES_NUM, num_layers = 1)
 64 |     print('LSTM model:', lstm_model)
 65 |     print('model.parameters:', lstm_model.parameters)
 66 |     
 67 |     loss_fn = nn.MSELoss()
 68 |     lr = 1e-2
 69 |     optimizer = torch.optim.Adam(lstm_model.parameters(), lr)
 70 |     
 71 |     max_epochs = 10000
 72 |     for epoch in range(max_epochs):
 73 |         output = lstm_model(train_x_tensor)
 74 |         loss = loss_fn(output, train_y_tensor)
 75 |         loss.backward()
 76 |         optimizer.step()
 77 |         optimizer.zero_grad()
 78 |         
 79 |         if loss.item() < 1e-4:
 80 |             print('Epoch [{}/{}], Loss: {:.5f}'.format(epoch+1, max_epochs, loss.item()))
 81 |             print("The loss value is reached")
 82 |             break
 83 |         elif (epoch+1) % 100 == 0:
 84 |             print('Epoch [{}/{}], Loss: {:.5f}'.format(epoch+1, max_epochs, loss.item()))
 85 |             
 86 |     # 用模型预测
 87 |     # 训练集上
 88 |     predictive_y_for_training = lstm_model(train_x_tensor)
 89 |     predictive_y_for_training = predictive_y_for_training.view(-1, OUTPUT_FEATURES_NUM).data.numpy()
 90 |     
 91 |     # 切换为测试状态
 92 |     lstm_model = lstm_model.eval()
 93 |     # 用测试集预测
 94 |     test_x_tensor = test_x.reshape(-1, 5, INPUT_FEATURES_NUM) 
 95 |     test_x_tensor = torch.from_numpy(test_x_tensor)
 96 |     predictive_y_for_testing = lstm_model(test_x_tensor)
 97 |     predictive_y_for_testing = predictive_y_for_testing.view(-1, OUTPUT_FEATURES_NUM).data.numpy()
 98 |     
 99 |     # 画图
100 |     plt.figure()
101 |     plt.plot(t_for_training, train_x, 'g', label='sin_trn')
102 |     plt.plot(t_for_training, train_y, 'b', label='ref_cos_trn')
103 |     plt.plot(t_for_training, predictive_y_for_training, 'y--', label='pre_cos_trn')
104 | 
105 |     plt.plot(t_for_testing, test_x, 'c', label='sin_tst')
106 |     plt.plot(t_for_testing, test_y, 'k', label='ref_cos_tst')
107 |     plt.plot(t_for_testing, predictive_y_for_testing, 'm--', label='pre_cos_tst')
108 | 
109 |     plt.plot([t[train_data_len], t[train_data_len]], [-1.2, 4.0], 'r--', label='separation line') # separation line
110 | 
111 |     plt.xlabel('t')
112 |     plt.ylabel('sin(t) and cos(t)')
113 |     plt.xlim(t[0], t[-1])
114 |     plt.ylim(-1.2, 4)
115 |     plt.legend(loc='upper right')
116 |     plt.text(14, 2, "train", size = 15, alpha = 1.0)
117 |     plt.text(20, 2, "test", size = 15, alpha = 1.0)
118 |     
119 |     plt.savefig("./output/LSTM.png")
120 | 
121 | 
122 | if __name__ == "__main__":
123 |     LSTM()


--------------------------------------------------------------------------------
/LSTM_work.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | # kaggle Jane Street Market Prediction代码
  3 | # 实际自己工作的代码
  4 | # LSTM模型
  5 | 
  6 | 
  7 | import numpy as np
  8 | import pandas as pd
  9 | pd.set_option('display.max_columns', None)
 10 | import janestreet
 11 | 
 12 | import matplotlib.pyplot as plt
 13 | from sklearn.model_selection import train_test_split
 14 | from sklearn import metrics
 15 | from sklearn.metrics import accuracy_score
 16 | import torch
 17 | import torch.nn as nn
 18 | import torch.optim as optim
 19 | 
 20 | import os
 21 | 
 22 | from FE import featureEngineer
 23 | from tools import *
 24 | 
 25 |     
 26 |     
 27 | # 建模前处理数据
 28 | def preprocessing(train):
 29 |     X_train = train.loc[:, train.columns.str.contains('feature')]
 30 |     # y_train = train.loc[:, 'resp']
 31 |     y_train = train.loc[:, 'action']
 32 |     
 33 |     # X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, random_state=666, test_size=0.2)
 34 |     
 35 |     return X_train, y_train
 36 | 
 37 |     
 38 | # 评分函数
 39 | def Score(model, data):
 40 |     # test_df = pd.read_csv("/kaggle/input/jane-street-market-prediction/train.csv")
 41 |     data = data.fillna(-999)
 42 |     X_test = data.loc[:, data.columns.str.contains('feature')]
 43 |     resp = model.predict(X_test)
 44 |     date = data["date"].values
 45 |     weight = data["weight"].values
 46 |     action = (resp > 0).astype("int")
 47 |     
 48 |     count_i = len(np.unique(date))
 49 |     Pi = np.zeros(count_i)
 50 |     # 用循环太慢
 51 |     #for i, day in enumerate(np.unique(date)):
 52 | #        Pi[i] = np.sum(weight[date == day] * resp[date == day] * action[date == day])
 53 |     # 用下面这行代替
 54 |     Pi = np.bincount(date, weight * resp * action)
 55 |     t = np.sum(Pi) / np.sqrt(np.sum(Pi ** 2)) * np.sqrt(250 / count_i)
 56 |     u = np.clip(t, 0, 6) * np.sum(Pi)
 57 |     return u
 58 |     
 59 | 
 60 | # 进行预测，生成提交文件，分类版
 61 | def predict_clf(model):
 62 |     env = janestreet.make_env()
 63 |     iter_test = env.iter_test()
 64 |     for (test_df, sample_prediction_df) in iter_test:
 65 |         if test_df['weight'].item() > 0:
 66 |             # test_df = featureEngineer(test_df)
 67 |             X_test = test_df.loc[:, test_df.columns.str.contains('feature')]
 68 |             X_test = X_test.fillna(0.0)
 69 |             y_preds = model.predict(X_test)[0]
 70 |         else:
 71 |             y_preds = 0
 72 |         # print(y_preds)
 73 |         sample_prediction_df.action = y_preds
 74 |         env.predict(sample_prediction_df)
 75 |         
 76 |         
 77 | class LstmRNN(nn.Module):
 78 |     def __init__(self, input_size, hidden_size = 10, output_size = 1, num_layers = 1):
 79 |         super().__init__()
 80 |         self.linear = nn.Linear(input_size, hidden_size)
 81 |         self.lstm = nn.LSTM(input_size, hidden_size, num_layers)
 82 |         self.forwardCalculation = nn.Linear(hidden_size, output_size)
 83 |         self.sigmoid = nn.Sigmoid()
 84 |         
 85 |     def forward(self, _x):
 86 |         # s, b, h = _x.shape
 87 |         # x = _x.view(s*b, h)
 88 |         # x = self.linear(x)
 89 |         # x = x.view(s, b, h)
 90 |         x, _ = self.lstm(_x)
 91 |         s, b, h = x.shape # seq_len, batch, hidden_size
 92 |         x = x.view(s*b, h)
 93 |         x = self.forwardCalculation(x)
 94 |         x = self.sigmoid(x)
 95 |         x = x.view(s, b, -1)
 96 |         return x
 97 |         
 98 | 
 99 | if __name__ == "__main__":
100 |     newpath = "/home/code"
101 |     os.chdir(newpath)
102 |     
103 |     # data_explore()
104 |     
105 |     # 真正开始干活
106 |     p = 0.001
107 |     train = loadData(p = p)
108 |     train = featureEngineer(train)
109 |     print(train.info())
110 |     # print(train.head())
111 |     
112 |     # 计算模型评分
113 |     # score = Score(model, train)
114 |     # print("模型评分:%.2f" % score)
115 |     test = loadData(p = p)
116 |     test = featureEngineer(test)
117 |     
118 |     #训练数据预处理
119 |     x_train, y_train = preprocessing(train)
120 |     x_test, y_test = preprocessing(test)
121 |     
122 |     # 深度学习
123 |     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
124 |     # x_train.values.reshape(-1, 1, 130)
125 |     # y_train.values.reshape(-1, 1, 1)
126 |     x_tensor = torch.from_numpy(x_train.values.reshape(-1, 1, 130)).float().to(device)
127 |     y_tensor = torch.from_numpy(y_train.values.reshape(-1, 1, 1)).float().to(device)
128 |     
129 | 
130 |     Model = LstmRNN(130, 5).to(device)
131 |             
132 |     # model = Model(x_tensor).to(device)
133 |     # print(model.state_dict())
134 |     # 设置超参数
135 |     lr = 0.000678
136 |     n_epochs = 110
137 |      
138 |     # loss_fn = nn.BCELoss(reduction='sum')
139 |     loss_fn = nn.MSELoss(reduction = "mean")
140 |     optimizer = optim.Adam(Model.parameters(), lr = lr)
141 |     # 创建训练器
142 |     train_step = make_train_step(Model, loss_fn, optimizer)
143 |     losses = []
144 |     
145 |     print("开始训练")
146 |     # 进行训练
147 |     for epoch in range(n_epochs):
148 |         # y_tensor = y_tensor.detach()
149 |         loss = train_step(x_tensor, y_tensor)
150 |         losses.append(loss)
151 |         
152 |     # print(model.state_dict())
153 |     print(losses)
154 |     plt.figure()
155 |     plt.plot(losses)
156 |     plt.savefig("./output/loss.png")
157 |     # 验证模型
158 |     # x_test.reshape(-1, 1, 130)
159 |     # y_test.reshape(-1, 1, 1)
160 |     x_test_tensor = torch.from_numpy(x_test.values.reshape(-1, 1, 130)).float().to(device)
161 |     y_test_tensor = torch.from_numpy(y_test.values.reshape(-1, 1, 1)).float().to(device)
162 |     result = []
163 |     preds = []
164 |     # dph = 0.0
165 |     for x in Model(x_test_tensor):
166 |         preds.append(x.detach().cpu().numpy()[0][0])
167 |         # dph = np.min(preds) + (np.max(preds) - np.min(preds))/2.0
168 |         # print(dph)
169 |         if x >= 0.5:
170 |             result.append(1)
171 |         else:
172 |             result.append(0)
173 |     y_test = y_test_tensor.numpy()
174 |     # print(len(y_test))
175 |     # print(result)
176 |     plt.figure()
177 |     plt.hist(preds)
178 |     plt.savefig("./output/predicts.png")
179 |     plt.close()
180 |     print("预测结果均值:{}".format(np.mean(preds)))
181 |     print("预测结果中位数:{}".format(np.median(preds)))
182 |     print("预测结果极值之差:{}-{}={}".format(np.max(preds), np.min(preds), np.max(preds) - np.min(preds)))
183 |     count = 0
184 |     for i in range(len(result)):
185 |         if y_test[i] == result[i]:
186 |             count += 1
187 |     print(count)
188 |     print("预测正确率:%f" % (count/len(y_test)))
189 |     # 进行预测
190 |     # predict_clf(model)
191 |     


--------------------------------------------------------------------------------
/MyFrame.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | # 开课吧《创造你的第一个深度学习框架》实操
  3 | 
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | import random
  8 | from run import *
  9 | from sklearn.datasets import load_boston
 10 | from sklearn.utils import shuffle, resample
 11 | from collections import defaultdict
 12 | from sklearn.model_selection import train_test_split
 13 | import matplotlib.pyplot as plt
 14 | import torch
 15 | from torch import nn
 16 | 
 17 | 
 18 | # 神经节类
 19 | class Node:
 20 |     def __init__(self, inputs = []):
 21 |         self.inputs = inputs
 22 |         self.outputs = []
 23 |         for n in self.inputs:
 24 |             n.outputs.append(self)
 25 |             
 26 |         self.value = None
 27 |         self.gradients = {}
 28 |         
 29 |     def forward(self):
 30 |         raise NotImplemented
 31 |         
 32 |     def backward(self):
 33 |         raise NotImplemented
 34 |         
 35 |         
 36 | # 占位节点，没有输入的节点，其值要指定
 37 | class Placeholder(Node):
 38 |     def __init__(self):
 39 |         Node.__init__(self)
 40 |         
 41 |     def forward(self, value = None):
 42 |         if value is not None:
 43 |             self.value = value
 44 |             
 45 |     def backward(self):
 46 |         self.gradients = {self:0}
 47 |         for n in self.outputs:
 48 |             grad_cost = n.gradients[self]
 49 |             self.gradients[self] = grad_cost*1
 50 |             
 51 |             
 52 | # 线性节点
 53 | class Linear(Node):
 54 |     def __init__(self, nodes, weights, bias):
 55 |         Node.__init__(self, inputs = [nodes, weights, bias])
 56 |         
 57 |     def forward(self):
 58 |         inputs = self.inputs[0].value
 59 |         weights = self.inputs[1].value
 60 |         bias = self.inputs[2].value
 61 |         # print("测试", type(inputs), type(weights))
 62 |         self.value = np.dot(inputs, weights) + bias
 63 |         
 64 |     def backward(self):
 65 |         self.gradients = {n: np.zeros_like(n.value) for n in self.inputs}
 66 |         for n in self.outputs:
 67 |             grad_cost = n.gradients[self]
 68 |             self.gradients[self.inputs[0]] = np.dot(grad_cost, self.inputs[1].value.T)
 69 |             self.gradients[self.inputs[1]] = np.dot(self.inputs[0].value.T, grad_cost)
 70 |             self.gradients[self.inputs[2]] = np.sum(grad_cost, axis=0, keepdims=False)
 71 |             
 72 |             
 73 | # Sigmoid激活节点
 74 | class Sigmoid(Node):
 75 |     def __init__(self, node):
 76 |         Node.__init__(self, inputs = [node])
 77 |         
 78 |     def _sigmoid(self, x):
 79 |         return 1./(1 + np.exp(-1 * x))
 80 |         
 81 |     def forward(self):
 82 |         self.x = self.inputs[0].value
 83 |         self.value = self._sigmoid(self.x)
 84 |         
 85 |     def backward(self):
 86 |         self.partial = self._sigmoid(self.x) * (1 - self._sigmoid(self.x))
 87 |         self.gradients = {n: np.zeros_like(n.value) for n in self.inputs}
 88 |         for n in self.outputs:
 89 |             grad_cost = n.gradients[self]
 90 |             self.gradients[self.inputs[0]] = grad_cost * self.partial
 91 |             
 92 |             
 93 | # 损失函数            
 94 | class MSE(Node):
 95 |     def __init__(self, y, yhat):
 96 |         Node.__init__(self, inputs = [y, yhat])
 97 |         
 98 |     def forward(self):
 99 |         y = self.inputs[0].value.reshape(-1, 1)
100 |         yhat = self.inputs[1].value.reshape(-1, 1)
101 |         assert(y.shape == yhat.shape)
102 |         
103 |         self.m = self.inputs[0].value.shape[0]
104 |         self.diff = y - yhat
105 |         self.value = np.mean(self.diff**2)
106 |         
107 |     def backward(self):
108 |         self.gradients[self.inputs[0]] = (2/self.m)*self.diff
109 |         self.gradients[self.inputs[1]] = (-2/self.m)*self.diff
110 | 
111 | 
112 | # sigmoid函数
113 | def sigmoid(x):
114 |     return 1./(1 + np.exp(-1 * x))
115 | 
116 | 
117 | # 加载数据
118 | def loaddata():
119 |     data = load_boston()
120 |     X_ = data["data"]
121 |     Y_ = data["target"]
122 |     # 将数据归一化
123 |     X_ = (X_ - np.mean(X_, axis=0)) / np.std(X_, axis=0)
124 |     x_train, x_test, y_train, y_test = train_test_split(X_, Y_, test_size = 0.2, random_state = 666)
125 |     return (x_train, x_test, y_train, y_test)
126 |     
127 |     
128 | # 将节点数据转换为图
129 | def convert_feed_dict_to_graph(feed_dict):
130 |     computing_graph = defaultdict(list)
131 |     nodes = [n for n in feed_dict]
132 |     while nodes:
133 |         n = nodes.pop()
134 |         if isinstance(n, Placeholder):
135 |             n.value = feed_dict[n]
136 |             
137 |         if n in computing_graph:
138 |             continue
139 |             
140 |         for m in n.outputs:
141 |             computing_graph[n].append(m)
142 |             nodes.append(m)
143 |     
144 |     return computing_graph
145 |     
146 |    
147 | # 将图进行拓扑排序，生成计算图
148 | def toplogic(graph):
149 |     sorted_nodes = []
150 |     while len(graph) > 0:
151 |         all_inputs = []
152 |         all_outputs = []
153 |         
154 |         for n in graph:
155 |             all_inputs += graph[n]
156 |             all_outputs.append(n)
157 |             
158 |         all_inputs = set(all_inputs)
159 |         all_outputs = set(all_outputs)
160 |         
161 |         need_remove = all_outputs - all_inputs
162 |         
163 |         if len(need_remove) > 0:
164 |             node = random.choice(list(need_remove))
165 |             need_to_visited = [node]
166 |             if len(graph) == 1:
167 |                 need_to_visited += graph[node]
168 |             graph.pop(node)
169 |             sorted_nodes += need_to_visited
170 |             
171 |             for _, links in graph.items():
172 |                 if node in links:
173 |                     links.remove(node)
174 |         else:
175 |             break
176 |         
177 |     return sorted_nodes
178 |     
179 |     
180 | # 生成计算图
181 | def topological_sort_feed_dict(feed_dict):
182 |     graph = convert_feed_dict_to_graph(feed_dict)
183 |     return toplogic(graph)
184 |     
185 |     
186 | # 前向传播
187 | def forward(graph):
188 |     for n in graph:
189 |         n.forward()
190 |         
191 | 
192 | # 后向传播
193 | def backward(graph):
194 |     for n in graph[::-1]:
195 |         n.backward()
196 | 
197 | 
198 | # 更新参数
199 | def optimizer(trainables, learning_rate=1e-2):
200 |     for t in trainables:
201 |         t.value += -1 * learning_rate * t.gradients[t]
202 | 
203 | 
204 | # 进行预测
205 | @change_dir
206 | def MyFramePredict(w1_, b1_, w2_, b2_, losses, X_test, Y_test):
207 |     y1 = np.dot(x_test, w1_.value) + b1_.value
208 |     s = sigmoid(y1)
209 |     y2 = np.dot(s, w2_.value) + b2_.value
210 |     # print(y2)
211 |     # 用误差平方和评价
212 |     sse = ((y2-y_test)**2).sum()
213 |     print("框架评分:{}".format(sse))
214 |     
215 |     # 画图
216 |     plt.figure()
217 |     plt.plot(losses)
218 |     plt.title("cost of model")
219 |     plt.savefig("./output/FrameCost.png")
220 |     plt.close()
221 |     plt.figure()
222 |     y2 = y2.flatten()
223 |     delta = y2-y_test
224 |     # print(y2.shape, y_test.shape, delta.shape, x.shape)
225 |     plt.plot(delta, color = "green")
226 |     # plt.scatter(x, y_test, color = "red")
227 |     plt.savefig("./output/FrameResult.png")
228 |     plt.close()
229 |     
230 | 
231 | # pytorch版预测        
232 | @change_dir
233 | def PytorchPredict(model, losses, x_test, y_test):
234 |     # 准备数据
235 |     x_test = x_test.astype(np.float32)
236 |     y_test = y_test.astype(np.float32)
237 |     x_tensor = torch.from_numpy(x_test)
238 |     y_tensor = torch.from_numpy(y_test)
239 |     # 用模型进行预测
240 |     y_pred = model(x_tensor).detach().numpy()
241 |     sse = ((y_pred-y_test)**2).sum()
242 |     print("pytorch评分:{}".format(sse))
243 |     
244 |     # 画图
245 |     plt.figure()
246 |     plt.plot(losses)
247 |     plt.title("cost of pytorch")
248 |     plt.savefig("./output/PytorchCost.png")
249 |     plt.close()
250 |     plt.figure()
251 |     y_pred = y_pred.flatten()
252 |     delta = y_pred-y_test
253 |     # print(y2.shape, y_test.shape, delta.shape, x.shape)
254 |     plt.plot(delta, color = "green")
255 |     # plt.scatter(x, y_test, color = "red")
256 |     plt.savefig("./output/PytorchResult.png")
257 |     plt.close()
258 | 
259 | 
260 | # 测试我的框架
261 | @change_dir
262 | @timethis
263 | def testMyFrame(X_, Y_):
264 |     # 初始化参数
265 |     n_features = X_.shape[1]
266 |     n_hidden = 10
267 |     W1_ = np.random.randn(n_features, n_hidden)
268 |     b1_ = np.zeros(n_hidden)
269 |     W2_ = np.random.randn(n_hidden, 1)
270 |     b2_ = np.zeros(1)
271 |     # 定义神经节
272 |     X, y = Placeholder(), Placeholder()
273 |     W1, b1 = Placeholder(), Placeholder()
274 |     W2, b2 = Placeholder(), Placeholder()
275 |     
276 |     # 定义模型
277 |     l1 = Linear(X, W1, b1)
278 |     s1 = Sigmoid(l1)
279 |     l2 = Linear(s1, W2, b2)
280 |     cost = MSE(y, l2)
281 |     
282 |     # 定义初始值
283 |     feed_dict = {
284 |         X: X_,
285 |         y: Y_,
286 |         W1: W1_,
287 |         b1: b1_,
288 |         W2: W2_,
289 |         b2: b2_
290 |     }
291 |     
292 |     # 定义超参数
293 |     epochs = 5000
294 |     m = X_.shape[0]
295 |     batch_size = 16
296 |     steps_per_epoch = m // batch_size
297 |     
298 |     # 生成计算图
299 |     graph = topological_sort_feed_dict(feed_dict)
300 |     trainables = [W1, b1, W2, b2]
301 |     
302 |     print("样本总数{}".format(m))
303 |     
304 |     # 训练过程
305 |     losses = []
306 |     
307 |     for i in range(epochs):
308 |         loss = 0
309 |         for j in range(steps_per_epoch):
310 |             # 步骤①，对样本随机采样
311 |             X_batch, y_batch = resample(X_, Y_, n_samples=batch_size)
312 |             # 重置X, Y的输入值
313 |             X.value = X_batch
314 |             y.value = y_batch
315 |             # 步骤②，前向和后向传播
316 |             # _ = None
317 |             forward(graph)
318 |             backward(graph)
319 |             # 步骤③ 更新参数
320 |             rate = 1e-2
321 |             optimizer(trainables, rate)
322 |             loss += graph[-1].value
323 |             
324 |         # 输出
325 |         if i % 100 == 0:
326 |             # print("Epoch: {}, Loss: {:.3f}".format(i+1, loss/steps_per_epoch))
327 |             losses.append(loss/steps_per_epoch)
328 | 
329 |     return (W1, b1, W2, b2, losses)
330 |             
331 |     
332 | # 测试pytorch
333 | @change_dir
334 | @timethis
335 | def testPytorch(x_train, y_train):
336 |     n_features = x_train.shape[1]
337 |     n_hidden = 10
338 |     # 定义网络结构
339 |     net = nn.Sequential(
340 |         nn.Linear(n_features, n_hidden),
341 |         nn.Sigmoid(),
342 |         nn.Linear(n_hidden, 1)
343 |     )
344 |     # 定义损失函数和优化器
345 |     loss_fn = nn.MSELoss()
346 |     optimizer = torch.optim.SGD(net.parameters(), lr = 1e-2)
347 |     
348 |     # 定义训练参数
349 |     epochs = 5000
350 |     m = n_features
351 |     batch_size = 16
352 |     steps_per_epoch = m // batch_size
353 |     
354 |     # 定义数据加载器
355 |     x_train = x_train.astype(np.float32)
356 |     y_train = y_train.astype(np.float32)
357 |     x_tensor = torch.from_numpy(x_train)
358 |     y_tensor = torch.from_numpy(y_train)
359 |     train = torch.utils.data.TensorDataset(x_tensor, y_tensor)
360 |     train_loader = torch.utils.data.DataLoader(train, batch_size = batch_size, shuffle = False)
361 |     
362 |     #训练
363 |     losses = []
364 |     for i in range(epochs):
365 |         for x, y in train_loader:
366 |             y = y.view(-1, 1)
367 |             # print(i, x.size(), y.size())
368 |             # 清除梯度
369 |             optimizer.zero_grad()
370 |             outputs = net(x)
371 |             loss = loss_fn(outputs, y)
372 |             loss.backward()
373 |             optimizer.step()
374 |         # 输出
375 |         if i % 100 == 0:
376 |             # print("Epoch: {}, Loss: {:.3f}".format(i+1, loss.data))
377 |             losses.append(loss.data)
378 |     
379 |     return (net, losses)
380 | 
381 | 
382 | if __name__ == "__main__":
383 |     x_train, x_test, y_train, y_test = loaddata()
384 |     x_train_, x_test_, y_train_, y_test_ = x_train.copy(), x_test.copy(), y_train.copy(), y_test.copy()
385 |     params = testMyFrame(x_train_, y_train_)
386 |     MyFramePredict(params[0], params[1], params[2], params[3], params[4], x_test_, y_test_)
387 |     x_train_, x_test_, y_train_, y_test_ = x_train.copy(), x_test.copy(), y_train.copy(), y_test.copy()
388 |     model, losses = testPytorch(x_train_, y_train_)
389 |     PytorchPredict(model, losses, x_test_, y_test_)
390 |     


--------------------------------------------------------------------------------
/NNDL/minst.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | # 《神经网络与深度学习》代码实现
  3 | 
  4 | 
  5 | import numpy as np
  6 | import run
  7 | import random
  8 | 
  9 | import pickle
 10 | import gzip
 11 | 
 12 | @run.change_dir
 13 | def load_data():
 14 |     f = gzip.open('./mnist.pkl.gz', 'rb')
 15 |     training_data, validation_data, test_data = pickle.load(f, encoding='bytes')
 16 |     f.close()
 17 |     return (training_data, validation_data, test_data)
 18 |     
 19 |     
 20 | def load_data_wrapper():
 21 |     tr_d, va_d, te_d = load_data()
 22 |     training_inputs = [np.reshape(x, (784, 1)) for x in tr_d[0]]
 23 |     training_results = [vectorized_result(y) for y in tr_d[1]]
 24 |     training_data = list(zip(training_inputs, training_results))
 25 |     validation_inputs = [np.reshape(x, (784, 1)) for x in va_d[0]]
 26 |     validation_data = zip(validation_inputs, va_d[1])
 27 |     test_inputs = [np.reshape(x, (784, 1)) for x in te_d[0]]
 28 |     test_data = list(zip(test_inputs, te_d[1]))
 29 |     return (training_data, validation_data, test_data)
 30 |     
 31 |     
 32 | def vectorized_result(j):
 33 |     e = np.zeros((10, 1))
 34 |     e[j] = 1.0
 35 |     return e
 36 | 
 37 |     
 38 | # 定义神经网络
 39 | class Network:
 40 |     def __init__(self, sizes):
 41 |         self.num_layers = len(sizes)
 42 |         self.sizes = sizes
 43 |         self.biases = [np.random.randn(y, 1) for y in sizes[1:]]
 44 |         self.weights = [np.random.randn(y, x) for x, y in zip(sizes[:-1], sizes[1:])]
 45 |         
 46 |     def feedforward(self, a):
 47 |         for b, w in zip(self.biases, self.weights):
 48 |             a = sigmoid(np.dot(w, a) + b)
 49 |         return a
 50 |         
 51 |     def SGD(self, training_data, epochs, mini_batch_size, eta, test_data = None):
 52 |         if test_data:
 53 |             n_test = len(test_data)
 54 |         n = len(training_data)
 55 |         for j in range(epochs):
 56 |             random.shuffle(training_data)
 57 |             mini_batches = [
 58 |                 training_data[k:k+mini_batch_size] for k in range(0, n, mini_batch_size)]
 59 |             for mini_batch in mini_batches:
 60 |                 self.update_mini_batch(mini_batch, eta)
 61 |             if test_data:
 62 |                 print("Epoch {}: {}/{}".format(j, self.evaluate(test_data), n_test))
 63 |             else:
 64 |                 print("Epoch {}完成".format(j))
 65 |                 
 66 |     def update_mini_batch(self, mini_batch, eta):
 67 |         nabla_b = [np.zeros(b.shape) for b in self.biases]
 68 |         nabla_w = [np.zeros(w.shape) for w in self.weights]
 69 |         for x, y in mini_batch:
 70 |             delta_nabla_b, delta_nabla_w = self.backprop(x, y)
 71 |             nabla_b = [nb+dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]
 72 |             nabla_w = [nw+dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]
 73 |             self.weights = [w-(eta/len(mini_batch))*nw for w, nw in zip(self.weights, nabla_w)]
 74 |             self.biases = [b-(eta/len(mini_batch))*nb for b, nb in zip(self.biases, nabla_b)]
 75 |             
 76 |     def backprop(self, x, y):
 77 |         nabla_b = [np.zeros(b.shape) for b in self.biases]
 78 |         nabla_w = [np.zeros(w.shape) for w in self.weights]
 79 |         activation = x
 80 |         activations = [x]
 81 |         zs = []
 82 |         for b, w in zip(self.biases, self.weights):
 83 |             z = np.dot(w, activation) + b
 84 |             zs.append(z)
 85 |             activation = sigmoid(z)
 86 |             activations.append(activation)
 87 |         # 反向传播过程
 88 |         delta = self.cost_derivative(activations[-1], y)*sigmoid_prime(zs[-1])
 89 |         nabla_b[-1] = delta
 90 |         nabla_w[-1] = np.dot(delta, activations[-2].transpose())
 91 |         for l in range(2, self.num_layers):
 92 |             z = zs[-l]
 93 |             sp = sigmoid_prime(z)
 94 |             delta = np.dot(self.weights[-l+1].transpose(), delta)*sp
 95 |             nabla_b[-l] = delta
 96 |             nabla_w[-l] = np.dot(delta, activations[-l-1].transpose())
 97 |         return (nabla_b, nabla_w)
 98 |         
 99 |     def evaluate(self, test_data):
100 |         test_results = [(np.argmax(self.feedforward(x)), y) for (x, y) in test_data]
101 |         return sum(int(x == y) for (x, y) in test_results)
102 |         
103 |     def cost_derivative(self, output_activations, y):
104 |         return (output_activations-y)
105 |         
106 |         
107 | # 逻辑函数
108 | def sigmoid(z):
109 |     return 1.0/(1.0+np.exp(-z))
110 |     
111 |     
112 | def sigmoid_prime(z):
113 |     return sigmoid(z)*(1-sigmoid(z))
114 |     
115 |     
116 | if __name__ == "__main__":
117 |     training_data, validation_data, test_data = load_data_wrapper()
118 |     net = Network([784, 30, 10])
119 |     net.SGD(training_data, 30, 10, 3.0, test_data = test_data)
120 |     
121 |     


--------------------------------------------------------------------------------
/NNDL/mnist.pkl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zwdnet/JSMPwork/b7ae1380986abc3ee5b10bea7f9d57cbb805a8f0/NNDL/mnist.pkl.gz


--------------------------------------------------------------------------------
/NNDL/run.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8
 2 | # 将程序上传到服务器上执行
 3 | import os
 4 | import sys
 5 | from functools import wraps
 6 | import time
 7 | 
 8 | 
 9 | # 上传代码至服务器并运行
10 | def run(gpus, server):
11 |     # 上传本目录所有文件再执行指定文件
12 |     if gpus == "all":
13 |         # 清除服务器代码目录里所有源文件以及输出目录中的文件
14 |         s = "ssh ubuntu@" + server + " \"sudo rm -rf ~/code/*.py\""
15 |         os.system(s)
16 |         s = "ssh ubuntu@" + server + " \"sudo rm -rf ~/code/output/*\""
17 |         os.system(s)
18 |         # 将本地目录所有文件上传至容器
19 |         s = "scp -r ./*.py ubuntu@" + server + ":~/code"
20 |         os.system(s)
21 |         # 运行指定代码
22 |         s = "ssh root@" + server +  " -p 2222 \"python /home/code/" + sys.argv[2] + "\""
23 |         print("正在运行代码……\n")
24 |         os.system(s)
25 |         # 将代码目录里所有输出文件传回
26 |         s = "scp -r ubuntu@" + server + ":~/code/output/* ./output/"
27 |         os.system(s)
28 |     # 将所有结果文件传回
29 |     elif gpus == "copy":
30 |         s = "scp -r ubuntu@" + server + ":~/code/output/* ./output/"
31 |         os.system(s)
32 |     # 上传指定文件并执行
33 |     else:
34 |         ## 清除服务器代码目录里所有源文件以及输出目录中的文件
35 |         s = "ssh ubuntu@" + server + " \"sudo rm -rf ~/code/*.py\""
36 |         os.system(s)
37 |         s = "ssh ubuntu@" + server + " \"sudo rm -rf ~/code/output/*\""
38 |         os.system(s)
39 |         # 将本地目录指定文件上传至容器
40 |         s = "scp " + sys.argv[1] + " ubuntu@" + server + ":~/code"
41 |         os.system(s)
42 |         # 运行指定代码
43 |         s = "ssh root@" + server +  " -p 2222 \"python /home/code/" + sys.argv[1] + "\""
44 |         os.system(s)
45 |         # 将代码目录里所有文件传回
46 |         s = "scp -r ubuntu@" + server + ":~/code/output/* ./output/"
47 |         os.system(s)
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     gpus = sys.argv[1]
52 |     # 读取服务器IP地址，自己编辑serverIP.txt去
53 |     with open("serverIP.txt", "rt") as f:
54 |         server = f.read()
55 |     run(gpus, server)
56 |         
57 |     
58 | # 工具函数，在上传到服务器上运行时改变当前目录
59 | def change_dir(func):
60 |     @wraps(func)
61 |     def change(*args, **kwargs):
62 |         oldpath = os.getcwd()
63 |         newpath = "/home/code/"
64 |         os.chdir(newpath)
65 |         r = func(*args, **kwargs)
66 |         os.chdir(oldpath)
67 |         return r
68 |     return change
69 |     
70 |     
71 | # 工具函数，计算函数运行时间    
72 | def timethis(func):
73 |     @wraps(func)
74 |     def wrapper(*args, **kwargs):
75 |         start = time.perf_counter()
76 |         r = func(*args, **kwargs)
77 |         end = time.perf_counter()
78 |         print('{}.{}的运行时间为 : {}秒'.format(func.__module__, func.__name__, end - start))
79 |         return r
80 |     return wrapper
81 |     


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
1 | Kaggle竞赛Jane Street Market Prediction实操代码 
2 | 竞赛地址： https://www.kaggle.com/c/jane-street-market-prediction/overview


--------------------------------------------------------------------------------
/X.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zwdnet/JSMPwork/b7ae1380986abc3ee5b10bea7f9d57cbb805a8f0/X.npy


--------------------------------------------------------------------------------
/Y.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zwdnet/JSMPwork/b7ae1380986abc3ee5b10bea7f9d57cbb805a8f0/Y.npy


--------------------------------------------------------------------------------
/__pycache__/run.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zwdnet/JSMPwork/b7ae1380986abc3ee5b10bea7f9d57cbb805a8f0/__pycache__/run.cpython-37.pyc


--------------------------------------------------------------------------------
/__pycache__/run.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zwdnet/JSMPwork/b7ae1380986abc3ee5b10bea7f9d57cbb805a8f0/__pycache__/run.cpython-38.pyc


--------------------------------------------------------------------------------
/__pycache__/tools.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zwdnet/JSMPwork/b7ae1380986abc3ee5b10bea7f9d57cbb805a8f0/__pycache__/tools.cpython-38.pyc


--------------------------------------------------------------------------------
/copy_jsmp.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | # kaggle Jane Street Market Prediction代码
  3 | # copy别人的代码:https://www.kaggle.com/c/jane-street-market-prediction/submissions
  4 | 
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | import janestreet
  9 | 
 10 | import plotly.express as px
 11 | from plotly.subplots import make_subplots
 12 | import plotly.graph_objs as go
 13 | import plotly.io as pio
 14 | 
 15 | import matplotlib.pyplot as plt
 16 | from xgboost import XGBClassifier
 17 | from sklearn.model_selection import train_test_split
 18 | from sklearn import metrics
 19 | from sklearn.metrics import accuracy_score
 20 | import optuna
 21 | from optuna.samplers import TPESampler
 22 | 
 23 | import os
 24 | import time
 25 | 
 26 | 
 27 | # 数据探索
 28 | def data_explore():
 29 |     # 读取数据
 30 |     train = pd.read_csv("./train.csv", nrows = 10000)
 31 |     print(train.head())
 32 |     
 33 |     # 先画图看目标特征的分布
 34 |     # .plt.figure()
 35 |     plot_list = ['weight', 'resp_1', 'resp_2', 'resp_3', 'resp_4', 'resp']
 36 |     fig = make_subplots(rows=3, cols=2)
 37 |     traces = [
 38 |         go.Histogram(
 39 |             x = train[col],
 40 |             nbinsx = 100,
 41 |             name = col
 42 |         ) for col in plot_list
 43 |     ]
 44 |     
 45 |     for i in range(len(traces)):
 46 |         fig.append_trace(
 47 |             traces[i],
 48 |             (i // 2) + 1,
 49 |             (i % 2) + 1
 50 |         )
 51 |     
 52 |     fig.update_layout(
 53 |         title_text='Target features distributions',
 54 |         height = 900,
 55 |         width = 800
 56 |     )
 57 |     
 58 |     pio.write_image(fig, "./output/target_distribute.png")
 59 |     
 60 |     # 看特征值的分布
 61 |     features = train.columns
 62 |     features = features[7:]
 63 |     features = features[:130]
 64 |     fig = make_subplots(
 65 |         rows = 44,
 66 |         cols = 3
 67 |     )
 68 |     traces = [
 69 |         go.Histogram(
 70 |             x = train[col],
 71 |             nbinsx = 100,
 72 |             name = col
 73 |         ) for col in features
 74 |     ]
 75 |     
 76 |     for i in range(len(traces)):
 77 |         fig.append_trace(
 78 |             traces[i],
 79 |             (i // 3) + 1,
 80 |             (i % 3) + 1
 81 |         )
 82 |     
 83 |     fig.update_layout(
 84 |         title_text='Train features distributions',
 85 |         height = 5000
 86 |     )
 87 |     
 88 |     pio.write_image(fig, "./output/features_distribute.png")
 89 |     
 90 |     cols = features
 91 |     
 92 |     # 读取其它数据文件看看
 93 |     features = pd.read_csv("./features.csv")
 94 |     print(features)
 95 |     example_test = pd.read_csv("./example_test.csv")
 96 |     print(example_test)
 97 |     submission = pd.read_csv("./example_sample_submission.csv")
 98 |     print(submission)
 99 |     
100 |     # 开始建模
101 |     train = pd.read_csv("./small_train.csv")
102 |     # 先找到高度相关的特征
103 |     all_columns = []
104 |     for i in range(0, len(cols)):
105 |         for j in range(i+1, len(cols)):
106 |             if abs(train[cols[i]].corr(train[cols[j]])) > 0.95:
107 |                 all_columns = all_columns + [cols[i], cols[j]]
108 |     
109 |     all_columns = list(set(all_columns))
110 |     print('Number of columns:', len(all_columns))
111 |     # 画图
112 |     data = train[all_columns]
113 |     f = plt.figure(
114 |         figsize = (22, 22)
115 |     )
116 |     plt.matshow(
117 |         data.corr(),
118 |         fignum = f.number
119 |     )
120 |     plt.xticks(
121 |         range(data.shape[1]),
122 |         data.columns,
123 |         fontsize = 14,
124 |         rotation = 90
125 |     )
126 |     plt.yticks(
127 |         range(data.shape[1]),
128 |         data.columns,
129 |         fontsize = 14
130 |     )
131 |     cb = plt.colorbar()
132 |     cb.ax.tick_params(
133 |         labelsize = 14
134 |     )
135 |     plt.savefig("./output/features_corr.png")
136 |     
137 |     # 目标值的相关度
138 |     data = train[['weight', 'resp_1', 'resp_2', 'resp_3', 'resp_4', 'resp']]
139 |     f = plt.figure(
140 |         figsize = (12, 12)
141 |     )
142 |     plt.matshow(
143 |         data.corr(),
144 |         fignum = f.number
145 |     )
146 |     plt.xticks(
147 |         range(data.shape[1]),
148 |         data.columns,
149 |         fontsize = 14,
150 |         rotation = 90
151 |     )
152 |     plt.yticks(
153 |         range(data.shape[1]),
154 |         data.columns,
155 |         fontsize = 14
156 |     )
157 |     cb = plt.colorbar()
158 |     cb.ax.tick_params(
159 |         labelsize = 14
160 |     )
161 |     plt.savefig("./output/targets_corr.png")
162 |     
163 |     
164 | # 建模过程
165 | def modeling():
166 |     print("开始建模")
167 |     # train = pd.read_csv("./small_train.csv")
168 |     train = pd.read_csv("./train.csv", nrows = 10000)
169 |     
170 |     train = train[train['weight'] != 0]
171 |     train['action'] = ((train['weight'].values * train['resp'].values) > 0).astype('int')
172 |     
173 |     X_train = train.loc[:, train.columns.str.contains('feature')]
174 |     y_train = train.loc[:, 'action']
175 |     
176 |     X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, random_state=666, test_size=0.2)
177 |     
178 |     del train
179 |     
180 |     X_train = X_train.fillna(-999)
181 |     sampler = TPESampler(seed=666)
182 |     tm = "auto"
183 |     
184 |     def create_model(trial):
185 |         max_depth = trial.suggest_int("max_depth", 2, 12)
186 |         n_estimators = trial.suggest_int("n_estimators", 2, 600)
187 |         learning_rate = trial.suggest_uniform('learning_rate', 0.0001, 0.99)
188 |         subsample = trial.suggest_uniform('subsample', 0.0001, 1.0)
189 |         colsample_bytree = trial.suggest_uniform('colsample_bytree', 0.0000001, 1)
190 |         model = XGBClassifier(
191 |         n_estimators=n_estimators, 
192 |         max_depth=max_depth, 
193 |         learning_rate=learning_rate,
194 |         subsample=subsample,
195 |         colsample_bytree=colsample_bytree,
196 |         random_state=666,
197 |         tree_method=tm,
198 |         silent = 1
199 |         )
200 |         
201 |         return model
202 |         
203 |     def objective(trial):
204 |         model = create_model(trial)
205 |         model.fit(X_train, y_train)
206 |         score = accuracy_score(
207 |             y_train,
208 |             model.predict(X_train)
209 |             )
210 |         return score
211 |         
212 |     params1 = {
213 |         'max_depth': 8, 
214 |         'n_estimators': 500, 
215 |         'learning_rate': 0.01, 
216 |         'subsample': 0.9, 
217 |         'tree_method': tm,
218 |         'random_state': 666
219 |     }
220 |     
221 |     params3 = {
222 |         'max_depth': 10, 
223 |         'n_estimators': 500, 
224 |         'learning_rate': 0.03, 
225 |         'subsample': 0.9, 
226 |         'colsample_bytree': 0.7,
227 |         'tree_method': tm,
228 |         'random_state': 666
229 |     }
230 |     
231 |     start_time = time.time()
232 |     model1 = XGBClassifier(**params1)
233 |     model1.fit(X_train, y_train, eval_metric='auc')
234 |     model1.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], eval_metric='auc',verbose=False)
235 |     evals_result = model1.evals_result()
236 |     print("模型1评分")
237 |     y_true, y_pred = y_test, model1.predict(X_test)
238 |     print("Accuracy : %.4g" % metrics.accuracy_score(y_true, y_pred))
239 |     
240 |     model3 = XGBClassifier(**params3)
241 |     model3.fit(X_train, y_train, eval_metric='auc')
242 |     model3.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], eval_metric='auc',verbose=False)
243 |     evals_result = model3.evals_result()
244 |     print("模型3评分")
245 |     y_true, y_pred = y_test, model3.predict(X_test)
246 |     print("Accuracy : %.4g" % metrics.accuracy_score(y_true, y_pred))
247 |     end_time = time.time()
248 |     print("建模时间:%.2f秒" % (end_time - start_time))
249 |     
250 |     return (model1, model3)
251 | 
252 | 
253 | if __name__ == "__main__":
254 |     newpath = "/home/code"
255 |     os.chdir(newpath)
256 |     # pio.orca.config.use_xvfb = True
257 |     # pio.orca.config.executable = "/opt/conda/envs/tensorflow/bin/orca"
258 |     pd.set_option('display.max_columns', None)
259 |     
260 |     # data_explore()
261 |     
262 |     # 真正开始干活
263 |     model1, model3 = modeling()
264 |     
265 |     # 进行预测
266 |     env = janestreet.make_env()
267 |     iter_test = env.iter_test()
268 |     for (test_df, sample_prediction_df) in iter_test:
269 |         if test_df['weight'].item() > 0:
270 |             X_test = test_df.loc[:, test_df.columns.str.contains('feature')]
271 |             X_test = X_test.fillna(-999)
272 |             y_preds = model1.predict(X_test) + model3.predict(X_test)
273 |             if y_preds == 2:
274 |                 y_preds = np.array([1])
275 |             else:
276 |                 y_preds = np.array([0])
277 |         else:
278 |             y_preds = np.array([0])
279 |         sample_prediction_df.action = y_preds
280 |         env.predict(sample_prediction_df)
281 |     


--------------------------------------------------------------------------------
/hello.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | import tools
 6 | 
 7 | 
 8 | p = 0.02
 9 | train = pd.read_csv("./small_train.csv", skiprows = lambda x: x>0 and np.random.rand() > p)
10 | train.to_csv("very_small.csv")
11 | 


--------------------------------------------------------------------------------
/hidegpu/FE.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8
 2 | # kaggle竞赛Jane Street Market Prediction
 3 | # 特征工程代码
 4 | 
 5 | 
 6 | import pandas as pd
 7 | import matplotlib.pyplot as plt
 8 | 
 9 | 
10 | """
11 | # 特征工程
12 | @change_dir
13 | def featureEngineer(data):
14 |     tages = pd.DataFrame()
15 |     tagename = feature.columns
16 |     for i in range(29):
17 |         # tagename = "tag_" + str(i)
18 |         # tages[tagename[i+1]] = feature[(feature[tagename[i+1]] == True)].iloc[:, i+1]
19 |         #print(tages[i])
20 |         temp = feature["feature"][feature[tagename[i+1]] == True]
21 |         temp.name = tagename[i+1]
22 |         print(temp)
23 |     #print(tages)
24 |     # 填充空值
25 |     print(data.isnull().sum())
26 |     for col in data.columns:
27 |         mean_val = data[col].mean()
28 |         data[col].fillna(mean_val, inplace=True)
29 |     print(data.isnull().sum())
30 |     # 处理feature_0
31 |     feature_0 = data["feature_0"].cumsum()
32 |     plt.plot(feature_0)
33 |     plt.savefig("./output/cumf_0.png")
34 |     plt.close()
35 |     data["feature_0"] = feature_0
36 |     # print(feature_0)
37 |     return data
38 | """
39 | # 特征工程
40 | def featureEngineer(data):
41 |     # data = data[data['weight'] != 0]
42 |     data = data.fillna(0.0)
43 |     weight = data['weight'].values
44 |     resp = data['resp'].values
45 |     data['action'] = ((weight * resp) > 0).astype('int')
46 |     return data
47 |     
48 | 
49 |     
50 |     
51 | if __name__ == "__main__":
52 |     train, feature = loadData()
53 |     # feature = feature[feature == True]
54 |     print(feature)
55 |     train = featureEngineer(train)
56 |     
57 | 


--------------------------------------------------------------------------------
/hidegpu/optuna_test.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8
 2 | # kaggle Jane Street Market Prediction代码
 3 | # optuna的测试代码
 4 | 
 5 | import numpy as np
 6 | import pandas as pd
 7 | pd.set_option('display.max_columns', None)
 8 | import os
 9 | # from tools import *
10 | from FE import featureEngineer
11 | 
12 | import matplotlib.pyplot as plt
13 | from sklearn.model_selection import train_test_split, cross_val_score
14 | from sklearn import metrics
15 | from sklearn.metrics import accuracy_score
16 | import optuna
17 | from optuna.samplers import TPESampler
18 | 
19 | # XGBoost
20 | from xgboost import XGBClassifier
21 | 
22 | 
23 | def objective(trial):
24 |     x = trial.suggest_uniform("x", -10, 10)
25 |     return (x - 2)**2
26 |     
27 |     
28 | def objective2(trial, x, y):
29 |     train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.3, random_state = 101)
30 |     param = {
31 |         "eval_metric":trial.suggest_categorical("eval_metric", ["logloss"]),
32 |         "tree_method":trial.suggest_categorical("tree_method", ["gpu_hist"]),
33 |         "n_estimators" : trial.suggest_int('n_estimators', 1, 100),
34 |         'max_depth':trial.suggest_int('max_depth', 2, 12),
35 |         'learning_rate':trial.suggest_loguniform('learning_rate',0.001,0.5),
36 |         "subsample":trial.suggest_loguniform("subsample", 0.5, 1.0)
37 |     }
38 |     model = XGBClassifier(**param)
39 |     model.fit(train_x, train_y)
40 |     
41 |     return cross_val_score(model,test_x,test_y).mean()
42 |     
43 |     
44 | # 建模前处理数据
45 | def preprocessing(train):
46 |     X_train = train.loc[:, train.columns.str.contains('feature')]
47 |     # y_train = train.loc[:, 'resp']
48 |     y_train = train.loc[:, 'action']
49 |     
50 |     X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, random_state=666, test_size=0.2)
51 |     
52 |     return X_train, y_train
53 | 
54 | 
55 | if __name__ == "__main__":
56 | #    study = optuna.create_study()
57 | #    study.optimize(objective, n_trials = 100)
58 | #    print("结果:", study.best_params)
59 | #    print(study.best_value)
60 | #    print(study.best_trial)
61 | #    study.optimize(objective, n_trials = 100)
62 | #    print("结果:", study.best_params)
63 | #    print(study.best_value)
64 | #    print(study.best_trial)
65 |     
66 |     
67 |     # data_explore()
68 |     
69 |     # 真正开始干活
70 |     p = 0.001
71 |     train = pd.read_csv("small_train.csv")
72 |     train = featureEngineer(train)
73 |     # print(train.head())
74 |     
75 |     # 计算模型评分
76 |     # score = Score(model, train)
77 |     # print("模型评分:%.2f" % score)
78 |     
79 |     #训练数据预处理
80 |     X_train, y_train = preprocessing(train)
81 |     
82 |     # xgboost
83 |     print("XGBoost")
84 |     study = optuna.create_study(direction = "maximize", sampler = TPESampler())
85 |     study.optimize(lambda trial:objective2(trial, X_train, y_train), n_trials = 100)
86 |     print("结果:", study.best_params)
87 |     print(study.best_value)
88 |     print(study.best_trial)
89 |     
90 | 


--------------------------------------------------------------------------------
/hidegpu/tools.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | # kaggle竞赛Jane Street Market Prediction
  3 | # 工具函数
  4 | 
  5 | from run import *
  6 | import pandas as pd
  7 | import matplotlib.pyplot as plt
  8 | import numpy as np
  9 | from sklearn.model_selection import cross_val_score, learning_curve
 10 | from sklearn.metrics import classification_report, roc_curve, auc
 11 | 
 12 | 
 13 | # 载入数据
 14 | @change_dir
 15 | def loadData(p = 0.01):
 16 |     # 抽样，读取1%数据
 17 |     # 参考https://mp.weixin.qq.com/s/2LSKnN9R-N-I2HcHePT9zA
 18 |     train = pd.read_csv("./train.csv", skiprows = lambda x: x>0 and np.random.rand() > p)
 19 |     # feature = pd.read_csv("./features.csv")
 20 |     return train
 21 |     
 22 |     
 23 | # 对模型进行交叉验证
 24 | def cross_val(model, X, Y, cv = 10):
 25 |     scores = cross_val_score(model, X, Y, cv=cv)
 26 |     score = scores.mean()
 27 |     return score
 28 |     
 29 |     
 30 | # 模型评估
 31 | def evalution(model, X, y_true):
 32 |     # X = test.loc[:, test.columns.str.contains("feature")].values
 33 |     # y_true = test.action.values
 34 |     y_pred = model.predict(X)
 35 |     target_names = ["1", "0"]
 36 |     result = classification_report(y_true, y_pred, target_names = target_names, output_dict = False )
 37 |     return result
 38 | 
 39 | 
 40 | # 对模型评分
 41 | @timethis
 42 | def score(model, test, modelName):
 43 |     if modelName == "XGBoost":
 44 |         X = test.loc[:, test.columns.str.contains("feature")]
 45 |         Y = test.action
 46 |     else:
 47 |         X = test.loc[:, test.columns.str.contains("feature")].values
 48 |         Y = test.action.values
 49 |     model_score = model.score(X, Y)
 50 |     cross_score = cross_val(model, X, Y)
 51 |     report = evalution(model, X, Y)
 52 |     print("模型评分:", model_score)
 53 |     print("交叉验证:", cross_score)
 54 |     print("模型评估:\n", report)
 55 |     Roc(model, X, Y, modelName)
 56 |     Lc(model, modelName, X, Y)
 57 |     
 58 |     
 59 | # 画roc曲线
 60 | @change_dir
 61 | def Roc(model, X, Y, modelName):
 62 |     y_label = Y
 63 |     y_pred = model.predict(X)
 64 |     fpr, tpr, thersholds = roc_curve(y_label, y_pred)
 65 |         
 66 |     roc_auc = auc(fpr, tpr)
 67 |     
 68 |     plt.plot(fpr, tpr, 'k--', label = "ROC (area = {0:.2f})".format(roc_auc), lw = 2)
 69 |     plt.tick_params(axis='x', labelsize=15)
 70 |     plt.tick_params(axis='y', labelsize=15)
 71 |     plt.xlim([-0.05, 1.05])
 72 |     plt.ylim([-0.05, 1.05])
 73 |     plt.xlabel("False Positive Rate")
 74 |     plt.ylabel("True Positive Rate")
 75 |     plt.title(modelName + " ROC Curve")
 76 |     plt.legend(loc = "best")
 77 |     plt.savefig("./output/" + modelName + "_ROC.png")
 78 |     
 79 |     
 80 | # 画学习曲线
 81 | @change_dir
 82 | def Lc(model, modelName, X, y, ylim = None, cv = None, n_jobs = 1, train_sizes = np.linspace(0.1, 1.0, 5), verbose = 0):
 83 |     plt.figure()
 84 |     plt.title(modelName+" Learning Curve")
 85 |     if ylim is not None:
 86 |         plt.ylim(*ylim)
 87 |     plt.xlabel("Training Samples")
 88 |     plt.ylabel("Score")
 89 |     train_sizes, train_scores, test_scores = learning_curve(model, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
 90 |     train_scores_mean = np.mean(train_scores, axis = 1)
 91 |     train_scores_std = np.std(train_scores, axis = 1)
 92 |     test_scores_mean = np.mean(test_scores, axis = 1)
 93 |     test_scores_std = np.std(test_scores, axis = 1)
 94 |     plt.grid()
 95 |     
 96 |     plt.fill_between(train_sizes, train_scores_mean - train_scores_std,train_scores_mean + train_scores_std, alpha=0.1, color="r")
 97 |     plt.fill_between(train_sizes,test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g")
 98 |     plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
 99 |     plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
100 |     
101 |     plt.legend(loc="best")
102 |     plt.savefig("./output/" + modelName + "_Learning Curve.png")
103 |     
104 |     
105 | # 工具函数，返回神经网络训练的每一步
106 | def make_train_step(model, loss_fn, optimizer):
107 |     # 执行在循环中训练过程
108 |     def train_step(x, y):
109 |         # 设置训练模式
110 |         model.train()
111 |         # 预测
112 |         yhat = model(x)
113 |         # 计算损失
114 |         # print("测试")
115 |         yhat = yhat.squeeze(-1)
116 |         # print(yhat.shape, y.shape)
117 |         loss = loss_fn(yhat, y)
118 |         # 计算梯度
119 |         loss.backward()
120 |         # 更新参数，梯度置零
121 |         optimizer.step()
122 |         optimizer.zero_grad()
123 |         # 返回损失值
124 |         return loss.item()
125 |         
126 |     # 返回在训练循环中调用的函数
127 |     return train_step
128 |         
129 | 
130 |     


--------------------------------------------------------------------------------
/janestreet/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | from .competition import make_env
3 | 
4 | __all__ = ['make_env']
5 | 


--------------------------------------------------------------------------------
/janestreet/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zwdnet/JSMPwork/b7ae1380986abc3ee5b10bea7f9d57cbb805a8f0/janestreet/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/janestreet/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zwdnet/JSMPwork/b7ae1380986abc3ee5b10bea7f9d57cbb805a8f0/janestreet/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/janestreet/competition.cpython-37m-x86_64-linux-gnu.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zwdnet/JSMPwork/b7ae1380986abc3ee5b10bea7f9d57cbb805a8f0/janestreet/competition.cpython-37m-x86_64-linux-gnu.so


--------------------------------------------------------------------------------
/jsmp.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | # kaggle Jane Street Market Prediction代码
  3 | # 服务器版本
  4 | 
  5 | 
  6 | import pandas as pd
  7 | import matplotlib.pyplot as plt
  8 | from sklearn.linear_model import LinearRegression
  9 | from sklearn.model_selection import train_test_split
 10 | from sklearn import metrics
 11 | import numpy as np
 12 | from sklearn.externals import joblib
 13 | import pickle
 14 | from run import *
 15 | import socket
 16 | import sys
 17 | from sklearn.impute import SimpleImputer, MissingIndicator
 18 | from sklearn.pipeline import FeatureUnion
 19 | import janestreet
 20 | 
 21 | 
 22 | # 获取转换器
 23 | def getTransformer(X, y):
 24 |     transformer = FeatureUnion(
 25 |             transformer_list = [
 26 |                     ("features", SimpleImputer(strategy = "mean")),
 27 |                     ("indicators", MissingIndicator())
 28 |             ]
 29 |     )
 30 |     transformer = transformer.fit(X, y)
 31 |     return transformer
 32 | 
 33 | 
 34 | # 特征工程
 35 | def fp(data):
 36 |     print(data.info(verbose = True, null_counts = True))
 37 |     ds = data.describe()
 38 |     data = data[data["weight"] != 0]
 39 |     data["action"] =((data["weight"].values * data["resp"].values) > 0).astype("int")
 40 |     # 查看缺失值
 41 |     print(data.isnull().sum())
 42 |     # 复制数据，进行操作
 43 |     newdata = data.copy()
 44 |     # 特征列名称
 45 |     features = [c for c in newdata.columns if 'feature' in c] + ["date"]
 46 |     # 处理缺失值
 47 |     
 48 |     X = newdata.loc[:, features]
 49 |     y = newdata.loc[:, "action"]
 50 |     transformer = getTransformer(X, y)
 51 | #    X = transformer.transform(X)
 52 |     X = X.fillna(-999)
 53 |     print("特征工程结束")
 54 |     return (X, y, features, transformer)
 55 |     
 56 |     
 57 | # 形成提交文件
 58 | def makeSubmittion(model, features, transformer):
 59 |     print("正在生成提交文件")
 60 |     env = janestreet.make_env()
 61 |     iter_test = env.iter_test()
 62 |     
 63 |     for (test_df, pred_df) in iter_test:
 64 |         X_test = test_df.loc[:, features]
 65 |         #X_test = transformer.transform(X_test)
 66 |         X_test = X_test.fillna(-999)
 67 |         preds = model.predict(X_test)
 68 |         action = ((test_df['weight'].values * preds) > 0).astype('int')
 69 |         pred_df.action = action
 70 |         env.predict(pred_df)
 71 |     
 72 |     
 73 | # 线性回归模型
 74 | def LR(X, y):
 75 |     train_set, test_set, train_action, test_action = train_test_split(X, y, test_size = 0.2)
 76 |     print(len(train_set))
 77 |     # 训练
 78 |     linreg = LinearRegression()
 79 |     linreg.fit(train_set, train_action)
 80 |     # 预测
 81 |     train_pred = linreg.predict(train_set)
 82 |     test_pred = linreg.predict(test_set)
 83 |     # 模型评估
 84 |     print("train MSE:", metrics.mean_squared_error(train_action, train_pred))
 85 |     print("test MSE:", metrics.mean_squared_error(test_action, test_pred))
 86 |     print("train RMSE:", np.sqrt(metrics.mean_squared_error(train_action, train_pred)))
 87 |     print("test RMSE:", np.sqrt(metrics.mean_squared_error(test_action, test_pred)))
 88 |     # 保存模型到文件
 89 |     # joblib.dump(linreg, "LinesRegress.pkl")
 90 |     with open("/home/code/output/LinesRegress.pkl", "wb") as fw:
 91 |         pickle.dump(linreg, fw)
 92 |     print(test_pred)
 93 |     fig = plt.figure()
 94 |     plt.plot(test_pred[:100], "b.")
 95 |     plt.plot(test_action[:100], "rx")
 96 |     plt.savefig("/home/code/output/LR_result.png")
 97 |     return linreg
 98 |     
 99 |     
100 | if __name__ == "__main__":
101 |     print(os.getcwd())
102 |     data = pd.read_csv("/home/code/small_train.csv", index_col = 0)
103 |     X, y, features, transformer = fp(data)
104 |     model = LR(X, y)
105 |     makeSubmittion(model, features, transformer)
106 |     


--------------------------------------------------------------------------------
/jsmp_local.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8
 2 | # kaggle Jane Street Market Prediction代码
 3 | # 本地运行版本
 4 | 
 5 | 
 6 | import pandas as pd
 7 | import matplotlib.pyplot as plt
 8 | from sklearn.linear_model import LinearRegression
 9 | from sklearn.model_selection import train_test_split
10 | from sklearn import metrics
11 | import numpy as np
12 | from sklearn.externals import joblib
13 | import pickle
14 | from run import *
15 | import socket
16 | import sys
17 | 
18 | 
19 | # 特征工程
20 | def fp(data):
21 |     print(data.info(verbose = True, null_counts = True))
22 |     ds = data.describe()
23 |     # 查看缺失值
24 |     print(data.isnull().sum())
25 |     # 复制数据，进行操作
26 |     newdata = data.copy()
27 |     # 特征列名称
28 |     features = [c for c in newdata.columns if 'feature' in c]
29 |     # print(features)
30 |     x_tt = newdata.loc[:, features].values
31 |     # 填充空值
32 |     if np.isnan(x_tt[:, :].sum()):
33 |             x_tt[:, :] = np.nan_to_num(x_tt[:, :]) + np.isnan(x_tt[:, :])*10.0
34 |     newdata.update(pd.DataFrame(x_tt, columns = features))
35 |     print(newdata.head())
36 |     # 够造训练集的行动变量
37 |     print(data.weight.describe())
38 |     p = data[data["weight"] < 50].weight.hist().get_figure()
39 |     p.savefig("./output/weight_hist.png")
40 |     newdata["action"] = ((newdata["weight"].values) > 0.549).astype("int")
41 |     print(newdata.action)
42 |     print("特征工程结束")
43 |     return newdata
44 |     
45 |     
46 | # 线性回归模型
47 | def LR(data):
48 |     train_set, test_set, train_action, test_action = train_test_split(data.loc[:, "feature_0":"feature_129"], data.action, test_size = 0.2)
49 |     print(len(train_set))
50 |     # 训练
51 |     linreg = LinearRegression()
52 |     linreg.fit(train_set, train_action)
53 |     # 预测
54 |     train_pred = linreg.predict(train_set)
55 |     test_pred = linreg.predict(test_set)
56 |     # 模型评估
57 |     print("train MSE:", metrics.mean_squared_error(train_action, train_pred))
58 |     print("test MSE:", metrics.mean_squared_error(test_action, test_pred))
59 |     print("train RMSE:", np.sqrt(metrics.mean_squared_error(train_action, train_pred)))
60 |     print("test RMSE:", np.sqrt(metrics.mean_squared_error(test_action, test_pred)))
61 |     # 保存模型到文件
62 |     # joblib.dump(linreg, "LinesRegress.pkl")
63 |     with open("./output/LinesRegress.pkl", "wb") as fw:
64 |         pickle.dump(linreg, fw)
65 |     print(test_pred)
66 |     fig = plt.figure()
67 |     plt.hist(test_pred)
68 |     plt.savefig("./output/LR_result.png")
69 |     
70 |     
71 | if __name__ == "__main__":
72 |     print(os.getcwd())
73 |     data = pd.read_csv("small_train.csv", index_col = 0)
74 |     newdata = fp(data)
75 |     print(newdata.info(verbose = True, null_counts = True))
76 |     print(newdata.date)
77 |     # 用多元线性回归模型训练
78 |     LR(newdata)
79 |     


--------------------------------------------------------------------------------
/myxgboost.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | # kaggle Jane Street Market Prediction代码
  3 | # 实际自己工作的代码
  4 | 
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | pd.set_option('display.max_columns', None)
  9 | import janestreet
 10 | 
 11 | import matplotlib.pyplot as plt
 12 | from sklearn.model_selection import train_test_split
 13 | from sklearn import metrics
 14 | from sklearn.metrics import accuracy_score
 15 | import optuna
 16 | # 逻辑回归
 17 | from sklearn.linear_model import LinearRegression, LogisticRegression
 18 | # 支持向量机
 19 | from sklearn.svm import SVC, LinearSVC
 20 | # 随机森林
 21 | from sklearn.ensemble import RandomForestClassifier
 22 | # KNN算法
 23 | from sklearn.neighbors import KNeighborsClassifier
 24 | # 朴素贝叶斯算法
 25 | from sklearn.naive_bayes import GaussianNB
 26 | # SGD算法
 27 | from sklearn.linear_model import SGDClassifier
 28 | # 决策树算法
 29 | from sklearn.tree import DecisionTreeClassifier
 30 | # XGBoost
 31 | from xgboost import XGBClassifier
 32 | from xgboost import plot_importance
 33 | 
 34 | import os
 35 | 
 36 | from EDA import data_explore
 37 | from FE import featureEngineer
 38 | from tools import *
 39 | from run import *
 40 | 
 41 |     
 42 |     
 43 | # 建模前处理数据
 44 | def preprocessing(train):
 45 |     X_train = train.loc[:, train.columns.str.contains('feature')]
 46 |     # y_train = train.loc[:, 'resp']
 47 |     y_train = train.loc[:, 'action']
 48 |     
 49 |     X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, random_state=666, test_size=0.2)
 50 |     
 51 |     return X_train, y_train
 52 | 
 53 |     
 54 | # 评分函数
 55 | def Score(model, data):
 56 |     # test_df = pd.read_csv("/kaggle/input/jane-street-market-prediction/train.csv")
 57 |     data = data.fillna(-999)
 58 |     X_test = data.loc[:, data.columns.str.contains('feature')]
 59 |     resp = model.predict(X_test)
 60 |     date = data["date"].values
 61 |     weight = data["weight"].values
 62 |     action = (resp > 0).astype("int")
 63 |     
 64 |     count_i = len(np.unique(date))
 65 |     Pi = np.zeros(count_i)
 66 |     # 用循环太慢
 67 |     #for i, day in enumerate(np.unique(date)):
 68 | #        Pi[i] = np.sum(weight[date == day] * resp[date == day] * action[date == day])
 69 |     # 用下面这行代替
 70 |     Pi = np.bincount(date, weight * resp * action)
 71 |     t = np.sum(Pi) / np.sqrt(np.sum(Pi ** 2)) * np.sqrt(250 / count_i)
 72 |     u = np.clip(t, 0, 6) * np.sum(Pi)
 73 |     return u
 74 |     
 75 |     
 76 | # 进行预测，生成提交文件，求值版
 77 | def predict_value(model):
 78 |     env = janestreet.make_env()
 79 |     iter_test = env.iter_test()
 80 |     for (test_df, sample_prediction_df) in iter_test:
 81 |         if test_df['weight'].item() > 0:
 82 |             test_df = featureEngineer(test_df)
 83 |             X_test = test_df.loc[:, test_df.columns.str.contains('feature')]
 84 |             # X_test = X_test.fillna(-999)
 85 |             y_resp = model.predict(X_test)[0]
 86 |             y_preds = 0 if y_resp < 0 else 1
 87 |         else:
 88 |             y_preds = 0
 89 |         # print(y_preds)
 90 |         sample_prediction_df.action = y_preds
 91 |         env.predict(sample_prediction_df)
 92 |         
 93 |         
 94 | # 进行预测，生成提交文件，分类版
 95 | def predict_clf(model):
 96 |     env = janestreet.make_env()
 97 |     iter_test = env.iter_test()
 98 |     for (test_df, sample_prediction_df) in iter_test:
 99 |         if test_df['weight'].item() > 0:
100 |             test_df = featureEngineer(test_df)
101 |             X_test = test_df.loc[:, test_df.columns.str.contains('feature')]
102 |             X_test = X_test.fillna(0.0)
103 |             y_preds = model.predict(X_test)[0]
104 |         else:
105 |             y_preds = 0
106 |         # print(y_preds)
107 |         sample_prediction_df.action = y_preds
108 |         env.predict(sample_prediction_df)
109 |         
110 |         
111 | from sklearn.model_selection import GridSearchCV
112 | from sklearn.model_selection import StratifiedKFold        
113 | # 调参
114 | @change_dir
115 | @timethis
116 | def tc(X, Y, param_grid, param_name):
117 |     model = XGBClassifier(use_label_encoder=False, eval_metric = "logloss")
118 |     kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
119 |     grid_search = GridSearchCV(model, param_grid, scoring="neg_log_loss", n_jobs=-1, cv=kfold)
120 |     grid_result = grid_search.fit(X, Y)
121 |     print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
122 |     # 输出每个参数对应分数
123 |     means = grid_result.cv_results_['mean_test_score']
124 |     stds = grid_result.cv_results_['std_test_score']
125 |     params = grid_result.cv_results_['params']
126 |     y = []
127 |     for mean, stdev, param in zip(means, stds, params):
128 |         print("%f (%f) with: %r" % (mean, stdev, param))
129 |         y.append(mean)
130 |     plt.plot(y)
131 |     plt.savefig("./output/"+param_name+".png")
132 |         
133 | 
134 | if __name__ == "__main__":
135 |     newpath = "/home/code"
136 |     os.chdir(newpath)
137 |     
138 |     # data_explore()
139 |     
140 |     # 真正开始干活
141 |     p = 0.0001
142 |     train = loadData(p = p)
143 |     train = featureEngineer(train)
144 |     # print(train.head())
145 |     
146 |     # 计算模型评分
147 |     # score = Score(model, train)
148 |     # print("模型评分:%.2f" % score)
149 |     test = loadData(p = p)
150 |     test = featureEngineer(test)
151 |     
152 |     #训练数据预处理
153 |     X_train, y_train = preprocessing(train)
154 |     
155 |     # xgboost
156 |     print("XGBoost")
157 |     max_depth = [3, 4, 5]
158 |     subsample = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
159 |     param_grid = dict(learning_rate = [0.1], max_depth = max_depth, subsample = subsample)
160 |     tc(X_train, y_train, param_grid, "subsample")
161 |     """
162 |     model = XGBClassifier()
163 |     eval_set = [(X_train, y_train)]
164 |     model.fit(X_train, y_train, early_stopping_rounds = 10, eval_metric = "logloss", eval_set = eval_set, verbose = True)
165 |     # X_test, y_test = preprocessing(test)
166 |     # y_pred = model.predict(X_test)
167 |     # print(y_pred[:10])
168 |     # predictions = [round(value) for value in y_pred]
169 |     # print(predictions[:10])
170 |     score(model, test, "XGBoost")
171 |     fig, ax = plt.subplots(nrows = 1, ncols = 1, figsize = (10, 200))
172 |     plot_importance(model, ax = ax)
173 |     plt.savefig("./output/feature_importance.png")
174 |     plt.close()
175 |     """
176 | 
177 |     # 进行预测
178 |     # predict_clf(model)
179 |     


--------------------------------------------------------------------------------
/nn.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | # kaggle Jane Street Market Prediction代码
  3 | # 神经网络及深度学习练习
  4 | 
  5 | 
  6 | # 先用手撸
  7 | # 参考 https://b23.tv/srXty3
  8 | from numpy import array, exp, random, dot
  9 | 
 10 | 
 11 | # 正向传播
 12 | def fp(X, weights):
 13 |     z = dot(X, weights)
 14 |     return 1/(1+exp(-z))
 15 |     
 16 |     
 17 | # 反向传播
 18 | def bp(y, output):
 19 |     error = y - output
 20 |     return error * output * (1-output)
 21 | 
 22 | 
 23 | # 手撸单层神经网络
 24 | def nn():
 25 |     # X = array([[0,0,1], [1,1,1], [1,0,1], [0,1,1]])
 26 |     # y = array([[0,1,1,0]]).T
 27 |     X = array([[0,0,1], [0,1,1], [1,0,1], [1,1,1]])
 28 |     y = array([[0,1,1,0]]).T
 29 |     random.seed(1)
 30 |     weights = 2*random.random((3,1)) - 1
 31 |     for it in range(10000):
 32 |         output = fp(X, weights)
 33 |         delta = bp(y, output)
 34 |         weights += dot(X.T, delta)
 35 |     print(weights)
 36 |     print(fp([0, 0, 1], weights))
 37 |     
 38 |     
 39 | # 多层正向传播
 40 | def mfp(X, w0, w1):
 41 |     l1 = 1/(1+exp(-dot(X, w0)))
 42 |     l2 = 1/(1+exp(-dot(l1, w1)))
 43 |     return l1, l2
 44 |     
 45 |     
 46 | # 反向传播
 47 | def mbp(l1, l2, y, w1):
 48 |     error = y - l2
 49 |     slope = l2 * (1-l2)
 50 |     l1_delta = error*slope
 51 |     
 52 |     l0_error = l1_delta.dot(w1.T)
 53 |     l0_slope = l1 * (1-l1)
 54 |     l0_delta = l0_error*l0_slope
 55 |     return l0_delta, l1_delta
 56 |     
 57 |     
 58 | # 手撸多层神经网络
 59 | def mnn():
 60 |     # X = array([[0,0,1], [1,1,1], [1,0,1], [0,1,1]])
 61 |     # y = array([[0,1,1,0]]).T
 62 |     X = array([[0,0,1], [0,1,1], [1,0,1], [1,1,1]])
 63 |     y = array([[0,1,1,0]]).T
 64 |     random.seed(1)
 65 |     # weights = 2*random.random((3,1)) - 1
 66 |     w0 = 2*random.random((3, 4)) - 1
 67 |     w1 = 2*random.random((4, 1)) - 1
 68 |     for it in range(10000):
 69 |         l0 = X
 70 |         l1, l2 = mfp(X, w0, w1)
 71 |         l0_delta, l1_delta = mbp(l1, l2, y, w1)
 72 |         w1 += dot(l1.T, l1_delta)
 73 |         w0 += dot(l0.T, l0_delta)
 74 |     # print(weights)
 75 |     print(mfp([0, 0, 0], w0, w1)[1])
 76 |     
 77 |     
 78 | # 再尝试pytorch
 79 | import torch
 80 | import torch.nn as nn
 81 | import torch.nn.functional as F
 82 | 
 83 | 
 84 | def testTorch():
 85 |     # 张量操作
 86 |     print("张量操作")
 87 |     x = torch.empty(5, 3)
 88 |     print(x)
 89 |     x = torch.rand(5, 3)
 90 |     print(x)
 91 |     x = torch.zeros(5, 3, dtype = torch.long)
 92 |     print(x)
 93 |     x = torch.tensor([5.5, 3])
 94 |     print(x)
 95 |     x = x.new_ones(5, 3, dtype = torch.double)
 96 |     print(x)
 97 |     x = torch.randn_like(x, dtype = torch.float)
 98 |     print(x)
 99 |     print(x.size())
100 |     y = torch.rand(5, 3)
101 |     print(x+y)
102 |     print(torch.add(x, y))
103 |     result = torch.empty(5, 3)
104 |     torch.add(x, y, out = result)
105 |     print(result)
106 |     y.add_(x)
107 |     print(y)
108 |     print(x[:, 1])
109 |     x = torch.randn(4, 4)
110 |     y = x.view(16)
111 |     z = x.view(-1, 8)
112 |     print(x.size(), y.size(), z.size())
113 |     x = torch.randn(1)
114 |     print(x)
115 |     print(x.item())
116 |     # 自动微分
117 |     print("自动微分")
118 |     x = torch.ones(2, 2, requires_grad = True)
119 |     print(x)
120 |     y = x+2
121 |     print(y)
122 |     print(y.grad_fn)
123 |     z = y*y*3
124 |     out = z.mean()
125 |     print(z, out)
126 |     a = torch.randn(2, 2)
127 |     a = ((a*3) / (a-1))
128 |     print(a.requires_grad)
129 |     a.requires_grad_(True)
130 |     print(a.requires_grad)
131 |     b = (a*a).sum()
132 |     print(b.grad_fn)
133 |     out.backward()
134 |     print(x.grad)
135 |     x = torch.randn(3, requires_grad = True)
136 |     y = x*2
137 |     while y.data.norm() < 1000:
138 |         y = y*2
139 |     print(y)
140 |     v = torch.tensor([0.1, 1.0, 0.0001], dtype = torch.float)
141 |     y.backward(v)
142 |     print(x.grad)
143 |     print(x.requires_grad)
144 |     print((x**2).requires_grad)
145 |     
146 |     with torch.no_grad():
147 |         print((x**2).requires_grad)
148 |     
149 |     
150 | if __name__ == "__main__":
151 |     nn()
152 |     mnn()
153 |     testTorch()
154 | 


--------------------------------------------------------------------------------
/optuna_DP.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | # kaggle Jane Street Market Prediction代码
  3 | # 实际自己工作的代码
  4 | # 用optuna对深度学习模型调参
  5 | 
  6 | 
  7 | import numpy as np
  8 | import pandas as pd
  9 | pd.set_option('display.max_columns', None)
 10 | import janestreet
 11 | 
 12 | import matplotlib.pyplot as plt
 13 | from sklearn.model_selection import train_test_split
 14 | from sklearn import metrics
 15 | from sklearn.metrics import accuracy_score
 16 | import torch
 17 | import torch.nn as nn
 18 | import torch.optim as optim
 19 | import optuna
 20 | 
 21 | import os
 22 | 
 23 | from FE import featureEngineer
 24 | from tools import *
 25 | 
 26 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 27 |     
 28 |     
 29 | # 建模前处理数据
 30 | def preprocessing(train):
 31 |     X = train.loc[:, train.columns.str.contains('feature')]
 32 |     # y_train = train.loc[:, 'resp']
 33 |     Y = train.loc[:, 'action']
 34 |     
 35 |     x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state=666, test_size=0.2)
 36 |     
 37 |     return x_train, x_test, y_train, y_test 
 38 | 
 39 |     
 40 | # 评分函数
 41 | def Score(model, data):
 42 |     # test_df = pd.read_csv("/kaggle/input/jane-street-market-prediction/train.csv")
 43 |     data = data.fillna(-999)
 44 |     X_test = data.loc[:, data.columns.str.contains('feature')]
 45 |     resp = model.predict(X_test)
 46 |     date = data["date"].values
 47 |     weight = data["weight"].values
 48 |     action = (resp > 0).astype("int")
 49 |     
 50 |     count_i = len(np.unique(date))
 51 |     Pi = np.zeros(count_i)
 52 |     # 用循环太慢
 53 |     #for i, day in enumerate(np.unique(date)):
 54 | #        Pi[i] = np.sum(weight[date == day] * resp[date == day] * action[date == day])
 55 |     # 用下面这行代替
 56 |     Pi = np.bincount(date, weight * resp * action)
 57 |     t = np.sum(Pi) / np.sqrt(np.sum(Pi ** 2)) * np.sqrt(250 / count_i)
 58 |     u = np.clip(t, 0, 6) * np.sum(Pi)
 59 |     return u
 60 |     
 61 | 
 62 | # 进行预测，生成提交文件，分类版
 63 | def predict_clf(model):
 64 |     env = janestreet.make_env()
 65 |     iter_test = env.iter_test()
 66 |     for (test_df, sample_prediction_df) in iter_test:
 67 |         if test_df['weight'].item() > 0:
 68 |             test_df = featureEngineer(test_df)
 69 |             X_test = test_df.loc[:, test_df.columns.str.contains('feature')]
 70 |             X_test = X_test.fillna(0.0)
 71 |             y_preds = model.predict(X_test)[0]
 72 |         else:
 73 |             y_preds = 0
 74 |         # print(y_preds)
 75 |         sample_prediction_df.action = y_preds
 76 |         env.predict(sample_prediction_df)
 77 |         
 78 |         
 79 | # 进行预测，生成提交文件，神经网络模型版
 80 | def predict_nn(model):
 81 |     env = janestreet.make_env()
 82 |     iter_test = env.iter_test()
 83 |     for (test_df, sample_prediction_df) in iter_test:
 84 |         if test_df['weight'].item() > 0:
 85 |             # test_df = featureEngineer(test_df)
 86 |             X_test = test_df.loc[:, test_df.columns.str.contains('feature')]
 87 |             X_test = X_test.fillna(0.0)
 88 |             X_test_tensor = torch.from_numpy(X_test.values).float().to(device)
 89 |             pred = model(X_test_tensor).detach().cpu().numpy()
 90 |             if pred >= 0.5:
 91 |                 y_preds = 1
 92 |             else:
 93 |                 y_preds = 0
 94 |         else:
 95 |             y_preds = 0
 96 |         # print(y_preds)
 97 |         sample_prediction_df.action = y_preds
 98 |         env.predict(sample_prediction_df)
 99 |         
100 |         
101 | # 获取数据
102 | def getData():
103 |     p = 0.1
104 |     data = loadData(p = p)
105 |     data = featureEngineer(data)
106 |     # print(data.info())
107 |     
108 |     #训练数据预处理
109 |     x_train, x_test, y_train, y_test  = preprocessing(data)
110 |     
111 |     return x_train, y_train, x_test, y_test
112 |     
113 |     
114 | # 获取模型准确率
115 | def getAccuracyRate(Model):
116 |     result = []
117 |     for x in Model(x_test_tensor):
118 |         if x >= 0.5:
119 |             result.append(1)
120 |         else:
121 |             result.append(0)
122 |     y_test = y_test_tensor.numpy()
123 |     # print(y_test[:10])
124 |     # print(result[:10])
125 |     count = 0
126 |     for i in range(len(result)):
127 |         if y_test[i] == result[i]:
128 |             count += 1
129 |     
130 |     return count/len(y_test)
131 |     
132 |     
133 | # 定义模型
134 | def define_model(trial):
135 |     input_dim = 130
136 |     hide1_dim = trial.suggest_int("hide1_dim", 100, 200)
137 |     hide2_dim = trial.suggest_int("hide2_dim", 10, 200)
138 |     output_dim = 1
139 |     Model = nn.Sequential(
140 |             nn.Linear(input_dim, hide1_dim),
141 |             nn.ReLU(),
142 |             nn.Linear(hide1_dim, hide2_dim),
143 |             nn.Sigmoid(),
144 |             nn.Linear(hide2_dim, output_dim)
145 |     )
146 |     return Model
147 |     
148 |     
149 | # 加载数据，为避免反复读取和数据一致，用全局变量
150 | x_train, y_train, x_test, y_test = getData()
151 | x_tensor = torch.from_numpy(x_train.values).float().to(device)
152 | y_tensor = torch.from_numpy(y_train.values).float().to(device)
153 | x_test_tensor = torch.from_numpy(x_test.values).float().to(device)
154 | y_test_tensor = torch.from_numpy(y_test.values).float().to(device)
155 |     
156 |     
157 | # 优化目标函数
158 | @timethis
159 | def objective(trial):
160 |     Model = define_model(trial).to(device)
161 |     optimizer_name = trial.suggest_categorical("optimizer", ["Adam", "RMSprop", "SGD"])
162 |     lr = trial.suggest_loguniform("lr", 1e-5, 1e-1)
163 |     optimizer = getattr(optim, optimizer_name)(Model.parameters(), lr=lr)
164 |     n_epochs = trial.suggest_int("epochs", 50, 200)
165 |     loss_fn = nn.MSELoss(reduction = "mean")
166 |     
167 |     # 创建训练器
168 |     train_step = make_train_step(Model, loss_fn, optimizer)
169 |     # losses = []
170 |     
171 |     # 进行训练
172 |     for epoch in range(n_epochs):
173 |         # y_tensor = y_tensor.detach()
174 |         loss = train_step(x_tensor, y_tensor)
175 |         # losses.append(loss)
176 |     accuracy = getAccuracyRate(Model)
177 |     
178 |     return accuracy
179 |     
180 | 
181 | if __name__ == "__main__":
182 |     newpath = "/home/code"
183 |     os.chdir(newpath)
184 |     
185 |     # 用optuna进行调参
186 |     study = optuna.create_study(direction="maximize")
187 |     study.optimize(objective, n_trials=10)
188 |     
189 |     print("结果:", study.best_params)
190 |     print(study.best_value)
191 |     print(study.best_trial)
192 |     
193 |     # 进行预测
194 |     # predict_clf(model)
195 |     


--------------------------------------------------------------------------------
/optuna_test.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8
 2 | # kaggle Jane Street Market Prediction代码
 3 | # optuna的测试代码
 4 | 
 5 | import numpy as np
 6 | import pandas as pd
 7 | pd.set_option('display.max_columns', None)
 8 | import os
 9 | from tools import *
10 | from FE import featureEngineer
11 | 
12 | import matplotlib.pyplot as plt
13 | from sklearn.model_selection import train_test_split, cross_val_score
14 | from sklearn import metrics
15 | from sklearn.metrics import accuracy_score
16 | import optuna
17 | from optuna.samplers import TPESampler
18 | 
19 | # XGBoost
20 | from xgboost import XGBClassifier
21 | 
22 | 
23 | def objective(trial):
24 |     x = trial.suggest_uniform("x", -10, 10)
25 |     return (x - 2)**2
26 |     
27 |     
28 | def objective2(trial, x, y):
29 |     train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.3, random_state = 101)
30 |     param = {
31 |         "n_estimators" : trial.suggest_int('n_estimators', 0, 1000),
32 |         'max_depth':trial.suggest_int('max_depth', 2, 25),
33 |         'learning_rate':trial.suggest_loguniform('learning_rate',0.005,0.5)
34 |     }
35 |     model = XGBClassifier(**param)
36 |     model.fit(train_x, train_y)
37 |     
38 |     return cross_val_score(model,test_x,test_y).mean()
39 |     
40 |     
41 | # 建模前处理数据
42 | def preprocessing(train):
43 |     X_train = train.loc[:, train.columns.str.contains('feature')]
44 |     # y_train = train.loc[:, 'resp']
45 |     y_train = train.loc[:, 'action']
46 |     
47 |     X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, random_state=666, test_size=0.2)
48 |     
49 |     return X_train, y_train
50 | 
51 | 
52 | if __name__ == "__main__":
53 | #    study = optuna.create_study()
54 | #    study.optimize(objective, n_trials = 100)
55 | #    print("结果:", study.best_params)
56 | #    print(study.best_value)
57 | #    print(study.best_trial)
58 | #    study.optimize(objective, n_trials = 100)
59 | #    print("结果:", study.best_params)
60 | #    print(study.best_value)
61 | #    print(study.best_trial)
62 |     
63 |     newpath = "/home/code"
64 |     os.chdir(newpath)
65 |     
66 |     # data_explore()
67 |     
68 |     # 真正开始干活
69 |     p = 0.001
70 |     train = loadData(p = p)
71 |     train = featureEngineer(train)
72 |     # print(train.head())
73 |     
74 |     # 计算模型评分
75 |     # score = Score(model, train)
76 |     # print("模型评分:%.2f" % score)
77 |     test = loadData(p = p)
78 |     test = featureEngineer(test)
79 |     
80 |     #训练数据预处理
81 |     X_train, y_train = preprocessing(train)
82 |     
83 |     # xgboost
84 |     print("XGBoost")
85 |     study = optuna.create_study(direction = "maximize", sampler = TPESampler())
86 |     study.optimize(lambda trial:objective2(trial, X_train, y_train), n_trials = 50)
87 |     print("结果:", study.best_params)
88 |     print(study.best_value)
89 |     print(study.best_trial)
90 |     


--------------------------------------------------------------------------------
/pic/00.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zwdnet/JSMPwork/b7ae1380986abc3ee5b10bea7f9d57cbb805a8f0/pic/00.jpg


--------------------------------------------------------------------------------
/pic/01.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zwdnet/JSMPwork/b7ae1380986abc3ee5b10bea7f9d57cbb805a8f0/pic/01.jpg


--------------------------------------------------------------------------------
/pic/02.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zwdnet/JSMPwork/b7ae1380986abc3ee5b10bea7f9d57cbb805a8f0/pic/02.jpg


--------------------------------------------------------------------------------
/pic/03.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zwdnet/JSMPwork/b7ae1380986abc3ee5b10bea7f9d57cbb805a8f0/pic/03.jpg


--------------------------------------------------------------------------------
/pic/04.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zwdnet/JSMPwork/b7ae1380986abc3ee5b10bea7f9d57cbb805a8f0/pic/04.jpg


--------------------------------------------------------------------------------
/pic/05.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zwdnet/JSMPwork/b7ae1380986abc3ee5b10bea7f9d57cbb805a8f0/pic/05.jpg


--------------------------------------------------------------------------------
/pic/06.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zwdnet/JSMPwork/b7ae1380986abc3ee5b10bea7f9d57cbb805a8f0/pic/06.jpg


--------------------------------------------------------------------------------
/pic/07.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zwdnet/JSMPwork/b7ae1380986abc3ee5b10bea7f9d57cbb805a8f0/pic/07.jpg


--------------------------------------------------------------------------------
/pic/08.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zwdnet/JSMPwork/b7ae1380986abc3ee5b10bea7f9d57cbb805a8f0/pic/08.jpg


--------------------------------------------------------------------------------
/pic/09.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zwdnet/JSMPwork/b7ae1380986abc3ee5b10bea7f9d57cbb805a8f0/pic/09.jpg


--------------------------------------------------------------------------------
/pic/10.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zwdnet/JSMPwork/b7ae1380986abc3ee5b10bea7f9d57cbb805a8f0/pic/10.jpg


--------------------------------------------------------------------------------
/preprocess.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8
 2 | # kaggle Jane Street Market Prediction代码
 3 | # 数据探索及预处理
 4 | 
 5 | 
 6 | import pandas as pd
 7 | from run import *
 8 | import matplotlib.pyplot as plt
 9 | import dask.dataframe as dd
10 | 
11 | 
12 | # 初步探索花了一天
13 | @change_dir
14 | def drawData():
15 |     # data = pd.read_csv("train.csv", usecols = [0,1])
16 |     n = 2390491
17 |     row_read = int(n/100)
18 |     # row_read = 5
19 |     # data = pd.read_csv("./train.csv", nrows = row_read)
20 |     data = dd.read_csv("./train.csv")
21 |     # print(data.head())
22 |     # print(data.info())
23 |     print(data.info())
24 |     print(data.columns)
25 |     
26 |     fig = plt.figure()
27 |     plt.plot(data["weight"].values.compute())
28 |     plt.savefig("./output/weight.png")
29 |     
30 |     s = "resp_"
31 |     for i in range(1, 5):
32 |         col = s+str(i)
33 |         plt.close()
34 |         fig = plt.figure()
35 |         plt.plot(data[col].values.compute())
36 |         plt.savefig("./output/"+col+".png")
37 |         
38 |     plt.close()
39 |     fig = plt.figure()
40 |     plt.plot(data["resp"].values.compute())
41 |     plt.savefig("./output/"+"resp"+".png")
42 |     
43 |     s = "feature_"
44 |     for i in range(0, 130):
45 |         col = s+str(i)
46 |         plt.close()
47 |         fig = plt.figure()
48 |         plt.plot(data[col].values.compute())
49 |         plt.savefig("./output/"+col+".png")
50 | 
51 |     return data
52 |     
53 |     
54 | # 读取数据，提取前1/10做研究
55 | @change_dir
56 | def smallData():
57 |     n = 2390491
58 |     row_read = int(n/10)
59 |     data = pd.read_csv("./train.csv", nrows = row_read)
60 |     print(data.info())
61 |     # 画图
62 |     fig = plt.figure()
63 |     plt.plot(data["weight"].values)
64 |     plt.savefig("./output/weight_small.png")
65 |     
66 |     s = "resp_"
67 |     for i in range(1, 5):
68 |         col = s+str(i)
69 |         plt.close()
70 |         fig = plt.figure()
71 |         plt.plot(data[col].values)
72 |         plt.savefig("./output/"+col+"_small.png")
73 |         
74 |     plt.close()
75 |     fig = plt.figure()
76 |     plt.plot(data["resp"].values)
77 |     plt.savefig("./output/"+"resp"+"_small.png")
78 |     
79 |     s = "feature_"
80 |     for i in range(0, 130):
81 |         col = s+str(i)
82 |         plt.close()
83 |         fig = plt.figure()
84 |         plt.plot(data[col].values)
85 |         plt.savefig("./output/"+col+"_small.png")
86 |     data.to_csv("./small_train.csv")
87 |         
88 |     
89 | if __name__ == "__main__":
90 |     data = pd.read_csv("small_train.csv")
91 |     print(data.info())
92 |     


--------------------------------------------------------------------------------
/py_nn.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | # 《python神经网络编程》实操代码
  3 | 
  4 | 
  5 | import numpy as np
  6 | import scipy.special
  7 | import run
  8 | import pandas as pd
  9 | import matplotlib.pyplot as plt
 10 | import optuna
 11 | import optuna.visualization as pv
 12 | 
 13 | 
 14 | # 神经网络类
 15 | class NN:
 16 |     def __init__(self, inputnodes, hiddennodes, outputnodes, learningrate):
 17 |         # 设置输入、隐藏和输出层维度
 18 |         self.inodes = inputnodes
 19 |         self.hnodes = hiddennodes
 20 |         self.onodes = outputnodes
 21 | 
 22 | 
 23 |         # simple random number
 24 |         # self.wih = (np.random.rand(self.hnodes, self.inodes) - 0.5)
 25 |         # self.who = (np.random.rand(self.onodes, self.hnodes) - 0.5)
 26 | 
 27 |         # Normal distribution
 28 |         # average = 0
 29 |         # Standard deviation = 1/evolution of number of nodes passed in
 30 |         # 用正态分布随机数初始化权重
 31 |         self.wih = np.random.normal(0, pow(self.hnodes, -0.5), (self.hnodes, self.inodes))
 32 |         self.who = np.random.normal(0, pow(self.onodes, -0.5), (self.onodes, self.hnodes))
 33 | 
 34 |         # 学习率
 35 |         self.lr = learningrate
 36 | 
 37 |         # 用sigmoid函数做激活函数
 38 |         self.activation_function = lambda x: scipy.special.expit(x)
 39 | 
 40 | 
 41 |     # 训练神经网络
 42 |     def train(self, inputs_list, targets_list):
 43 |         # 将数据转换为二维数组
 44 |         inputs = np.array(inputs_list, ndmin=2).T
 45 |         targets = np.array(targets_list, ndmin=2).T
 46 | 
 47 |         # 利用传输矩阵wih，计算隐藏层输入
 48 |         hidden_inputs = np.dot(self.wih, inputs)
 49 |         # 计算隐藏层输出，激活函数
 50 |         hidden_outputs = self.activation_function(hidden_inputs)
 51 |         # 利用传输矩阵who，计算输出层输入
 52 |         final_inputs = np.dot(self.who, hidden_outputs)
 53 |         # 用激活函数计算输出信号
 54 |         final_outputs = self.activation_function(final_inputs)
 55 | 
 56 |         # 计算误差值
 57 |         output_errors = targets - final_outputs
 58 | 
 59 |         # 按权重分配误差
 60 |         hidden_errors = np.dot(self.who.T, output_errors)
 61 |         # update the weights for the links between the hidden and output layers
 62 |         # wj,k = learningrate * error * sigmoid(ok) * (1 - sigmoid(ok)) · oj^T
 63 |         # 更新隐藏层及输出层之间的权重值
 64 |         self.who += self.lr * np.dot(
 65 |             (output_errors * final_outputs * (1.0 - final_outputs)),
 66 |             np.transpose(hidden_outputs))
 67 |         # update the weights for the links between the input and hidden layers
 68 |         # 更新输入层及隐藏层之间的权重值
 69 |         self.wih += self.lr * np.dot(
 70 |             (hidden_errors * hidden_outputs * (1.0 - hidden_outputs)),
 71 |             np.transpose(inputs))
 72 | 
 73 | 
 74 |     # 前向传播
 75 |     def query(self, inputs_list):
 76 |         # 输入矩阵
 77 |         inputs = np.array(inputs_list, ndmin=2).T
 78 | 
 79 |         # calculate signals into hidden layer
 80 |         # 利用传输矩阵wih，计算隐藏层输入
 81 |         hidden_inputs = np.dot(self.wih, inputs)
 82 |         # calculate the signals emerging from hidden layer
 83 |         # 计算隐藏层输出，激活函数
 84 |         hidden_outputs = self.activation_function(hidden_inputs)
 85 |         # calculate signals into final output layer
 86 |         # 利用传输矩阵who，计算输出层输入
 87 |         final_inputs = np.dot(self.who, hidden_outputs)
 88 |         # calculate the signals emerging from final output layer
 89 |         final_outputs = self.activation_function(final_inputs)
 90 | 
 91 |         return final_outputs
 92 |         
 93 |         
 94 | # 加载数据
 95 | def loadData():
 96 |     # load the mnist training data CSV file into a list
 97 |     training_data_file = open("mnist_train.csv", 'r')
 98 |     training_data_list = training_data_file.readlines()
 99 |     training_data_file.close()
100 |     
101 |     testing_data_file = open("mnist_test.csv", 'r')
102 |     testing_data_list = testing_data_file.readlines()
103 |     testing_data_file.close()
104 |     
105 |     return training_data_list, testing_data_list
106 |     
107 |     
108 | # 创建模型
109 | def init_model(input_nodes, hidden_nodes, output_nodes, learning_rate):
110 |     # create instance of neural network
111 |     n = NN(input_nodes, hidden_nodes, output_nodes, learning_rate)
112 |     
113 |     return n
114 |     
115 |     
116 | # 训练过程
117 | def train(n, epochs, training_data_list, output_nodes):
118 |     # 对训练过程进行循环
119 |     for e in range(epochs):
120 |         for record in training_data_list:
121 |             # split the record by the ',' commas
122 |             # 通过','将数分段
123 |             all_values = record.split(',')
124 |             # scale and shift the inputs
125 |             # 将所有的像素点的值转换为0.01-1.00
126 |             inputs = (np.asfarray(all_values[1:]) / 255.0 * 0.99 + 0.01)
127 |             # creat the target output values
128 |             # 创建标签输出值
129 |             targets = np.zeros(output_nodes) + 0.01
130 |             # all_values[0] is the target label for this record
131 |             # 10个输出值，对应的为0.99，其他为0.01
132 |             targets[int(all_values[0])] = 0.99
133 |             # 传入网络进行训练
134 |             n.train(inputs, targets)
135 |     return n
136 |     
137 |     
138 | # 获取预测准确率
139 | def getScores(n, testing_data_list):
140 |     # 创建一个空白的计分卡
141 |     scorecard = []
142 |     # 遍历测试数据
143 |     for record in testing_data_list:
144 |         all_values = record.split(',')
145 |         # 提取正确的标签
146 |         correct_label = int(all_values[0])
147 |         # print(correct_label, 'correct label')
148 |         # 读取像素值并转换
149 |         inputs = (np.asfarray(all_values[1:]) / 255.0 * 0.99 + 0.01)
150 |         # 通过神经网络得出结果
151 |         outputs = n.query(inputs)
152 |         # 结果
153 |         label = np.argmax(outputs)
154 |         # print(label, "network's answer")
155 |         # 标签相同，计分卡加一，否则加零
156 |         if (label == correct_label):
157 |             scorecard.append(1)
158 |         else:
159 |             scorecard.append(0)
160 |     # 输出计分卡
161 |     # print(scorecard)
162 |     # 输出分数
163 |     scorecard_array = np.asarray(scorecard)
164 |     
165 |     return scorecard_array
166 |         
167 |         
168 | # 解MINST手写数字识别问题
169 | @run.change_dir
170 | @run.timethis
171 | def minst(trial):
172 |     input_nodes = 784
173 |     hidden_nodes = trial.suggest_categorical("hidden_dim", [50, 100, 200, 300])
174 |     output_nodes = 10
175 |     # 学习率
176 |     learning_rate = trial.suggest_discrete_uniform("learning_rate", 0.01, 0.81, 0.1)
177 |     n = init_model(input_nodes, hidden_nodes, output_nodes, learning_rate)
178 |     training_data_list, testing_data_list = loadData()
179 |     # 训练
180 |     epochs = trial.suggest_int("epochs:", 1, 10)
181 |     n = train(n, epochs, training_data_list, output_nodes)
182 |     # 测试
183 |     res = getScores(n, testing_data_list)
184 |     return res.sum() / res.size
185 |     
186 |     
187 | # 画图
188 | @run.change_dir
189 | def draw_results(study):
190 |     # 优化历史
191 |     plt.figure()
192 |     fig = pv.plot_optimization_history(study)
193 |     fig.write_image("./output/opt_his.png")
194 |     plt.close()
195 |     # 等高线图
196 |     plt.figure()
197 |     fig = pv.plot_contour(study)
198 |     fig.write_image("./output/opt_contour.png")
199 |     plt.close()
200 |     # 经验分布图
201 |     plt.figure()
202 |     fig = pv.plot_edf(study)
203 |     fig.write_image("./output/opt_edf.png")
204 |     plt.close()
205 |     # 高维参数
206 |     plt.figure()
207 |     fig = pv.plot_parallel_coordinate(study)
208 |     fig.write_image("./output/opt_coordinate.png")
209 |     plt.close()
210 |     
211 | 
212 | if __name__ == "__main__":
213 |     input_nodes = 3
214 |     hidden_nodes = 3
215 |     output_nodes = 3
216 |     
217 |     learning_rate = 0.3
218 |     
219 |     # n = NN(input_nodes, hidden_nodes, output_nodes, learning_rate)
220 |     # print(n.query([1.0, 0.5, -0.5]))
221 |     
222 |     # minst()
223 |     
224 |     study = optuna.create_study(direction="maximize")
225 |     study.optimize(minst, n_trials=100)
226 |     print("结果:", study.best_params)
227 |     print(study.best_value)
228 |     print(study.best_trial)
229 |     if pv.is_available:
230 |         print("结果作图")
231 |         draw_results(study)
232 |     else:
233 |         print("不能作图")
234 | 


--------------------------------------------------------------------------------
/py_nn_back.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | # 《python神经网络编程》实操代码
  3 | # 反向查询看看
  4 | 
  5 | 
  6 | import numpy as np
  7 | import scipy.special
  8 | import run
  9 | import pandas as pd
 10 | import matplotlib.pyplot as plt
 11 | import optuna
 12 | import optuna.visualization as pv
 13 | import cv2
 14 | import glob
 15 | 
 16 | 
 17 | # 神经网络类
 18 | class NN:
 19 |     def __init__(self, inputnodes, hiddennodes, outputnodes, learningrate):
 20 |         # 设置输入、隐藏和输出层维度
 21 |         self.inodes = inputnodes
 22 |         self.hnodes = hiddennodes
 23 |         self.onodes = outputnodes
 24 | 
 25 | 
 26 |         # simple random number
 27 |         # self.wih = (np.random.rand(self.hnodes, self.inodes) - 0.5)
 28 |         # self.who = (np.random.rand(self.onodes, self.hnodes) - 0.5)
 29 | 
 30 |         # Normal distribution
 31 |         # average = 0
 32 |         # Standard deviation = 1/evolution of number of nodes passed in
 33 |         # 用正态分布随机数初始化权重
 34 |         self.wih = np.random.normal(0, pow(self.hnodes, -0.5), (self.hnodes, self.inodes))
 35 |         self.who = np.random.normal(0, pow(self.onodes, -0.5), (self.onodes, self.hnodes))
 36 | 
 37 |         # 学习率
 38 |         self.lr = learningrate
 39 | 
 40 |         # 用sigmoid函数做激活函数
 41 |         self.activation_function = lambda x: scipy.special.expit(x)
 42 |         # 激活函数的反函数
 43 |         self.inverse_activation_function = lambda x: scipy.special.logit(x)
 44 | 
 45 | 
 46 |     # 训练神经网络
 47 |     def train(self, inputs_list, targets_list):
 48 |         # 将数据转换为二维数组
 49 |         inputs = np.array(inputs_list, ndmin=2).T
 50 |         targets = np.array(targets_list, ndmin=2).T
 51 | 
 52 |         # 利用传输矩阵wih，计算隐藏层输入
 53 |         hidden_inputs = np.dot(self.wih, inputs)
 54 |         # 计算隐藏层输出，激活函数
 55 |         hidden_outputs = self.activation_function(hidden_inputs)
 56 |         # 利用传输矩阵who，计算输出层输入
 57 |         final_inputs = np.dot(self.who, hidden_outputs)
 58 |         # 用激活函数计算输出信号
 59 |         final_outputs = self.activation_function(final_inputs)
 60 | 
 61 |         # 计算误差值
 62 |         output_errors = targets - final_outputs
 63 | 
 64 |         # 按权重分配误差
 65 |         hidden_errors = np.dot(self.who.T, output_errors)
 66 |         # update the weights for the links between the hidden and output layers
 67 |         # wj,k = learningrate * error * sigmoid(ok) * (1 - sigmoid(ok)) · oj^T
 68 |         # 更新隐藏层及输出层之间的权重值
 69 |         self.who += self.lr * np.dot(
 70 |             (output_errors * final_outputs * (1.0 - final_outputs)),
 71 |             np.transpose(hidden_outputs))
 72 |         # update the weights for the links between the input and hidden layers
 73 |         # 更新输入层及隐藏层之间的权重值
 74 |         self.wih += self.lr * np.dot(
 75 |             (hidden_errors * hidden_outputs * (1.0 - hidden_outputs)),
 76 |             np.transpose(inputs))
 77 | 
 78 | 
 79 |     # 前向传播
 80 |     def query(self, inputs_list):
 81 |         # 输入矩阵
 82 |         inputs = np.array(inputs_list, ndmin=2).T
 83 | 
 84 |         # calculate signals into hidden layer
 85 |         # 利用传输矩阵wih，计算隐藏层输入
 86 |         hidden_inputs = np.dot(self.wih, inputs)
 87 |         # calculate the signals emerging from hidden layer
 88 |         # 计算隐藏层输出，激活函数
 89 |         hidden_outputs = self.activation_function(hidden_inputs)
 90 |         # calculate signals into final output layer
 91 |         # 利用传输矩阵who，计算输出层输入
 92 |         final_inputs = np.dot(self.who, hidden_outputs)
 93 |         # calculate the signals emerging from final output layer
 94 |         final_outputs = self.activation_function(final_inputs)
 95 | 
 96 |         return final_outputs
 97 |         
 98 |     # 反向查询，给定输出值，看输入会是啥
 99 |     def backquery(self, targets_list):
100 |         # 转换为垂直向量
101 |         final_outputs = np.array(targets_list, ndmin = 2).T
102 |         # 计算最后的输入信号，用激活函数的反函数
103 |         final_inputs = self.inverse_activation_function(final_outputs)
104 |         # 计算隐藏层的输出
105 |         hidden_outputs = np.dot(self.who.T, final_inputs)
106 |         # 归一化
107 |         hidden_outputs -= np.min(hidden_outputs)
108 |         hidden_outputs /= np.max(hidden_outputs)
109 |         hidden_outputs *= 0.98
110 |         hidden_outputs += 0.01
111 |         # 计算进入隐藏层的信号
112 |         hidden_inputs = self.inverse_activation_function(hidden_outputs)
113 |         
114 |         # 计算输入层的输出信号
115 |         inputs = np.dot(self.wih.T, hidden_inputs)
116 |         # 归一化
117 |         inputs -= np.min(inputs)
118 |         inputs /= np.max(inputs)
119 |         inputs *= 0.98
120 |         inputs += 0.01
121 |         
122 |         return inputs
123 |         
124 |         
125 | # 加载数据
126 | @run.change_dir
127 | def loadData():
128 |     # load the mnist training data CSV file into a list
129 |     training_data_file = open("mnist_train.csv", 'r')
130 |     training_data_list = training_data_file.readlines()
131 |     training_data_file.close()
132 |     
133 |     testing_data_file = open("mnist_test.csv", 'r')
134 |     testing_data_list = testing_data_file.readlines()
135 |     testing_data_file.close()
136 |     
137 |     return training_data_list, testing_data_list
138 |     
139 |     
140 | # 创建模型
141 | def init_model(input_nodes, hidden_nodes, output_nodes, learning_rate):
142 |     # create instance of neural network
143 |     n = NN(input_nodes, hidden_nodes, output_nodes, learning_rate)
144 |     
145 |     return n
146 |     
147 |     
148 | # 训练过程
149 | def train(n, epochs, training_data_list, output_nodes):
150 |     # 对训练过程进行循环
151 |     for e in range(epochs):
152 |         print("第{}轮".format(e))
153 |         for record in training_data_list:
154 |             # split the record by the ',' commas
155 |             # 通过','将数分段
156 |             all_values = record.split(',')
157 |             # scale and shift the inputs
158 |             # 将所有的像素点的值转换为0.01-1.00
159 |             inputs = (np.asfarray(all_values[1:]) / 255.0 * 0.99 + 0.01)
160 |             # creat the target output values
161 |             # 创建标签输出值
162 |             targets = np.zeros(output_nodes) + 0.01
163 |             # all_values[0] is the target label for this record
164 |             # 10个输出值，对应的为0.99，其他为0.01
165 |             targets[int(all_values[0])] = 0.99
166 |             # 传入网络进行训练
167 |             n.train(inputs, targets)
168 |     return n
169 |     
170 |     
171 | # 获取预测准确率
172 | def getScores(n, testing_data_list):
173 |     # 创建一个空白的计分卡
174 |     scorecard = []
175 |     # 遍历测试数据
176 |     for record in testing_data_list:
177 |         all_values = record.split(',')
178 |         # 提取正确的标签
179 |         correct_label = int(all_values[0])
180 |         # print(correct_label, 'correct label')
181 |         # 读取像素值并转换
182 |         inputs = (np.asfarray(all_values[1:]) / 255.0 * 0.99 + 0.01)
183 |         # 通过神经网络得出结果
184 |         outputs = n.query(inputs)
185 |         # 结果
186 |         label = np.argmax(outputs)
187 |         # print(label, "network's answer")
188 |         # 标签相同，计分卡加一，否则加零
189 |         if (label == correct_label):
190 |             scorecard.append(1)
191 |         else:
192 |             scorecard.append(0)
193 |     # 输出计分卡
194 |     # print(scorecard)
195 |     # 输出分数
196 |     scorecard_array = np.asarray(scorecard)
197 |     
198 |     return scorecard_array
199 |         
200 |         
201 | # 解MINST手写数字识别问题
202 | @run.change_dir
203 | @run.timethis
204 | def minst(trial):
205 |     input_nodes = 784
206 |     hidden_nodes = trial.suggest_categorical("hidden_dim", [50, 100, 200, 300])
207 |     output_nodes = 10
208 |     # 学习率
209 |     learning_rate = trial.suggest_discrete_uniform("learning_rate", 0.01, 0.81, 0.1)
210 |     n = init_model(input_nodes, hidden_nodes, output_nodes, learning_rate)
211 |     training_data_list, testing_data_list = loadData()
212 |     # 训练
213 |     epochs = trial.suggest_int("epochs:", 1, 10)
214 |     n = train(n, epochs, training_data_list, output_nodes)
215 |     # 测试
216 |     res = getScores(n, testing_data_list)
217 |     return res.sum() / res.size
218 |     
219 |     
220 | # 画图
221 | @run.change_dir
222 | def draw_results(study):
223 |     # 优化历史
224 |     plt.figure()
225 |     fig = pv.plot_optimization_history(study)
226 |     fig.write_image("./output/opt_his.png")
227 |     plt.close()
228 |     # 等高线图
229 |     plt.figure()
230 |     fig = pv.plot_contour(study)
231 |     fig.write_image("./output/opt_contour.png")
232 |     plt.close()
233 |     # 经验分布图
234 |     plt.figure()
235 |     fig = pv.plot_edf(study)
236 |     fig.write_image("./output/opt_edf.png")
237 |     plt.close()
238 |     # 高维参数
239 |     plt.figure()
240 |     fig = pv.plot_parallel_coordinate(study)
241 |     fig.write_image("./output/opt_coordinate.png")
242 |     plt.close()
243 |     
244 |     
245 | # 手写数字识别应用
246 | # 处理输入数据
247 | @run.change_dir
248 | def data_process():
249 |     targets = []
250 |     datas = []
251 |     for file in glob.glob(r"./pic/*.png"):
252 |         targets.append(int(file.split("/")[2].split(".")[0]))
253 |         img_array = cv2.imread(file)
254 |         img_array = cv2.resize(img_array, (28, 28))
255 |         img_array = cv2.cvtColor(img_array, cv2.COLOR_BGR2GRAY)
256 |         height,width = img_array.shape
257 |         dst = np.zeros((height,width),np.uint8)
258 |         for i in range(height):
259 |             for j in range(width):
260 |                 dst[i,j] = 255 - img_array[i,j]
261 |         img_array = dst.reshape(784)
262 |         datas.append(img_array)
263 |     return (targets, datas)
264 |     
265 |     
266 | # 训练模型
267 | @run.timethis
268 | def trainModel():
269 |     print("开始训练")
270 |     input_nodes = 784
271 |     hidden_nodes = 300
272 |     output_nodes = 10
273 |     learning_rate = 0.11
274 |     epochs = 8
275 |     
276 |     model = NN(input_nodes, hidden_nodes, output_nodes, learning_rate)
277 |     training_data_list, _ = loadData()
278 |    
279 |     # 对训练过程进行循环
280 |     for e in range(epochs):
281 |         for record in training_data_list:
282 |             # 通过','将数分段
283 |             all_values = record.split(',')
284 |             # 将所有的像素点的值转换为0.01-1.00
285 |             inputs = (np.asfarray(all_values[1:]) / 255.0 * 0.99 + 0.01)
286 |             # 创建标签输出值
287 |             targets = np.zeros(output_nodes) + 0.01
288 |             # 10个输出值，对应的为0.99，其他为0.01
289 |             targets[int(all_values[0])] = 0.99
290 |             # 传入网络进行训练
291 |             model.train(inputs, targets)
292 |             
293 |     return model
294 |     
295 |     
296 | # 用模型识别实际数据
297 | def testModel(model, test_datas, targets):
298 |     n = len(test_datas)
299 |     correct = 0
300 |     for i in range(n):
301 |         # 用模型得出预测值
302 |         outputs = model.query(test_datas[i])
303 |         # 转换为结果
304 |         label = np.argmax(outputs)
305 |         print("预测结果{}，实际结果{}".format(label, targets[i]))
306 |         if label == targets[i]:
307 |             correct += 1
308 |             
309 |     return correct/n
310 |     
311 |     
312 | # 反向查询给定输出的输入
313 | @run.change_dir
314 | def back(model):
315 |     output_nodes = 10
316 |     for i in range(10):
317 |         label = i
318 |         targets = np.zeros(output_nodes) + 0.01
319 |         targets[label] = 0.99
320 |         image_data = model.backquery(targets)
321 |         filename = "./output/"+str(i)+".png"
322 |         print(filename)
323 |         plt.figure()
324 |         plt.imshow(image_data.reshape(28,28), cmap='Greys', interpolation='None')
325 |         plt.savefig(filename)
326 |         plt.close()
327 |     
328 | 
329 | if __name__ == "__main__":
330 |     """
331 |     input_nodes = 3
332 |     hidden_nodes = 3
333 |     output_nodes = 3
334 |     
335 |     learning_rate = 0.3
336 |     
337 |     # n = NN(input_nodes, hidden_nodes, output_nodes, learning_rate)
338 |     # print(n.query([1.0, 0.5, -0.5]))
339 |     
340 |     # minst()
341 |     
342 |     study = optuna.create_study(direction="maximize")
343 |     study.optimize(minst, n_trials=100)
344 |     print("结果:", study.best_params)
345 |     print(study.best_value)
346 |     print(study.best_trial)
347 |     if pv.is_available:
348 |         print("结果作图")
349 |         draw_results(study)
350 |     else:
351 |         print("不能作图")
352 |     """
353 |     # 具体应用模型
354 |     # 目前得到的最佳参数:{'hidden_dim': 300, 'learning_rate': 0.11, 'epochs:': 9}
355 |     # targets, datas = data_process()
356 |     model = trainModel()
357 |     # score = testModel(model, datas, targets)
358 |     # print("模型预测准确率:{}".format(score))
359 |     back(model)
360 | 


--------------------------------------------------------------------------------
/py_nn_use.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | # 《python神经网络编程》实操代码
  3 | # 具体应用
  4 | 
  5 | 
  6 | import numpy as np
  7 | import scipy.special
  8 | import run
  9 | import pandas as pd
 10 | import matplotlib.pyplot as plt
 11 | import optuna
 12 | import optuna.visualization as pv
 13 | import cv2
 14 | import glob
 15 | 
 16 | 
 17 | # 神经网络类
 18 | class NN:
 19 |     def __init__(self, inputnodes, hiddennodes, outputnodes, learningrate):
 20 |         # 设置输入、隐藏和输出层维度
 21 |         self.inodes = inputnodes
 22 |         self.hnodes = hiddennodes
 23 |         self.onodes = outputnodes
 24 | 
 25 | 
 26 |         # simple random number
 27 |         # self.wih = (np.random.rand(self.hnodes, self.inodes) - 0.5)
 28 |         # self.who = (np.random.rand(self.onodes, self.hnodes) - 0.5)
 29 | 
 30 |         # Normal distribution
 31 |         # average = 0
 32 |         # Standard deviation = 1/evolution of number of nodes passed in
 33 |         # 用正态分布随机数初始化权重
 34 |         self.wih = np.random.normal(0, pow(self.hnodes, -0.5), (self.hnodes, self.inodes))
 35 |         self.who = np.random.normal(0, pow(self.onodes, -0.5), (self.onodes, self.hnodes))
 36 | 
 37 |         # 学习率
 38 |         self.lr = learningrate
 39 | 
 40 |         # 用sigmoid函数做激活函数
 41 |         self.activation_function = lambda x: scipy.special.expit(x)
 42 | 
 43 | 
 44 |     # 训练神经网络
 45 |     def train(self, inputs_list, targets_list):
 46 |         # 将数据转换为二维数组
 47 |         inputs = np.array(inputs_list, ndmin=2).T
 48 |         targets = np.array(targets_list, ndmin=2).T
 49 | 
 50 |         # 利用传输矩阵wih，计算隐藏层输入
 51 |         hidden_inputs = np.dot(self.wih, inputs)
 52 |         # 计算隐藏层输出，激活函数
 53 |         hidden_outputs = self.activation_function(hidden_inputs)
 54 |         # 利用传输矩阵who，计算输出层输入
 55 |         final_inputs = np.dot(self.who, hidden_outputs)
 56 |         # 用激活函数计算输出信号
 57 |         final_outputs = self.activation_function(final_inputs)
 58 | 
 59 |         # 计算误差值
 60 |         output_errors = targets - final_outputs
 61 | 
 62 |         # 按权重分配误差
 63 |         hidden_errors = np.dot(self.who.T, output_errors)
 64 |         # update the weights for the links between the hidden and output layers
 65 |         # wj,k = learningrate * error * sigmoid(ok) * (1 - sigmoid(ok)) · oj^T
 66 |         # 更新隐藏层及输出层之间的权重值
 67 |         self.who += self.lr * np.dot(
 68 |             (output_errors * final_outputs * (1.0 - final_outputs)),
 69 |             np.transpose(hidden_outputs))
 70 |         # update the weights for the links between the input and hidden layers
 71 |         # 更新输入层及隐藏层之间的权重值
 72 |         self.wih += self.lr * np.dot(
 73 |             (hidden_errors * hidden_outputs * (1.0 - hidden_outputs)),
 74 |             np.transpose(inputs))
 75 | 
 76 | 
 77 |     # 前向传播
 78 |     def query(self, inputs_list):
 79 |         # 输入矩阵
 80 |         inputs = np.array(inputs_list, ndmin=2).T
 81 | 
 82 |         # calculate signals into hidden layer
 83 |         # 利用传输矩阵wih，计算隐藏层输入
 84 |         hidden_inputs = np.dot(self.wih, inputs)
 85 |         # calculate the signals emerging from hidden layer
 86 |         # 计算隐藏层输出，激活函数
 87 |         hidden_outputs = self.activation_function(hidden_inputs)
 88 |         # calculate signals into final output layer
 89 |         # 利用传输矩阵who，计算输出层输入
 90 |         final_inputs = np.dot(self.who, hidden_outputs)
 91 |         # calculate the signals emerging from final output layer
 92 |         final_outputs = self.activation_function(final_inputs)
 93 | 
 94 |         return final_outputs
 95 |         
 96 |         
 97 | # 加载数据
 98 | @run.change_dir
 99 | def loadData():
100 |     # load the mnist training data CSV file into a list
101 |     training_data_file = open("mnist_train.csv", 'r')
102 |     training_data_list = training_data_file.readlines()
103 |     training_data_file.close()
104 |     
105 |     testing_data_file = open("mnist_test.csv", 'r')
106 |     testing_data_list = testing_data_file.readlines()
107 |     testing_data_file.close()
108 |     
109 |     return training_data_list, testing_data_list
110 |     
111 |     
112 | # 创建模型
113 | def init_model(input_nodes, hidden_nodes, output_nodes, learning_rate):
114 |     # create instance of neural network
115 |     n = NN(input_nodes, hidden_nodes, output_nodes, learning_rate)
116 |     
117 |     return n
118 |     
119 |     
120 | # 训练过程
121 | def train(n, epochs, training_data_list, output_nodes):
122 |     # 对训练过程进行循环
123 |     for e in range(epochs):
124 |         print("第{}轮".format(e))
125 |         for record in training_data_list:
126 |             # split the record by the ',' commas
127 |             # 通过','将数分段
128 |             all_values = record.split(',')
129 |             # scale and shift the inputs
130 |             # 将所有的像素点的值转换为0.01-1.00
131 |             inputs = (np.asfarray(all_values[1:]) / 255.0 * 0.99 + 0.01)
132 |             # creat the target output values
133 |             # 创建标签输出值
134 |             targets = np.zeros(output_nodes) + 0.01
135 |             # all_values[0] is the target label for this record
136 |             # 10个输出值，对应的为0.99，其他为0.01
137 |             targets[int(all_values[0])] = 0.99
138 |             # 传入网络进行训练
139 |             n.train(inputs, targets)
140 |     return n
141 |     
142 |     
143 | # 获取预测准确率
144 | def getScores(n, testing_data_list):
145 |     # 创建一个空白的计分卡
146 |     scorecard = []
147 |     # 遍历测试数据
148 |     for record in testing_data_list:
149 |         all_values = record.split(',')
150 |         # 提取正确的标签
151 |         correct_label = int(all_values[0])
152 |         # print(correct_label, 'correct label')
153 |         # 读取像素值并转换
154 |         inputs = (np.asfarray(all_values[1:]) / 255.0 * 0.99 + 0.01)
155 |         # 通过神经网络得出结果
156 |         outputs = n.query(inputs)
157 |         # 结果
158 |         label = np.argmax(outputs)
159 |         # print(label, "network's answer")
160 |         # 标签相同，计分卡加一，否则加零
161 |         if (label == correct_label):
162 |             scorecard.append(1)
163 |         else:
164 |             scorecard.append(0)
165 |     # 输出计分卡
166 |     # print(scorecard)
167 |     # 输出分数
168 |     scorecard_array = np.asarray(scorecard)
169 |     
170 |     return scorecard_array
171 |         
172 |         
173 | # 解MINST手写数字识别问题
174 | @run.change_dir
175 | @run.timethis
176 | def minst(trial):
177 |     input_nodes = 784
178 |     hidden_nodes = trial.suggest_categorical("hidden_dim", [50, 100, 200, 300])
179 |     output_nodes = 10
180 |     # 学习率
181 |     learning_rate = trial.suggest_discrete_uniform("learning_rate", 0.01, 0.81, 0.1)
182 |     n = init_model(input_nodes, hidden_nodes, output_nodes, learning_rate)
183 |     training_data_list, testing_data_list = loadData()
184 |     # 训练
185 |     epochs = trial.suggest_int("epochs:", 1, 10)
186 |     n = train(n, epochs, training_data_list, output_nodes)
187 |     # 测试
188 |     res = getScores(n, testing_data_list)
189 |     return res.sum() / res.size
190 |     
191 |     
192 | # 画图
193 | @run.change_dir
194 | def draw_results(study):
195 |     # 优化历史
196 |     plt.figure()
197 |     fig = pv.plot_optimization_history(study)
198 |     fig.write_image("./output/opt_his.png")
199 |     plt.close()
200 |     # 等高线图
201 |     plt.figure()
202 |     fig = pv.plot_contour(study)
203 |     fig.write_image("./output/opt_contour.png")
204 |     plt.close()
205 |     # 经验分布图
206 |     plt.figure()
207 |     fig = pv.plot_edf(study)
208 |     fig.write_image("./output/opt_edf.png")
209 |     plt.close()
210 |     # 高维参数
211 |     plt.figure()
212 |     fig = pv.plot_parallel_coordinate(study)
213 |     fig.write_image("./output/opt_coordinate.png")
214 |     plt.close()
215 |     
216 |     
217 | # 手写数字识别应用
218 | # 处理输入数据
219 | @run.change_dir
220 | def data_process():
221 |     targets = []
222 |     datas = []
223 |     for file in glob.glob(r"./pic/*.jpg"):
224 |         targets.append(int(file.split("/")[2].split(".")[0]))
225 |         if targets[-1] == 10:
226 |             targets[-1] = 0
227 |         img_array = cv2.imread(file)
228 |         img_array = cv2.resize(img_array, (28, 28))
229 |         img_array = cv2.cvtColor(img_array, cv2.COLOR_BGR2GRAY)
230 |         height,width = img_array.shape
231 |         dst = np.zeros((height,width),np.uint8)
232 |         for i in range(height):
233 |             for j in range(width):
234 |                 dst[i,j] = 255 - img_array[i,j]
235 |         img_array = dst.reshape(784)
236 |         datas.append(img_array)
237 |     return (targets, datas)
238 |     
239 |     
240 | # 训练模型
241 | @run.timethis
242 | def trainModel():
243 |     print("开始训练")
244 |     input_nodes = 784
245 |     hidden_nodes = 300
246 |     output_nodes = 10
247 |     learning_rate = 0.11
248 |     epochs = 8
249 |     
250 |     model = NN(input_nodes, hidden_nodes, output_nodes, learning_rate)
251 |     training_data_list, _ = loadData()
252 |    
253 |     # 对训练过程进行循环
254 |     for e in range(epochs):
255 |         for record in training_data_list:
256 |             # 通过','将数分段
257 |             all_values = record.split(',')
258 |             # 将所有的像素点的值转换为0.01-1.00
259 |             inputs = (np.asfarray(all_values[1:]) / 255.0 * 0.99 + 0.01)
260 |             # 创建标签输出值
261 |             targets = np.zeros(output_nodes) + 0.01
262 |             # 10个输出值，对应的为0.99，其他为0.01
263 |             targets[int(all_values[0])] = 0.99
264 |             # 传入网络进行训练
265 |             model.train(inputs, targets)
266 |             
267 |     return model
268 |     
269 |     
270 | # 用模型识别实际数据
271 | def testModel(model, test_datas, targets):
272 |     n = len(test_datas)
273 |     correct = 0
274 |     for i in range(n):
275 |         # 用模型得出预测值
276 |         outputs = model.query(test_datas[i])
277 |         # 转换为结果
278 |         label = np.argmax(outputs)
279 |         print("预测结果{}，实际结果{}".format(label, targets[i]))
280 |         if label == targets[i]:
281 |             correct += 1
282 |             
283 |     return correct/n
284 |     
285 | 
286 | if __name__ == "__main__":
287 |     """
288 |     input_nodes = 3
289 |     hidden_nodes = 3
290 |     output_nodes = 3
291 |     
292 |     learning_rate = 0.3
293 |     
294 |     # n = NN(input_nodes, hidden_nodes, output_nodes, learning_rate)
295 |     # print(n.query([1.0, 0.5, -0.5]))
296 |     
297 |     # minst()
298 |     
299 |     study = optuna.create_study(direction="maximize")
300 |     study.optimize(minst, n_trials=100)
301 |     print("结果:", study.best_params)
302 |     print(study.best_value)
303 |     print(study.best_trial)
304 |     if pv.is_available:
305 |         print("结果作图")
306 |         draw_results(study)
307 |     else:
308 |         print("不能作图")
309 |     """
310 |     # 具体应用模型
311 |     # 目前得到的最佳参数:{'hidden_dim': 300, 'learning_rate': 0.11, 'epochs:': 9}
312 |     targets, datas = data_process()
313 |     model = trainModel()
314 |     score = testModel(model, datas, targets)
315 |     print("模型预测准确率:{}".format(score))
316 | 


--------------------------------------------------------------------------------
/pytorch_work.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | # kaggle Jane Street Market Prediction代码
  3 | # 实际自己工作的代码
  4 | # 用pytorch
  5 | 
  6 | 
  7 | import numpy as np
  8 | import pandas as pd
  9 | pd.set_option('display.max_columns', None)
 10 | import janestreet
 11 | 
 12 | import matplotlib.pyplot as plt
 13 | from sklearn.model_selection import train_test_split
 14 | from sklearn import metrics
 15 | from sklearn.metrics import accuracy_score
 16 | import torch
 17 | import torch.nn as nn
 18 | import torch.optim as optim
 19 | 
 20 | import os
 21 | 
 22 | from FE import featureEngineer
 23 | from tools import *
 24 | 
 25 |     
 26 |     
 27 | # 建模前处理数据
 28 | def preprocessing(train):
 29 |     X_train = train.loc[:, train.columns.str.contains('feature')]
 30 |     # y_train = train.loc[:, 'resp']
 31 |     y_train = train.loc[:, 'action']
 32 |     
 33 |     # X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, random_state=666, test_size=0.2)
 34 |     
 35 |     return X_train, y_train
 36 | 
 37 |     
 38 | # 评分函数
 39 | def Score(model, data):
 40 |     # test_df = pd.read_csv("/kaggle/input/jane-street-market-prediction/train.csv")
 41 |     data = data.fillna(-999)
 42 |     X_test = data.loc[:, data.columns.str.contains('feature')]
 43 |     resp = model.predict(X_test)
 44 |     date = data["date"].values
 45 |     weight = data["weight"].values
 46 |     action = (resp > 0).astype("int")
 47 |     
 48 |     count_i = len(np.unique(date))
 49 |     Pi = np.zeros(count_i)
 50 |     # 用循环太慢
 51 |     #for i, day in enumerate(np.unique(date)):
 52 | #        Pi[i] = np.sum(weight[date == day] * resp[date == day] * action[date == day])
 53 |     # 用下面这行代替
 54 |     Pi = np.bincount(date, weight * resp * action)
 55 |     t = np.sum(Pi) / np.sqrt(np.sum(Pi ** 2)) * np.sqrt(250 / count_i)
 56 |     u = np.clip(t, 0, 6) * np.sum(Pi)
 57 |     return u
 58 |     
 59 | 
 60 | # 进行预测，生成提交文件，分类版
 61 | def predict_clf(model):
 62 |     env = janestreet.make_env()
 63 |     iter_test = env.iter_test()
 64 |     for (test_df, sample_prediction_df) in iter_test:
 65 |         if test_df['weight'].item() > 0:
 66 |             # test_df = featureEngineer(test_df)
 67 |             X_test = test_df.loc[:, test_df.columns.str.contains('feature')]
 68 |             X_test = X_test.fillna(0.0)
 69 |             y_preds = model.predict(X_test)[0]
 70 |         else:
 71 |             y_preds = 0
 72 |         # print(y_preds)
 73 |         sample_prediction_df.action = y_preds
 74 |         env.predict(sample_prediction_df)
 75 |         
 76 | 
 77 | if __name__ == "__main__":
 78 |     newpath = "/home/code"
 79 |     os.chdir(newpath)
 80 |     
 81 |     # data_explore()
 82 |     
 83 |     # 真正开始干活
 84 |     p = 0.0001
 85 |     train = loadData(p = p)
 86 |     train = featureEngineer(train)
 87 |     print(train.info())
 88 |     # print(train.head())
 89 |     
 90 |     # 计算模型评分
 91 |     # score = Score(model, train)
 92 |     # print("模型评分:%.2f" % score)
 93 |     test = loadData(p = p)
 94 |     test = featureEngineer(test)
 95 |     
 96 |     #训练数据预处理
 97 |     x_train, y_train = preprocessing(train)
 98 |     x_test, y_test = preprocessing(test)
 99 |     
100 |     # 深度学习
101 |     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
102 |     
103 |     x_tensor = torch.from_numpy(x_train.values).float().to(device)
104 |     y_tensor = torch.from_numpy(y_train.values).float().to(device)
105 |     
106 | 
107 |     Model = nn.Sequential(
108 |             nn.Linear(130, 118),
109 |             nn.ReLU(),
110 |             nn.Linear(118, 142),
111 |             nn.Sigmoid(),
112 |             nn.Linear(142, 1)
113 |     ).to(device)
114 |             
115 |     # model = Model(x_tensor).to(device)
116 |     # print(model.state_dict())
117 |     # 设置超参数
118 |     lr = 0.000678
119 |     n_epochs = 110
120 |      
121 |     # loss_fn = nn.BCELoss(reduction='sum')
122 |     loss_fn = nn.MSELoss(reduction = "mean")
123 |     optimizer = optim.Adam(Model.parameters(), lr = lr)
124 |     # 创建训练器
125 |     train_step = make_train_step(Model, loss_fn, optimizer)
126 |     losses = []
127 |     
128 |     print("开始训练")
129 |     # 进行训练
130 |     for epoch in range(n_epochs):
131 |         # y_tensor = y_tensor.detach()
132 |         loss = train_step(x_tensor, y_tensor)
133 |         losses.append(loss)
134 |         
135 |     # print(model.state_dict())
136 |     print(losses)
137 |     plt.figure()
138 |     plt.plot(losses)
139 |     plt.savefig("./output/loss.png")
140 |     # 验证模型
141 |     x_test_tensor = torch.from_numpy(x_test.values).float().to(device)
142 |     y_test_tensor = torch.from_numpy(y_test.values).float().to(device)
143 |     result = []
144 |     for x in Model(x_test_tensor):
145 |         if x >= 0.5:
146 |             result.append(1)
147 |         else:
148 |             result.append(0)
149 |     y_test = y_test_tensor.numpy()
150 |     # print(len(y_test))
151 |     # print(result)
152 |     count = 0
153 |     for i in range(len(result)):
154 |         if y_test[i] == result[i]:
155 |             count += 1
156 |     print(count)
157 |     print("预测正确率:%f" % (count/len(y_test)))
158 |     # 进行预测
159 |     # predict_clf(model)
160 |     


--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8
 2 | # 将程序上传到服务器上执行
 3 | import os
 4 | import sys
 5 | from functools import wraps
 6 | import time
 7 | 
 8 | 
 9 | # 上传代码至服务器并运行
10 | def run(gpus, server):
11 |     # 上传本目录所有文件再执行指定文件
12 |     if gpus == "all":
13 |         # 清除服务器代码目录里所有源文件以及输出目录中的文件
14 |         s = "ssh ubuntu@" + server + " \"sudo rm -rf ~/code/*.py\""
15 |         os.system(s)
16 |         s = "ssh ubuntu@" + server + " \"sudo rm -rf ~/code/output/*\""
17 |         os.system(s)
18 |         # 将本地目录所有文件上传至容器
19 |         s = "scp -r ./*.py ubuntu@" + server + ":~/code"
20 |         os.system(s)
21 |         # 运行指定代码
22 |         s = "ssh root@" + server +  " -p 2222 \"python /home/code/" + sys.argv[2] + "\""
23 |         print("正在运行代码……\n")
24 |         os.system(s)
25 |         # 将代码目录里所有输出文件传回
26 |         s = "scp -r ubuntu@" + server + ":~/code/output/* ./output/"
27 |         os.system(s)
28 |     # 将所有结果文件传回
29 |     elif gpus == "copy":
30 |         s = "scp -r ubuntu@" + server + ":~/code/output/* ./output/"
31 |         os.system(s)
32 |     # 上传指定文件并执行
33 |     else:
34 |         ## 清除服务器代码目录里所有源文件以及输出目录中的文件
35 |         s = "ssh ubuntu@" + server + " \"sudo rm -rf ~/code/*.py\""
36 |         os.system(s)
37 |         s = "ssh ubuntu@" + server + " \"sudo rm -rf ~/code/output/*\""
38 |         os.system(s)
39 |         # 将本地目录指定文件上传至容器
40 |         s = "scp " + sys.argv[1] + " ubuntu@" + server + ":~/code"
41 |         os.system(s)
42 |         # 运行指定代码
43 |         s = "ssh root@" + server +  " -p 2222 \"python /home/code/" + sys.argv[1] + "\""
44 |         os.system(s)
45 |         # 将代码目录里所有文件传回
46 |         s = "scp -r ubuntu@" + server + ":~/code/output/* ./output/"
47 |         os.system(s)
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     gpus = sys.argv[1]
52 |     # 读取服务器IP地址，自己编辑serverIP.txt去
53 |     with open("serverIP.txt", "rt") as f:
54 |         server = f.read()
55 |     run(gpus, server)
56 |         
57 |     
58 | # 工具函数，在上传到服务器上运行时改变当前目录
59 | def change_dir(func):
60 |     @wraps(func)
61 |     def change(*args, **kwargs):
62 |         oldpath = os.getcwd()
63 |         newpath = "/home/code/"
64 |         os.chdir(newpath)
65 |         r = func(*args, **kwargs)
66 |         os.chdir(oldpath)
67 |         return r
68 |     return change
69 |     
70 |     
71 | # 工具函数，计算函数运行时间    
72 | def timethis(func):
73 |     @wraps(func)
74 |     def wrapper(*args, **kwargs):
75 |         start = time.perf_counter()
76 |         r = func(*args, **kwargs)
77 |         end = time.perf_counter()
78 |         print('{}.{}的运行时间为 : {}秒'.format(func.__module__, func.__name__, end - start))
79 |         return r
80 |     return wrapper
81 |     


--------------------------------------------------------------------------------
/tc/FE.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8
 2 | # kaggle竞赛Jane Street Market Prediction
 3 | # 特征工程代码
 4 | 
 5 | 
 6 | from run import *
 7 | from tools import *
 8 | import pandas as pd
 9 | import matplotlib.pyplot as plt
10 | 
11 | 
12 | """
13 | # 特征工程
14 | @change_dir
15 | def featureEngineer(data):
16 |     tages = pd.DataFrame()
17 |     tagename = feature.columns
18 |     for i in range(29):
19 |         # tagename = "tag_" + str(i)
20 |         # tages[tagename[i+1]] = feature[(feature[tagename[i+1]] == True)].iloc[:, i+1]
21 |         #print(tages[i])
22 |         temp = feature["feature"][feature[tagename[i+1]] == True]
23 |         temp.name = tagename[i+1]
24 |         print(temp)
25 |     #print(tages)
26 |     # 填充空值
27 |     print(data.isnull().sum())
28 |     for col in data.columns:
29 |         mean_val = data[col].mean()
30 |         data[col].fillna(mean_val, inplace=True)
31 |     print(data.isnull().sum())
32 |     # 处理feature_0
33 |     feature_0 = data["feature_0"].cumsum()
34 |     plt.plot(feature_0)
35 |     plt.savefig("./output/cumf_0.png")
36 |     plt.close()
37 |     data["feature_0"] = feature_0
38 |     # print(feature_0)
39 |     return data
40 | """
41 | # 特征工程
42 | def featureEngineer(data):
43 |     # data = data[data['weight'] != 0]
44 |     data = data.fillna(0.0)
45 |     weight = data['weight'].values
46 |     resp = data['resp'].values
47 |     data['action'] = ((weight * resp) > 0).astype('int')
48 |     return data
49 |     
50 | 
51 |     
52 |     
53 | if __name__ == "__main__":
54 |     train, feature = loadData()
55 |     # feature = feature[feature == True]
56 |     print(feature)
57 |     train = featureEngineer(train)
58 |     


--------------------------------------------------------------------------------
/tc/optuna_DP.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | # kaggle Jane Street Market Prediction代码
  3 | # 实际自己工作的代码
  4 | # 用optuna对深度学习模型调参
  5 | 
  6 | 
  7 | import numpy as np
  8 | import pandas as pd
  9 | pd.set_option('display.max_columns', None)
 10 | import janestreet
 11 | 
 12 | import matplotlib.pyplot as plt
 13 | from sklearn.model_selection import train_test_split
 14 | from sklearn import metrics
 15 | from sklearn.metrics import accuracy_score
 16 | import torch
 17 | import torch.nn as nn
 18 | import torch.optim as optim
 19 | import optuna
 20 | 
 21 | import os
 22 | 
 23 | from FE import featureEngineer
 24 | from tools import *
 25 | 
 26 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 27 |     
 28 |     
 29 | # 建模前处理数据
 30 | def preprocessing(train):
 31 |     X = train.loc[:, train.columns.str.contains('feature')]
 32 |     # y_train = train.loc[:, 'resp']
 33 |     Y = train.loc[:, 'action']
 34 |     
 35 |     x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state=666, test_size=0.2)
 36 |     
 37 |     return x_train, x_test, y_train, y_test 
 38 | 
 39 |     
 40 | # 评分函数
 41 | def Score(model, data):
 42 |     # test_df = pd.read_csv("/kaggle/input/jane-street-market-prediction/train.csv")
 43 |     data = data.fillna(-999)
 44 |     X_test = data.loc[:, data.columns.str.contains('feature')]
 45 |     resp = model.predict(X_test)
 46 |     date = data["date"].values
 47 |     weight = data["weight"].values
 48 |     action = (resp > 0).astype("int")
 49 |     
 50 |     count_i = len(np.unique(date))
 51 |     Pi = np.zeros(count_i)
 52 |     # 用循环太慢
 53 |     #for i, day in enumerate(np.unique(date)):
 54 | #        Pi[i] = np.sum(weight[date == day] * resp[date == day] * action[date == day])
 55 |     # 用下面这行代替
 56 |     Pi = np.bincount(date, weight * resp * action)
 57 |     t = np.sum(Pi) / np.sqrt(np.sum(Pi ** 2)) * np.sqrt(250 / count_i)
 58 |     u = np.clip(t, 0, 6) * np.sum(Pi)
 59 |     return u
 60 |     
 61 | 
 62 | # 进行预测，生成提交文件，分类版
 63 | def predict_clf(model):
 64 |     env = janestreet.make_env()
 65 |     iter_test = env.iter_test()
 66 |     for (test_df, sample_prediction_df) in iter_test:
 67 |         if test_df['weight'].item() > 0:
 68 |             test_df = featureEngineer(test_df)
 69 |             X_test = test_df.loc[:, test_df.columns.str.contains('feature')]
 70 |             X_test = X_test.fillna(0.0)
 71 |             y_preds = model.predict(X_test)[0]
 72 |         else:
 73 |             y_preds = 0
 74 |         # print(y_preds)
 75 |         sample_prediction_df.action = y_preds
 76 |         env.predict(sample_prediction_df)
 77 |         
 78 |         
 79 | # 获取数据
 80 | def getData():
 81 |     p = 0.1
 82 |     data = loadData(p = p)
 83 |     data = featureEngineer(data)
 84 |     # print(data.info())
 85 |     
 86 |     #训练数据预处理
 87 |     x_train, x_test, y_train, y_test  = preprocessing(data)
 88 |     
 89 |     return x_train, y_train, x_test, y_test
 90 |     
 91 |     
 92 | # 获取模型准确率
 93 | def getAccuracyRate(Model):
 94 |     result = []
 95 |     for x in Model(x_test_tensor):
 96 |         if x >= 0.5:
 97 |             result.append(1)
 98 |         else:
 99 |             result.append(0)
100 |     y_test = y_test_tensor.numpy()
101 |     # print(y_test[:10])
102 |     # print(result[:10])
103 |     count = 0
104 |     for i in range(len(result)):
105 |         if y_test[i] == result[i]:
106 |             count += 1
107 |     
108 |     return count/len(y_test)
109 |     
110 |     
111 | # 定义模型
112 | def define_model(trial):
113 |     input_dim = 130
114 |     hide1_dim = trial.suggest_int("hide1_dim", 100, 200)
115 |     hide2_dim = trial.suggest_int("hide2_dim", 10, 200)
116 |     output_dim = 1
117 |     Model = nn.Sequential(
118 |             nn.Linear(input_dim, hide1_dim),
119 |             nn.ReLU(),
120 |             nn.Linear(hide1_dim, hide2_dim),
121 |             nn.Sigmoid(),
122 |             nn.Linear(hide2_dim, output_dim)
123 |     )
124 |     return Model
125 |     
126 |     
127 | # 加载数据，为避免反复读取和数据一致，用全局变量
128 | x_train, y_train, x_test, y_test = getData()
129 | x_tensor = torch.from_numpy(x_train.values).float().to(device)
130 | y_tensor = torch.from_numpy(y_train.values).float().to(device)
131 | x_test_tensor = torch.from_numpy(x_test.values).float().to(device)
132 | y_test_tensor = torch.from_numpy(y_test.values).float().to(device)
133 |     
134 |     
135 | # 优化目标函数
136 | @timethis
137 | def objective(trial):
138 |     Model = define_model(trial).to(device)
139 |     optimizer_name = trial.suggest_categorical("optimizer", ["Adam", "RMSprop", "SGD"])
140 |     lr = trial.suggest_loguniform("lr", 1e-5, 1e-1)
141 |     optimizer = getattr(optim, optimizer_name)(Model.parameters(), lr=lr)
142 |     n_epochs = trial.suggest_int("epochs", 50, 200)
143 |     loss_fn = nn.MSELoss(reduction = "mean")
144 |     
145 |     # 创建训练器
146 |     train_step = make_train_step(Model, loss_fn, optimizer)
147 |     # losses = []
148 |     
149 |     # 进行训练
150 |     for epoch in range(n_epochs):
151 |         # y_tensor = y_tensor.detach()
152 |         loss = train_step(x_tensor, y_tensor)
153 |         # losses.append(loss)
154 |     accuracy = getAccuracyRate(Model)
155 |     
156 |     return accuracy
157 |     
158 | 
159 | if __name__ == "__main__":
160 |     newpath = "/home/code"
161 |     os.chdir(newpath)
162 |     
163 |     # 用optuna进行调参
164 |     study = optuna.create_study(direction="maximize")
165 |     study.optimize(objective, n_trials=10)
166 |     
167 |     print("结果:", study.best_params)
168 |     print(study.best_value)
169 |     print(study.best_trial)
170 |     
171 |     # 进行预测
172 |     # predict_clf(model)
173 |     


--------------------------------------------------------------------------------
/tc/run.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8
 2 | # 将程序上传到服务器上执行
 3 | import os
 4 | import sys
 5 | from functools import wraps
 6 | import time
 7 | 
 8 | 
 9 | # 上传代码至服务器并运行
10 | def run(gpus, server):
11 |     # 上传本目录所有文件再执行指定文件
12 |     if gpus == "all":
13 |         # 清除服务器代码目录里所有源文件以及输出目录中的文件
14 |         s = "ssh ubuntu@" + server + " \"sudo rm -rf ~/code/*.py\""
15 |         os.system(s)
16 |         s = "ssh ubuntu@" + server + " \"sudo rm -rf ~/code/output/*\""
17 |         os.system(s)
18 |         # 将本地目录所有文件上传至容器
19 |         s = "scp -r ./*.py ubuntu@" + server + ":~/code"
20 |         os.system(s)
21 |         # 运行指定代码
22 |         s = "ssh root@" + server +  " -p 2222 \"python /home/code/" + sys.argv[2] + "\""
23 |         print("正在运行代码……\n")
24 |         os.system(s)
25 |         # 将代码目录里所有输出文件传回
26 |         s = "scp -r ubuntu@" + server + ":~/code/output/* ./output/"
27 |         os.system(s)
28 |     # 将所有结果文件传回
29 |     elif gpus == "copy":
30 |         s = "scp -r ubuntu@" + server + ":~/code/output/* ./output/"
31 |         os.system(s)
32 |     # 上传指定文件并执行
33 |     else:
34 |         ## 清除服务器代码目录里所有源文件以及输出目录中的文件
35 |         s = "ssh ubuntu@" + server + " \"sudo rm -rf ~/code/*.py\""
36 |         os.system(s)
37 |         s = "ssh ubuntu@" + server + " \"sudo rm -rf ~/code/output/*\""
38 |         os.system(s)
39 |         # 将本地目录指定文件上传至容器
40 |         s = "scp " + sys.argv[1] + " ubuntu@" + server + ":~/code"
41 |         os.system(s)
42 |         # 运行指定代码
43 |         s = "ssh root@" + server +  " -p 2222 \"python /home/code/" + sys.argv[1] + "\""
44 |         os.system(s)
45 |         # 将代码目录里所有文件传回
46 |         s = "scp -r ubuntu@" + server + ":~/code/output/* ./output/"
47 |         os.system(s)
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     gpus = sys.argv[1]
52 |     # 读取服务器IP地址，自己编辑serverIP.txt去
53 |     with open("serverIP.txt", "rt") as f:
54 |         server = f.read()
55 |     run(gpus, server)
56 |         
57 |     
58 | # 工具函数，在上传到服务器上运行时改变当前目录
59 | def change_dir(func):
60 |     @wraps(func)
61 |     def change(*args, **kwargs):
62 |         oldpath = os.getcwd()
63 |         newpath = "/home/code/"
64 |         os.chdir(newpath)
65 |         r = func(*args, **kwargs)
66 |         os.chdir(oldpath)
67 |         return r
68 |     return change
69 |     
70 |     
71 | # 工具函数，计算函数运行时间    
72 | def timethis(func):
73 |     @wraps(func)
74 |     def wrapper(*args, **kwargs):
75 |         start = time.perf_counter()
76 |         r = func(*args, **kwargs)
77 |         end = time.perf_counter()
78 |         print('{}.{}的运行时间为 : {}秒'.format(func.__module__, func.__name__, end - start))
79 |         return r
80 |     return wrapper
81 |     


--------------------------------------------------------------------------------
/tc/tools.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | # kaggle竞赛Jane Street Market Prediction
  3 | # 工具函数
  4 | 
  5 | from run import *
  6 | import pandas as pd
  7 | import matplotlib.pyplot as plt
  8 | import numpy as np
  9 | from sklearn.model_selection import cross_val_score, learning_curve
 10 | from sklearn.metrics import classification_report, roc_curve, auc
 11 | 
 12 | 
 13 | # 载入数据
 14 | @change_dir
 15 | def loadData(p = 0.01):
 16 |     # 抽样，读取1%数据
 17 |     # 参考https://mp.weixin.qq.com/s/2LSKnN9R-N-I2HcHePT9zA
 18 |     train = pd.read_csv("./train.csv", skiprows = lambda x: x>0 and np.random.rand() > p)
 19 |     # feature = pd.read_csv("./features.csv")
 20 |     return train
 21 |     
 22 |     
 23 | # 对模型进行交叉验证
 24 | def cross_val(model, X, Y, cv = 10):
 25 |     scores = cross_val_score(model, X, Y, cv=cv)
 26 |     score = scores.mean()
 27 |     return score
 28 |     
 29 |     
 30 | # 模型评估
 31 | def evalution(model, X, y_true):
 32 |     # X = test.loc[:, test.columns.str.contains("feature")].values
 33 |     # y_true = test.action.values
 34 |     y_pred = model.predict(X)
 35 |     target_names = ["1", "0"]
 36 |     result = classification_report(y_true, y_pred, target_names = target_names, output_dict = False )
 37 |     return result
 38 | 
 39 | 
 40 | # 对模型评分
 41 | @timethis
 42 | def score(model, test, modelName):
 43 |     if modelName == "XGBoost":
 44 |         X = test.loc[:, test.columns.str.contains("feature")]
 45 |         Y = test.action
 46 |     else:
 47 |         X = test.loc[:, test.columns.str.contains("feature")].values
 48 |         Y = test.action.values
 49 |     model_score = model.score(X, Y)
 50 |     cross_score = cross_val(model, X, Y)
 51 |     report = evalution(model, X, Y)
 52 |     print("模型评分:", model_score)
 53 |     print("交叉验证:", cross_score)
 54 |     print("模型评估:\n", report)
 55 |     Roc(model, X, Y, modelName)
 56 |     Lc(model, modelName, X, Y)
 57 |     
 58 |     
 59 | # 画roc曲线
 60 | @change_dir
 61 | def Roc(model, X, Y, modelName):
 62 |     y_label = Y
 63 |     y_pred = model.predict(X)
 64 |     fpr, tpr, thersholds = roc_curve(y_label, y_pred)
 65 |         
 66 |     roc_auc = auc(fpr, tpr)
 67 |     
 68 |     plt.plot(fpr, tpr, 'k--', label = "ROC (area = {0:.2f})".format(roc_auc), lw = 2)
 69 |     plt.tick_params(axis='x', labelsize=15)
 70 |     plt.tick_params(axis='y', labelsize=15)
 71 |     plt.xlim([-0.05, 1.05])
 72 |     plt.ylim([-0.05, 1.05])
 73 |     plt.xlabel("False Positive Rate")
 74 |     plt.ylabel("True Positive Rate")
 75 |     plt.title(modelName + " ROC Curve")
 76 |     plt.legend(loc = "best")
 77 |     plt.savefig("./output/" + modelName + "_ROC.png")
 78 |     
 79 |     
 80 | # 画学习曲线
 81 | @change_dir
 82 | def Lc(model, modelName, X, y, ylim = None, cv = None, n_jobs = 1, train_sizes = np.linspace(0.1, 1.0, 5), verbose = 0):
 83 |     plt.figure()
 84 |     plt.title(modelName+" Learning Curve")
 85 |     if ylim is not None:
 86 |         plt.ylim(*ylim)
 87 |     plt.xlabel("Training Samples")
 88 |     plt.ylabel("Score")
 89 |     train_sizes, train_scores, test_scores = learning_curve(model, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
 90 |     train_scores_mean = np.mean(train_scores, axis = 1)
 91 |     train_scores_std = np.std(train_scores, axis = 1)
 92 |     test_scores_mean = np.mean(test_scores, axis = 1)
 93 |     test_scores_std = np.std(test_scores, axis = 1)
 94 |     plt.grid()
 95 |     
 96 |     plt.fill_between(train_sizes, train_scores_mean - train_scores_std,train_scores_mean + train_scores_std, alpha=0.1, color="r")
 97 |     plt.fill_between(train_sizes,test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g")
 98 |     plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
 99 |     plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
100 |     
101 |     plt.legend(loc="best")
102 |     plt.savefig("./output/" + modelName + "_Learning Curve.png")
103 |     
104 |     
105 | # 工具函数，返回神经网络训练的每一步
106 | def make_train_step(model, loss_fn, optimizer):
107 |     # 执行在循环中训练过程
108 |     def train_step(x, y):
109 |         # 设置训练模式
110 |         model.train()
111 |         # 梯度置零
112 |         optimizer.zero_grad()
113 |         # 预测
114 |         yhat = model(x)
115 |         # print(yhat[:10])
116 |         # 计算损失
117 |         # print("测试")
118 |         yhat = yhat.squeeze(-1)
119 |         # print(yhat.shape, y.shape)
120 |         loss = loss_fn(yhat, y)
121 |         # 计算梯度
122 |         loss.backward()
123 |         # 更新参数，梯度置零
124 |         optimizer.step()
125 |         # 返回损失值
126 |         return loss.item()
127 |         
128 |     # 返回在训练循环中调用的函数
129 |     return train_step
130 |         
131 | 
132 |     


--------------------------------------------------------------------------------
/test_dt.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8
 2 | # kaggle Jane Street Market Prediction代码
 3 | # 测试datatable的代码
 4 | 
 5 | 
 6 | import datatable as dt
 7 | import pandas as pd
 8 | from run import *
 9 | 
10 | 
11 | # 测试计时函数
12 | @change_dir
13 | @timethis
14 | def testtime():
15 |     print(3)
16 |     sum = 0
17 |     N = 1000
18 |     for i in range(N):
19 |         for j in range(N):
20 |             sum += i*j
21 |     print("sum = {}".format(sum))
22 |     
23 |     
24 | # 读取数据
25 | @change_dir
26 | @timethis
27 | def testread():
28 |     train_df = dt.fread("./train.csv")
29 |     print(train_df.shape)
30 |     print(train_df.info())
31 |     print(train_df.describe())
32 |     print(train_df.sum())
33 | 
34 | 
35 | if __name__ == "__main__":
36 |     testread()
37 | 


--------------------------------------------------------------------------------
/test_pytorch.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | # kaggle Jane Street Market Prediction代码
  3 | # 学习pytorch
  4 | # 参考https://pytorch.apachecn.org/docs/1.0/pytorch_with_examples.html
  5 | 
  6 | 
  7 | from run import *
  8 | import matplotlib.pyplot as plt
  9 | 
 10 | # 用numpy实现
 11 | import numpy as np
 12 | 
 13 | 
 14 | # 前向传播
 15 | def fp_np(x, w1, w2):
 16 |     # 向前传播，计算预测值
 17 |      h = x.dot(w1)
 18 |      h_relu = np.maximum(h, 0)
 19 |      y_pred = h_relu.dot(w2)
 20 |      return y_pred, h_relu, h
 21 |      
 22 |      
 23 | # 反向传播
 24 | def bp_np(x, y, y_pred, h_relu, h, w1, w2):
 25 |     grad_y_pred = 2.0*(y_pred - y)
 26 |     grad_w2 = h_relu.T.dot(grad_y_pred)
 27 |     grad_h_relu = grad_y_pred.dot(w2.T)
 28 |     grad_h = grad_h_relu.copy()
 29 |     grad_h[h < 0] = 0
 30 |     grad_w1 = x.T.dot(grad_h)
 31 |     return w1, w2
 32 |     
 33 |     
 34 | def nn_numpy():
 35 |     print("numy版神经网络")
 36 |     # N是批大小；D_in是输入维度
 37 |     # H是隐藏层维度；D_out是输出维度
 38 |     N, D_in, H, D_out = 64, 1000, 100, 10
 39 |     
 40 |     # 产生随机输入和输出数据
 41 |     x = np.random.randn(N, D_in)
 42 |     y = np.random.randn(N, D_out)
 43 |     print(len(x))
 44 |     print(len(y))
 45 |     
 46 |     # 随机初始化权重
 47 |     w1 = np.random.randn(D_in, H)
 48 |     w2 = np.random.randn(H, D_out)
 49 |     learning_rate = 1e-6
 50 |     
 51 |     for t in range(500):
 52 |         # 向前传播，计算预测值
 53 |         y_pred, h_relu, h = fp_np(x, w1, w2)
 54 |         
 55 |         # 计算并显示loss(损失)
 56 |         loss = np.square(y_pred - y).sum()
 57 |         # print(t, loss)
 58 |         
 59 |         # 反向传播，计算w1,w2对loss的梯度
 60 |         grad_w1, grad_w2 = bp_np(x, y, y_pred, h_relu, h, w1, w2)
 61 |         
 62 |         # 更新权重
 63 |         w1 -= learning_rate * grad_w1
 64 |         w2 -= learning_rate * grad_w2
 65 |         
 66 |     x_test = np.random.randn(N, D_in)
 67 |     print(fp_np(x_test, w1, w2))
 68 |     
 69 |     
 70 | # 用pytorch实现
 71 | import torch
 72 | import torch.nn as nn
 73 | import torch.utils.data as Data
 74 | from torch.utils.data import Dataset, TensorDataset, DataLoader
 75 | import torch.optim as optim
 76 | from torchviz import make_dot
 77 | 
 78 | 
 79 | def nn_pytorch():
 80 |     print("pytorch版神经网络")
 81 |     N, D_in, H, D_out = 64, 1000, 100, 10
 82 |     x = torch.randn(N, D_in, device=device)
 83 |     y = torch.randn(N, D_out, device=device)
 84 |     
 85 |     # 产生随机权重tensor
 86 |     w1 = torch.randn(D_in, H, device=device, requires_grad=True)
 87 |     w2 = torch.randn(H, D_out, device=device, requires_grad=True)
 88 |     
 89 |     learning_rate = 1e-6
 90 |     for t in range(500):
 91 |         # 前向传播，自动计算梯度
 92 |         y_pred = x.mm(w1).clamp(min = 0).mm(w2)
 93 |         # 计算并输出loss
 94 |         loss = (y_pred - y).pow(2).sum()
 95 |         print(t, loss.item())
 96 |         # 反向传播
 97 |         loss.backward()
 98 |         
 99 |         # 更新权重，不自动计算梯度
100 |         with torch.no_grad():
101 |             w1 -= learning_rate * w1.grad
102 |             w2 -= learning_rate * w2.grad
103 |             
104 |             # 梯度置零
105 |             w1.grad.zero_()
106 |             w2.grad.zero_()
107 |     x_test = torch.randn(N, D_in)
108 |     print(x_test.mm(w1).clamp(min = 0).mm(w2))
109 |     
110 |     
111 | # 用pytorch.nn实现
112 | def nn_torch_nn():
113 |     print("pytorch_nn版神经网络")
114 |     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
115 |     N, D_in, H, D_out = 64, 1000, 100, 10
116 |     x = torch.randn(N, D_in, device=device)
117 |     y = torch.randn(N, D_out, device=device)
118 |     
119 |     model = torch.nn.Sequential(
120 |             torch.nn.Linear(D_in, H),
121 |             torch.nn.ReLU(),
122 |             torch.nn.Linear(H, D_out),
123 |     ).to(device)
124 |     
125 |     loss_fn = torch.nn.MSELoss(reduction = "sum")
126 |     
127 |     learning_rate = 1e-4
128 |     for t in range(500):
129 |         y_pred = model(x)
130 |         loss = loss_fn(y_pred, y)
131 |         print(t, loss.item())
132 |         model.zero_grad()
133 |         loss.backward()
134 |         with torch.no_grad():
135 |             for param in model.parameters():
136 |                 param.data -= learning_rate * param.grad
137 |     
138 |     x_test = torch.randn(N, D_in)
139 |     print(model(x_test))
140 |     
141 |     
142 | # Pytorch实现二分类器
143 | def pytorch_class():
144 |     class ClassifyModel(nn.Module):
145 |         def __init__(self, input_dim, hide_dim, output_dim):
146 |             super(ClassifyModel, self).__init__()
147 |             self.linear1 = nn.Linear(input_dim, hide_dim)
148 |             self.linear2 = nn.Linear(hide_dim, output_dim)
149 |             
150 |         def forward(self, x):
151 |             hidden = self.linear1(x)
152 |             activate = torch.relu(hidden)
153 |             output = self.linear2(activate)
154 |             return output
155 |             
156 |     # 准备数据
157 |     x = torch.unsqueeze(torch.linspace(-10, 10, 50), 1)
158 |     y = torch.cat((torch.ones(25), torch.zeros(25))).type(torch.LongTensor)
159 |     print(x)
160 |     print(y)
161 |     dataset = Data.TensorDataset(x, y)
162 |     dataloader = Data.DataLoader(dataset=dataset, batch_size=5, shuffle=True)
163 |     model = ClassifyModel(1, 10, 2)
164 |     model2 = torch.nn.Sequential(
165 |              nn.Linear(1, 10),
166 |              nn.ReLU(),
167 |              nn.Linear(10, 2),
168 |      )
169 |      
170 |     optim = torch.optim.Adam(model2.parameters(), lr=0.0001)
171 |     loss_fn = nn.CrossEntropyLoss()
172 |      
173 |     for e in range(1000):
174 |         epoch_loss = 0
175 |         epoch_acc = 0
176 |         for i, (x, y) in enumerate(dataloader):
177 |             optim.zero_grad()
178 |             out = model2(x)
179 |             loss = loss_fn(out, y)
180 |              
181 |             loss.backward()
182 |             optim.step()
183 |              
184 |             epoch_loss += loss.data
185 |             epoch_acc += get_acc(out, y)
186 |              
187 |             if e % 200 == 0:
188 |                 print('epoch: %d, loss: %f, acc: %f' % (e, epoch_loss / 50, epoch_acc / 50))
189 |                 
190 |     x_test = torch.unsqueeze(torch.linspace(-2, 2, 10), 1)
191 |     print(x_test)
192 |     y_pred = (model2(x_test))
193 |     print(y_pred)
194 |             
195 |             
196 | def get_acc(outputs, labels):
197 |     _, predict = torch.max(outputs.data, 1)
198 |     total_num = labels.shape[0]*1.0
199 |     correct_num = (labels == predict).sum().item()
200 |     acc = correct_num / total_num
201 |     return acc
202 |     
203 |     
204 | # 新的尝试
205 | # https://towardsdatascience.com/understanding-pytorch-with-an-example-a-step-by-step-tutorial-81fc5f8c4e8e
206 | @change_dir
207 | def new_try():
208 |     # 1.一个简单的回归问题
209 |     # 生成数据
210 |     np.random.seed(42)
211 |     x = np.random.rand(100, 1)
212 |     y = 1 + 2*x + 0.1*np.random.randn(100, 1)
213 |     # 打乱顺序
214 |     idx = np.arange(100)
215 |     np.random.shuffle(idx)
216 |     # 使用前80个数据做训练集
217 |     train_idx = idx[:80]
218 |     # 剩下的做验证集
219 |     val_idx = idx[80:]
220 |     x_train, y_train = x[train_idx], y[train_idx]
221 |     x_test, y_test = x[val_idx], y[val_idx]
222 |     plt.figure()
223 |     plt.scatter(x_train, y_train)
224 |     plt.savefig("./output/train.png")
225 |     plt.close()
226 |     plt.figure()
227 |     plt.scatter(x_test, y_test)
228 |     plt.savefig("./output/test.png")
229 |     plt.close()
230 |     
231 |     # 2.梯度下降
232 |     # 第一步，计算损失值loss
233 |     # 对于回归问题，用平均方差
234 |     # Mean Square Error (MSE)
235 |     # 第二步，计算梯度
236 |     # 即当我们轻微变动两个参数a,b时MSE如何变化
237 |     # 第三步，更新参数
238 |     # 第四步，用新的参数重新进行上述步骤
239 |     # 这个过程就是训练模型的过程
240 |     
241 |     # 3.使用numpy进行线性回归
242 |     # 初始化步骤有两步
243 |     # ①随机初始化参数和权重
244 |     np.random.seed(42)
245 |     a = np.random.randn(1)
246 |     b = np.random.randn(1)
247 |     print(a, b)
248 |     # ②初始化超参数
249 |     lr = 1e-1
250 |     n_epochs = 1000
251 |     
252 |     # 训练过程
253 |     for epoch in range(n_epochs):
254 |         # 计算模型预测值:前向传播
255 |         yhat = a + b*x_train
256 |         # 计算损失值
257 |         error = (y_train - yhat)
258 |         loss = (error**2).mean()
259 |         # 计算每个参数的梯度值
260 |         a_grad = -2*error.mean()
261 |         b_grad = -2*(x_train*error).mean()
262 |         # 使用梯度和学习率更新参数
263 |         a -= lr*a_grad
264 |         b -= lr*b_grad
265 |         
266 |     print(a, b)
267 |     
268 |     # 检查一下对不对
269 |     from sklearn.linear_model import LinearRegression
270 |     linr = LinearRegression()
271 |     linr.fit(x_train, y_train)
272 |     print(linr.intercept_, linr.coef_[0])
273 |     
274 |     # 4.使用pytorch
275 |     # 张量tensor，有三个或更多的维度
276 |     # 加载数据
277 |     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
278 |     x_train_tensor = torch.from_numpy(x_train).float().to(device)
279 |     y_train_tensor = torch.from_numpy(y_train).float().to(device)
280 |     print(type(x_train), type(x_train_tensor), x_train_tensor.type())
281 |     # 创建参数
282 |     # 第一种方法
283 |     a = torch.randn(1, requires_grad = True, dtype = torch.float)
284 |     b = torch.randn(1, requires_grad = True, dtype = torch.float)
285 |     print(a, b)
286 |     # 第二种方法
287 |     a = torch.randn(1, requires_grad = True, dtype = torch.float).to(device)
288 |     b = torch.randn(1, requires_grad = True, dtype = torch.float).to(device)
289 |     print(a, b)
290 |     # 第三种方法
291 |     a = torch.randn(1, dtype = torch.float).to(device)
292 |     b = torch.randn(1, dtype = torch.float).to(device)
293 |     a.requires_grad_()
294 |     b.requires_grad_()
295 |     print(a, b)
296 |     # 创建时即确定
297 |     torch.manual_seed(42)
298 |     a = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)
299 |     b = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)
300 |     print(a, b)
301 |     
302 |     # 5.自动梯度
303 |     lr = 1e-1
304 |     n_epochs = 1000
305 |     
306 |     for epoch in range(n_epochs):
307 |         yhat = a + b*x_train_tensor
308 |         error = y_train_tensor - yhat
309 |         loss = (error**2).mean()
310 |         
311 |         # 不用自己手动计算梯度了
312 |         loss.backward()
313 |         # print(a.grad)
314 |         # print(b.grad)
315 |         
316 |         # 更新参数，这时不需要自动计算梯度
317 |         with torch.no_grad():
318 |             a -= lr*a.grad
319 |             b -= lr*b.grad
320 |             
321 |         # 将梯度置零，使过程继续
322 |         a.grad.zero_()
323 |         b.grad.zero_()
324 |         
325 |     print(a, b)
326 |     
327 |     # 6.动态计算图
328 |     torch.manual_seed(42)
329 |     a = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)
330 |     b = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)
331 |     yhat = a + b*x_train_tensor
332 |     error = y_train_tensor - yhat
333 |     loss = (error**2).mean()
334 |     graph = make_dot(yhat)
335 |     # graph.view("./output/yhat")
336 |     
337 |     # 7.优化
338 |     torch.manual_seed(42)
339 |     a = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)
340 |     b = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)
341 |     print(a, b)
342 |     
343 |     lr = 1e-1
344 |     n_epochs = 1000
345 |     
346 |     optimizer = optim.SGD([a, b], lr = lr)
347 |     for epoch in range(n_epochs):
348 |         yhat = a + b*x_train_tensor
349 |         error = y_train_tensor - yhat
350 |         loss = (error**2).mean()
351 |         
352 |         # 不用自己手动计算梯度了
353 |         loss.backward()
354 |         
355 |         # 也不用自己手动更新参数了
356 |         optimizer.step()
357 |         # 也不用手动将梯度归零
358 |         optimizer.zero_grad()
359 |         
360 |     print(a, b)
361 |     
362 |     # 8.损失函数 loss
363 |     # pytorch提供了很多损失函数计算方法
364 |     # 还可以通过reduction参数来决定如何聚合单个神经节的损失。
365 |     torch.manual_seed(42)
366 |     a = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)
367 |     b = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)
368 |     print(a, b)
369 |     
370 |     lr = 1e-1
371 |     n_epochs = 1000
372 |     
373 |     # 使用pytorch的损失函数
374 |     loss_fn = nn.MSELoss(reduction='mean')
375 |     
376 |     optimizer = optim.SGD([a, b], lr = lr)
377 |     for epoch in range(n_epochs):
378 |         yhat = a + b*x_train_tensor
379 |         # error = y_train_tensor - yhat
380 |         # loss = (error**2).mean()
381 |         # 不用自己算中间值了
382 |         loss = loss_fn(y_train_tensor, yhat)
383 |         
384 |         # 不用自己手动计算梯度了
385 |         loss.backward()
386 |         
387 |         # 也不用自己手动更新参数了
388 |         optimizer.step()
389 |         # 也不用手动将梯度归零
390 |         optimizer.zero_grad()
391 |         
392 |     print(a, b)
393 |     
394 |     # 9.模型
395 |     # 在pytorch中模型是继承自Module的一个类
396 |     # 至少要实现__init__，初始化参数
397 |     # 和forward，是实际计算过程，给定参数x，
398 |     # 输出预测。
399 |     # 使用model(x)来做出预测
400 |     # 模型和数据应该在同一设备中
401 |     class ManualLinearRegression(nn.Module):
402 |         def __init__(self):
403 |             super().__init__()
404 |             """使用nn.Parameter使a,b成为模型真正的参数,可以通过parameters()获得参数列表，还可以通过state_dict()获得所有参数的当前值"""
405 |             self.a = nn.Parameter(torch.randn(1, requires_grad = True, dtype = torch.float))
406 |             self.b = nn.Parameter(torch.randn(1, requires_grad = True, dtype = torch.float))
407 |             
408 |         def forward(self, x):
409 |             # 实际的计算过程
410 |             return self.a + self.b*x
411 |             
412 |     # 使用模型
413 |     
414 |     torch.manual_seed(42)
415 |     
416 |     #创建模型并传到相关设备上
417 |     model = ManualLinearRegression().to(device)
418 |     # 输出模型参数状态
419 |     print(model.state_dict())
420 |     
421 |     lr = 1e-1
422 |     n_epochs = 1000
423 |     
424 |     loss_fn = nn.MSELoss(reduction = "mean")
425 |     optimizer = optim.SGD(model.parameters(), lr = lr)
426 |     
427 |     for epoch in range(n_epochs):
428 |         # 这里不是训练，只是开启训练模式
429 |         # 因为有的模型会使用诸如Dropout等
430 |         # 它们在训练阶段和评估阶段的行为不同
431 |         model.train()
432 |         
433 |         # 不用手动计算了
434 |         yhat = model(x_train_tensor)
435 |         
436 |         loss = loss_fn(y_train_tensor, yhat)
437 |         loss.backward()
438 |         optimizer.step()
439 |         optimizer.zero_grad()
440 |         
441 |     print(model.state_dict())
442 |     
443 |     #嵌套模型 nested models
444 |     class LayerLinearRegression(nn.Module):
445 |         def __init__(self):
446 |             super().__init__()
447 |             self.linear = nn.Linear(1, 1)
448 |             
449 |         def forward(self, x):
450 |             return self.linear(x)
451 |     
452 |     # 使用模型
453 |     
454 |     torch.manual_seed(42)
455 |     
456 |     #创建模型并传到相关设备上
457 |     model = LayerLinearRegression().to(device)
458 |     # 输出模型参数状态
459 |     print(model.state_dict())
460 |     
461 |     lr = 1e-1
462 |     n_epochs = 1000
463 |     
464 |     loss_fn = nn.MSELoss(reduction = "mean")
465 |     optimizer = optim.SGD(model.parameters(), lr = lr)
466 |     
467 |     for epoch in range(n_epochs):
468 |         # 这里不是训练，只是开启训练模式
469 |         # 因为有的模型会使用诸如Dropout等
470 |         # 它们在训练阶段和评估阶段的行为不同
471 |         model.train()
472 |         
473 |         # 不用手动计算了
474 |         yhat = model(x_train_tensor)
475 |         
476 |         loss = loss_fn(y_train_tensor, yhat)
477 |         loss.backward()
478 |         optimizer.step()
479 |         optimizer.zero_grad()
480 |         
481 |     print(model.state_dict())
482 |     
483 |     # 序列模型Sequential Models
484 |     # 为了不用新建一个类
485 |     #对于前馈模型，前一层输出可以作为后层的输入
486 |     model = nn.Sequential(nn.Linear(1, 1)).to(device)
487 |     
488 |     # 可以写一个函数封装固定的训练过程
489 |     def make_train_step(model, loss_fn, optimizer):
490 |         # 执行在循环中训练过程
491 |         def train_step(x, y):
492 |             # 设置训练模式
493 |             model.train()
494 |             # 预测
495 |             yhat = model(x)
496 |             # 计算损失
497 |             loss = loss_fn(y, yhat)
498 |             # 计算梯度
499 |             loss.backward()
500 |             # 更新参数，梯度置零
501 |             optimizer.step()
502 |             optimizer.zero_grad()
503 |             # 返回损失值
504 |             return loss.item()
505 |         
506 |         # 返回在训练循环中调用的函数
507 |         return train_step
508 |         
509 |     torch.manual_seed(42)
510 |     
511 |     #创建模型并传到相关设备上
512 |     model = LayerLinearRegression().to(device)
513 |     # 输出模型参数状态
514 |     print(model.state_dict())
515 |     
516 |     lr = 1e-1
517 |     n_epochs = 1000
518 |     
519 |     loss_fn = nn.MSELoss(reduction = "mean")
520 |     optimizer = optim.SGD(model.parameters(), lr = lr)
521 |     train_step = make_train_step(model, loss_fn, optimizer)
522 |     losses = []
523 |     
524 |     for epoch in range(n_epochs):
525 |         loss = train_step(x_train_tensor, y_train_tensor)
526 |         losses.append(loss)
527 |         
528 |     print(model.state_dict())
529 |     
530 |     # 10.数据集 dataset
531 |     # 代表继承自Dataset的一个类
532 |     # 可看成一个tuples列表,每个tuple代表一个(特征，标签)点
533 |     # 数据很大时，建议在需要时再加载，用__get_item__
534 |     class CustomDataset(Dataset):
535 |         # 用csv文件或tensor输入
536 |         def __init__(self, x_tensor, y_tensor):
537 |             self.x = x_tensor
538 |             self.y = y_tensor
539 |             
540 |         def __getitem__(self, index):
541 |             return (self.x[index], self.y[index])
542 |             
543 |         def __len__(self):
544 |             return len(self.x)
545 |             
546 |     x_train_tensor = torch.from_numpy(x_train).float()
547 |     y_train_tensor = torch.from_numpy(y_train).float()
548 |     
549 |     train_data = CustomDataset(x_train_tensor, y_train_tensor)
550 |     print(train_data[0])
551 |     # 如果一个数据集是一对张量，可以用TensorDataset
552 |     train_data = TensorDataset(x_train_tensor, y_train_tensor)
553 |     print(train_data[0])
554 |     # 别把所有训练数据都放到GPU里，太占显存了
555 |     # 创建数据集的目的是可以使用DataLoader
556 |     
557 |     # 11.加载数据DataLoader
558 |     # 对于大数据集，在训练中只加载一部分
559 |     train_loader = DataLoader(dataset = train_data, batch_size = 16, shuffle = True)
560 |     
561 |     # 使用
562 |     losses = []
563 |     train_step = make_train_step(model, loss_fn, optimizer)
564 |     
565 |     for epoch in range(n_epochs):
566 |         for x_batch, y_batch in train_loader:
567 |             x_batch = x_batch.to(device)
568 |             y_batch = y_batch.to(device)
569 |             
570 |             loss = train_step(x_batch, y_batch)
571 |             losses.append(loss)
572 |             
573 |     print(model.state_dict())
574 |     # 随机划分训练_验证集
575 |     x_tensor = torch.from_numpy(x).float()
576 |     y_tensor = torch.from_numpy(y).float()
577 |     
578 |     dataset = TensorDataset(x_tensor, y_tensor)
579 |     
580 |     train_dataset, val_dataset = Data.dataset.random_split(dataset, [80, 20])
581 |     
582 |     train_loader = DataLoader(dataset = train_dataset, batch_size = 16)
583 |     val_loader = DataLoader(dataset = val_dataset, batch_size = 20)
584 |     
585 |     # 12.评估
586 |     losses = []
587 |     val_losses = []
588 |     train_step = make_train_step(model, loss_fn, optimizer)
589 |     
590 |     for epoch in range(n_epochs):
591 |         for x_batch, y_batch in train_loader:
592 |             x_batch = x_batch.to(device)
593 |             y_batch = y_batch.to(device)
594 |             
595 |             loss = train_step(x_batch, y_batch)
596 |             losses.append(loss)
597 |             
598 |         with torch.no_grad():
599 |             for x_val, y_val in val_loader:
600 |                 x_val = x_val.to(device)
601 |                 y_val = y_val.to(device)
602 |                 
603 |                 # 将模型置为评估阶段
604 |                 model.eval()
605 |                 
606 |                 yhat = model(x_val)
607 |                 val_loss = loss_fn(y_val, yhat)
608 |                 val_losses.append(val_loss.item())
609 |                 
610 |     print(model.state_dict())
611 |     
612 | 
613 | if __name__ == "__main__":
614 |     # nn_numpy()
615 |     # nn_pytorch()
616 |     # nn_torch_nn()
617 |     # pytorch_class()
618 |     # print(torch.__version__)
619 |     # print(torch.version.cuda)
620 |     new_try()


--------------------------------------------------------------------------------
/test_work.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | # kaggle Jane Street Market Prediction代码
  3 | # 实际自己工作的代码
  4 | 
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | pd.set_option('display.max_columns', None)
  9 | import janestreet
 10 | 
 11 | import matplotlib.pyplot as plt
 12 | from sklearn.model_selection import train_test_split
 13 | from sklearn import metrics
 14 | from sklearn.metrics import accuracy_score
 15 | import optuna
 16 | # 逻辑回归
 17 | from sklearn.linear_model import LinearRegression, LogisticRegression
 18 | # 支持向量机
 19 | from sklearn.svm import SVC, LinearSVC
 20 | # 随机森林
 21 | from sklearn.ensemble import RandomForestClassifier
 22 | # KNN算法
 23 | from sklearn.neighbors import KNeighborsClassifier
 24 | # 朴素贝叶斯算法
 25 | from sklearn.naive_bayes import GaussianNB
 26 | # SGD算法
 27 | from sklearn.linear_model import SGDClassifier
 28 | # 决策树算法
 29 | from sklearn.tree import DecisionTreeClassifier
 30 | 
 31 | import os
 32 | 
 33 | from EDA import data_explore
 34 | from FE import featureEngineer
 35 | from tools import *
 36 | 
 37 |     
 38 |     
 39 | # 建模前处理数据
 40 | def preprocessing(train):
 41 |     X_train = train.loc[:, train.columns.str.contains('feature')]
 42 |     # y_train = train.loc[:, 'resp']
 43 |     y_train = train.loc[:, 'action']
 44 |     
 45 |     X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, random_state=666, test_size=0.2)
 46 |     
 47 |     return X_train, y_train
 48 | 
 49 |     
 50 | # 评分函数
 51 | def Score(model, data):
 52 |     # test_df = pd.read_csv("/kaggle/input/jane-street-market-prediction/train.csv")
 53 |     data = data.fillna(-999)
 54 |     X_test = data.loc[:, data.columns.str.contains('feature')]
 55 |     resp = model.predict(X_test)
 56 |     date = data["date"].values
 57 |     weight = data["weight"].values
 58 |     action = (resp > 0).astype("int")
 59 |     
 60 |     count_i = len(np.unique(date))
 61 |     Pi = np.zeros(count_i)
 62 |     # 用循环太慢
 63 |     #for i, day in enumerate(np.unique(date)):
 64 | #        Pi[i] = np.sum(weight[date == day] * resp[date == day] * action[date == day])
 65 |     # 用下面这行代替
 66 |     Pi = np.bincount(date, weight * resp * action)
 67 |     t = np.sum(Pi) / np.sqrt(np.sum(Pi ** 2)) * np.sqrt(250 / count_i)
 68 |     u = np.clip(t, 0, 6) * np.sum(Pi)
 69 |     return u
 70 |     
 71 |     
 72 | # 进行预测，生成提交文件，求值版
 73 | def predict_value(model):
 74 |     env = janestreet.make_env()
 75 |     iter_test = env.iter_test()
 76 |     for (test_df, sample_prediction_df) in iter_test:
 77 |         if test_df['weight'].item() > 0:
 78 |             test_df = featureEngineer(test_df)
 79 |             X_test = test_df.loc[:, test_df.columns.str.contains('feature')]
 80 |             # X_test = X_test.fillna(-999)
 81 |             y_resp = model.predict(X_test)[0]
 82 |             y_preds = 0 if y_resp < 0 else 1
 83 |         else:
 84 |             y_preds = 0
 85 |         # print(y_preds)
 86 |         sample_prediction_df.action = y_preds
 87 |         env.predict(sample_prediction_df)
 88 |         
 89 |         
 90 | # 进行预测，生成提交文件，分类版
 91 | def predict_clf(model):
 92 |     env = janestreet.make_env()
 93 |     iter_test = env.iter_test()
 94 |     for (test_df, sample_prediction_df) in iter_test:
 95 |         if test_df['weight'].item() > 0:
 96 |             test_df = featureEngineer(test_df)
 97 |             X_test = test_df.loc[:, test_df.columns.str.contains('feature')]
 98 |             X_test = X_test.fillna(0.0)
 99 |             y_preds = model.predict(X_test)[0]
100 |         else:
101 |             y_preds = 0
102 |         # print(y_preds)
103 |         sample_prediction_df.action = y_preds
104 |         env.predict(sample_prediction_df)
105 |         
106 | 
107 | if __name__ == "__main__":
108 |     newpath = "/home/code"
109 |     os.chdir(newpath)
110 |     
111 |     # data_explore()
112 |     
113 |     # 真正开始干活
114 |     p = 0.01
115 |     train = loadData(p = p)
116 |     train = featureEngineer(train)
117 |     # print(train.head())
118 |     
119 |     # 计算模型评分
120 |     # score = Score(model, train)
121 |     # print("模型评分:%.2f" % score)
122 |     test = loadData(p = p)
123 |     test = featureEngineer(test)
124 |     
125 |     #训练数据预处理
126 |     X_train, y_train = preprocessing(train)
127 |     
128 |     # 逻辑回归
129 |     print("逻辑回归")
130 |     model = LogisticRegression(max_iter = 3000)
131 |     model.fit(X_train, y_train)
132 |     score(model, test, "Logist")
133 |     
134 |     # 支持向量机
135 |     print("支持向量机")
136 |     model = SVC()
137 |     model.fit(X_train, y_train)
138 |     score(model, test, "SVC")
139 |     
140 |     # 随机森林
141 |     print("随机森林")
142 |     model = RandomForestClassifier()
143 |     model.fit(X_train, y_train)
144 |     score(model, test, "RandomForest")
145 |     
146 |     # knn
147 |     print("knn")
148 |     model = KNeighborsClassifier(n_neighbors = 2)
149 |     model.fit(X_train, y_train)
150 |     score(model, test, "knn")
151 |     
152 |     # 朴素贝叶斯
153 |     print("朴素贝叶斯")
154 |     model = GaussianNB()
155 |     model.fit(X_train, y_train)
156 |     score(model, test, "Bayes")
157 |     
158 |     # SGD算法
159 |     print("SGD算法")
160 |     model = SGDClassifier()
161 |     model.fit(X_train, y_train)
162 |     score(model, test, "SGD")
163 |     
164 |     # 决策树
165 |     print("决策树算法")
166 |     model = DecisionTreeClassifier()
167 |     model.fit(X_train, y_train)
168 |     score(model, test, "DecisionTree")
169 |     # 进行预测
170 |     # predict_clf(model)
171 |     


--------------------------------------------------------------------------------
/tools.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | # kaggle竞赛Jane Street Market Prediction
  3 | # 工具函数
  4 | 
  5 | from run import *
  6 | import pandas as pd
  7 | import matplotlib.pyplot as plt
  8 | import numpy as np
  9 | from sklearn.model_selection import cross_val_score, learning_curve
 10 | from sklearn.metrics import classification_report, roc_curve, auc
 11 | 
 12 | 
 13 | # 载入数据
 14 | @change_dir
 15 | def loadData(p = 0.01):
 16 |     # 抽样，读取1%数据
 17 |     # 参考https://mp.weixin.qq.com/s/2LSKnN9R-N-I2HcHePT9zA
 18 |     train = pd.read_csv("./train.csv", skiprows = lambda x: x>0 and np.random.rand() > p)
 19 |     # feature = pd.read_csv("./features.csv")
 20 |     return train
 21 |     
 22 |     
 23 | # 对模型进行交叉验证
 24 | def cross_val(model, X, Y, cv = 10):
 25 |     scores = cross_val_score(model, X, Y, cv=cv)
 26 |     score = scores.mean()
 27 |     return score
 28 |     
 29 |     
 30 | # 模型评估
 31 | def evalution(model, X, y_true):
 32 |     # X = test.loc[:, test.columns.str.contains("feature")].values
 33 |     # y_true = test.action.values
 34 |     y_pred = model.predict(X)
 35 |     target_names = ["1", "0"]
 36 |     result = classification_report(y_true, y_pred, target_names = target_names, output_dict = False )
 37 |     return result
 38 | 
 39 | 
 40 | # 对模型评分
 41 | @timethis
 42 | def score(model, test, modelName):
 43 |     if modelName == "XGBoost":
 44 |         X = test.loc[:, test.columns.str.contains("feature")]
 45 |         Y = test.action
 46 |     else:
 47 |         X = test.loc[:, test.columns.str.contains("feature")].values
 48 |         Y = test.action.values
 49 |     model_score = model.score(X, Y)
 50 |     cross_score = cross_val(model, X, Y)
 51 |     report = evalution(model, X, Y)
 52 |     print("模型评分:", model_score)
 53 |     print("交叉验证:", cross_score)
 54 |     print("模型评估:\n", report)
 55 |     Roc(model, X, Y, modelName)
 56 |     Lc(model, modelName, X, Y)
 57 |     
 58 |     
 59 | # 画roc曲线
 60 | @change_dir
 61 | def Roc(model, X, Y, modelName):
 62 |     y_label = Y
 63 |     y_pred = model.predict(X)
 64 |     fpr, tpr, thersholds = roc_curve(y_label, y_pred)
 65 |         
 66 |     roc_auc = auc(fpr, tpr)
 67 |     
 68 |     plt.plot(fpr, tpr, 'k--', label = "ROC (area = {0:.2f})".format(roc_auc), lw = 2)
 69 |     plt.tick_params(axis='x', labelsize=15)
 70 |     plt.tick_params(axis='y', labelsize=15)
 71 |     plt.xlim([-0.05, 1.05])
 72 |     plt.ylim([-0.05, 1.05])
 73 |     plt.xlabel("False Positive Rate")
 74 |     plt.ylabel("True Positive Rate")
 75 |     plt.title(modelName + " ROC Curve")
 76 |     plt.legend(loc = "best")
 77 |     plt.savefig("./output/" + modelName + "_ROC.png")
 78 |     
 79 |     
 80 | # 画学习曲线
 81 | @change_dir
 82 | def Lc(model, modelName, X, y, ylim = None, cv = None, n_jobs = 1, train_sizes = np.linspace(0.1, 1.0, 5), verbose = 0):
 83 |     plt.figure()
 84 |     plt.title(modelName+" Learning Curve")
 85 |     if ylim is not None:
 86 |         plt.ylim(*ylim)
 87 |     plt.xlabel("Training Samples")
 88 |     plt.ylabel("Score")
 89 |     train_sizes, train_scores, test_scores = learning_curve(model, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
 90 |     train_scores_mean = np.mean(train_scores, axis = 1)
 91 |     train_scores_std = np.std(train_scores, axis = 1)
 92 |     test_scores_mean = np.mean(test_scores, axis = 1)
 93 |     test_scores_std = np.std(test_scores, axis = 1)
 94 |     plt.grid()
 95 |     
 96 |     plt.fill_between(train_sizes, train_scores_mean - train_scores_std,train_scores_mean + train_scores_std, alpha=0.1, color="r")
 97 |     plt.fill_between(train_sizes,test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g")
 98 |     plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
 99 |     plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
100 |     
101 |     plt.legend(loc="best")
102 |     plt.savefig("./output/" + modelName + "_Learning Curve.png")
103 |     
104 |     
105 | # 工具函数，返回神经网络训练的每一步
106 | def make_train_step(model, loss_fn, optimizer):
107 |     # 执行在循环中训练过程
108 |     def train_step(x, y):
109 |         # 设置训练模式
110 |         model.train()
111 |         # 梯度置零
112 |         optimizer.zero_grad()
113 |         # 预测
114 |         yhat = model(x)
115 |         # print(yhat[:10])
116 |         # 计算损失
117 |         # print("测试")
118 |         yhat = yhat.squeeze(-1)
119 |         # print(yhat.shape, y.shape)
120 |         loss = loss_fn(yhat, y)
121 |         # 计算梯度
122 |         loss.backward()
123 |         # 更新参数，梯度置零
124 |         optimizer.step()
125 |         # 返回损失值
126 |         return loss.item()
127 |         
128 |     # 返回在训练循环中调用的函数
129 |     return train_step
130 |         
131 | 
132 |     


--------------------------------------------------------------------------------
/works.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | # kaggle Jane Street Market Prediction代码
  3 | # 实际自己工作的代码
  4 | 
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | pd.set_option('display.max_columns', None)
  9 | import janestreet
 10 | 
 11 | import matplotlib.pyplot as plt
 12 | from sklearn.model_selection import train_test_split
 13 | from sklearn import metrics
 14 | from sklearn.metrics import accuracy_score
 15 | import optuna
 16 | from sklearn.linear_model import LinearRegression, LogisticRegression
 17 | 
 18 | import os
 19 | 
 20 | from EDA import data_explore
 21 | from FE import featureEngineer
 22 | 
 23 |     
 24 |     
 25 | # 建模过程
 26 | def modeling(train):
 27 |     print("开始建模")
 28 |     X_train = train.loc[:, train.columns.str.contains('feature')]
 29 |     # y_train = train.loc[:, 'resp']
 30 |     y_train = train.loc[:, 'action']
 31 |     
 32 |     X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, random_state=666, test_size=0.2)
 33 |     # model = LinearRegression()
 34 |     model = LogisticRegression()
 35 |     model.fit(X_train, y_train)
 36 |     
 37 |     return model
 38 | 
 39 |     
 40 | # 评分函数
 41 | def Score(model, data):
 42 |     # test_df = pd.read_csv("/kaggle/input/jane-street-market-prediction/train.csv")
 43 |     data = data.fillna(-999)
 44 |     X_test = data.loc[:, data.columns.str.contains('feature')]
 45 |     resp = model.predict(X_test)
 46 |     date = data["date"].values
 47 |     weight = data["weight"].values
 48 |     action = (resp > 0).astype("int")
 49 |     
 50 |     count_i = len(np.unique(date))
 51 |     Pi = np.zeros(count_i)
 52 |     # 用循环太慢
 53 |     #for i, day in enumerate(np.unique(date)):
 54 | #        Pi[i] = np.sum(weight[date == day] * resp[date == day] * action[date == day])
 55 |     # 用下面这行代替
 56 |     Pi = np.bincount(date, weight * resp * action)
 57 |     t = np.sum(Pi) / np.sqrt(np.sum(Pi ** 2)) * np.sqrt(250 / count_i)
 58 |     u = np.clip(t, 0, 6) * np.sum(Pi)
 59 |     return u
 60 |     
 61 |     
 62 | # 进行预测，生成提交文件，求值版
 63 | def predict_value(model):
 64 |     env = janestreet.make_env()
 65 |     iter_test = env.iter_test()
 66 |     for (test_df, sample_prediction_df) in iter_test:
 67 |         if test_df['weight'].item() > 0:
 68 |             X_test = test_df.loc[:, test_df.columns.str.contains('feature')]
 69 |             X_test = X_test.fillna(-999)
 70 |             y_resp = model.predict(X_test)[0]
 71 |             y_preds = 0 if y_resp < 0 else 1
 72 |         else:
 73 |             y_preds = 0
 74 |         # print(y_preds)
 75 |         sample_prediction_df.action = y_preds
 76 |         env.predict(sample_prediction_df)
 77 |         
 78 |         
 79 | # 进行预测，生成提交文件，分类版
 80 | def predict_clf(model):
 81 |     env = janestreet.make_env()
 82 |     iter_test = env.iter_test()
 83 |     for (test_df, sample_prediction_df) in iter_test:
 84 |         if test_df['weight'].item() > 0:
 85 |             X_test = test_df.loc[:, test_df.columns.str.contains('feature')]
 86 |             X_test = X_test.fillna(-999)
 87 |             y_preds = model.predict(X_test)[0]
 88 |         else:
 89 |             y_preds = 0
 90 |         # print(y_preds)
 91 |         sample_prediction_df.action = y_preds
 92 |         env.predict(sample_prediction_df)
 93 | 
 94 | 
 95 | if __name__ == "__main__":
 96 |     newpath = "/home/code"
 97 |     os.chdir(newpath)
 98 |     
 99 |     # data_explore()
100 |     
101 |     # 真正开始干活
102 |     train = pd.read_csv("./train.csv", nrows = 10000)
103 |     feature = pd.read_csv("./features.csv")
104 |     train = featureEngineer(train)
105 |     model = modeling(train)
106 |     # 计算模型评分
107 |     # score = Score(model, train)
108 |     # print("模型评分:%.2f" % score)
109 |     
110 |     # 进行预测
111 |     predict_clf(model)
112 |     


--------------------------------------------------------------------------------