├── LICENSE ├── README.md └── stock_image_clf ├── cnn_hyper.py ├── main.py ├── random_forest.py ├── somemodels.py ├── someplots.py └── vectorization.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 ernest222 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # stock_img_clf 2 | CNN、随机森林应用于股票四种形态A形(先涨后跌),U形(先跌后涨),R形(上涨),D形(下跌)的识别,基于keras,后端为tensorflow。准确率为96%左右。 3 | cnn_hyper.py 为超参数优化;someplots.py为cnn层输、通道、热力图可视化等;vectorization.py为数据预处理;random_forest.py为cnn特征提取,拟合随机森林分类器。 4 | 5 | -------------------------------------------------------------------------------- /stock_image_clf/cnn_hyper.py: -------------------------------------------------------------------------------- 1 | import os 2 | import vectorization as vr 3 | from keras import layers 4 | from keras import models 5 | import numpy as np 6 | from hyperopt import Trials, STATUS_OK, tpe 7 | from hyperas import optim 8 | from hyperas.distributions import choice, uniform 9 | 10 | # 超参数优化 11 | 12 | def data(): 13 | x_train, y_train = vr.ImgVectorization('train').vec_all() 14 | x_test, y_test = vr.ImgVectorization('val').vec_all() 15 | return x_train, y_train, x_test, y_test 16 | 17 | def create_model(x_train, y_train, x_test, y_test): 18 | model = models.Sequential() 19 | model.add(layers.Conv2D({{choice([16, 32, 64, 128])}}, (3, 3), activation='relu', input_shape=(80,80,3))) 20 | model.add(layers.MaxPool2D((2, 2))) 21 | model.add(layers.Conv2D({{choice([16, 32, 64, 128])}}, (3, 3), activation='relu')) 22 | model.add(layers.MaxPool2D((2, 2))) 23 | model.add(layers.Conv2D({{choice([16, 32, 64, 128])}}, (3, 3), activation='relu')) 24 | model.add(layers.MaxPool2D((2, 2))) 25 | model.add(layers.Flatten()) 26 | model.add(layers.Dropout({{uniform(0, 1)}})) 27 | model.add(layers.Dense({{choice([16, 32, 64, 128])}}, activation='relu')) 28 | model.add(layers.Dense(4, activation='softmax')) 29 | model.compile(loss='categorical_crossentropy', metrics=['accuracy'], 30 | optimizer='rmsprop') 31 | result = model.fit(x_train, y_train, 32 | batch_size={{choice([32, 64, 128])}}, 33 | epochs={{choice([10,20])}}, 34 | validation_data=(x_test, y_test)) 35 | val_acc = np.amax(result.history['val_acc']) 36 | print('Best validation acc of epoch:', val_acc) 37 | return {'loss': -val_acc, 'status': STATUS_OK, 'model': model} 38 | 39 | 40 | if __name__ == '__main__': 41 | this_dir= os.getcwd() 42 | result_dir = this_dir + os.sep + 'result' 43 | X_train, Y_train, X_test, Y_test = data() 44 | best_run, best_model = optim.minimize(model=create_model, 45 | data=data, 46 | algo=tpe.suggest, 47 | max_evals=20, 48 | trials=Trials()) 49 | best_model.save(result_dir + os.sep + 'cnn_hyper.h5') 50 | print("Best performing model chosen hyper-parameters:") 51 | print(best_run) 52 | -------------------------------------------------------------------------------- /stock_image_clf/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import somemodels as md 3 | import vectorization as vr 4 | from keras.models import load_model 5 | import matplotlib.pyplot as plt 6 | import numpy as np 7 | 8 | 9 | class FitModel(): 10 | def __init__(self,data_enhance=False,img_size=(80,80),epochs=10,batch_size=32): 11 | self.img_size=img_size 12 | self.epochs=epochs 13 | self.batch_size=batch_size 14 | if data_enhance: 15 | self.l=[0,0.1,0.1,0.1] 16 | else: 17 | self.l=[0,0,0,0] 18 | 19 | def fm(self): 20 | train_input = vr.ImgVectorization('train',img_size=self.img_size,batch_size=self.batch_size, 21 | rotation=self.l[0],hs=self.l[1],ws=self.l[2],zr=self.l[3]).vec_generator() 22 | val_input = vr.ImgVectorization('val',img_size=self.img_size,batch_size=self.batch_size).vec_generator() 23 | model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy']) # 损失函数为分类交叉熵 24 | history = model.fit_generator(train_input, steps_per_epoch=50, epochs=self.epochs, validation_data=val_input, validation_steps=12) 25 | best_acc=np.amax(history.history['val_acc']) 26 | print(best_acc) 27 | model.save(result_dir + os.sep + model_name + '_'+str(pic_size)+'_'+str(dropout)+'_'+str(batch_size)+'_'+str(self.epochs)+'.h5') 28 | self.plot(history) 29 | 30 | def plot(self,history): 31 | acc = history.history['acc'] 32 | val_acc = history.history['val_acc'] 33 | loss = history.history['loss'] 34 | val_loss = history.history['val_loss'] 35 | epochs = range(1, len(acc) + 1) 36 | plt.figure() 37 | plt.subplot(2, 1, 1) 38 | plt.plot(epochs, acc, 'bo', label='train') 39 | plt.plot(epochs, val_acc, 'b', label='val') 40 | plt.title('accuracy') 41 | plt.subplot(2, 1, 2) 42 | plt.plot(epochs, loss, 'bo', label='train') 43 | plt.plot(epochs, val_loss, 'b', label='val') 44 | plt.title('loss') 45 | plt.legend() 46 | plt.savefig(result_dir + os.sep + model_name + str(pic_size)+'_'+str(self.epochs)+'_plot') 47 | plt.show() 48 | 49 | 50 | def evaluate_on_test(model_file,img_size=(80,80),batch_size=32): 51 | read_model = load_model( model_file) 52 | X_test,Y_test = vr.ImgVectorization('test',img_size=img_size,batch_size=batch_size).vec_all() # 评估验证集 53 | test_score = read_model.evaluate(X_test,Y_test) 54 | print(test_score) 55 | 56 | if __name__ == '__main__': 57 | this_dir = os.getcwd() 58 | result_dir = this_dir+os.sep+'result' 59 | pic_size=80 60 | epochs = 10 61 | batch_size=16 62 | dropout=0.2 63 | # model_name, model = md.dense_model() 64 | model_name, model = md.cnn_model(shape=(pic_size,pic_size,3),dropout=dropout) # 选择模型 65 | FitModel(data_enhance=False,img_size=(pic_size,pic_size),epochs=epochs,batch_size=batch_size).fm() 66 | model_file=result_dir + os.sep + model_name + '_'+str(pic_size)+'_'+str(dropout)+'_'+str(batch_size)+'_'+str(epochs)+'.h5' 67 | evaluate_on_test(model_file,img_size=(pic_size,pic_size),batch_size=batch_size) # 在测试集评估所有模型 68 | 69 | 70 | 71 | -------------------------------------------------------------------------------- /stock_image_clf/random_forest.py: -------------------------------------------------------------------------------- 1 | import os 2 | from sklearn.ensemble import RandomForestClassifier 3 | from sklearn.tree import DecisionTreeClassifier 4 | from sklearn.metrics import accuracy_score 5 | from sklearn.model_selection import GridSearchCV 6 | import numpy as np 7 | from keras.models import load_model 8 | from keras import models 9 | import vectorization as vr 10 | import pandas as pd 11 | from sklearn import tree 12 | import matplotlib.pyplot as plt 13 | import pydotplus 14 | 15 | # 将cnn的第一层dense层的特征提取,输入随机森林和决策树分类器 16 | 17 | def data(shuff=False): 18 | x_train, Y_onhot = vr.ImgVectorization('train',shuff=shuff).vec_all() 19 | x_val, y_onehot = vr.ImgVectorization('val',shuff=shuff).vec_all() 20 | y_train=vr.ImgVectorization('train').lable_list() 21 | y_val=vr.ImgVectorization('val').lable_list() 22 | return x_train,y_train, x_val,y_val,Y_onhot,y_onehot 23 | 24 | 25 | def get_layer_features(model,x_data,layer_name,channels): # cnn中dense层特征提取 26 | layer_model = models.Model(inputs=model.input,outputs=model.get_layer(layer_name).output) 27 | i=0 28 | features = np.zeros(shape=(x_data.shape[0], channels)) 29 | for x in x_data: 30 | x = np.expand_dims(x, axis=0) 31 | layer_output = layer_model.predict(x) 32 | features[i] = layer_output 33 | i += 1 34 | feature_col = [] 35 | for r in range(channels): 36 | feature_col.append(str(r)) 37 | r += 1 38 | df = pd.DataFrame(data=features, columns=feature_col) 39 | return df 40 | 41 | def fit_random_forest(x_train,y_train,x_val,y_val): # 随机森林分类器 42 | rf = RandomForestClassifier(max_depth= 11,min_samples_leaf= 40, min_samples_split= 6, n_estimators=30,max_features='sqrt') 43 | rf.fit(x_train,y_train) 44 | # plot(rf,x_train) 45 | print(rf) 46 | predictions = rf.predict(x_val) 47 | acc = accuracy_score(predictions, y_val) 48 | print(acc) 49 | 50 | def plot(clf,x): # 随机森林可视化及特征重要性 51 | Estimators = clf.estimators_ 52 | for index, model in enumerate(Estimators): 53 | filename = 'tree_' + str(index) + '.pdf' 54 | dot_data = tree.export_graphviz(model, out_file=None, 55 | feature_names=x.columns, 56 | class_names=['A_shape','D_shape','R_shape','U_shape'], 57 | filled=True, rounded=True, 58 | special_characters=True) 59 | graph = pydotplus.graph_from_dot_data(dot_data) 60 | graph.write_pdf(this_dir+os.sep+'randomforest'+os.sep+filename) 61 | df=pd.DataFrame({'features':x.columns,'importances':clf.feature_importances_}) 62 | df=df.sort_values(by='importances',ascending=False).head(10) 63 | plt.bar(df.features, df.importances) 64 | plt.xticks(np.arange(len(df.features)),df.features) 65 | plt.ylabel('Importances') 66 | plt.title('Features Importances') 67 | plt.show() 68 | 69 | 70 | def find_param(x,y): # 随机森林调参 71 | param_test2 = {'max_depth':range(10,20),'min_samples_split':range(5,15), 'min_samples_leaf':range(10,60,10),'n_estimators': range(10, 71, 10)} 72 | gsearch2 = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_test2, 73 | scoring='roc_auc', cv=5) 74 | gsearch2.fit(x, y) 75 | print(gsearch2.best_params_, gsearch2.best_score_) 76 | 77 | def fit_tree(x_train,y_train,x_val,y_val): # 决策树分类器 78 | clf = DecisionTreeClassifier() 79 | clf.fit(x_train,y_train) 80 | print(clf) 81 | predictions = clf.predict(x_val) 82 | acc = accuracy_score(predictions, y_val) 83 | print(acc) 84 | 85 | if __name__ == '__main__': 86 | this_dir= os.getcwd() 87 | result_dir = this_dir + os.sep + 'result' 88 | model_path=result_dir+os.sep+'cnn_80_0.2_16_10.h5' 89 | model = load_model(model_path) 90 | model.summary() 91 | os.environ["PATH"] += os.pathsep + path 92 | x_train, y_train,x_val,y_val,Y_onehot,y_onhot=data(shuff=False) # 调参时需要用打乱的数据; fit的时候lable是按次序读文件夹名的,此时shuffle要设为false 93 | train_features=get_layer_features(model,x_train,'dense_1',128) 94 | val_features=get_layer_features(model,x_val,'dense_1',128) 95 | fit_random_forest(train_features,y_train,val_features,y_val) 96 | # find_param(train_features,Y_onehot) 97 | # fit_tree(train_features,y_train,val_features,y_val) -------------------------------------------------------------------------------- /stock_image_clf/somemodels.py: -------------------------------------------------------------------------------- 1 | from keras import layers 2 | from keras import models 3 | 4 | 5 | 6 | 7 | 8 | def cnn_model(shape=(80,80,3),dropout=0.5,last_activation='softmax'): 9 | model=models.Sequential() 10 | model.add(layers.Conv2D(64,(3,3),activation='relu',input_shape=shape)) 11 | model.add(layers.MaxPool2D((2,2))) 12 | model.add(layers.Conv2D(64,(3,3),activation='relu')) 13 | model.add(layers.MaxPool2D((2,2))) 14 | model.add(layers.Conv2D(128, (3, 3), activation='relu')) 15 | model.add(layers.MaxPool2D((2, 2))) 16 | model.add(layers.Flatten()) 17 | model.add(layers.Dropout(dropout)) 18 | model.add(layers.Dense(128,activation='relu')) 19 | model.add(layers.Dense(4,activation=last_activation)) 20 | model.summary() 21 | return 'cnn', model 22 | 23 | 24 | def dense_model(shape=(80*80,),last_activation='softmax'): 25 | model = models.Sequential() 26 | model.add(layers.Dense(128, activation='relu',input_shape=shape)) 27 | model.add(layers.Dense(4, activation=last_activation)) 28 | model.summary() 29 | return 'simple_dense',model 30 | 31 | def inception_model(): 32 | pass 33 | 34 | 35 | 36 | 37 | -------------------------------------------------------------------------------- /stock_image_clf/someplots.py: -------------------------------------------------------------------------------- 1 | import os 2 | from keras.models import load_model 3 | from keras import models 4 | from keras import backend as K 5 | from keras_preprocessing import image 6 | import numpy as np 7 | import matplotlib.pyplot as plt 8 | from keras.models import load_model 9 | import cv2 10 | 11 | # cnn可视化,参考deep learning with python一书 12 | 13 | class PredictImg(): 14 | def __init__(self,model_path,img_path): 15 | self.model_path=model_path 16 | self.img_path=img_path 17 | 18 | def img_to_tensor(self): 19 | img = image.load_img(self.img_path, target_size=(80, 80)) 20 | img_tensor = image.img_to_array(img) 21 | img_tensor = np.expand_dims(img_tensor, axis=0) 22 | img_tensor /= 255 23 | print(img_tensor.shape) 24 | return img_tensor 25 | 26 | def predict(self): 27 | model = load_model(self.model_path) 28 | pred=model.predict(self.img_to_tensor()) 29 | print(pred) 30 | result=np.argmax(pred[0]) 31 | print(result) 32 | if result==0: 33 | print('A_shape') 34 | elif result==1: 35 | print('down_shape') 36 | elif result==2: 37 | print('rise_shape') 38 | elif result==3: 39 | print('U_shape') 40 | return result 41 | 42 | def plot_img_tensor(self): 43 | plt.imshow(self.img_to_tensor()[0]) 44 | print(self.img_to_tensor().shape) 45 | plt.show() 46 | 47 | def plot_onelayer_onechannel(self,layerid,channel_id,layer_before): 48 | model = load_model(self.model_path) 49 | layer_outputs = [layer.output for layer in model.layers[:layer_before]] # layer_before 前多少层,layer_before=5,获取前5层 50 | activation_model = models.Model(inputs=model.input, outputs=layer_outputs) 51 | activations = activation_model.predict(self.img_to_tensor()) 52 | choose_layer_activation = activations[layerid] # layerid=0 查看第一层激活输出 53 | print(choose_layer_activation.shape) 54 | plt.matshow(choose_layer_activation[0, :, :, channel_id], cmap='viridis') # channel_id 查看layerid层的第channel_id通道图片 55 | plt.show() 56 | 57 | def plot_layer_allchannel(self,begin_layer,end_layer): # 查看层数越多,打印图片对内存要求越高 58 | model = load_model(self.model_path) 59 | layer_outputs = [layer.output for layer in model.layers[begin_layer:end_layer]] 60 | layer_names = [] 61 | for layer in model.layers[begin_layer:end_layer]: 62 | layer_names.append(layer.name) 63 | images_per_row = 16 64 | activation_model = models.Model(inputs=model.input, outputs=layer_outputs) 65 | activations = activation_model.predict(self.img_to_tensor()) 66 | for layer_name, layer_activation in zip(layer_names, activations): 67 | n_features = layer_activation.shape[-1] 68 | size = layer_activation.shape[1] 69 | n_cols = n_features // images_per_row 70 | display_grid = np.zeros((size * n_cols, images_per_row * size)) 71 | for col in range(n_cols): 72 | for row in range(images_per_row): 73 | channel_image = layer_activation[0, :, :,col * images_per_row + row] 74 | channel_image -= channel_image.mean() 75 | channel_image /= channel_image.std() 76 | channel_image *= 64 77 | channel_image += 128 78 | channel_image = np.clip(channel_image, 0, 255).astype('uint8') 79 | display_grid[col * size: (col + 1) * size, 80 | row * size: (row + 1) * size] = channel_image 81 | scale = 1. / size 82 | plt.figure(figsize=(scale * display_grid.shape[1], 83 | scale * display_grid.shape[0])) 84 | plt.title(layer_name) 85 | plt.grid(False) 86 | plt.imshow(display_grid, aspect='auto', cmap='viridis') 87 | plt.show() 88 | 89 | 90 | def plot_heatmaps(self,con2dlayer_name,con2dlayer_channel): # 打印某层的热力图 91 | model = load_model(self.model_path) 92 | predict_result=self.predict() 93 | shape_output = model.output[:, predict_result] 94 | last_conv_layer = model.get_layer(con2dlayer_name) 95 | grads = K.gradients(shape_output, last_conv_layer.output)[0] 96 | pooled_grads = K.mean(grads, axis=(0, 1, 2)) 97 | iterate = K.function([model.input], [pooled_grads, last_conv_layer.output[0]]) 98 | pooled_grads_value, conv_layer_output_value = iterate([self.img_to_tensor()]) 99 | for i in range(con2dlayer_channel): 100 | conv_layer_output_value[:, :, i] *= pooled_grads_value[i] 101 | heatmap = np.mean(conv_layer_output_value, axis=-1) 102 | heatmap = np.maximum(heatmap, 0) 103 | heatmap /= np.max(heatmap) 104 | # print(heatmap) 105 | plt.matshow(heatmap) 106 | plt.show() 107 | plt.matshow(heatmap) 108 | img = cv2.imread(self.img_path) 109 | heatmap = cv2.resize(heatmap, (img.shape[1], img.shape[0])) 110 | heatmap = np.uint8(255 * heatmap) 111 | heatmap = cv2.applyColorMap(heatmap, cv2.COLORMAP_JET) 112 | superimposed_img = heatmap * 0.9 + img 113 | cv2.imwrite(result_dir+os.sep+str(predict_result)+'_heatmap.jpg', superimposed_img) 114 | 115 | 116 | if __name__ == '__main__': 117 | this_dir= os.getcwd() 118 | result_dir = this_dir + os.sep + 'result' 119 | img_path=this_dir+os.sep+'U_shape.png' 120 | model_path=result_dir+os.sep+'cnn_80_0.2_16_10.h5' 121 | PI=PredictImg(model_path,img_path) 122 | PI.predict() 123 | # PI.plot_heatmaps('conv2d_5',64) 124 | # PI.plot_layer_allchannel(begin_layer=0,end_layer=6) 125 | # PI.plot_onelayer_onechannel(layerid=2,channel_id=5,layer_before=3) 126 | # PI.img_to_tensor() 127 | # PI.plot_img_tensor() -------------------------------------------------------------------------------- /stock_image_clf/vectorization.py: -------------------------------------------------------------------------------- 1 | import os 2 | from keras.preprocessing.image import ImageDataGenerator 3 | 4 | this_dir=os.getcwd() 5 | 6 | 7 | # 图像数据预处理 8 | class ImgVectorization(): 9 | def __init__(self,datatype='train',img_size=(80,80),batch_size=20,mode='categorical',rotation=0,ws=0,hs=0,zr=0,hf=False,shuff=True): 10 | self.data_dir=this_dir+os.sep+datatype 11 | self.img_size = img_size 12 | self.shuff=shuff 13 | self.batch_size=batch_size 14 | self.mode=mode 15 | self.rotation=rotation 16 | self.ws=ws 17 | self.hs=hs 18 | self.zr=zr 19 | self.hf=hf 20 | 21 | def check_img_amount(self): # 检查各类图片数量一致 22 | list=[] 23 | for folder in os.listdir(self.data_dir): 24 | img_amount = len(os.listdir(self.data_dir + os.sep + folder)) 25 | img_class = folder 26 | print(img_class, img_amount) 27 | list.append(img_amount) 28 | if len(set(list))==1: 29 | return list[0]*len(list) # 返回该目录图片数量 30 | else: 31 | print('样本分类数量不一致') 32 | 33 | def lable_list(self): # 随机森林使用的label_list 34 | list_lable=[] 35 | for folder in os.listdir(self.data_dir): 36 | img_amount = len(os.listdir(self.data_dir + os.sep + folder)) 37 | for im in range(img_amount): 38 | list_lable.append(int(folder)) 39 | return list_lable 40 | 41 | def vec_generator(self): # 按批 向量化生成器 42 | data = ImageDataGenerator(rescale=1. / 255, rotation_range=self.rotation, width_shift_range=self.ws, 43 | height_shift_range=self.hs, zoom_range=self.zr, 44 | horizontal_flip=self.hf) 45 | data_generator = data.flow_from_directory(self.data_dir, target_size=self.img_size, batch_size=self.batch_size, 46 | class_mode=self.mode,shuffle=self.shuff) 47 | return data_generator 48 | 49 | def check_input_type(self): # 检查生成数据格式 50 | for data_batch,lables_batch in self.vec_generator(): 51 | print(data_batch.shape) 52 | print(lables_batch.shape) 53 | break 54 | 55 | def vec_all(self): # 向量化全部数据 56 | data_amount=self.check_img_amount() 57 | self.batch_size=data_amount 58 | list=[] 59 | for data_batch,data_label in self.vec_generator(): 60 | list.append(data_batch) 61 | list.append(data_label) 62 | break 63 | return list[0],list[1] 64 | 65 | 66 | 67 | 68 | # 文本数据预处理 69 | class TextVectorization(): 70 | pass --------------------------------------------------------------------------------