├── myThread.py ├── data └── config.json ├── log.py ├── customized_gru.py ├── param.py ├── linear.py ├── test.py ├── config.py ├── util.py ├── plt.py ├── model.py ├── trajectoryNet.py └── data_funs.py /myThread.py: -------------------------------------------------------------------------------- 1 | import threading 2 | 3 | class MyThread(threading.Thread): 4 | def __init__(self, func, args): 5 | threading.Thread.__init__(self) 6 | self.func = func 7 | self.args = args 8 | self.res = None 9 | 10 | def get_result(self): 11 | return self.res 12 | 13 | def run(self): 14 | self.res = self.func(*self.args) -------------------------------------------------------------------------------- /data/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "task": "trajectoryNet", 3 | "testmode": "lobo", 4 | "val_id": [6], 5 | "test_id": [0,1,2,3,4,5], 6 | "hidden_size": 100, 7 | "learning_rate": 0.1, 8 | "batch_size": 128, 9 | "num_layers": 1, 10 | "num_epochs": 100, 11 | "activation": "maxout", 12 | "deep_gate": false, 13 | "checkpoint": true, 14 | "restore":false, 15 | "exp_seq_len":100, 16 | "init_scale": 0.001, 17 | "weight_initializer": "uniform", 18 | "evaluate_freq": 50, 19 | "num_threads": 100, 20 | "tensorboard":false, 21 | "useGPU":true, 22 | "test_mode": false, 23 | "num_classes":4, 24 | "maxOut_numUnits":5, 25 | "num_features":5, 26 | "embeded_dims":50, 27 | "l2_preparam":0.001, 28 | "rnn_type":"gru_b" 29 | 30 | } 31 | -------------------------------------------------------------------------------- /log.py: -------------------------------------------------------------------------------- 1 | 2 | import time 3 | 4 | class Log(object): 5 | 6 | def __init__(self,path,name): 7 | self.train_log_path = path 8 | second = time.localtime(time.time()) 9 | time_str = time.strftime('%Y-%m-%d-%H-%M-%S',second) 10 | #summary_log_file 初始化 由当前时间命名 11 | self.summary_log_file_name = "summary"+time_str +name+".csv" 12 | self.summary_log_file = open(path +self.summary_log_file_name,"w+") 13 | 14 | #training_log_file 由training+当前时间命名 15 | self.train_log_file_name = "training" + time_str + name+".csv" 16 | self.train_log_file = open(path +self.train_log_file_name,"w+") 17 | 18 | self.addheader() 19 | 20 | def addheader(self): 21 | self.summary_log_file.write("iteration, trainLoss, valLoss, testLoss, trainAcc, valAcc, testAcc\n") 22 | #self.train_log_file.write("iteration, trainLoss,trainAcc\n") 23 | 24 | def summary_log(self,data,batch_iter): 25 | (cost_train, acc_train, cost_val, acc_val,cost_test, acc_test) = data 26 | self.summary_log_file.write("{0}, {1:0.3f}, {2:0.3f}, {3:0.3f}, {4:0.3f}, {5:0.3f}, {6:0.3f}\n".format(batch_iter, cost_train, cost_val, cost_test, acc_train, acc_val, acc_test)) 27 | self.summary_log_file.flush() 28 | 29 | 30 | def training_log(self,data): 31 | #trainLoss,trainAcc = 32 | self.train_log_file.write(data) 33 | self.train_log_file.write("\n") 34 | self.train_log_file.flush() 35 | 36 | 37 | def close(self): 38 | self.summary_log_file.close() 39 | self.train_log_file.close() 40 | 41 | 42 | if __name__ == "__main__": 43 | 44 | print(str(3)+"dd") -------------------------------------------------------------------------------- /customized_gru.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import linear 3 | from tensorflow.python.ops.rnn_cell import RNNCell 4 | from tensorflow.python.ops.math_ops import tanh 5 | from tensorflow.python.platform import tf_logging as logging 6 | from tensorflow.python.ops import variable_scope as vs 7 | from tensorflow.python.ops import array_ops 8 | from tensorflow.python.ops.math_ops import sigmoid 9 | from tensorflow.python.ops.math_ops import tanh 10 | 11 | 12 | class CustomizedGRU(RNNCell): 13 | 14 | def __init__(self,num_units,maxOut_numUnits,activation = tanh): 15 | self._num_units = num_units 16 | self._activation = activation 17 | self._maxOut_numUnits = maxOut_numUnits 18 | 19 | 20 | @property 21 | def state_size(self): 22 | return self._num_units 23 | 24 | @property 25 | def output_size(self): 26 | return self._num_units 27 | 28 | def __call__(self, inputs, state, scope=None): 29 | 30 | with vs.variable_scope(scope or "gru_cell"): 31 | with vs.variable_scope("gates"): 32 | gate = linear._linear([inputs,state],2*self._num_units,True,1.0,scope = scope) 33 | 34 | r,u = array_ops.split(gate,num_or_size_splits=2,axis=1) 35 | 36 | r,u = sigmoid(r),sigmoid(u) 37 | #r,u = tanh(r),tanh(u) 38 | 39 | with vs.variable_scope("candidate"): 40 | 41 | c = self.maxout(inputs,r*state,self._maxOut_numUnits,0,self._num_units,scope= scope) 42 | 43 | new_h = u*state +(1-u)*c 44 | 45 | return new_h,new_h 46 | 47 | 48 | 49 | def maxout(self, input1, input2, num_units, ini_value, output_size, scope=None): 50 | shape = input1.get_shape().as_list() 51 | dim = shape[-1] 52 | outputs = None 53 | for i in range(num_units): 54 | with tf.variable_scope(str(i)): 55 | y = self._activation(linear._linear([input1, input2],output_size, True, ini_value,scope=scope)) 56 | if outputs is None: 57 | outputs = y 58 | else: 59 | outputs = tf.maximum(outputs, y) 60 | c = outputs 61 | return c 62 | -------------------------------------------------------------------------------- /param.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | import tensorflow as tf 3 | 4 | WIDTH = 30 5 | FENWEI_MAX = 0.95 6 | FILTER_K = 10 7 | 8 | 9 | train_file_pattern = "interval_[5]_train_*.tfrecords" 10 | valid_file_pattern = "interval_[5]_valid_*.tfrecords" 11 | test_file_pattern = "interval_[5]_test_*.tfrecords" 12 | 13 | 14 | SPEED_SEC = "speed_sec" 15 | ACC_SEC = "acc_sec" 16 | AVG_SPEED = "avg_speed" 17 | STD_SPEED = "std_speed" 18 | MEAN_ACC = "mean_acc" 19 | STD_ACC = "std_acc" 20 | HEAD = "head" 21 | HEAD_MEAN = "head_mean" 22 | STD_HEAD = "std_head" 23 | MAX_ACC = "max_acc" 24 | MAX_SPEED = "max_speed" 25 | MAX_HEAD = "max_head" 26 | EARLY = "early" 27 | LABEL = "label" 28 | 29 | feature = { 30 | SPEED_SEC: tf.FixedLenFeature([],tf.string), 31 | AVG_SPEED : tf.FixedLenFeature([],tf.string), 32 | STD_SPEED : tf.FixedLenFeature([],tf.string), 33 | ACC_SEC : tf.FixedLenFeature([],tf.string), 34 | MEAN_ACC : tf.FixedLenFeature([],tf.string), 35 | STD_ACC : tf.FixedLenFeature([],tf.string), 36 | HEAD : tf.FixedLenFeature([],tf.string), 37 | HEAD_MEAN : tf.FixedLenFeature([],tf.string), 38 | STD_HEAD : tf.FixedLenFeature([],tf.string), 39 | MAX_SPEED : tf.FixedLenFeature([],tf.string), 40 | MAX_ACC : tf.FixedLenFeature([],tf.string), 41 | MAX_HEAD : tf.FixedLenFeature([],tf.string), 42 | EARLY : tf.FixedLenFeature([],tf.int64), 43 | LABEL:tf.FixedLenFeature([],tf.int64) 44 | } 45 | 46 | class RNNType(Enum): 47 | LSTM = 1 # LSTM unidirectional 48 | LSTM_b = 2 # LSTM bidirectional 49 | GRU = 3 # GRU 50 | GRU_b = 4 # GRU, bidirectional 51 | NORM_GRU = 5 52 | NORM_GRU_b = 6 53 | 54 | class NetType(Enum): 55 | DNN_MAXOUT = 0 56 | DNN = 1 57 | CNN = 2 58 | RNN_NV1 = 3 59 | RNN_NVN = 4 60 | 61 | class DirName(Enum): 62 | DNN = "dnn/" 63 | DNN_MAXOUT = "dnn_maxout/" 64 | CNN = "cnn/" 65 | RNN_NV1 = "rnn_nv1/" 66 | RNN_NVN = "rnn_nvn/" 67 | 68 | class FeatureName(Enum): 69 | SPEED_SEC = "speed_sec" 70 | ACC_SEC = "acc_sec" 71 | AVG_SPEED = "avg_speed" 72 | STD_SPEED = "std_speed" 73 | MEAN_ACC = "mean_acc" 74 | STD_ACC = "std_acc" -------------------------------------------------------------------------------- /linear.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | 6 | from tensorflow.python.ops import array_ops 7 | from tensorflow.python.ops import init_ops 8 | from tensorflow.python.ops import math_ops 9 | from tensorflow.python.ops import nn_ops 10 | from tensorflow.python.ops import variable_scope as vs 11 | 12 | 13 | from tensorflow.python.util import nest 14 | 15 | 16 | def _linear(args, output_size, bias, bias_start=0.0, scope=None): 17 | """Linear map: sum_i(args[i] * W[i]), where W[i] is a variable. 18 | Args: 19 | args: a 2D Tensor or a list of 2D, batch x n, Tensors. 20 | output_size: int, second dimension of W[i]. 21 | bias: boolean, whether to add a bias term or not. 22 | bias_start: starting value to initialize the bias; 0 by default. 23 | scope: (optional) Variable scope to create parameters in. 24 | Returns: 25 | A 2D Tensor with shape [batch x output_size] equal to 26 | sum_i(args[i] * W[i]), where W[i]s are newly created matrices. 27 | Raises: 28 | ValueError: if some of the arguments has unspecified or wrong shape. 29 | """ 30 | if args is None or (nest.is_sequence(args) and not args): 31 | raise ValueError("`args` must be specified") 32 | if not nest.is_sequence(args): 33 | args = [args] 34 | 35 | # Calculate the total size of arguments on dimension 1. 36 | total_arg_size = 0 37 | shapes = [a.get_shape() for a in args] 38 | for shape in shapes: 39 | if shape.ndims != 2: 40 | raise ValueError("linear is expecting 2D arguments: %s" % shapes) 41 | if shape[1].value is None: 42 | raise ValueError("linear expects shape[1] to be provided for shape %s, " 43 | "but saw %s" % (shape, shape[1])) 44 | else: 45 | total_arg_size += shape[1].value 46 | 47 | dtype = [a.dtype for a in args][0] 48 | 49 | # Now the computation. 50 | scope = vs.get_variable_scope() 51 | with vs.variable_scope(scope) as outer_scope: 52 | weights = vs.get_variable( 53 | "weights", [total_arg_size, output_size], dtype=dtype) 54 | if len(args) == 1: 55 | res = math_ops.matmul(args[0], weights) 56 | else: 57 | res = math_ops.matmul(array_ops.concat(args, 1), weights) 58 | if not bias: 59 | return res 60 | with vs.variable_scope(outer_scope) as inner_scope: 61 | inner_scope.set_partitioner(None) 62 | biases = vs.get_variable( 63 | "biases", [output_size], 64 | dtype=dtype, 65 | initializer=init_ops.constant_initializer(bias_start, dtype=dtype)) 66 | return nn_ops.bias_add(res, biases) 67 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import json 2 | import numpy as np 3 | import random 4 | import math 5 | import config 6 | import pandas as pd 7 | from param import FeatureName 8 | from param import RNNType 9 | import tensorflow as tf 10 | import os 11 | import param 12 | 13 | a = np.random.randint(0,10,[2,2]) 14 | b = np.random.randint(0,10,[2,2]) 15 | c = np.random.randint(0,10,[2,2]) 16 | 17 | a_df = pd.DataFrame(a) 18 | b_df = pd.DataFrame(b) 19 | c_df = pd.DataFrame(c) 20 | cv = pd.concat([a_df,b_df,c_df],axis=1) 21 | print(24%10) 22 | 23 | # data_dir ="G:/all_data/tfrecords/" 24 | # filenames = os.listdir(data_dir) 25 | # filenames = [os.path.join(data_dir,i) for i in filenames] 26 | # 27 | # feature = { 28 | # FeatureName.SPEED_SEC.value : tf.FixedLenFeature([],tf.string), 29 | # FeatureName.AVG_SPEED.value : tf.FixedLenFeature([],tf.string), 30 | # FeatureName.STD_SPEED.value : tf.FixedLenFeature([],tf.string), 31 | # FeatureName.ACC_SEC.value : tf.FixedLenFeature([],tf.string), 32 | # FeatureName.MEAN_ACC.value : tf.FixedLenFeature([],tf.string), 33 | # FeatureName.STD_ACC.value : tf.FixedLenFeature([],tf.string), 34 | # "label":tf.FixedLenFeature([],tf.int64) 35 | # } 36 | # 37 | # filename_queue = tf.train.string_input_producer(filenames,num_epochs=1) 38 | # reader = tf.TFRecordReader() 39 | # _,serialized_example = reader.read(filename_queue) 40 | # 41 | # features = tf.parse_single_example(serialized_example,features= feature) 42 | # speed_sec_flat = tf.decode_raw(features[param.SPEED_SEC],tf.int64) 43 | # speed_sec = tf.reshape(speed_sec_flat,[-1,param.width]) 44 | # label = tf.cast(features[param.LABEL],tf.int64) 45 | # 46 | # 47 | # 48 | # with tf.Session() as sess: 49 | # sess.run(tf.global_variables_initializer()) 50 | # sess.run(tf.local_variables_initializer()) 51 | # coord = tf.train.Coordinator() 52 | # threads = tf.train.start_queue_runners(coord=coord) 53 | # 54 | # for i in range(2): 55 | # speed_sec1,label1 = sess.run([speed_sec,label]) 56 | # print(type(speed_sec1)) 57 | # full_seq_num = speed_sec1.shape[0] // 100 58 | # print(full_seq_num) 59 | # list = [] 60 | # begin = 0 61 | # if full_seq_num >0: 62 | # for e in range(full_seq_num): 63 | # begin = e*30 64 | # list.append(tf.slice(speed_sec1,[begin,30],[100,30])) 65 | # remain = speed_sec1.shape[0] - full_seq_num*100 66 | # remain_tensor = tf.slice(speed_sec1,[begin,30],[remain,30]) 67 | # 68 | # remain_tensor_pad = tf.pad(remain_tensor,[[0,100 - remain],[0,0]]) 69 | # list.append(remain_tensor_pad) 70 | # for h in list: 71 | # print(h) 72 | # 73 | # tf.train.shuffle_batch() 74 | # 75 | # 76 | # 77 | # coord.request_stop() 78 | # coord.join(threads) 79 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | import json 2 | from tensorflow.python.ops.math_ops import tanh 3 | from tensorflow.python.ops.math_ops import sigmoid 4 | from enum import Enum 5 | from param import RNNType 6 | from param import NetType 7 | 8 | 9 | class Config(object): 10 | def __init__(self,configFile="data/config.json"): 11 | dconf = json.load(open(configFile)) 12 | #测试集ID 13 | self.test_id = dconf['test_id'] 14 | #验证集ID 15 | self.val_id = dconf['val_id'] 16 | #预期序列长度 17 | self.exp_seq_len = dconf["exp_seq_len"] 18 | #学习速率 19 | self.learning_rate = dconf["learning_rate"] 20 | #批数据尺寸 21 | self.batch_size = dconf["batch_size"] 22 | #隐藏层数 23 | self.num_layers = dconf["num_layers"] 24 | #迭代周期 25 | self.num_epochs = dconf["num_epochs"] 26 | #是否开启tensorboard 27 | self.tensorboard = dconf["tensorboard"] 28 | self.init_scale = dconf["init_scale"] 29 | #线程数 30 | self.num_threads = dconf["num_threads"] 31 | #gru中的隐藏节点 32 | self.hidden_size = dconf["hidden_size"] 33 | #任务名称 34 | self.task = dconf["task"] 35 | #是否用GPU加速 36 | self.useGPU = dconf["useGPU"] 37 | #weight初始化方式 38 | self.weight_initializer = dconf["weight_initializer"] 39 | #评估频率 40 | self.evaluate_freq = dconf["evaluate_freq"] 41 | self.testmode = dconf["testmode"] 42 | #是否有检查点 43 | self.checkpoint = dconf["checkpoint"] 44 | #是否重载变量 45 | self.restore = dconf["restore"] 46 | #激励函数 47 | self.activation = dconf["activation"] 48 | self.test_mode = dconf["test_mode"] 49 | #分类个数 50 | self.num_classes = dconf["num_classes"] 51 | #maxout中的单元个数 52 | self.maxOut_numUnits = dconf["maxOut_numUnits"] 53 | #特征数量 54 | self.num_features = dconf["num_features"] 55 | #嵌入后的向量维度 56 | self.embeded_dims = dconf["embeded_dims"] 57 | #L2正则化超参数 58 | self.l2_preparam = dconf["l2_preparam"] 59 | # 60 | self.rnn_type = dconf["rnn_type"] 61 | 62 | self.use_tfrecord = dconf["use_tfrecord"] 63 | 64 | self.tfrecord_path = dconf["tfrecord_path"] 65 | 66 | self.shuffle = dconf["shuffle"] 67 | 68 | self.keep_prob = dconf["keep_prob"] 69 | 70 | self.discretization_width = dconf["discretization_width"] 71 | 72 | self.net_type = dconf["net_type"] 73 | 74 | self.use_dropout = dconf["use_dropout"] 75 | 76 | 77 | 78 | class TrainingConfig(object): 79 | def __init__(self,is_training,is_validation,is_test,batch_size,len_features,net_type = NetType.RNN_NV1,rnn_type = RNNType.GRU): 80 | self.is_training = is_training 81 | self.is_validation = is_validation 82 | self.is_test = is_test 83 | self.batch_size = batch_size 84 | self.rnn_type = rnn_type 85 | self.net_type = net_type 86 | 87 | #特征长度即onehot总长度 88 | self.len_features = len_features 89 | self.train_seq_len = [] 90 | self.val_seq_len = [] 91 | self.test_seq_len = [] 92 | self.activation = tanh 93 | 94 | -------------------------------------------------------------------------------- /util.py: -------------------------------------------------------------------------------- 1 | from math import radians, cos, sin, asin, sqrt,atan,pi 2 | import os 3 | from glob import glob 4 | import param 5 | from param import RNNType 6 | from param import NetType 7 | from param import DirName 8 | from log import Log 9 | 10 | def jwd2dis(lat1,lon1,lat2,lon2): 11 | lat1,lon1,lat2,lon2 = map(radians,[lat1,lon1,lat2,lon2]) 12 | dlon = lon2 - lon1 13 | dlat = lat2 - lat1 14 | a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2 15 | c = 2 * asin(sqrt(a)) 16 | r = 6371 # 地球平均半径,单位为公里 17 | return c * r * 1000 18 | 19 | def jwd2angle(lat1,lon1,lat2,lon_2): 20 | dy = lon_2 - lon1 21 | dx = lat2 - lat1 22 | angle = 0 23 | 24 | if dx == 0 and dy ==0: 25 | angle = 0 26 | elif dx == 0 and dy >0: 27 | angle = 90 28 | elif dx == 0 and dy <0: 29 | angle = 270 30 | elif dy == 0 and dx >0: 31 | angle = 0 32 | elif dy == 0 and dx <0: 33 | angle = 180 34 | elif dy > 0 and dx >0: 35 | angle = atan(dy/dx) * 180/pi 36 | elif dy >0 and dx <0: 37 | angle = atan(abs(dy/dx))* 180/pi + 90 38 | elif dy <0 and dx <0: 39 | angle = atan(abs(dy / dx))* 180/pi + 180 40 | else: 41 | #小于零 42 | angle = atan(abs(dy / dx))* 180/pi + 270 43 | 44 | return angle 45 | 46 | def timestamp2second(time1,time2): 47 | 48 | return abs(time1-time2)*3600*24 49 | 50 | def switch_mode(str): 51 | str = str.strip() 52 | if(str == "bike"): 53 | return "0" 54 | 55 | if(str == "car"): 56 | return "1" 57 | 58 | if (str == "walk"): 59 | return "2" 60 | 61 | if (str == "bus"): 62 | return "3" 63 | 64 | if (str == "train"): 65 | return "4" 66 | 67 | if (str == "subway"): 68 | return "5" 69 | 70 | if (str == "airplane"): 71 | return "6" 72 | 73 | if (str == "taxi"): 74 | return "7" 75 | if (str == "boat"): 76 | return "8" 77 | if (str == "run"): 78 | return "9" 79 | if (str == "motorcycle"): 80 | return "10" 81 | else: 82 | print(str) 83 | return "11" 84 | 85 | def rename_file(): 86 | 87 | data_dir = "G:/新建文件夹/Geolife Trajectories 1.3/Data/" 88 | 89 | valiable_user_data = open("./data/have_label_user.txt", "r") 90 | user_list = valiable_user_data.readlines() 91 | for i in user_list[1:]: 92 | user_id = i[0:3] 93 | data_txt_name = data_dir + user_id + "/userdata.csv" 94 | features_name = data_dir+user_id + "/user_features.csv" 95 | new_data_name = data_dir + user_id + "/userdata_interval_1.csv" 96 | new_features_name = data_dir + user_id + "/user_features_interval_1.csv" 97 | os.rename(data_txt_name,new_data_name) 98 | os.rename(features_name,new_features_name) 99 | 100 | def delete_file(): 101 | data_dir = "G:/新建文件夹/Geolife Trajectories 1.3/Data/" 102 | 103 | valiable_user_data = open("./data/have_label_user.txt", "r") 104 | user_list = valiable_user_data.readlines() 105 | for i in user_list: 106 | user_id = i[0:3] 107 | data_txt_name = data_dir + user_id + "/user_features_interval_2.csv" 108 | 109 | os.remove(data_txt_name) 110 | 111 | def search_file(pattern,path): 112 | paths = glob(os.path.join(path,pattern)) 113 | filenames = [ path.split("\\")[1] for path in paths] 114 | filenames = [os.path.join(path,name) for name in filenames] 115 | return filenames 116 | 117 | def get_net_type(net_type): 118 | return param.NetType(net_type) 119 | 120 | def get_rnn_type(rnn_type): 121 | return param.RNNType(rnn_type) 122 | 123 | 124 | def init_environment(net_type,rnn_type): 125 | log_path = "./logdir/shiyanxiuzheng/" 126 | data_path = "./data/tfrecord12/" 127 | if net_type == NetType.DNN: 128 | data_dir = DirName.DNN.value 129 | log_dir = DirName.DNN.value 130 | elif net_type == NetType.DNN_MAXOUT: 131 | data_dir = DirName.DNN_MAXOUT.value 132 | log_dir = DirName.DNN_MAXOUT.value 133 | elif net_type == NetType.RNN_NV1: 134 | data_dir = DirName.RNN_NV1.value 135 | log_dir = DirName.RNN_NV1.value 136 | else: 137 | data_dir = DirName.RNN_NVN.value 138 | log_dir = DirName.RNN_NVN.value 139 | 140 | log_path = log_path + log_dir 141 | data_path = data_path + data_dir 142 | net_name = str(NetType(net_type)).split(".")[-1] 143 | nn_name = str(RNNType(rnn_type)).split(".")[-1] 144 | LOGGER = Log(log_path, "_" + net_name + "_" + nn_name) 145 | 146 | return log_path,data_path,LOGGER 147 | 148 | if __name__ == "__main__": 149 | #print(search_file("interval_[0-1]_*_train.tfrecords","G:/all_data/tfrecords/")) 150 | print(jwd2angle(39.974879,116.33258899999998,39.97487,116.332673)) -------------------------------------------------------------------------------- /plt.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | import matplotlib.pyplot as plt 3 | import pandas as pd 4 | import numpy as np 5 | 6 | matplotlib.rcParams['font.family']='SimHei' 7 | 8 | logdir = "./logdir/shiyanxiuzheng/" 9 | 10 | def read_csv_file(name): 11 | 12 | file = open(name) 13 | 14 | df = pd.read_csv(file) 15 | 16 | result_arr = np.array(df,np.float64) 17 | 18 | return result_arr 19 | 20 | 21 | def draw_chart(arr): 22 | plt.plot(arr[:,-1]) 23 | plt.show() 24 | 25 | def draw_4_features(): 26 | temp_dir = "12features/" 27 | features_3 = read_csv_file(logdir +temp_dir+"RNN_NV1_GRU_b_3.csv") 28 | features_6 = read_csv_file(logdir + temp_dir+"RNN_NV1_GRU_b_6.csv") 29 | features_9 = read_csv_file(logdir + temp_dir+"RNN_NV1_GRU_b_9.csv") 30 | features_12 = read_csv_file(logdir + temp_dir+"RNN_NV1_GRU_b_12.csv") 31 | 32 | limit = 20 33 | x = range(1,limit+1) 34 | 35 | plt.plot(x,features_3[0:limit, -1], "bx-", label="3个特征") 36 | plt.plot(x,features_6[0:limit, -1], "rx-", label="6个特征") 37 | plt.plot(x,features_9[0:limit, -1], "gx-", label="9个特征") 38 | plt.plot(x,features_12[0:limit, -1], "yx-", label="12个特征") 39 | 40 | plt.xlabel("mini-batch") 41 | plt.ylabel("accuarcy") 42 | plt.ylim(0.8, 0.95) 43 | plt.xlim(0, limit+2) 44 | plt.legend(loc=0) 45 | plt.title("RNN_Nv1 双层双向MaxoutGRU模型不同特征的精度(测试集)") 46 | plt.savefig(logdir +temp_dir +"rnn_nv1_features") 47 | 48 | 49 | plt.show() 50 | 51 | def draw_width(): 52 | temp_dir = "width/" 53 | features_3 = read_csv_file(logdir +temp_dir+"RNN_NV1_GRU_b_10.csv") 54 | features_6 = read_csv_file(logdir + temp_dir+"RNN_NV1_GRU_b_20.csv") 55 | features_9 = read_csv_file(logdir + temp_dir+"RNN_NV1_GRU_b_30.csv") 56 | features_12 = read_csv_file(logdir + temp_dir+"RNN_NV1_GRU_b_40.csv") 57 | 58 | limit = 20 59 | x = range(1,limit+1) 60 | 61 | plt.plot(x,features_3[0:limit, -1], "bx-", label="10") 62 | plt.plot(x,features_6[0:limit, -1], "rx-", label="20") 63 | plt.plot(x,features_9[0:limit, -1], "gx-", label="30") 64 | plt.plot(x,features_12[0:limit, -1], "yx-", label="40") 65 | 66 | plt.xlabel("mini-batch") 67 | plt.ylabel("accuarcy") 68 | plt.ylim(0.8, 0.96) 69 | plt.xlim(0, limit+2) 70 | plt.legend(loc=0) 71 | plt.title("RNN_Nv1 双层双向MaxoutGRU模型 离散宽度(测试集)") 72 | plt.savefig(logdir +temp_dir +"rnn_nv1_width") 73 | 74 | 75 | plt.show() 76 | 77 | def draw_dnn(): 78 | dnn = read_csv_file(logdir + "result_dnn/dnn.csv") 79 | dnn_dropout = read_csv_file(logdir + "result_dnn/dnn_dropout.csv") 80 | dnn_maxout = read_csv_file(logdir + "result_dnn/dnn_maxout.csv") 81 | 82 | x = range(1,20) 83 | 84 | plt.plot(x,dnn[:, -1], "bx-", label="dnn") 85 | plt.plot(x,dnn_dropout[:, -1], "rx-", label="dnn_dropout") 86 | plt.plot(x,dnn_maxout[:, -1], "gx-", label="dnn_maxout") 87 | 88 | plt.xlabel("mini-batch") 89 | plt.ylabel("accuarcy") 90 | plt.ylim(0.5, 1) 91 | plt.xlim(0, 21) 92 | plt.legend(loc=1) 93 | plt.title("三种DNN模型的精度(测试集)") 94 | plt.savefig(logdir + "result_dnn/dnn_3") 95 | 96 | plt.show() 97 | 98 | def rnn_3(): 99 | 100 | temp_dir = "3_rnn/" 101 | 102 | lstm = read_csv_file(logdir + temp_dir+"RNN_NV13 2.csv") 103 | maxoutgru = read_csv_file(logdir + temp_dir+ "RNN_NV13 4.csv") 104 | normal_gru = read_csv_file(logdir + temp_dir+ "RNN_NV13 6.csv") 105 | 106 | limit = 29 107 | x = range(1,limit+1) 108 | 109 | plt.plot(x,lstm[0:limit, -1], "bx-", label="lstm") 110 | plt.plot(x,normal_gru[0:limit, -1], "rx-", label="normal_gru") 111 | plt.plot(x,maxoutgru[0:limit, -1], "gx-", label="maxout_gru") 112 | 113 | plt.xlabel("mini-batch") 114 | plt.ylabel("accuarcy") 115 | plt.ylim(0.8, 0.95) 116 | plt.xlim(0, limit+2) 117 | plt.legend(loc=1) 118 | plt.title("三种RNN模型的精度(测试集)") 119 | plt.savefig(logdir +temp_dir +"3_rnn") 120 | 121 | plt.show() 122 | 123 | def gru_2(): 124 | temp_dir = "3_rnn/" 125 | 126 | gru = read_csv_file(logdir + temp_dir + "RNN_NV13 3.csv") 127 | gru_b= read_csv_file(logdir + temp_dir + "RNN_NV13 4.csv") 128 | 129 | limit = 48 130 | x = range(1, limit + 1) 131 | 132 | plt.plot(x, gru[0:limit, -1], "bx-", label="单向MaxoutGRU") 133 | plt.plot(x, gru_b[0:limit, -1], "rx-", label="双向MaxoutGRU") 134 | 135 | plt.xlabel("mini-batch") 136 | plt.ylabel("accuarcy") 137 | plt.ylim(0.8, 0.95) 138 | plt.xlim(0, limit + 2) 139 | plt.legend(loc=1) 140 | plt.title("单双向MaxoutGRU模型的精度(测试集)") 141 | plt.savefig(logdir + temp_dir + "gru_2") 142 | 143 | plt.show() 144 | 145 | def rnn_nvn(): 146 | temp_dir = "3nvn_gru/" 147 | 148 | gru = read_csv_file(logdir + temp_dir+"RNN_NVN_GRU.csv") 149 | gru_b_3 = read_csv_file(logdir + temp_dir+ "RNN_NVN_GRU_b 3 feature.csv") 150 | gru_b_9 = read_csv_file(logdir + temp_dir+ "RNN_NVN_GRU_b 9 feature.csv") 151 | 152 | limit = 30 153 | x = range(1,limit+1) 154 | 155 | plt.plot(x,gru[0:limit, -1], "bx-", label="单向MaxoutGRU") 156 | plt.plot(x,gru_b_3[0:limit, -1], "rx-", label="双向MaxoutGRU 3个特征") 157 | plt.plot(x,gru_b_9[0:limit, -1], "gx-", label="双向MaxoutGRU 9个特征") 158 | 159 | plt.xlabel("mini-batch") 160 | plt.ylabel("accuarcy") 161 | plt.ylim(0.8, 0.95) 162 | plt.xlim(0, limit+2) 163 | plt.legend(loc=1) 164 | plt.title("RNN_NVN模型的精度(测试集)") 165 | plt.savefig(logdir +temp_dir +"3_nvn_gru") 166 | 167 | plt.show() 168 | 169 | def draw_3_model(): 170 | temp_dir = "3_model/" 171 | 172 | gru = read_csv_file(logdir + temp_dir+"dnn no dropout.csv") 173 | gru_b_3 = read_csv_file(logdir + temp_dir+ "rnn_nvn.csv") 174 | gru_b_9 = read_csv_file(logdir + temp_dir+ "rnn_nv1.csv") 175 | 176 | limit = 39 177 | x = range(1,limit+1) 178 | 179 | plt.plot(x,gru[0:limit, -1], "bx-", label="双层DNN") 180 | plt.plot(x,gru_b_3[0:limit, -1], "rx-", label="双层双向MaxoutGRU RNN_NVN") 181 | plt.plot(x,gru_b_9[0:limit, -1], "gx-", label="双层双向MaxoutGRU RNN_NV1") 182 | 183 | plt.xlabel("mini-batch") 184 | plt.ylabel("accuarcy") 185 | plt.ylim(0.8, 0.95) 186 | plt.xlim(0, limit+2) 187 | plt.legend(loc=0) 188 | plt.title("三种模型的精度(测试集)") 189 | plt.savefig(logdir +temp_dir +"3_model") 190 | 191 | plt.show() 192 | 193 | def draw_nvn_2(): 194 | temp_dir = "3nvn_gru/" 195 | 196 | gru = read_csv_file(logdir + temp_dir+"RNN_NVN_LSTM_b.csv") 197 | gru_b_3 = read_csv_file(logdir + temp_dir+ "RNN_NVN_GRU_b 9 feature.csv") 198 | #gru_b_9 = read_csv_file(logdir + temp_dir+ "rnn_nv1.csv") 199 | 200 | limit = 39 201 | x = range(1,limit+1) 202 | 203 | plt.plot(x,gru[0:limit, -1], "bx-", label="双层双向LSTM RNN_NVN") 204 | plt.plot(x,gru_b_3[0:limit, -1], "rx-", label="双层双向MaxoutGRU RNN_NVN") 205 | #plt.plot(x,gru_b_9[0:limit, -1], "gx-", label="双层双向MaxoutGRU RNN_NV1") 206 | 207 | plt.xlabel("mini-batch") 208 | plt.ylabel("accuarcy") 209 | plt.ylim(0.8, 0.95) 210 | plt.xlim(0, limit+2) 211 | plt.legend(loc=0) 212 | plt.title("RNN_NvN模型两种网络结构的精度(测试集)") 213 | plt.savefig(logdir +temp_dir +"2_rnn") 214 | 215 | plt.show() 216 | 217 | def draw_baifenwei(): 218 | temp_dir = "baifenweiduibi/" 219 | 220 | gru = read_csv_file(logdir + temp_dir+"baifenwei95.csv") 221 | gru_b_3 = read_csv_file(logdir + temp_dir+ "baifenwei99.csv") 222 | #gru_b_9 = read_csv_file(logdir + temp_dir+ "rnn_nv1.csv") 223 | 224 | limit = 30 225 | x = range(1,limit+1) 226 | 227 | plt.plot(x,gru[0:limit, -1], "bx-", label="百分位95") 228 | plt.plot(x,gru_b_3[0:limit, -1], "rx-", label="百分位99") 229 | #plt.plot(x,gru_b_9[0:limit, -1], "gx-", label="双层双向MaxoutGRU RNN_NV1") 230 | 231 | plt.xlabel("mini-batch") 232 | plt.ylabel("accuarcy") 233 | plt.ylim(0.84, 0.95) 234 | plt.xlim(0, limit+2) 235 | plt.legend(loc=0) 236 | plt.title("百分位对比(测试集)") 237 | plt.savefig(logdir +temp_dir +"baifenwei") 238 | 239 | plt.show() 240 | 241 | def draw_batch_size(): 242 | 243 | temp_dir = "3_model/" 244 | 245 | gru = read_csv_file(logdir + temp_dir+"rnn_nv1.csv") 246 | gru_b_3 = read_csv_file(logdir + temp_dir+ "rnn_nv1_256.csv") 247 | #gru_b_9 = read_csv_file(logdir + temp_dir+ "rnn_nv1.csv") 248 | 249 | limit = 30 250 | x = range(1,limit+1) 251 | 252 | plt.plot(x,gru[0:limit, -1], "bx-", label="batch size 128") 253 | plt.plot(x,gru_b_3[0:limit, -1], "rx-", label="batch size 256") 254 | #plt.plot(x,gru_b_9[0:limit, -1], "gx-", label="双层双向MaxoutGRU RNN_NV1") 255 | 256 | plt.xlabel("mini-batch") 257 | plt.ylabel("accuarcy") 258 | plt.ylim(0.84, 0.95) 259 | plt.xlim(0, limit+2) 260 | plt.legend(loc=0) 261 | plt.title("batch size 对比(测试集)") 262 | plt.savefig(logdir +temp_dir +"batch size") 263 | 264 | plt.show() 265 | 266 | def draw_hidden(): 267 | temp_dir = "3_model/" 268 | 269 | gru = read_csv_file(logdir + temp_dir+"rnn_nv1_hidden_50.csv") 270 | gru_b_3 = read_csv_file(logdir + temp_dir+ "rnn_nv1_hidden_200.csv") 271 | gru_b_9 = read_csv_file(logdir + temp_dir+ "rnn_nv1.csv") 272 | 273 | limit = 41 274 | x = range(1,limit+1) 275 | 276 | plt.plot(x,gru[0:limit, -1], "bx-", label="hidden size 50") 277 | plt.plot(x,gru_b_3[0:limit, -1], "rx-", label="hidden size 200") 278 | plt.plot(x,gru_b_9[0:limit, -1], "gx-", label="hidden size 100") 279 | 280 | plt.xlabel("mini-batch") 281 | plt.ylabel("accuarcy") 282 | plt.ylim(0.8, 0.95) 283 | plt.xlim(0, limit+2) 284 | plt.legend(loc=0) 285 | plt.title("HIDDEN SIZE(测试集)") 286 | plt.savefig(logdir +temp_dir +"2_rnn") 287 | 288 | plt.show() 289 | 290 | def draw_activation(): 291 | temp_dir = "3_model/" 292 | 293 | gru = read_csv_file(logdir + temp_dir+"rnn_nv1_tahn.csv") 294 | gru_b_3 = read_csv_file(logdir + temp_dir+ "rnn_nv1.csv") 295 | #gru_b_9 = read_csv_file(logdir + temp_dir+ "rnn_nv1.csv") 296 | 297 | limit = 66 298 | x = range(1,limit+1) 299 | 300 | plt.plot(x,gru[0:limit, -1], "bx-", label="tahn") 301 | plt.plot(x,gru_b_3[0:limit, -1], "rx-", label="sigmod") 302 | #plt.plot(x,gru_b_9[0:limit, -1], "gx-", label="hidden size 100") 303 | 304 | plt.xlabel("mini-batch") 305 | plt.ylabel("accuarcy") 306 | plt.ylim(0.8, 0.95) 307 | plt.xlim(0, limit+2) 308 | plt.legend(loc=0) 309 | plt.title("MaxoutGRU的两种activation(测试集)") 310 | plt.savefig(logdir +temp_dir +"activation") 311 | 312 | plt.show() 313 | 314 | if __name__ == "__main__": 315 | draw_width() -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | import config 2 | import tensorflow as tf 3 | from customized_gru import CustomizedGRU as GRUCell 4 | from tensorflow.python.ops.rnn_cell import GRUCell as BasicGRUCell 5 | import tensorflow.contrib as tf_ct 6 | from tensorflow.contrib.rnn import BasicLSTMCell 7 | from param import RNNType 8 | from param import NetType 9 | import linear 10 | import param 11 | import util 12 | 13 | 14 | 15 | 16 | class Model(object): 17 | 18 | def __init__(self,conf,config): 19 | 20 | self.init_conf(conf) 21 | self.init_config(config) 22 | 23 | self.current_step = tf.Variable(0,trainable=False) 24 | self._learning_rate = tf.train.exponential_decay(self.learning_rate,self.current_step,decay_steps=10,decay_rate=0.98,staircase=True) 25 | 26 | 27 | #self.current_step = tf.Variable(0) 28 | if self.net_type == NetType.DNN: 29 | self.init_dnn_type() 30 | elif self.net_type == NetType.CNN: 31 | self.init_cnn_type() 32 | elif self.net_type == NetType.RNN_NV1: 33 | self.init_rnn_type_nv1() 34 | elif self.net_type == NetType.RNN_NVN: 35 | self.init_rnn_type_nvn() 36 | elif self.net_type == NetType.DNN_MAXOUT: 37 | self.init_dnn_type_with_maxout() 38 | 39 | #init 文件里的配置 40 | def init_conf(self,conf): 41 | self.num_threads = conf.num_threads 42 | self.hidden_size = conf.hidden_size # 隐藏层节点 43 | self.learning_rate = conf.learning_rate # 学习速率 44 | self.num_layers = conf.num_layers # 隐藏层数 45 | self.num_epochs = conf.num_epochs # 训练周期 46 | self.exp_seq_len = conf.exp_seq_len # 序列长度 47 | self.num_classes = conf.num_classes # 分类个数 48 | self.num_features = conf.num_features # 特征数量 49 | self.maxOut_numUnits = conf.maxOut_numUnits # maxout节点 50 | self.embeded_dims = conf.embeded_dims # 嵌入维数 51 | self.bias_initializer = tf.random_uniform_initializer(0, 0.001) # bias初始器 52 | self.l2_preparam = conf.l2_preparam # l2正则化超参数 53 | self.tensorboard =conf.tensorboard 54 | self.use_tfrecord = conf.use_tfrecord 55 | self.tfrecord_path = conf.tfrecord_path 56 | self.shuffle = conf.shuffle 57 | self.keep_prob = conf.keep_prob 58 | self.use_dropout = conf.use_dropout 59 | 60 | #init 创建模型时的配置 61 | def init_config(self,config): 62 | # 将一些要创建时的数据通过config类传进来 包括模式,数据长度等等 63 | self.net_type = config.net_type # 网络类型 64 | self.rnn_type = config.rnn_type # rnn类型 65 | self.is_training = config.is_training # 是否为训练模式 66 | self.is_test = config.is_test # 是否为测试模式 67 | self.is_validation = config.is_validation # 是否为验证模式 68 | self.len_features = config.len_features # 特征长度 69 | self.train_seq_len = config.train_seq_len # 训练集序列长度列表 70 | self.valid_seq_len = config.val_seq_len 71 | self.test_seq_len = config.test_seq_len 72 | self.activation = config.activation # 激励函数 73 | self.batch_size = config.batch_size # batch尺寸 74 | 75 | def init_rnn_type_nv1(self): 76 | 77 | # 输入数据 78 | self._input_data = tf.placeholder(tf.float32, [self.exp_seq_len, self.batch_size, self.len_features], 79 | name="input_data") 80 | self._targets = tf.placeholder(tf.int32,[self.batch_size],name="label") 81 | #self._valid_target = self._targets 82 | 83 | # 用于提前结束每个batch 84 | self._early_stop = tf.placeholder(tf.int32, shape=[self.batch_size], name="early-stop") 85 | 86 | if self.is_training: 87 | self.seq_len = self.exp_seq_len * self.batch_size 88 | elif self.is_validation: 89 | self.seq_len = sum(self.valid_seq_len) 90 | else: 91 | self.seq_len = sum(self.test_seq_len) 92 | 93 | # 获得多层双向gru的cell 94 | with tf.name_scope("mutil_rnn_cell"): 95 | cell = self.get_mutil_rnn_cell() 96 | 97 | # with tf.name_scope("embeded"): 98 | # self.get_embeded_vec() 99 | 100 | # 初始化cell 101 | self.set_initial_states(cell) 102 | 103 | # 获得gru的输出 104 | with tf.name_scope("rnn_outputs"): 105 | self.get_rnn_outputs(cell) 106 | 107 | #w,b=self.init_mutil_dnn_weights(self.hidden_size*2,self.hidden_size*2,self.hidden_size*2,self.num_layers -1) 108 | #self._output = self.mlp(self._output,w,b,0.5) 109 | 110 | # softmax层的权重 111 | with tf.name_scope("softmax_layer") as scope: 112 | self.get_softmax_layer_output() 113 | 114 | # 获得混淆矩阵 115 | with tf.name_scope("confusion_matrix") as scope: 116 | self._confusion_matrix = tf.confusion_matrix(self._targets, self._digit_predictions,self.num_classes) 117 | 118 | with tf.name_scope("cross_entropy") as scope: 119 | self._onehot_labels = tf.one_hot(self._targets,depth=self.num_classes) 120 | self._loss = tf.nn.softmax_cross_entropy_with_logits(labels=self._onehot_labels,logits=self._predictions) 121 | self._cross_entropy = tf.reduce_sum(self._loss) 122 | self._cost = tf.reduce_mean(self._loss) 123 | self.add_l2_regulation() 124 | 125 | with tf.name_scope("accuracy") as scope: 126 | self._correct_prediction = tf.equal(self._targets, self._digit_predictions) 127 | self._accuracy = tf.reduce_mean(tf.cast(self._correct_prediction,tf.float32)) 128 | #self._accuracy = tf.metrics.accuracy( self._target,self._digit_predictions)[1] 129 | 130 | with tf.name_scope("optimization") as scope: 131 | self._train_op = tf.train.AdamOptimizer(self._learning_rate).minimize(self._cost,global_step=self.current_step) 132 | 133 | if self.tensorboard: 134 | self.w_hist = tf.summary.histogram("weights", self._softmax_w) 135 | self.b_hist = tf.summary.histogram("biases", self._softmax_b) 136 | self.y_hist_train = tf.summary.histogram("train-predictions", self._predictions) 137 | self.y_hist_test = tf.summary.histogram("test-predictions", self._predictions) 138 | self.mse_summary_train = tf.summary.scalar("train-cross-entropy-cost", self._cost) 139 | self.mse_summary_test = tf.summary.scalar("test-cross-entropy-cost", self._cost) 140 | 141 | def init_rnn_type_nvn(self): 142 | self._input_data = tf.placeholder(tf.float32, [self.exp_seq_len, self.batch_size, self.len_features], 143 | name="input_data") 144 | self._targets = tf.placeholder(tf.int32, [self.batch_size, self.exp_seq_len], name="targets") 145 | 146 | self._weight_sequence_loss = tf.placeholder(tf.float32,[self.batch_size,self.exp_seq_len],name="weight_sequence_loss") 147 | 148 | if self.is_training: 149 | self.seq_len = self.exp_seq_len * self.batch_size 150 | elif self.is_validation: 151 | self.seq_len = sum(self.valid_seq_len) 152 | else: 153 | self.seq_len = sum(self.test_seq_len) 154 | 155 | # 获得多层双向gru的cell 156 | with tf.name_scope("mutil_rnn_cell"): 157 | cell = self.get_mutil_rnn_cell() 158 | 159 | # 用于提前结束每个batch 160 | self._early_stop = tf.placeholder(tf.int32, shape=[self.batch_size], name="early-stop") 161 | 162 | # with tf.name_scope("embeded"): 163 | # self.get_embeded_vec() 164 | 165 | # 初始化cell 166 | self.set_initial_states(cell) 167 | 168 | # 获得gru的输出 169 | with tf.name_scope("rnn_outputs"): 170 | self.get_rnn_outputs(cell) 171 | # 获得去除padding的标签 172 | # self._valid_target = self.get_valid_sequence( 173 | # tf.reshape(self._targets, [self.exp_seq_len * self.batch_size]), 174 | # self.num_classes) 175 | # softmax层的权重 176 | with tf.name_scope("softmax-layer") as scope: 177 | self.get_softmax_layer_output() 178 | 179 | # 获得混淆矩阵 180 | with tf.name_scope("confusion-matrix") as scope: 181 | self._confusion_matrix = tf.confusion_matrix(tf.reshape(self._targets,[self.batch_size*self.exp_seq_len]), self._digit_predictions,self.num_classes) 182 | 183 | with tf.name_scope("seq2seq-loss-by-example") as scpoe: 184 | self._loss = tf_ct.legacy_seq2seq.sequence_loss_by_example([self._predictions], 185 | [tf.reshape(self.targets,[self.batch_size*self.exp_seq_len])], 186 | [tf.reshape(self._weight_sequence_loss,[self.batch_size*self.exp_seq_len])]) 187 | # self._loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example( 188 | # [self._predictions], 189 | # [self._targets], 190 | # [tf.ones([tf.cast(self.getTensorShape(self._targets)[0],tf.int32)])]) 191 | self._cross_entropy = tf.reduce_sum(self._loss) 192 | self._cost = tf.reduce_mean(self._loss) 193 | self.add_l2_regulation() 194 | # 计算l2cost 195 | # tv = tf.trainable_variables() 196 | # #tf_ct.layers.l2_regularizer() 197 | # 198 | # self._regularization_cost = self.l2_preparam*tf.reduce_sum([tf.nn.l2_loss(v) for v in tv]) 199 | # #总cost为 基础cost + l2cost 200 | # #self._regularization_cost = tf.reduce_sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) 201 | # self._cost = self._cost+self._regularization_cost 202 | self._accuracy = tf_ct.metrics.accuracy(self._digit_predictions, tf.reshape(self._targets,[self.batch_size*self.exp_seq_len])) 203 | 204 | with tf.name_scope("optimization") as scope: 205 | self._train_op = tf.train.AdamOptimizer(self._learning_rate).minimize(self._cost,global_step=self.current_step) 206 | 207 | if self.tensorboard: 208 | self.w_hist = tf.summary.histogram("weights", self._softmax_w) 209 | self.b_hist = tf.summary.histogram("biases", self._softmax_b) 210 | self.y_hist_train = tf.summary.histogram("train-predictions", self._predictions) 211 | self.y_hist_test = tf.summary.histogram("test-predictions", self._predictions) 212 | self.mse_summary_train = tf.summary.scalar("train-cross-entropy-cost", self._cost) 213 | self.mse_summary_test = tf.summary.scalar("test-cross-entropy-cost", self._cost) 214 | 215 | def init_dnn_type(self): 216 | # 输入数据 217 | self._input_data = tf.placeholder(tf.float32, [None, self.len_features], 218 | name="input_data") 219 | self._early_stop = tf.placeholder(tf.int32, shape=[None], name="early-stop") 220 | self._targets = tf.placeholder(tf.int32, [None], name="label") 221 | 222 | with tf.name_scope("init_weights") as scope: 223 | weights,biases = self.init_mutil_dnn_weights(self.len_features,self.hidden_size,self.num_classes,self.num_layers) 224 | 225 | with tf.name_scope("mlp") as scope: 226 | 227 | self._predictions = self.mlp(self._input_data,weights,biases,self.keep_prob) 228 | 229 | with tf.name_scope("cost") as scope: 230 | 231 | self._onehot_labels = tf.one_hot(self._targets, depth=self.num_classes) 232 | 233 | self._loss = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits_v2(labels=self._onehot_labels,logits=self._predictions)) 234 | self._cross_entropy = tf.reduce_sum(self._loss) 235 | self._cost = tf.reduce_mean(self._loss) 236 | 237 | self.add_l2_regulation() 238 | 239 | with tf.name_scope("accuracy") as scope: 240 | self._prob_predictions = tf.nn.softmax(self._predictions) 241 | # 获得每个数据最大的索引 242 | self._digit_predictions = tf.argmax(self._prob_predictions, axis=1, output_type=tf.int32) 243 | 244 | self._correct_prediction = tf.equal(self._targets, self._digit_predictions) 245 | self._accuracy = tf.reduce_mean(tf.cast(self._correct_prediction,tf.float32)) 246 | #self._accuracy = tf.metrics.accuracy( self._valid_target,self._digit_predictions)[1] 247 | 248 | with tf.name_scope("confusion-matrix") as scope: 249 | self._confusion_matrix = tf.confusion_matrix(self._targets, self._digit_predictions,self.num_classes) 250 | 251 | with tf.name_scope("optimization") as scope: 252 | self._train_op = tf.train.AdamOptimizer(self._learning_rate).minimize(self._cost,global_step=self.current_step) 253 | 254 | def init_dnn_type_with_maxout(self): 255 | # 输入数据 256 | self._input_data = tf.placeholder(tf.float32, [None, self.len_features], 257 | name="input_data") 258 | self._early_stop = tf.placeholder(tf.int32, shape=[None], name="early-stop") 259 | self._targets = tf.placeholder(tf.int32, [None], name="label") 260 | 261 | with tf.name_scope("init_hidden_weights") as scope: 262 | weights,biases = self.init_mutil_dnn_weights(self.len_features,self.hidden_size,self.hidden_size,self.num_layers -1) 263 | 264 | with tf.name_scope("hidden_mlp") as scope: 265 | 266 | self._hidden_outputs = self.mlp(self._input_data,weights,biases,self.keep_prob) 267 | 268 | with tf.name_scope("maxout_layer") as scope: 269 | self._maxout_output = self.maxout(self._hidden_outputs,self.maxOut_numUnits,self.hidden_size,0,scope) 270 | 271 | 272 | with tf.name_scope("softmax_layer") as scope: 273 | self._softmax_w = tf.get_variable("softmax_w", [self.hidden_size, self.num_classes]) 274 | 275 | self._softmax_b = tf.get_variable("softmax_b", [self.num_classes], initializer=self.bias_initializer) 276 | 277 | self._predictions = tf.matmul(self._maxout_output,self._softmax_w) + self._softmax_b 278 | 279 | self._prob_predictions = tf.nn.softmax(self._predictions) 280 | 281 | with tf.name_scope("cost") as scope: 282 | 283 | self._onehot_labels = tf.one_hot(self._targets, depth=self.num_classes) 284 | 285 | self._loss = tf.nn.softmax_cross_entropy_with_logits_v2(labels=self._onehot_labels,logits=self._predictions) 286 | 287 | self._cross_entropy = tf.reduce_sum(self._loss) 288 | 289 | self._cost = tf.reduce_mean(self._loss) 290 | 291 | self.add_l2_regulation() 292 | 293 | with tf.name_scope("accuracy") as scope: 294 | # 获得每个数据最大的索引 295 | self._digit_predictions = tf.argmax(self._prob_predictions, axis=1, output_type=tf.int32) 296 | 297 | self._correct_prediction = tf.equal(self._targets, self._digit_predictions) 298 | self._accuracy = tf.reduce_mean(tf.cast(self._correct_prediction,tf.float32)) 299 | #self._accuracy = tf.metrics.accuracy( self._valid_target,self._digit_predictions)[1] 300 | 301 | with tf.name_scope("confusion-matrix") as scope: 302 | self._confusion_matrix = tf.confusion_matrix(self._targets, self._digit_predictions,self.num_classes) 303 | 304 | with tf.name_scope("optimization") as scope: 305 | self._train_op = tf.train.AdamOptimizer(self._learning_rate).minimize(self._cost,global_step=self.current_step) 306 | 307 | 308 | def init_cnn_type(self): 309 | pass 310 | 311 | def init_mutil_dnn_weights(self,ils, hls, ols,hl_num): 312 | weights, bias = {}, {} 313 | stddev = 0.1 314 | for i in range(hl_num + 1): 315 | fan_in = ils if i == 0 else hls 316 | fan_out = ols if i == hl_num else hls 317 | weights[i] = tf.get_variable("weight_" + str(i),shape=[fan_in, fan_out]) 318 | bias[i] = tf.get_variable("bias_"+str(i),shape = [fan_out]) 319 | return weights, bias 320 | 321 | def mlp(self,_x, _w, _b, _keep_prob): 322 | layers = {} 323 | for i in range(len(_w)): 324 | if i == 0: 325 | if self.use_dropout: 326 | layers[i] = tf.nn.dropout(self.activation(tf.add(tf.matmul(_x, _w[i]), _b[i])), _keep_prob) 327 | else: 328 | layers[i] = self.activation(tf.add(tf.matmul(_x, _w[i]), _b[i])) 329 | elif i < len(_w) - 1: 330 | if self.use_dropout: 331 | layers[i] = tf.nn.dropout(self.activation(tf.add(tf.matmul(layers[i - 1], _w[i]), _b[i])), _keep_prob) 332 | else: 333 | layers[i] = self.activation(tf.add(tf.matmul(layers[i - 1], _w[i]), _b[i])) 334 | else: 335 | layers[i] = tf.add(tf.matmul(layers[i - 1], _w[i]), _b[i]) 336 | return layers[len(_w) - 1] 337 | 338 | def get_embeded_vec(self): 339 | 340 | self._embeding_weights = tf.get_variable(name="embeding",shape=[self.len_features,self.embeded_dims],dtype=tf.float32) 341 | 342 | embed_input = tf.reshape(self._input_data,[self.exp_seq_len*self.batch_size,self.len_features]) 343 | 344 | #embeding_bias = tf.get_variable(name="embeding_bias",shape=[self.embeded_dims],dtype=tf.float32,initializer=self.bias_initializer) 345 | 346 | embed_result = tf.matmul(embed_input,self.embeding_weights)# + embeding_bias 347 | 348 | self._embeded_result = tf.reshape(embed_result,[self.exp_seq_len,self.batch_size,self.embeded_dims]) 349 | 350 | def get_mutil_rnn_cell(self): 351 | if self.use_dropout: 352 | if self.rnn_type == RNNType.GRU: 353 | cell = tf_ct.rnn.MultiRNNCell( 354 | [tf_ct.rnn.DropoutWrapper(GRUCell(self.hidden_size, self.maxOut_numUnits, activation=self.activation),self.keep_prob,self.keep_prob,self.keep_prob) for _ in range(self.num_layers)]) 355 | return cell 356 | elif self.rnn_type == RNNType.GRU_b: 357 | cell_fw = tf_ct.rnn.MultiRNNCell( 358 | [tf_ct.rnn.DropoutWrapper(GRUCell(self.hidden_size, self.maxOut_numUnits, activation=self.activation),self.keep_prob,self.keep_prob,self.keep_prob) for _ in range(self.num_layers)]) 359 | cell_bw = tf_ct.rnn.MultiRNNCell( 360 | [tf_ct.rnn.DropoutWrapper(GRUCell(self.hidden_size, self.maxOut_numUnits, activation=self.activation),self.keep_prob,self.keep_prob,self.keep_prob) for _ in range(self.num_layers)]) 361 | return (cell_fw,cell_bw) 362 | elif self.rnn_type == RNNType.LSTM: 363 | cell = tf_ct.rnn.MultiRNNCell( 364 | [tf_ct.rnn.DropoutWrapper(BasicLSTMCell(self.hidden_size,activation=self.activation),self.keep_prob,self.keep_prob,self.keep_prob) for _ in range(self.num_layers)]) 365 | return cell 366 | elif self.rnn_type == RNNType.LSTM_b: 367 | cell_fw = tf_ct.rnn.MultiRNNCell( 368 | [tf_ct.rnn.DropoutWrapper(BasicLSTMCell(self.hidden_size,activation=self.activation),self.keep_prob,self.keep_prob,self.keep_prob) for _ in range(self.num_layers)]) 369 | cell_bw = tf_ct.rnn.MultiRNNCell( 370 | [tf_ct.rnn.DropoutWrapper(BasicLSTMCell(self.hidden_size,activation=self.activation),self.keep_prob,self.keep_prob,self.keep_prob) for _ in range(self.num_layers)]) 371 | return (cell_fw,cell_bw) 372 | elif self.rnn_type == RNNType.NORM_GRU: 373 | cell = tf_ct.rnn.MultiRNNCell( 374 | [tf_ct.rnn.DropoutWrapper( 375 | BasicGRUCell(self.hidden_size, activation=self.activation), self.keep_prob, 376 | self.keep_prob, self.keep_prob) for _ in range(self.num_layers)]) 377 | return cell 378 | elif self.rnn_type == RNNType.NORM_GRU_b: 379 | cell_fw = tf_ct.rnn.MultiRNNCell( 380 | [tf_ct.rnn.DropoutWrapper(BasicGRUCell(self.hidden_size, activation=self.activation), 381 | self.keep_prob, self.keep_prob, self.keep_prob) for _ in 382 | range(self.num_layers)]) 383 | cell_bw = tf_ct.rnn.MultiRNNCell( 384 | [tf_ct.rnn.DropoutWrapper(BasicGRUCell(self.hidden_size, activation=self.activation), 385 | self.keep_prob, self.keep_prob, self.keep_prob) for _ in 386 | range(self.num_layers)]) 387 | return (cell_fw, cell_bw) 388 | else: 389 | 390 | if self.rnn_type == RNNType.GRU: 391 | cell = tf_ct.rnn.MultiRNNCell( 392 | [GRUCell(self.hidden_size, self.maxOut_numUnits, activation=self.activation) for _ in range(self.num_layers)]) 393 | return cell 394 | elif self.rnn_type == RNNType.GRU_b: 395 | cell_fw = tf_ct.rnn.MultiRNNCell( 396 | [GRUCell(self.hidden_size, self.maxOut_numUnits, activation=self.activation) for _ in range(self.num_layers)]) 397 | cell_bw = tf_ct.rnn.MultiRNNCell( 398 | [GRUCell(self.hidden_size, self.maxOut_numUnits, activation=self.activation) for _ in range(self.num_layers)]) 399 | return (cell_fw,cell_bw) 400 | elif self.rnn_type == RNNType.LSTM: 401 | cell = tf_ct.rnn.MultiRNNCell( 402 | [BasicLSTMCell(self.hidden_size,activation=self.activation) for _ in range(self.num_layers)]) 403 | return cell 404 | elif self.rnn_type == RNNType.LSTM_b: 405 | cell_fw = tf_ct.rnn.MultiRNNCell( 406 | [BasicLSTMCell(self.hidden_size,activation=self.activation) for _ in range(self.num_layers)]) 407 | cell_bw = tf_ct.rnn.MultiRNNCell( 408 | [BasicLSTMCell(self.hidden_size,activation=self.activation) for _ in range(self.num_layers)]) 409 | return (cell_fw,cell_bw) 410 | elif self.rnn_type == RNNType.NORM_GRU: 411 | cell = tf_ct.rnn.MultiRNNCell( 412 | [BasicGRUCell(self.hidden_size, activation=self.activation) for _ in range(self.num_layers)]) 413 | return cell 414 | elif self.rnn_type == RNNType.NORM_GRU_b: 415 | cell_fw = tf_ct.rnn.MultiRNNCell( 416 | [BasicGRUCell(self.hidden_size, activation=self.activation) for _ in range(self.num_layers)]) 417 | cell_bw = tf_ct.rnn.MultiRNNCell( 418 | [BasicGRUCell(self.hidden_size, activation=self.activation) for _ in range(self.num_layers)]) 419 | return (cell_fw, cell_bw) 420 | 421 | #初始化cell的状态 422 | def set_initial_states(self, cell): 423 | if self.rnn_type == RNNType.GRU or self.rnn_type == RNNType.LSTM or self.rnn_type == RNNType.NORM_GRU: 424 | self._initial_state = cell.zero_state(self.batch_size,tf.float32) 425 | elif self.rnn_type == RNNType.GRU_b or self.rnn_type == RNNType.LSTM_b or self.rnn_type == RNNType.NORM_GRU_b: 426 | (cell_fw, cell_bw) = cell 427 | self._initial_state_fw = cell_fw.zero_state(self.batch_size, tf.float32) 428 | self._initial_state_bw = cell_bw.zero_state(self.batch_size, tf.float32) 429 | 430 | def get_rnn_outputs(self,cell): 431 | if self.rnn_type == RNNType.LSTM or self.rnn_type == RNNType.GRU or self.rnn_type == RNNType.NORM_GRU: 432 | self._outputs,self._state = tf.nn.dynamic_rnn(cell,self._input_data,sequence_length=self._early_stop, 433 | initial_state=self.initial_state, 434 | time_major=True,dtype=tf.float32) 435 | if self.net_type == NetType.RNN_NVN: 436 | outputs = tf.transpose(self._outputs, perm=[1, 0, 2]) 437 | 438 | self._output = tf.reshape(tf.concat(axis=0, values=outputs), 439 | [self.exp_seq_len * self.batch_size, self.hidden_size]) 440 | 441 | #self._valid_output = self.get_valid_sequence(self._output, self.hidden_size) 442 | 443 | elif self.net_type == NetType.RNN_NV1: 444 | if self.rnn_type == RNNType.LSTM : 445 | state_h = self._state[-1] 446 | self._output = state_h[-1] 447 | elif self.rnn_type == RNNType.GRU or self.rnn_type == RNNType.NORM_GRU: 448 | self._output = self._state[-1] 449 | 450 | 451 | elif self.rnn_type == RNNType.LSTM_b or self.rnn_type == RNNType.GRU_b or self.rnn_type == RNNType.NORM_GRU_b: 452 | (cell_fw, cell_bw) = cell 453 | self._outputs, self._state = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, self._input_data, 454 | sequence_length=self._early_stop, 455 | initial_state_fw=self._initial_state_fw, 456 | initial_state_bw=self._initial_state_bw, 457 | time_major=True, dtype=tf.float32) 458 | 459 | if self.net_type == NetType.RNN_NVN: 460 | 461 | output_fw, output_bw = self._outputs 462 | output_fw = tf.transpose(output_fw, perm=[1, 0, 2]) 463 | output_bw = tf.transpose(output_bw, perm=[1, 0, 2]) 464 | outputs = tf.concat(axis=2, values=[output_fw, output_bw]) 465 | # Concatenates tensors along one dimension. 466 | # this will flatten the dimension of the matrix to [batch_size * num_steps, num_hidden_nodes] 467 | # However, this is not the true output sequence, since padding added a number of empty elements 468 | # Extra padding elements should be removed from the output sequence. 469 | # Here first concatenate all vessels into one long sequence, including paddings 470 | self._output = tf.reshape(tf.concat(axis=0, values=outputs), 471 | [self.exp_seq_len * self.batch_size, self.hidden_size * 2]) 472 | # Remove padding here 473 | #self._valid_output = self.get_valid_sequence(self._output, self.hidden_size * 2) 474 | elif self.net_type == NetType.RNN_NV1: 475 | state_fw, state_bw = self._state 476 | if self.rnn_type == RNNType.LSTM_b : 477 | state_fw_h = state_fw[-1] 478 | state_bw_h = state_bw[-1] 479 | self._output = tf.concat(axis=1, values=[state_fw_h[-1], state_bw_h[-1]]) 480 | elif self.rnn_type == RNNType.GRU_b or self.rnn_type==RNNType.NORM_GRU_b: 481 | self._output = tf.concat(axis=1,values=[state_fw[-1],state_bw[-1]]) 482 | 483 | def get_softmax_layer_output(self): 484 | if self.net_type == NetType.DNN: 485 | self._softmax_w = tf.get_variable("softmax_w", [self.hidden_size, self.num_classes]) 486 | elif self.net_type == NetType.RNN_NV1 or self.net_type == NetType.RNN_NVN: 487 | 488 | if self.rnn_type == RNNType.GRU or self.rnn_type == RNNType.LSTM or self.rnn_type == RNNType.NORM_GRU: 489 | self._softmax_w = tf.get_variable("softmax_w", [self.hidden_size , self.num_classes]) 490 | elif self.rnn_type == RNNType.GRU_b or self.rnn_type == RNNType.LSTM_b or self.rnn_type == RNNType.NORM_GRU_b: 491 | 492 | self._softmax_w = tf.get_variable("softmax_w", [self.hidden_size * 2, self.num_classes]) 493 | 494 | # softmax层的bias 495 | self._softmax_b = tf.get_variable("softmax_b", [self.num_classes], initializer=self.bias_initializer) 496 | 497 | self._predictions = tf.matmul(self._output, self._softmax_w) + self._softmax_b 498 | # 概率 499 | self._prob_predictions = tf.nn.softmax(self._predictions) 500 | # 获得每个数据最大的索引 501 | self._digit_predictions = tf.argmax(self._prob_predictions, axis=1,output_type=tf.int32) 502 | 503 | def add_l2_regulation(self): 504 | 505 | # 计算l2cost 506 | tv = tf.trainable_variables() 507 | 508 | # #tf_ct.layers.l2_regularizer() 509 | # 510 | self._l2_regularization_cost = self.l2_preparam*tf.reduce_sum([tf.nn.l2_loss(v) for v in tv]) 511 | # #总cost为 基础cost + l2cost 512 | self._cost = self._cost+self._l2_regularization_cost 513 | 514 | def maxout(self,input,num_units,output_size,ini_value,scope): 515 | outputs = None 516 | for i in range(num_units): 517 | with tf.variable_scope(str(i)): 518 | y = self.activation(linear._linear(input, output_size, True, ini_value, scope=scope)) 519 | if outputs is None: 520 | outputs = y 521 | else: 522 | outputs = tf.maximum(outputs, y) 523 | return outputs 524 | 525 | def get_valid_sequence(self, seq, feature_size): 526 | """remove padding from sequences""" 527 | if self.is_training: 528 | stop = self._early_stop 529 | elif self.is_validation: 530 | stop = self._early_stop 531 | else: 532 | stop = self._early_stop 533 | valid_sequence_list = [] 534 | for i in range(self.batch_size): 535 | if len(tf.Tensor.get_shape(seq)) == 2: 536 | sub_seq = tf.slice(seq, [self.exp_seq_len * i, 0], [stop[i], feature_size]) 537 | else: 538 | sub_seq = tf.slice(seq, [self.exp_seq_len * i], [stop[i]]) 539 | valid_sequence_list.append(sub_seq) 540 | valid_sequence = tf.concat(axis=0, values=valid_sequence_list) 541 | return valid_sequence 542 | 543 | def getTensorShape(this, tensor): 544 | return tf.Tensor.get_shape(tensor) 545 | 546 | @property 547 | def embeding_weights(self): 548 | return self._embeding_weights 549 | 550 | @property 551 | def embeded_result(self): 552 | return self._embeded_result 553 | 554 | @property 555 | def digit_predictions(self): 556 | return self._digit_predictions 557 | 558 | @property 559 | def confusion_matrix(self): 560 | return self._confusion_matrix 561 | 562 | @property 563 | def prob_predictions(self): 564 | return self._prob_predictions 565 | 566 | @property 567 | def input_data(self): 568 | return self._input_data 569 | 570 | @property 571 | def weight_sequence_loss(self): 572 | return self._weight_sequence_loss 573 | 574 | @property 575 | def targets(self): 576 | return self._targets 577 | 578 | @property 579 | def predictions(self): 580 | return self._predictions 581 | 582 | @property 583 | def early_stop(self): 584 | return self._early_stop 585 | 586 | @property 587 | def initial_state(self): 588 | if self.rnn_type == RNNType.GRU or self.rnn_type == RNNType.LSTM: 589 | return self._initial_state 590 | elif self.rnn_type == RNNType.GRU_b or self.rnn_type == RNNType.LSTM_b: 591 | return (self._initial_state_fw ,self._initial_state_bw) 592 | 593 | @property 594 | def cost(self): 595 | return self._cost 596 | 597 | @property 598 | def accuracy(self): 599 | return self._accuracy 600 | 601 | @property 602 | def train_op(self): 603 | return self._train_op 604 | 605 | @property 606 | def final_state(self): 607 | return self._final_state 608 | 609 | -------------------------------------------------------------------------------- /trajectoryNet.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import pandas as pd 4 | import config 5 | from config import TrainingConfig 6 | from model import Model 7 | from config import NetType 8 | from config import RNNType 9 | from tensorflow.python.ops.math_ops import tanh 10 | from log import Log 11 | from myThread import MyThread 12 | import random 13 | from tensorflow.contrib import layers 14 | from data_funs import Data 15 | import util 16 | import param 17 | from param import DirName 18 | from sklearn.metrics import confusion_matrix 19 | from util import get_net_type 20 | from util import get_rnn_type 21 | 22 | 23 | conf = config.Config("data/config.json") 24 | 25 | if conf.use_tfrecord: 26 | log_path,data_path,LOGGER = util.init_environment(get_net_type(conf.net_type),get_rnn_type(conf.rnn_type)) 27 | task = conf.task 28 | tfrecords_data_path = conf.tfrecord_path 29 | len_features = conf.num_features * conf.discretization_width 30 | else: 31 | log_path = "./logdir/shiyan/rnn_nvn/" 32 | data_path = "./data/tfrecord9_data/rnn_nvn/" 33 | task = conf.task 34 | net_name = str(NetType(conf.net_type)).split(".")[-1] 35 | nn_name = str(RNNType(conf.rnn_type)).split(".")[-1] 36 | LOGGER = Log(log_path, "_" + net_name + "_" + nn_name) 37 | len_features = conf.num_features * conf.discretization_width 38 | 39 | #从npy加载数据 40 | def loadData_rnn_nvn(): 41 | x_file = 'data/x_mobility.npy' 42 | y_file = 'data/y_mobility.npy' 43 | mmsi_file = 'data/mmsi_mobility.npy' 44 | 45 | 46 | #加载数据 47 | print("加载数据中......") 48 | x = np.load(x_file) 49 | y = np.load(y_file) 50 | mmsi = np.load(mmsi_file) 51 | print("加载完毕......") 52 | 53 | #x中数据格式如下 54 | # shape = [总序列个数,序列长度,特征数] 55 | #y中数据格式如下 56 | #shape= [总序列个数,序列长度] 57 | #mmsi数据格式如下 58 | #shape = [2,总序列个数] 59 | #mmsi[0] 中储存着用户编号 60 | #mmsi[1] 中储存着有效序列长度(因为是padding之后的切割,所以用户的一个序列会出现不满足序列长度的数据,故记录有效的序列长度, 61 | (x,y,mmsi) = Data.reorganizeSeq(x, y, mmsi, conf.exp_seq_len) 62 | 63 | #序列的总个数 64 | num_examples = x.shape[0] 65 | #用户编号的不重复列表 66 | unique_mmsi = np.unique(mmsi[0]) 67 | #分类个数 68 | num_classes = conf.num_classes 69 | 70 | #test_and_val = random.sample(range(23),6) 71 | 72 | #测试集 73 | #test_vessel = test_and_val[0:5] 74 | test_vessel = conf.test_id 75 | #验证集 76 | #val_vessel = test_and_val[5:6] 77 | val_vessel = conf.val_id 78 | 79 | #分割数据,将数据分割成 训练集,测试集,验证集,返回这些数据集的索引 80 | #test_index 的格式 81 | #test_vessel = [0,1] 即前两名用户的索引 则test_index = [0,1,2,3,4,5,6......] 82 | (train_index, test_index, valid_index) = Data.splitDataset(mmsi[0], test_vessel, val_vessel) 83 | 84 | #提前停止也即有效序列 85 | early_stop = mmsi[1] 86 | x = x.transpose([1, 0, 2]) 87 | 88 | np.random.shuffle(train_index) 89 | 90 | 91 | # X_train shape = [序列长度,训练集序列总个数,特征] 92 | X_train = x[:, train_index, :] 93 | y_train = y[train_index, :] 94 | stop_train = early_stop[train_index] 95 | 96 | np.random.shuffle(test_index) 97 | 98 | X_test = x[:, test_index, :] 99 | y_test = y[test_index, :] 100 | stop_test = early_stop[test_index] 101 | 102 | np.random.shuffle(valid_index) 103 | 104 | X_valid = x[:, valid_index, :] 105 | y_valid = y[valid_index, :] 106 | stop_valid = early_stop[valid_index] 107 | 108 | train_data = (X_train, y_train, stop_train) 109 | test_data = (X_test, y_test, stop_test) 110 | val_data = (X_valid, y_valid, stop_valid) 111 | 112 | #获得训练集,测试集,验证集的序列长度数组 113 | #eg train_seq_len value = [250,250,250,55,250,250,250......] 114 | train_seq_len = mmsi[1][train_index] 115 | test_seq_len = mmsi[1][test_index] 116 | valid_seq_len = mmsi[1][valid_index] 117 | 118 | train_config = config.TrainingConfig(True, False,False, conf.batch_size,len_features=x.shape[2],rnn_type=RNNType.GRU_b) 119 | train_config.train_seq_len = train_seq_len 120 | 121 | test_config = config.TrainingConfig(False,False,True,len(test_index),len_features=x.shape[2],rnn_type=RNNType.GRU_b) 122 | test_config.test_seq_len = test_seq_len 123 | 124 | valid_config = config.TrainingConfig(False, True, False, len(valid_index),len_features=x.shape[2],rnn_type=RNNType.GRU_b) 125 | valid_config.val_seq_len = valid_seq_len 126 | 127 | return (train_data,test_data,val_data,train_config,test_config,valid_config) 128 | 129 | #加载rnn_nv1数据从npy文件 130 | def load_data_rnn_nv1(classes): 131 | # 分训练集与测试集 验证集 8:1:1 132 | train_data_all = None 133 | train_label_all = None 134 | train_early_all = None 135 | valid_data_all = None 136 | valid_label_all = None 137 | valid_early_all = None 138 | test_data_all = None 139 | test_label_all = None 140 | test_early_all = None 141 | features_arr_list = [] 142 | index_arr_list = [] 143 | label_arr_list = [] 144 | data_file_name_exp = data_path +"transportation_mode" 145 | for i in range(classes): 146 | print("加载" + str(i)) 147 | # data_file = data_file_name +str(i) +".npy" 148 | index_df = pd.DataFrame(pd.read_csv(data_file_name_exp +"_"+ str(i) + "_seg_index.csv")) 149 | features_arr = np.load(data_file_name_exp + str(i) + ".npy") 150 | print(features_arr.shape) 151 | features_arr = features_arr[:, 0:len_features] 152 | index_arr = np.array(index_df.iloc[:, [1, 2]].T) 153 | # index shape = [2,总个数] 154 | # 第一维是第几段轨迹 第二维是在固定长度为exp_seq_len中的实际长度 155 | # data shape =[seq_nums,exp_seq_len,feature_len] 切出相等的数据长度 不足的padding 156 | (data, index_arr) = Data.slice_seq(features_arr, index_arr, conf.exp_seq_len) 157 | #切割后删除features_arr index 158 | del features_arr 159 | del index_df 160 | label_arr = np.zeros(shape=[index_arr.shape[1]], dtype=np.int32) 161 | label_arr[:] = i 162 | # features_arr_list.append(data) 163 | # index_arr_list.append(index) 164 | # label_arr_list.append(label) 165 | #划分训练集,验证集,测试集 166 | print("划分训练集,验证集,测试集 " + str(i)) 167 | seq_nums = index_arr.shape[1] 168 | # 控制变量 169 | np.random.seed(2) 170 | index_perm = np.random.permutation(range(seq_nums)) 171 | train_count = int(np.floor(seq_nums * 0.8)) 172 | valid_count = int(np.floor(seq_nums * 0.9)) 173 | test_count = seq_nums 174 | train_index = index_perm[0:train_count] 175 | valid_index = index_perm[train_count + 1:valid_count] 176 | test_index = index_perm[valid_count + 1:seq_nums] 177 | 178 | # train_set valid_set test_set 179 | train_data = data[train_index, :, :] 180 | train_label = label_arr[train_index] 181 | train_early = index_arr[1, train_index] 182 | 183 | valid_data = data[valid_index, :, :] 184 | valid_label = label_arr[valid_index] 185 | valid_early = index_arr[1, valid_index] 186 | 187 | test_data = data[test_index, :, :] 188 | test_label = label_arr[test_index] 189 | test_early = index_arr[1, test_index] 190 | 191 | #删除读取到的data. 192 | del data 193 | del label_arr 194 | del index_arr 195 | 196 | if train_data_all is None: 197 | train_data_all = train_data 198 | train_label_all = train_label 199 | train_early_all = train_early 200 | 201 | valid_data_all = valid_data 202 | valid_label_all = valid_label 203 | valid_early_all = valid_early 204 | 205 | test_data_all = test_data 206 | test_label_all = test_label 207 | test_early_all = test_early 208 | else: 209 | train_data_all = np.concatenate((train_data_all, train_data), axis=0) 210 | train_label_all = np.concatenate((train_label_all, train_label), axis=0) 211 | train_early_all = np.concatenate((train_early_all, train_early), axis=0) 212 | 213 | valid_data_all = np.concatenate((valid_data_all, valid_data), axis=0) 214 | valid_label_all = np.concatenate((valid_label_all, valid_label), axis=0) 215 | valid_early_all = np.concatenate((valid_early_all, valid_early), axis=0) 216 | 217 | test_data_all = np.concatenate((test_data_all, test_data), axis=0) 218 | test_label_all = np.concatenate((test_label_all, test_label), axis=0) 219 | test_early_all = np.concatenate((test_early_all, test_early), axis=0) 220 | #打乱数据 221 | np.random.seed(1) 222 | train_perm = np.random.permutation(range(train_early_all.shape[0])) 223 | np.random.seed(1) 224 | valid_perm = np.random.permutation(range(valid_early_all.shape[0])) 225 | np.random.seed(1) 226 | test_perm = np.random.permutation(range(test_early_all.shape[0])) 227 | 228 | #shape=[序列长度,总个数,特征长度] TimeMajor 229 | train_data_all = np.transpose(train_data_all, [1, 0, 2]) 230 | valid_data_all = np.transpose(valid_data_all, [1, 0, 2]) 231 | test_data_all = np.transpose(test_data_all, [1, 0, 2]) 232 | 233 | # train_data_all = train_data_all[:, train_perm, :] 234 | # train_label_all = train_label_all[train_perm] 235 | # train_early_all = train_early_all[train_perm] 236 | 237 | valid_data_all = valid_data_all[:, valid_perm, :] 238 | valid_label_all = valid_label_all[valid_perm] 239 | valid_early_all = valid_early_all[valid_perm] 240 | 241 | test_data_all = test_data_all[:, test_perm, :] 242 | test_label_all = test_label_all[test_perm] 243 | test_early_all = test_early_all[test_perm] 244 | 245 | train_set = (train_data_all, train_label_all, train_early_all) 246 | valid_set = (valid_data_all, valid_label_all, valid_early_all) 247 | test_set = (test_data_all, test_label_all, test_early_all) 248 | return train_set,valid_set,test_set 249 | 250 | def load_data_rnn_nv1_other(classes): 251 | # 分训练集与测试集 验证集 8:1:1 252 | train_data_all = None 253 | train_label_all = None 254 | train_early_all = None 255 | valid_data_all = None 256 | valid_label_all = None 257 | valid_early_all = None 258 | test_data_all = None 259 | test_label_all = None 260 | test_early_all = None 261 | data_file_name_exp = data_path + "transportation_mode" 262 | for i in range(classes): 263 | data = np.load(data_path + "slice_label" + str(i) + "_" + str(conf.exp_seq_len) + ".npy") 264 | index_arr = np.load(data_path + "slice_index" + str(i) + ".npy") 265 | 266 | # 切割后删除features_arr index 267 | label_arr = np.zeros(shape=[index_arr.shape[1]], dtype=np.int32) 268 | label_arr[:] = i 269 | # features_arr_list.append(data) 270 | # index_arr_list.append(index) 271 | # label_arr_list.append(label) 272 | # 划分训练集,验证集,测试集 273 | print("划分训练集,验证集,测试集 " + str(i)) 274 | seq_nums = index_arr.shape[1] 275 | # 控制变量 276 | np.random.seed(2) 277 | index_perm = np.random.permutation(range(seq_nums)) 278 | train_count = int(np.floor(seq_nums * 0.8)) 279 | valid_count = int(np.floor(seq_nums * 0.9)) 280 | test_count = seq_nums 281 | train_index = index_perm[0:train_count] 282 | valid_index = index_perm[train_count + 1:valid_count] 283 | test_index = index_perm[valid_count + 1:seq_nums] 284 | 285 | # train_set valid_set test_set 286 | train_data = data[train_index, :, :] 287 | train_label = label_arr[train_index] 288 | train_early = index_arr[1, train_index] 289 | 290 | valid_data = data[valid_index, :, :] 291 | valid_label = label_arr[valid_index] 292 | valid_early = index_arr[1, valid_index] 293 | 294 | test_data = data[test_index, :, :] 295 | test_label = label_arr[test_index] 296 | test_early = index_arr[1, test_index] 297 | 298 | # 删除读取到的data. 299 | del data 300 | del label_arr 301 | del index_arr 302 | 303 | if train_data_all is None: 304 | train_data_all = train_data 305 | train_label_all = train_label 306 | train_early_all = train_early 307 | 308 | valid_data_all = valid_data 309 | valid_label_all = valid_label 310 | valid_early_all = valid_early 311 | 312 | test_data_all = test_data 313 | test_label_all = test_label 314 | test_early_all = test_early 315 | else: 316 | train_data_all = np.concatenate((train_data_all, train_data), axis=0) 317 | train_label_all = np.concatenate((train_label_all, train_label), axis=0) 318 | train_early_all = np.concatenate((train_early_all, train_early), axis=0) 319 | 320 | valid_data_all = np.concatenate((valid_data_all, valid_data), axis=0) 321 | valid_label_all = np.concatenate((valid_label_all, valid_label), axis=0) 322 | valid_early_all = np.concatenate((valid_early_all, valid_early), axis=0) 323 | 324 | test_data_all = np.concatenate((test_data_all, test_data), axis=0) 325 | test_label_all = np.concatenate((test_label_all, test_label), axis=0) 326 | test_early_all = np.concatenate((test_early_all, test_early), axis=0) 327 | # 打乱数据 328 | np.random.seed(1) 329 | train_perm = np.random.permutation(range(train_early_all.shape[0])) 330 | np.random.seed(1) 331 | valid_perm = np.random.permutation(range(valid_early_all.shape[0])) 332 | np.random.seed(1) 333 | test_perm = np.random.permutation(range(test_early_all.shape[0])) 334 | 335 | # shape=[序列长度,总个数,特征长度] TimeMajor 336 | train_data_all = np.transpose(train_data_all, [1, 0, 2]) 337 | valid_data_all = np.transpose(valid_data_all, [1, 0, 2]) 338 | test_data_all = np.transpose(test_data_all, [1, 0, 2]) 339 | 340 | # train_data_all = train_data_all[:, train_perm, :] 341 | # train_label_all = train_label_all[train_perm] 342 | # train_early_all = train_early_all[train_perm] 343 | 344 | valid_data_all = valid_data_all[:, valid_perm, :] 345 | valid_label_all = valid_label_all[valid_perm] 346 | valid_early_all = valid_early_all[valid_perm] 347 | 348 | test_data_all = test_data_all[:, test_perm, :] 349 | test_label_all = test_label_all[test_perm] 350 | test_early_all = test_early_all[test_perm] 351 | 352 | train_set = (train_data_all, train_label_all, train_early_all) 353 | valid_set = (valid_data_all, valid_label_all, valid_early_all) 354 | test_set = (test_data_all, test_label_all, test_early_all) 355 | return train_set, valid_set, test_set 356 | 357 | #直接读取整个data npz noTranspose 358 | def load_data_rnn_nv1_quick(classes): 359 | data_dir = "G:/all_data/" 360 | train_data_set_name = data_dir + "train_data_set.npz" 361 | valid_data_set_name = data_dir + "valid_data_set.npz" 362 | test_data_set_name = data_dir + "test_data_set.npz" 363 | train_data_set = np.load(train_data_set_name) 364 | valid_data_set = np.load(valid_data_set_name) 365 | test_data_set = np.load(test_data_set_name) 366 | 367 | return train_data_set,valid_data_set,test_data_set 368 | 369 | def evaluate_model(sess, minibatch): 370 | # test and validate model 371 | #if conf.test_mode: 372 | # run_batch(sess, mtest, test_data, tf.no_op(), minibatch) 373 | 374 | result_train = run_batch(sess,train_model,train_data,tf.no_op(),minibatch) 375 | result_test = run_batch(sess,test_model,test_data,tf.no_op(),minibatch) 376 | result_valid = run_batch(sess,valid_model,valid_data,tf.no_op(),minibatch) 377 | 378 | #t_train = MyThread(run_batch, (sess, train_model, train_data, tf.no_op(), minibatch)) 379 | #t_test = MyThread(run_batch, (sess, test_model, test_data, tf.no_op(), minibatch)) 380 | #t_val = MyThread(run_batch, (sess, valid_model, valid_data, tf.no_op(), minibatch)) 381 | 382 | #t_train.start() 383 | #t_test.start() 384 | #t_val.start() 385 | 386 | #t_train.join() 387 | #result_train = t_train.get_result() 388 | #t_test.join() 389 | #result_test = t_test.get_result() 390 | #t_val.join() 391 | #result_val = t_val.get_result() 392 | 393 | print("Train cost {0:0.3f}, Acc {1:0.3f}".format( 394 | result_train[0], result_train[1])) 395 | print("Valid cost {0:0.3f}, Acc {1:0.3f}".format( 396 | result_valid[0], result_valid[1])) 397 | print("Test cost {0:0.3f}, Acc {1:0.3f}".format( 398 | result_test[0], result_test[1])) 399 | 400 | return result_train + result_test + result_valid 401 | 402 | # 403 | def evaluate_model_all(sess,epoch): 404 | result_train = run_batch_all(sess, train_model, train_data, tf.no_op(), epoch) 405 | result_valid = run_batch_all(sess, valid_model, valid_data, tf.no_op(), epoch) 406 | result_test = run_batch_all(sess, test_model, test_data, tf.no_op(), epoch) 407 | 408 | 409 | LOGGER.summary_log(result_train+result_valid+result_test,epoch) 410 | 411 | print("Train cost {0:0.3f}, Acc {1:0.3f}".format( 412 | result_train[0], result_train[1])) 413 | print("Valid cost {0:0.3f}, Acc {1:0.3f}".format( 414 | result_valid[0], result_valid[1])) 415 | print("Test cost {0:0.3f}, Acc {1:0.3f}".format( 416 | result_test[0], result_test[1])) 417 | 418 | return result_train + result_test + result_valid 419 | 420 | #npz文件方式 421 | def evaluate_model_quick(sess,epoch): 422 | print("开始测试训练集") 423 | result_train = run_batch_quick(sess, train_model, train_data, tf.no_op(), epoch) 424 | print("开始测试验证集") 425 | result_valid = run_batch_quick(sess, valid_model, valid_data, tf.no_op(), epoch) 426 | print("开始测试测试集") 427 | result_test = run_batch_quick(sess, test_model, test_data, tf.no_op(), epoch) 428 | 429 | LOGGER.summary_log(result_train + result_valid + result_test, epoch) 430 | 431 | print("Train cost {0:0.3f}, Acc {1:0.3f}".format( 432 | result_train[0], result_train[1])) 433 | print("Valid cost {0:0.3f}, Acc {1:0.3f}".format( 434 | result_valid[0], result_valid[1])) 435 | print("Test cost {0:0.3f}, Acc {1:0.3f}".format( 436 | result_test[0], result_test[1])) 437 | 438 | return result_train + result_test + result_valid 439 | 440 | #队列版 未完成 441 | def evaluate_from_tfrecords(iter): 442 | with tf.Session() as sess: 443 | sess.run(tf.global_variables_initializer()) 444 | sess.run(tf.local_variables_initializer()) 445 | coord = tf.train.Coordinator() 446 | threads = tf.train.start_queue_runners(sess=sess, coord=coord) 447 | print(len(threads)) 448 | train_cost, train_acc = run_batch_from_tfrecords(sess, coord, train_model, tf.no_op()) 449 | 450 | coord.request_stop() 451 | coord.join(threads) 452 | 453 | with tf.Session() as sess: 454 | sess.run(tf.global_variables_initializer()) 455 | sess.run(tf.local_variables_initializer()) 456 | coord = tf.train.Coordinator() 457 | threads = tf.train.start_queue_runners(sess=sess, coord=coord) 458 | 459 | valid_cost, valid_acc = run_batch_from_tfrecords(sess, coord, valid_model, tf.no_op()) 460 | 461 | coord.request_stop() 462 | coord.join(threads) 463 | 464 | with tf.Session() as sess: 465 | sess.run(tf.global_variables_initializer()) 466 | sess.run(tf.local_variables_initializer()) 467 | coord = tf.train.Coordinator() 468 | threads = tf.train.start_queue_runners(sess=sess, coord=coord) 469 | 470 | test_cost, test_acc = run_batch_from_tfrecords(sess, coord, test_model, tf.no_op()) 471 | 472 | coord.request_stop() 473 | coord.join(threads) 474 | 475 | LOGGER.summary_log((train_cost, train_acc, valid_cost, valid_acc, test_cost, test_acc),iter) 476 | 477 | print("Train cost {0:0.3f}, Acc {1:0.3f}".format( 478 | train_cost, train_acc)) 479 | print("Valid cost {0:0.3f}, Acc {1:0.3f}".format( 480 | valid_cost, valid_acc)) 481 | print("Test cost {0:0.3f}, Acc {1:0.3f}".format( 482 | test_cost, test_acc)) 483 | 484 | #dataset版 485 | def evaluate_from_tfrecord_dataset(net_type,sess, model,next_element,eval_op,epoch): 486 | 487 | cost_list = [] 488 | acc_list = [] 489 | confus_list = [] 490 | count = 0 491 | try: 492 | while True: 493 | input, early, label = sess.run(next_element) 494 | 495 | if net_type == NetType.RNN_NV1: 496 | 497 | if input.shape[0] < conf.batch_size: 498 | print(input.shape) 499 | break 500 | input = np.transpose(input, [1, 0, 2]) 501 | batch_size = input.shape[1] 502 | 503 | cost, acc, confus_mat = sess.run(fetches=[model.cost, model.accuracy, model.confusion_matrix], 504 | feed_dict={model.input_data: input, 505 | model.early_stop: early, 506 | model.targets: label}) 507 | 508 | elif net_type == NetType.RNN_NVN: 509 | if input.shape[0] < conf.batch_size: 510 | print(input.shape) 511 | break 512 | new_label = np.zeros([conf.batch_size, conf.exp_seq_len], np.int32) 513 | for batch in range(conf.batch_size): 514 | new_label[batch, 0:early[batch]] = label[batch] 515 | new_label[batch, early[batch]:] = 0 516 | label = new_label 517 | input = np.transpose(input, [1, 0, 2]) 518 | batch_size = input.shape[1] 519 | 520 | weight_sequence_loss = np.zeros([conf.batch_size, conf.exp_seq_len], np.float32) 521 | for k in range(conf.batch_size): 522 | weight_sequence_loss[k, 0:early[k]] = 1 523 | 524 | cost,digit_predictions = sess.run(fetches = [model.cost,model.digit_predictions],feed_dict={ 525 | model.input_data:input, 526 | model.early_stop:early, 527 | model.weight_sequence_loss:weight_sequence_loss, 528 | model.targets:label 529 | }) 530 | 531 | 532 | batch_acc_list = [] 533 | confus_mat_list = [] 534 | for k in range(conf.batch_size): 535 | start = k*conf.exp_seq_len 536 | end = k*conf.exp_seq_len + early[k] 537 | seq_acc = np.equal(digit_predictions[start:end],label[k,0:early[k]]) 538 | seq_acc = seq_acc.astype(np.float32) 539 | batch_acc_list.append(np.mean(seq_acc)) 540 | confus = confusion_matrix(label[k,0:early[k]],digit_predictions[start:end],labels = [0,1,2,3]) 541 | confus_mat_list.append(confus) 542 | acc = sum(batch_acc_list)/conf.batch_size 543 | confus_mat = sum(confus_mat_list) 544 | elif net_type == NetType.DNN or net_type == NetType.DNN_MAXOUT: 545 | list_input = [] 546 | list_label = [] 547 | for batch in range(input.shape[0]): 548 | list_input.append(input[batch, 0:early[batch], :]) 549 | new_label = np.zeros([early[batch]], np.int32) 550 | new_label[:] = label[batch] 551 | list_label.append(new_label) 552 | input = np.concatenate(tuple(list_input), axis=0) 553 | label = np.concatenate(tuple(list_label), axis=0) 554 | batch_size = input.shape[0] 555 | cost, acc, confus_mat = sess.run(fetches=[model.cost, model.accuracy, model.confusion_matrix], 556 | feed_dict={model.input_data: input, 557 | model.early_stop: early, 558 | model.targets: label}) 559 | 560 | #print(input.shape) 561 | 562 | confus_list.append(confus_mat) 563 | cost_list.append(cost) 564 | acc_list.append(acc) 565 | count += 1 566 | 567 | except tf.errors.OutOfRangeError: 568 | 569 | print("超出界限!!!") 570 | 571 | if model.is_training: 572 | LOGGER.training_log("训练集:\n") 573 | elif model.is_validation: 574 | LOGGER.training_log("验证集:\n") 575 | else: 576 | LOGGER.training_log("测试集:\n") 577 | LOGGER.training_log(str(sum(confus_list))) 578 | print( count) 579 | return sum(cost_list) / len(cost_list), sum(acc_list) / len(acc_list) 580 | 581 | #minbatch训练方法 582 | def run_batch(session, model, data, eval_op, minibatch): 583 | # 准备数据 584 | x, y, e_stop = data 585 | epoch_size = x.shape[1] // model.batch_size 586 | 587 | # 记录结果 588 | costs = [] 589 | correct = [] 590 | 591 | for batch in range(epoch_size): 592 | x_batch = x[:, batch * model.batch_size: (batch + 1) * model.batch_size, :] 593 | y_batch = y[batch * model.batch_size: (batch + 1) * model.batch_size] 594 | e_batch = e_stop[batch * model.batch_size: (batch + 1) * model.batch_size] 595 | 596 | temp_dict = {model.input_data: x_batch} 597 | temp_dict.update({model.targets: y_batch}) 598 | temp_dict.update({model.early_stop: e_batch}) 599 | 600 | 601 | if model.is_training and eval_op == model.train_op: 602 | #如果是训练模式,且op正常 则正常训练 603 | print("开始训练第 %d 个batch" % batch) 604 | _, cost, accuracy = session.run([eval_op, model.cost, model.accuracy], 605 | feed_dict=temp_dict) 606 | 607 | if minibatch % conf.evaluate_freq == 0: 608 | result = evaluate_model(session, minibatch) #评估模型,返回结果 609 | LOGGER.summary_log(result, minibatch) 610 | minibatch += 1 611 | 612 | 613 | else: 614 | cost, confusion, accuracy, _ = session.run([model.cost, model.confusion_matrix, model.accuracy, eval_op], 615 | feed_dict=temp_dict) 616 | 617 | if model.net_type == NetType.RNN_NVN: 618 | # keep results for this minibatch 619 | costs.append(cost) 620 | correct.append(accuracy * sum(e_batch)) 621 | 622 | # print test confusion matrix 623 | if not model.is_training and not model.is_validation: 624 | 625 | LOGGER.training_log(str(minibatch) + "测试集的混淆矩阵") 626 | LOGGER.training_log(str(confusion)) 627 | # output predictions in test mode 628 | # if conf.test_mode: 629 | # pred = session.run([m._prob_predictions], feed_dict=temp_dict) 630 | # pred = np.array(pred) 631 | # np.set_printoptions(threshold=np.nan) 632 | # # results = np.column_stack((tar, pred)) 633 | # # np.savetxt("results/prediction.result", pred)#, fmt='%.3f') 634 | # #print("output target and predictions to file prediction.csv") 635 | # #exit() 636 | 637 | #计算平均精度与损失 638 | if batch == epoch_size - 1: 639 | accuracy = sum(correct) / float(sum(e_stop)) 640 | return (sum(costs) / float(epoch_size), accuracy) 641 | elif model.net_type == NetType.RNN_NV1: 642 | costs.append(cost) 643 | correct.append(accuracy) 644 | 645 | # print test confusion matrix 646 | if not model.is_training and not model.is_validation: 647 | LOGGER.training_log(str(minibatch) + "测试集的混淆矩阵") 648 | LOGGER.training_log(str(confusion)) 649 | # output predictions in test mode 650 | # if conf.test_mode: 651 | # pred = session.run([m._prob_predictions], feed_dict=temp_dict) 652 | # pred = np.array(pred) 653 | # np.set_printoptions(threshold=np.nan) 654 | # # results = np.column_stack((tar, pred)) 655 | # # np.savetxt("results/prediction.result", pred)#, fmt='%.3f') 656 | # #print("output target and predictions to file prediction.csv") 657 | # #exit() 658 | 659 | # 计算平均精度与损失 660 | if batch == epoch_size - 1: 661 | cost_mean = (sum(costs) )/ float(epoch_size) 662 | accuracy_mean = sum(correct) / float(epoch_size) 663 | return (cost_mean,accuracy_mean) 664 | 665 | # training: keep track of minibatch number 666 | return (minibatch) 667 | 668 | def run_batch_all(session,model,data,eval_op,epoch): 669 | x, y, e_stop = data 670 | epoch_size = x.shape[1] // model.batch_size 671 | shuffle_perm = None 672 | if model.is_training: 673 | shuffle_perm = np.random.permutation(range(e_stop.shape[0])) 674 | 675 | 676 | # 记录结果 677 | costs = [] 678 | correct = [] 679 | for batch in range(epoch_size): 680 | 681 | if model.is_training: 682 | batch_perm = shuffle_perm[batch * model.batch_size: (batch + 1) * model.batch_size] 683 | x_batch = x[:, batch_perm, :] 684 | y_batch = y[batch_perm] 685 | e_batch = e_stop[batch_perm] 686 | else: 687 | x_batch = x[:, batch * model.batch_size: (batch + 1) * model.batch_size, :] 688 | y_batch = y[batch * model.batch_size: (batch + 1) * model.batch_size] 689 | e_batch = e_stop[batch * model.batch_size: (batch + 1) * model.batch_size] 690 | 691 | temp_dict = {model.input_data: x_batch} 692 | temp_dict.update({model.targets: y_batch}) 693 | temp_dict.update({model.early_stop: e_batch}) 694 | 695 | if model.is_training and eval_op == model.train_op: 696 | _= session.run([eval_op],feed_dict=temp_dict) 697 | 698 | else: 699 | cost, confusion, accuracy, _ = session.run([model.cost, model.confusion_matrix, model.accuracy, eval_op],feed_dict=temp_dict) 700 | 701 | if model.is_test: 702 | LOGGER.training_log(str(epoch) + "测试集的混淆矩阵") 703 | LOGGER.training_log(str(confusion)) 704 | elif model.is_validation: 705 | LOGGER.training_log(str(epoch) + "验证集的混淆矩阵") 706 | LOGGER.training_log(str(confusion)) 707 | 708 | 709 | if model.net_type == NetType.RNN_NVN: 710 | # keep results for this minibatch 711 | costs.append(cost) 712 | correct.append(accuracy * sum(e_batch)) 713 | 714 | #计算平均精度与损失 715 | if batch == epoch_size - 1: 716 | accuracy = sum(correct) / float(sum(e_stop)) 717 | return (sum(costs) / float(epoch_size), accuracy) 718 | elif model.net_type == NetType.RNN_NV1: 719 | costs.append(cost) 720 | correct.append(accuracy) 721 | 722 | # 计算平均精度与损失 723 | if batch == epoch_size - 1: 724 | cost_mean = sum(costs) / float(epoch_size) 725 | accuracy_mean = sum(correct) / float(epoch_size) 726 | return (cost_mean, accuracy_mean) 727 | 728 | #quick 代表所有数据在npz文件里 729 | def run_batch_quick(session,model,data,eval_op,epoch): 730 | 731 | if model.is_training: 732 | x = data["train_data"] 733 | y = data["train_label"] 734 | e_stop = data["train_early_stop"] 735 | elif model.is_validation: 736 | x = data["valid_data"] 737 | y = data["valid_label"] 738 | e_stop = data["valid_early_stop"] 739 | else: 740 | x = data["test_data"] 741 | y = data["test_label"] 742 | e_stop = data["test_early_stop"] 743 | 744 | epoch_size = x.shape[0] // model.batch_size 745 | shuffle_perm = None 746 | if model.is_training: 747 | shuffle_perm = np.random.permutation(range(e_stop.shape[0])) 748 | 749 | # 记录结果 750 | costs = [] 751 | correct = [] 752 | for batch in range(epoch_size): 753 | 754 | if model.is_training: 755 | batch_perm = shuffle_perm[batch * model.batch_size: (batch + 1) * model.batch_size] 756 | x_batch = x[batch_perm,:, :] 757 | y_batch = y[batch_perm] 758 | e_batch = e_stop[batch_perm] 759 | else: 760 | x_batch = x[batch * model.batch_size: (batch + 1) * model.batch_size,: , :] 761 | y_batch = y[batch * model.batch_size: (batch + 1) * model.batch_size] 762 | e_batch = e_stop[batch * model.batch_size: (batch + 1) * model.batch_size] 763 | 764 | x_batch = np.transpose(x_batch,[1,0,2]) 765 | 766 | temp_dict = {model.input_data: x_batch} 767 | temp_dict.update({model.targets: y_batch}) 768 | temp_dict.update({model.early_stop: e_batch}) 769 | 770 | if model.is_training and eval_op == model.train_op: 771 | _ = session.run([eval_op], feed_dict=temp_dict) 772 | 773 | else: 774 | cost, confusion, accuracy, _ = session.run([model.cost, model.confusion_matrix, model.accuracy, eval_op], 775 | feed_dict=temp_dict) 776 | 777 | if model.is_test: 778 | LOGGER.training_log(str(epoch) + "测试集的混淆矩阵") 779 | LOGGER.training_log(str(confusion)) 780 | elif model.is_validation: 781 | LOGGER.training_log(str(epoch) + "验证集的混淆矩阵") 782 | LOGGER.training_log(str(confusion)) 783 | 784 | if model.net_type == NetType.RNN_NVN: 785 | # keep results for this minibatch 786 | costs.append(cost) 787 | correct.append(accuracy * sum(e_batch)) 788 | 789 | # 计算平均精度与损失 790 | if batch == epoch_size - 1: 791 | accuracy = sum(correct) / float(sum(e_stop)) 792 | return (sum(costs) / float(epoch_size), accuracy) 793 | elif model.net_type == NetType.RNN_NV1: 794 | costs.append(cost) 795 | correct.append(accuracy) 796 | 797 | # 计算平均精度与损失 798 | if batch == epoch_size - 1: 799 | cost_mean = sum(costs) / float(epoch_size) 800 | accuracy_mean = sum(correct) / float(epoch_size) 801 | return (cost_mean, accuracy_mean) 802 | 803 | print("训练完毕 " + str(epoch)) 804 | 805 | #队列版未完成 806 | def run_batch_from_tfrecords(sess, coord, model, eval_op): 807 | if model.is_training and eval_op == model.train_op: 808 | count = 1 809 | iter = 0 810 | while not coord.should_stop(): 811 | if count % conf.evaluate_freq != 0: 812 | _ = sess.run(model.train_op) 813 | 814 | else: 815 | coord.request_stop() 816 | print("第%d次测试精度" % (iter)) 817 | evaluate_from_tfrecords(iter) 818 | coord.clear_stop() 819 | iter += 1 820 | count+=1 821 | 822 | else: 823 | accuracy_list = [] 824 | cost_list = [] 825 | try: 826 | while not coord.should_stop(): 827 | cost, accuracy = sess.run([model.cost, model.accuracy]) 828 | print(cost,accuracy) 829 | accuracy_list.append(accuracy) 830 | cost_list.append(cost) 831 | except tf.errors.OutOfRangeError: 832 | print("测试完成") 833 | acc_mean = sum(accuracy_list) / len(accuracy_list) 834 | cost_mean = sum(cost_list) / len(cost_list) 835 | return cost_mean, acc_mean 836 | 837 | #nvn model 838 | def rnn_nvn_model(): 839 | #1 处理数据 840 | #2 设置模型 841 | #3 训练模型 842 | #4 测试模型 843 | 844 | global train_data 845 | global test_data 846 | global val_data 847 | #x shape = [序列长度,总的序列个数,特征长度] 848 | #y shape = [总的序列个数,1} 849 | #early_stop shape = [总的序列个数] [250,250,250,250,50,.........] 850 | #train_index 训练集的索引 [10,11,12,13,......] 851 | train_data, test_data, val_data, train_config, test_config, valid_config = loadData_rnn_nvn() 852 | 853 | minibatch = 0 854 | 855 | with tf.Session() as sess: 856 | tf.set_random_seed(0) 857 | 858 | #变量初始化 859 | initializer = tf.random_uniform_initializer(0,0.001) 860 | #正则化 861 | regularizer = layers.l2_regularizer(conf.l2_preparam) 862 | 863 | with tf.variable_scope("model",reuse=False,initializer=initializer,dtype=tf.float32): #,regularizer = regularizer): 864 | global train_model 865 | train_model = Model(conf,train_config) 866 | with tf.variable_scope("model",reuse=True,initializer=initializer,dtype=tf.float32): #,regularizer = regularizer): 867 | global test_model 868 | test_model = Model(conf,test_config) 869 | with tf.variable_scope("model",reuse=True,initializer=initializer,dtype=tf.float32): #,regularizer = regularizer): 870 | global valid_model 871 | valid_model = Model(conf,valid_config) 872 | 873 | saver = None 874 | if conf.checkpoint or conf.restore: 875 | saver = tf.train.Saver() 876 | 877 | if conf.tensorboard: 878 | global writer 879 | writer = tf.summary.FileWriter(log_path, sess.graph) 880 | 881 | if not conf.restore: 882 | tf.global_variables_initializer().run() # initialize all variables in the model 883 | else: 884 | saver.restore(sess, data_path + task) 885 | print("装载变量......") 886 | 887 | for i in range(conf.num_epochs): 888 | print("第 {0}次epoch".format(i)) 889 | minibatch = run_batch(sess,train_model,train_data,train_model.train_op,minibatch) 890 | if (i+1)%10 == 0: 891 | saver.save(sess,data_path+task) 892 | 893 | if conf.checkpoint: 894 | save_path = saver.save(sess,data_path+task) 895 | 896 | #nv1 model 897 | def rnn_nv1_model(is_quick): 898 | global train_data 899 | global test_data 900 | global valid_data 901 | if is_quick: 902 | train_data,valid_data,test_data=load_data_rnn_nv1_quick(conf.num_classes) 903 | else: 904 | train_data, valid_data, test_data = load_data_rnn_nv1(conf.num_classes) 905 | #print("数据加载完毕......") 906 | train_conf = None 907 | valid_conf = None 908 | test_conf = None 909 | 910 | if conf.rnn_type == "lstm_b": 911 | train_conf = TrainingConfig(True,False,False,conf.batch_size,len_features,net_type=NetType.RNN_NV1,rnn_type=RNNType.LSTM_b) 912 | valid_conf = TrainingConfig(False,True,False,conf.batch_size,len_features,net_type=NetType.RNN_NV1,rnn_type=RNNType.LSTM_b) 913 | test_conf = TrainingConfig(False,False,True,conf.batch_size,len_features,net_type=NetType.RNN_NV1,rnn_type=RNNType.LSTM_b) 914 | elif conf.rnn_type == "gru_b": 915 | train_conf = TrainingConfig(True, False, False, conf.batch_size, len_features, net_type=NetType.RNN_NV1, 916 | rnn_type=RNNType.GRU_b) 917 | valid_conf = TrainingConfig(False, True, False, conf.batch_size, len_features, net_type=NetType.RNN_NV1, 918 | rnn_type=RNNType.GRU_b) 919 | test_conf = TrainingConfig(False, False, True, conf.batch_size, len_features, net_type=NetType.RNN_NV1, 920 | rnn_type=RNNType.GRU_b) 921 | elif conf.rnn_type == "gru": 922 | train_conf = TrainingConfig(True, False, False, conf.batch_size, len_features, net_type=NetType.RNN_NV1, 923 | rnn_type=RNNType.GRU) 924 | valid_conf = TrainingConfig(False, True, False, conf.batch_size, len_features, net_type=NetType.RNN_NV1, 925 | rnn_type=RNNType.GRU) 926 | test_conf = TrainingConfig(False, False, True, conf.batch_size, len_features, net_type=NetType.RNN_NV1, 927 | rnn_type=RNNType.GRU) 928 | 929 | config = tf.ConfigProto() 930 | config.gpu_options.allow_growth = True 931 | #config.gpu_options.per_process_gpu_memory_fraction = 0.7 # 占用GPU90%的显存 932 | initializer = tf.random_uniform_initializer(0, conf.init_scale) 933 | 934 | with tf.variable_scope("model", reuse=False, initializer=initializer): 935 | global train_model 936 | train_model = Model(conf, train_conf) 937 | with tf.variable_scope("model", reuse=True, initializer=initializer): 938 | global test_model 939 | test_model = Model(conf, test_conf) 940 | with tf.variable_scope("model", reuse=True, initializer=initializer): 941 | global valid_model 942 | valid_model = Model(conf, valid_conf) 943 | 944 | minibatch = 0 945 | with tf.Session(config=config) as sess: 946 | 947 | saver = None 948 | if conf.checkpoint or conf.restore: 949 | saver = tf.train.Saver() 950 | 951 | if conf.tensorboard: 952 | global writer 953 | writer = tf.summary.FileWriter(log_path, sess.graph) 954 | 955 | if not conf.restore: 956 | tf.global_variables_initializer().run() # initialize all variables in the model 957 | else: 958 | saver.restore(sess, data_path + task) 959 | print("装载变量......") 960 | 961 | LOGGER.training_log(str(conf.__dict__)) 962 | LOGGER.training_log("activation = tanh") 963 | 964 | if is_quick: 965 | 966 | for i in range(conf.num_epochs): 967 | print("第 {0}次epoch".format(i)) 968 | #minibatch = run_batch(sess, train_model, train_data, train_model.train_op, minibatch) 969 | run_batch_quick(sess,train_model,train_data,train_model.train_op,i) 970 | evaluate_model_quick(sess,i) 971 | else: 972 | for i in range(conf.num_epochs): 973 | print("第 {0}次epoch".format(i)) 974 | #minibatch = run_batch(sess, train_model, train_data, train_model.train_op, minibatch) 975 | run_batch_all(sess,train_model,train_data,train_model.train_op,i) 976 | evaluate_model_all(sess,i) 977 | 978 | #队列版 未完善 979 | def rnn_nv1_model_tfrecord(): 980 | global train_data 981 | global test_data 982 | global valid_data 983 | # train_data,valid_data,test_data=load_data_rnn_nv1_quick(conf.num_classes) 984 | # print("数据加载完毕......") 985 | train_conf = None 986 | valid_conf = None 987 | test_conf = None 988 | 989 | if conf.rnn_type == "lstm_b": 990 | train_conf = TrainingConfig(True, False, False, conf.batch_size, len_features, net_type=NetType.RNN_NV1, 991 | rnn_type=RNNType.LSTM_b) 992 | valid_conf = TrainingConfig(False, True, False, conf.batch_size, len_features, net_type=NetType.RNN_NV1, 993 | rnn_type=RNNType.LSTM_b) 994 | test_conf = TrainingConfig(False, False, True, conf.batch_size, len_features, net_type=NetType.RNN_NV1, 995 | rnn_type=RNNType.LSTM_b) 996 | elif conf.rnn_type == "gru_b": 997 | train_conf = TrainingConfig(True, False, False, conf.batch_size, len_features, net_type=NetType.RNN_NV1, 998 | rnn_type=RNNType.GRU_b) 999 | valid_conf = TrainingConfig(False, True, False, conf.batch_size, len_features, net_type=NetType.RNN_NV1, 1000 | rnn_type=RNNType.GRU_b) 1001 | test_conf = TrainingConfig(False, False, True, conf.batch_size, len_features, net_type=NetType.RNN_NV1, 1002 | rnn_type=RNNType.GRU_b) 1003 | elif conf.rnn_type == "gru": 1004 | train_conf = TrainingConfig(True, False, False, conf.batch_size, len_features, net_type=NetType.RNN_NV1, 1005 | rnn_type=RNNType.GRU) 1006 | valid_conf = TrainingConfig(False, True, False, conf.batch_size, len_features, net_type=NetType.RNN_NV1, 1007 | rnn_type=RNNType.GRU) 1008 | test_conf = TrainingConfig(False, False, True, conf.batch_size, len_features, net_type=NetType.RNN_NV1, 1009 | rnn_type=RNNType.GRU) 1010 | 1011 | config = tf.ConfigProto() 1012 | config.gpu_options.allow_growth = True 1013 | # config.gpu_options.per_process_gpu_memory_fraction = 0.7 # 占用GPU90%的显存 1014 | initializer = tf.random_uniform_initializer(0, conf.init_scale) 1015 | 1016 | with tf.variable_scope("model", reuse=False, initializer=initializer): 1017 | global train_model 1018 | train_model = Model(conf, train_conf) 1019 | 1020 | # with tf.variable_scope("model", reuse=True, initializer=initializer): 1021 | # global test_model 1022 | # test_model = Model(conf, test_conf) 1023 | # 1024 | # with tf.variable_scope("model", reuse=True, initializer=initializer): 1025 | # global valid_model 1026 | # valid_model = Model(conf, valid_conf) 1027 | 1028 | 1029 | train_filenames = np.array(util.search_file("interval_[1-5]_label_[0-3]_train.tfrecords", tfrecords_data_path)) 1030 | valid_filenames = np.array(util.search_file("interval_[1-5]_label_[0-3]_valid.tfrecords", tfrecords_data_path)) 1031 | test_filenames = np.array(util.search_file("interval_[1-5]_label_[0-3]_test.tfrecords", tfrecords_data_path)) 1032 | 1033 | minibatch = 0 1034 | with tf.Session(config=config) as sess: 1035 | 1036 | saver = None 1037 | if conf.checkpoint or conf.restore: 1038 | saver = tf.train.Saver() 1039 | 1040 | if conf.tensorboard: 1041 | global writer 1042 | writer = tf.summary.FileWriter(log_path, sess.graph) 1043 | 1044 | sess.run(tf.local_variables_initializer()) 1045 | if not conf.restore: 1046 | sess.run(tf.global_variables_initializer()) # initialize all variables in the model 1047 | else: 1048 | saver.restore(sess, data_path + task) 1049 | print("装载变量......") 1050 | 1051 | LOGGER.training_log(str(conf.__dict__)) 1052 | 1053 | coord = tf.train.Coordinator() 1054 | threads = tf.train.start_queue_runners(sess=sess, coord=coord,start=True) 1055 | #print("a") 1056 | for i in range(10000): 1057 | _ = sess.run(train_model.train_op) 1058 | 1059 | #run_batch_from_tfrecords(sess, coord, train_model, train_model.train_op) 1060 | 1061 | coord.request_stop() 1062 | coord.join(threads) 1063 | 1064 | def init_model_config(batch_size,len_features,net_type,rnn_type): 1065 | train_conf = TrainingConfig(True, False, False, batch_size, len_features, net_type,rnn_type) 1066 | valid_conf = TrainingConfig(False, True, False, batch_size, len_features, net_type,rnn_type) 1067 | test_conf = TrainingConfig(False, False, True, batch_size, len_features, net_type,rnn_type) 1068 | 1069 | return train_conf,valid_conf,test_conf 1070 | 1071 | #dataset版 1072 | def model_tfrecord_dataset(net_type,rnn_type): 1073 | 1074 | #初始文件路径等等 1075 | #init_environment(net_type) 1076 | 1077 | train_conf,valid_conf,test_conf = init_model_config(conf.batch_size,len_features,net_type,rnn_type) 1078 | 1079 | config = tf.ConfigProto() 1080 | config.gpu_options.allow_growth = True 1081 | #config.gpu_options.per_process_gpu_memory_fraction = 0.25 # 占用GPU90%的显存 1082 | initializer = tf.random_uniform_initializer(0, conf.init_scale) 1083 | 1084 | with tf.variable_scope("model", reuse=False, initializer=initializer): 1085 | global train_model 1086 | train_model = Model(conf, train_conf) 1087 | 1088 | with tf.variable_scope("model", reuse=True, initializer=initializer): 1089 | global test_model 1090 | test_model = Model(conf, test_conf) 1091 | 1092 | with tf.variable_scope("model", reuse=True, initializer=initializer): 1093 | global valid_model 1094 | valid_model = Model(conf, valid_conf) 1095 | 1096 | LOGGER.training_log(str(conf.__dict__)) 1097 | LOGGER.training_log(str(train_conf.activation)) 1098 | 1099 | train_data_set = make_dataset_from_tfrecord_file(param.train_file_pattern,conf.batch_size,True,1) 1100 | train_data_iterator = train_data_set.make_initializable_iterator() 1101 | train_next_element = train_data_iterator.get_next() 1102 | 1103 | train_data_no_op_set = make_dataset_from_tfrecord_file(param.train_file_pattern, conf.batch_size, False, 1) 1104 | train_data_no_op_iterator = train_data_no_op_set.make_initializable_iterator() 1105 | train_no_op_next_element = train_data_no_op_iterator.get_next() 1106 | 1107 | 1108 | valid_data_set = make_dataset_from_tfrecord_file(param.valid_file_pattern, conf.batch_size, False,1) 1109 | valid_data_iterator = valid_data_set.make_initializable_iterator() 1110 | valid_next_element = valid_data_iterator.get_next() 1111 | 1112 | test_data_set = make_dataset_from_tfrecord_file(param.test_file_pattern, conf.batch_size, False,1) 1113 | test_data_iterator = test_data_set.make_initializable_iterator() 1114 | test_next_element = test_data_iterator.get_next() 1115 | 1116 | saver = tf.train.Saver() 1117 | 1118 | with tf.Session(config=config) as sess: 1119 | 1120 | if conf.restore: 1121 | saver.restore(sess,data_path + task + str(net_type.value)+str(rnn_type.value)) 1122 | else: 1123 | sess.run(tf.global_variables_initializer()) 1124 | sess.run(tf.local_variables_initializer()) 1125 | 1126 | for epoch in range(conf.num_epochs): 1127 | sess.run(train_data_iterator.initializer) 1128 | i = 0 1129 | try: 1130 | while True: 1131 | input, early, label = sess.run(fetches=train_next_element) 1132 | # 通过session每次从数据集中取值 1133 | #RNN_NV1网络 1134 | #print(input.shape) 1135 | if net_type == NetType.RNN_NV1: 1136 | 1137 | if input.shape[0] < conf.batch_size: 1138 | print(input.shape) 1139 | break 1140 | input = np.transpose(input, [1, 0, 2]) 1141 | 1142 | sess.run(fetches=train_model.train_op, feed_dict={train_model.input_data: input, 1143 | train_model.early_stop: early, 1144 | train_model.targets: label}) 1145 | 1146 | #rnn_nvn网络 1147 | elif net_type == NetType.RNN_NVN: 1148 | if input.shape[0] < conf.batch_size: 1149 | break 1150 | new_label = np.zeros([conf.batch_size,conf.exp_seq_len],np.int32) 1151 | for batch in range(conf.batch_size): 1152 | new_label[batch,0:early[batch]] = label[batch] 1153 | new_label[batch,early[batch]:] = 0 1154 | label = new_label 1155 | input = np.transpose(input, [1, 0, 2]) 1156 | weight_sequence_loss = np.zeros([conf.batch_size,conf.exp_seq_len],np.float32) 1157 | for k in range(conf.batch_size): 1158 | weight_sequence_loss[k,0:early[k]] = 1 1159 | 1160 | sess.run(fetches=train_model.train_op, feed_dict={ train_model.input_data:input, 1161 | train_model.early_stop:early, 1162 | train_model.targets:label, 1163 | train_model.weight_sequence_loss:weight_sequence_loss}) 1164 | 1165 | #dnn 网络 1166 | elif net_type == NetType.DNN or net_type == NetType.DNN_MAXOUT: 1167 | list_input = [] 1168 | list_label = [] 1169 | for batch in range(input.shape[0]): 1170 | list_input.append(input[batch,0:early[batch],:]) 1171 | new_label = np.zeros([early[batch]],np.int32) 1172 | new_label[:] = label[batch] 1173 | list_label.append(new_label) 1174 | input = np.concatenate(tuple(list_input),axis=0) 1175 | label = np.concatenate(tuple(list_label),axis=0) 1176 | 1177 | sess.run(fetches=train_model.train_op, feed_dict={train_model.input_data:input, 1178 | train_model.early_stop:early, 1179 | train_model.targets:label}) 1180 | print("训练集第%d个batch" %(i)) 1181 | if i % 100 == 0 and i >0: 1182 | train_cost = 0 1183 | train_acc = 0 1184 | valid_cost = 0 1185 | valid_acc = 0 1186 | #sess.run(train_data_no_op_iterator.initializer) 1187 | #train_cost, train_acc = evaluate_from_tfrecord_dataset(net_type, sess, train_model,train_no_op_next_element, tf.no_op(), i / 100) 1188 | #sess.run(valid_data_iterator.initializer) 1189 | #valid_cost,valid_acc = evaluate_from_tfrecord_dataset(net_type,sess,valid_model,valid_next_element,tf.no_op(),i/100) 1190 | sess.run(test_data_iterator.initializer) 1191 | test_cost,test_acc = evaluate_from_tfrecord_dataset(net_type,sess,test_model,test_next_element,tf.no_op(),i/100) 1192 | print("训练集cost:%f,acc:%f" %(train_cost,train_acc)) 1193 | print("验证集cost:%f,acc:%f" %(valid_cost,valid_acc)) 1194 | print("测试集cost:%f,acc:%f" %(test_cost,test_acc)) 1195 | LOGGER.summary_log((train_cost,train_acc,valid_cost,valid_acc,test_cost,test_acc),i) 1196 | i = i + 1 1197 | except tf.errors.OutOfRangeError: 1198 | print("第%d epoch end!" % (epoch)) 1199 | print("共 %d 个batch" %(i)) 1200 | print("第%d epoch end!" % (epoch)) 1201 | save_path = saver.save(sess, data_path + task + str(net_type.value)+str(rnn_type.value)) 1202 | if save_path is not None: 1203 | LOGGER.training_log(str(save_path)) 1204 | 1205 | #dataset 解析函数 1206 | def __parse_function_9_features(example_proto): 1207 | feature = param.feature 1208 | features = tf.parse_single_example(example_proto,feature) 1209 | speed_sec = tf.reshape(tf.decode_raw(features[param.SPEED_SEC], tf.int64), [conf.exp_seq_len, param.WIDTH]) 1210 | avg_speed = tf.reshape(tf.decode_raw(features[param.AVG_SPEED], tf.int64), [conf.exp_seq_len, param.WIDTH]) 1211 | std_speed = tf.reshape(tf.decode_raw(features[param.STD_SPEED], tf.int64), [conf.exp_seq_len, param.WIDTH]) 1212 | acc_sec = tf.reshape(tf.decode_raw(features[param.ACC_SEC], tf.int64), [conf.exp_seq_len, param.WIDTH]) 1213 | mean_acc = tf.reshape(tf.decode_raw(features[param.MEAN_ACC], tf.int64), [conf.exp_seq_len, param.WIDTH]) 1214 | std_acc = tf.reshape(tf.decode_raw(features[param.STD_ACC], tf.int64), [conf.exp_seq_len, param.WIDTH]) 1215 | head = tf.reshape(tf.decode_raw(features[param.HEAD], tf.int64), [conf.exp_seq_len, param.WIDTH]) 1216 | head_mean = tf.reshape(tf.decode_raw(features[param.HEAD_MEAN], tf.int64), [conf.exp_seq_len, param.WIDTH]) 1217 | std_head = tf.reshape(tf.decode_raw(features[param.STD_HEAD], tf.int64), [conf.exp_seq_len, param.WIDTH]) 1218 | early = tf.cast(features[param.EARLY], tf.int32) 1219 | label = tf.cast(features[param.LABEL], tf.int32) 1220 | 1221 | seq = tf.concat([speed_sec, avg_speed, std_speed, acc_sec, mean_acc, std_acc,head,head_mean,std_head], axis=1) 1222 | seq_float32 = tf.cast(seq, tf.float32) 1223 | 1224 | return seq_float32,early,label 1225 | 1226 | def __parse_function_3_features(example_proto): 1227 | feature = param.feature 1228 | features = tf.parse_single_example(example_proto, feature) 1229 | speed_sec = tf.reshape(tf.decode_raw(features[param.SPEED_SEC], tf.int64), [conf.exp_seq_len, param.WIDTH]) 1230 | avg_speed = tf.reshape(tf.decode_raw(features[param.AVG_SPEED], tf.int64), [conf.exp_seq_len, param.WIDTH]) 1231 | std_speed = tf.reshape(tf.decode_raw(features[param.STD_SPEED], tf.int64), [conf.exp_seq_len, param.WIDTH]) 1232 | # acc_sec = tf.reshape(tf.decode_raw(features[param.ACC_SEC], tf.int64), [conf.exp_seq_len, param.WIDTH]) 1233 | # mean_acc = tf.reshape(tf.decode_raw(features[param.MEAN_ACC], tf.int64), [conf.exp_seq_len, param.WIDTH]) 1234 | # std_acc = tf.reshape(tf.decode_raw(features[param.STD_ACC], tf.int64), [conf.exp_seq_len, param.WIDTH]) 1235 | # head = tf.reshape(tf.decode_raw(features[param.HEAD], tf.int64), [conf.exp_seq_len, param.WIDTH]) 1236 | # head_mean = tf.reshape(tf.decode_raw(features[param.HEAD_MEAN], tf.int64), [conf.exp_seq_len, param.WIDTH]) 1237 | # std_head = tf.reshape(tf.decode_raw(features[param.STD_HEAD], tf.int64), [conf.exp_seq_len, param.WIDTH]) 1238 | early = tf.cast(features[param.EARLY], tf.int32) 1239 | label = tf.cast(features[param.LABEL], tf.int32) 1240 | 1241 | seq = tf.concat([speed_sec, avg_speed, std_speed], axis=1) 1242 | seq_float32 = tf.cast(seq, tf.float32) 1243 | 1244 | return seq_float32, early, label 1245 | 1246 | def __parse_function_6_features(example_proto): 1247 | feature = param.feature 1248 | features = tf.parse_single_example(example_proto, feature) 1249 | speed_sec = tf.reshape(tf.decode_raw(features[param.SPEED_SEC], tf.int64), [conf.exp_seq_len, conf.discretization_width]) 1250 | avg_speed = tf.reshape(tf.decode_raw(features[param.AVG_SPEED], tf.int64), [conf.exp_seq_len, conf.discretization_width]) 1251 | std_speed = tf.reshape(tf.decode_raw(features[param.STD_SPEED], tf.int64), [conf.exp_seq_len, conf.discretization_width]) 1252 | acc_sec = tf.reshape(tf.decode_raw(features[param.ACC_SEC], tf.int64), [conf.exp_seq_len, conf.discretization_width]) 1253 | mean_acc = tf.reshape(tf.decode_raw(features[param.MEAN_ACC], tf.int64), [conf.exp_seq_len, conf.discretization_width]) 1254 | std_acc = tf.reshape(tf.decode_raw(features[param.STD_ACC], tf.int64), [conf.exp_seq_len, conf.discretization_width]) 1255 | # head = tf.reshape(tf.decode_raw(features[param.HEAD], tf.int64), [conf.exp_seq_len, param.WIDTH]) 1256 | # head_mean = tf.reshape(tf.decode_raw(features[param.HEAD_MEAN], tf.int64), [conf.exp_seq_len, param.WIDTH]) 1257 | # std_head = tf.reshape(tf.decode_raw(features[param.STD_HEAD], tf.int64), [conf.exp_seq_len, param.WIDTH]) 1258 | early = tf.cast(features[param.EARLY], tf.int32) 1259 | label = tf.cast(features[param.LABEL], tf.int32) 1260 | 1261 | seq = tf.concat([speed_sec, avg_speed, std_speed,acc_sec,mean_acc,std_acc], axis=1) 1262 | seq_float32 = tf.cast(seq, tf.float32) 1263 | 1264 | return seq_float32, early, label 1265 | 1266 | def __parse_function_12_features(example_proto): 1267 | feature = param.feature 1268 | features = tf.parse_single_example(example_proto, feature) 1269 | speed_sec = tf.reshape(tf.decode_raw(features[param.SPEED_SEC], tf.int64), 1270 | [conf.exp_seq_len, conf.discretization_width]) 1271 | avg_speed = tf.reshape(tf.decode_raw(features[param.AVG_SPEED], tf.int64), 1272 | [conf.exp_seq_len, conf.discretization_width]) 1273 | std_speed = tf.reshape(tf.decode_raw(features[param.STD_SPEED], tf.int64), 1274 | [conf.exp_seq_len, conf.discretization_width]) 1275 | acc_sec = tf.reshape(tf.decode_raw(features[param.ACC_SEC], tf.int64), 1276 | [conf.exp_seq_len, conf.discretization_width]) 1277 | mean_acc = tf.reshape(tf.decode_raw(features[param.MEAN_ACC], tf.int64), 1278 | [conf.exp_seq_len, conf.discretization_width]) 1279 | std_acc = tf.reshape(tf.decode_raw(features[param.STD_ACC], tf.int64), 1280 | [conf.exp_seq_len, conf.discretization_width]) 1281 | head = tf.reshape(tf.decode_raw(features[param.HEAD], tf.int64), [conf.exp_seq_len, param.WIDTH]) 1282 | head_mean = tf.reshape(tf.decode_raw(features[param.HEAD_MEAN], tf.int64), [conf.exp_seq_len, param.WIDTH]) 1283 | std_head = tf.reshape(tf.decode_raw(features[param.STD_HEAD], tf.int64), [conf.exp_seq_len, param.WIDTH]) 1284 | 1285 | max_speed = tf.reshape(tf.decode_raw(features[param.MAX_SPEED], tf.int64), 1286 | [conf.exp_seq_len, conf.discretization_width]) 1287 | max_acc = tf.reshape(tf.decode_raw(features[param.MAX_ACC], tf.int64), 1288 | [conf.exp_seq_len, conf.discretization_width]) 1289 | max_head = tf.reshape(tf.decode_raw(features[param.MAX_HEAD], tf.int64), 1290 | [conf.exp_seq_len, conf.discretization_width]) 1291 | 1292 | early = tf.cast(features[param.EARLY], tf.int32) 1293 | label = tf.cast(features[param.LABEL], tf.int32) 1294 | 1295 | seq = tf.concat([speed_sec, avg_speed, std_speed, acc_sec, mean_acc, std_acc,head,head_mean,std_head,max_speed,max_acc,max_head], axis=1) 1296 | seq_float32 = tf.cast(seq, tf.float32) 1297 | 1298 | return seq_float32, early, label 1299 | 1300 | #创建dataset 1301 | def make_dataset_from_tfrecord_file(file_name_pattern,batch_size=32,is_shuffle=True,repeat = 1): 1302 | filenames = util.search_file(file_name_pattern, tfrecords_data_path) 1303 | filenames = np.array(filenames) 1304 | perm = np.random.permutation(len(filenames)) 1305 | dataset = tf.data.TFRecordDataset(filenames[perm]) 1306 | if conf.num_features == 9: 1307 | dataset = dataset.map(__parse_function_9_features) 1308 | elif conf.num_features == 6: 1309 | dataset = dataset.map(__parse_function_6_features) 1310 | elif conf.num_features == 3: 1311 | dataset = dataset.map(__parse_function_3_features) 1312 | elif conf.num_features == 12: 1313 | dataset = dataset.map(__parse_function_12_features) 1314 | if is_shuffle: 1315 | dataset = dataset.shuffle(100000) 1316 | dataset = dataset.batch(batch_size) 1317 | dataset = dataset.repeat(repeat) 1318 | 1319 | return dataset 1320 | 1321 | #从npy获取数据 切割数据为预期长度的npy文件 1322 | def slice_seq(classes): 1323 | # 分训练集与测试集 验证集 8:1:1 1324 | train_data_all = None 1325 | train_label_all = None 1326 | train_early_all = None 1327 | valid_data_all = None 1328 | valid_label_all = None 1329 | valid_early_all = None 1330 | test_data_all = None 1331 | test_label_all = None 1332 | test_early_all = None 1333 | features_arr_list = [] 1334 | index_arr_list = [] 1335 | label_arr_list = [] 1336 | data_file_name_exp = data_path +"transportation_mode" 1337 | for i in range(classes): 1338 | print("加载" + str(i)) 1339 | # data_file = data_file_name +str(i) +".npy" 1340 | index_df = pd.DataFrame(pd.read_csv(data_file_name_exp +"_"+ str(i) + "_seg_index.csv")) 1341 | features_arr = np.load(data_file_name_exp + str(i) + ".npy") 1342 | features_arr = features_arr[:, 0:len_features] 1343 | index_arr = np.array(index_df.iloc[:, [1, 2]].T) 1344 | # index shape = [2,总个数] 1345 | # 第一维是第几段轨迹 第二维是在固定长度为exp_seq_len中的实际长度 1346 | # data shape =[seq_nums,exp_seq_len,feature_len] 切出相等的数据长度 不足的padding 1347 | (data, index_arr) = Data.slice_seq(features_arr, index_arr, conf.exp_seq_len) 1348 | 1349 | np.save(data_path+"slice_label" + str(i)+"_"+str(conf.exp_seq_len)+".npy",data) 1350 | np.save(data_path+"slice_index"+str(i)+".npy",index_arr) 1351 | 1352 | #分割数据集,合并数据 并写为npz文件 1353 | def partition_data_set(classes): 1354 | out_data_path = "G:/all_data/" 1355 | # 分训练集与测试集 验证集 8:1:1 1356 | train_data_all = None 1357 | train_label_all = None 1358 | train_early_all = None 1359 | valid_data_all = None 1360 | valid_label_all = None 1361 | valid_early_all = None 1362 | test_data_all = None 1363 | test_label_all = None 1364 | test_early_all = None 1365 | data_file_name_exp = data_path + "transportation_mode" 1366 | for i in range(classes): 1367 | data = np.load(data_path + "slice_label" + str(i) + "_" + str(conf.exp_seq_len) + ".npy") 1368 | index_arr = np.load(data_path + "slice_index" + str(i) + ".npy") 1369 | 1370 | # 切割后删除features_arr index 1371 | label_arr = np.zeros(shape=[index_arr.shape[1]], dtype=np.int32) 1372 | label_arr[:] = i 1373 | # features_arr_list.append(data) 1374 | # index_arr_list.append(index) 1375 | # label_arr_list.append(label) 1376 | # 划分训练集,验证集,测试集 1377 | print("划分训练集,验证集,测试集 " + str(i)) 1378 | seq_nums = index_arr.shape[1] 1379 | # 控制变量 1380 | np.random.seed(2) 1381 | index_perm = np.random.permutation(range(seq_nums)) 1382 | train_count = int(np.floor(seq_nums * 0.8)) 1383 | valid_count = int(np.floor(seq_nums * 0.9)) 1384 | test_count = seq_nums 1385 | train_index = index_perm[0:train_count] 1386 | valid_index = index_perm[train_count + 1:valid_count] 1387 | test_index = index_perm[valid_count + 1:seq_nums] 1388 | 1389 | # train_set valid_set test_set 1390 | train_data = data[train_index, :, :] 1391 | train_label = label_arr[train_index] 1392 | train_early = index_arr[1, train_index] 1393 | 1394 | valid_data = data[valid_index, :, :] 1395 | valid_label = label_arr[valid_index] 1396 | valid_early = index_arr[1, valid_index] 1397 | 1398 | test_data = data[test_index, :, :] 1399 | test_label = label_arr[test_index] 1400 | test_early = index_arr[1, test_index] 1401 | 1402 | # 删除读取到的data. 1403 | del data 1404 | del label_arr 1405 | del index_arr 1406 | 1407 | print("连接") 1408 | if train_data_all is None: 1409 | train_data_all = train_data 1410 | train_label_all = train_label 1411 | train_early_all = train_early 1412 | 1413 | valid_data_all = valid_data 1414 | valid_label_all = valid_label 1415 | valid_early_all = valid_early 1416 | 1417 | test_data_all = test_data 1418 | test_label_all = test_label 1419 | test_early_all = test_early 1420 | else: 1421 | train_data_all = np.concatenate((train_data_all, train_data), axis=0) 1422 | train_label_all = np.concatenate((train_label_all, train_label), axis=0) 1423 | train_early_all = np.concatenate((train_early_all, train_early), axis=0) 1424 | 1425 | valid_data_all = np.concatenate((valid_data_all, valid_data), axis=0) 1426 | valid_label_all = np.concatenate((valid_label_all, valid_label), axis=0) 1427 | valid_early_all = np.concatenate((valid_early_all, valid_early), axis=0) 1428 | 1429 | test_data_all = np.concatenate((test_data_all, test_data), axis=0) 1430 | test_label_all = np.concatenate((test_label_all, test_label), axis=0) 1431 | test_early_all = np.concatenate((test_early_all, test_early), axis=0) 1432 | 1433 | np.savez(out_data_path+"valid_data_set.npz",valid_data=valid_data_all,valid_label = valid_label_all,valid_early_stop = valid_early_all) 1434 | del valid_label_all 1435 | del valid_data_all 1436 | del valid_early_all 1437 | np.savez(out_data_path+"test_data_set.npz",test_data=test_data_all,test_label = test_label_all,test_early_stop = test_early_all) 1438 | del test_label_all 1439 | del test_early_all 1440 | del test_data_all 1441 | np.savez(out_data_path + "train_data_set.npz", train_data=train_data_all, train_label=train_label_all,train_early_stop=train_early_all) 1442 | 1443 | def main(): 1444 | model_tfrecord_dataset(get_net_type(conf.net_type),get_rnn_type(conf.rnn_type)) 1445 | #rnn_nv1_model(False) 1446 | 1447 | if __name__ == "__main__": 1448 | #slice_seq(4) 1449 | main() 1450 | #partition_data_set(4) 1451 | -------------------------------------------------------------------------------- /data_funs.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import tensorflow as tf 3 | import sys 4 | import csv 5 | 6 | import numpy as np 7 | import math 8 | import sklearn.preprocessing 9 | import os 10 | import time 11 | import pandas as pd 12 | import util 13 | from param import WIDTH 14 | from param import FeatureName 15 | import config 16 | import param 17 | from param import FENWEI_MAX 18 | from param import FILTER_K 19 | 20 | 21 | class Data: 22 | @staticmethod 23 | def splitDataset(mmsi, tr_mmsi, vl_tmmsi): 24 | test_index = Data.get_match_index(mmsi, tr_mmsi) 25 | val_index = Data.get_match_index(mmsi, vl_tmmsi) 26 | train_index = np.delete(np.array(range(len(mmsi))), np.concatenate([test_index, val_index])) 27 | return (train_index, test_index, val_index) 28 | 29 | @staticmethod 30 | def randomSplitDataset(mmsi, train_perc=0.5, val_perc=0.1): 31 | mmsi = np.array(mmsi) 32 | seq_len = mmsi.shape[0] 33 | test_perc = 1 - train_perc - val_perc 34 | rdn_perm = np.random.permutation(seq_len) 35 | train_index = rdn_perm[0:int(seq_len * train_perc)] 36 | test_index = rdn_perm[int(seq_len * train_perc): int(seq_len * (train_perc + test_perc))] 37 | val_index = rdn_perm[int(seq_len * (train_perc + test_perc)): seq_len] 38 | return (train_index, test_index, val_index) 39 | 40 | @staticmethod 41 | def get_match_index(mmsi, target): 42 | unique_mmsi = np.unique(mmsi) 43 | result = np.concatenate([np.where(mmsi == unique_mmsi[i]) for i in target], axis=1)[0] 44 | return result 45 | 46 | @staticmethod 47 | def upsample(data, cls, times): 48 | (X_train, y_train, stop_train) = data 49 | labels = [set(i) for i in y_train] 50 | samples = [cls in i for i in labels] 51 | sample_index = np.where(samples)[0] 52 | sample_x = np.repeat(X_train[:, sample_index, :], times - 1, axis=1) 53 | sample_y = np.repeat(y_train[sample_index, :], times - 1, axis=0) 54 | sample_stop = np.repeat(stop_train[sample_index], times - 1, axis=0) 55 | X_train = np.concatenate((X_train, sample_x), axis=1) 56 | y_train = np.vstack((y_train, sample_y)) 57 | stop_train = np.hstack((stop_train, sample_stop)) 58 | return (X_train, y_train, stop_train) 59 | 60 | # cut sequence into smaller sequences specified by the conf 61 | # 将序列切成指定长度的 62 | @staticmethod 63 | def reorganizeSeq(x, y, mmsi, exp_seq_len): 64 | num_features = x.shape[2] 65 | # 总共可以切出的序列个数 66 | num_total_seq = int(sum([math.ceil(i) for i in mmsi[1] / exp_seq_len])) 67 | new_data = np.zeros((num_total_seq, exp_seq_len, num_features)) 68 | new_label = np.zeros((num_total_seq, exp_seq_len)) 69 | # 0行存放编号 1行存放序列长度 70 | new_mmsi = np.zeros((2, num_total_seq)).astype(int) 71 | count = 0 72 | for v in range(len(mmsi[0])): # iterate each vessel 73 | # 每个用户的数据 74 | # print v 75 | vessel_data = x[v] 76 | vessel_lab = y[v] 77 | # 用户编号 78 | vessel_mmsi = mmsi[0][v] 79 | # print(mmsi[0][v]) 80 | # get full sequences first 81 | # 各个用户能切出的序列个数 82 | num_full_seq = mmsi[1][v] // exp_seq_len 83 | if num_full_seq: 84 | # full_seq的shape为当前用户的(总个数,序列长度,特征) 85 | full_seq = vessel_data[0:num_full_seq * exp_seq_len].reshape((num_full_seq, exp_seq_len, num_features)) 86 | full_lab = vessel_lab[0:num_full_seq * exp_seq_len].reshape((num_full_seq, exp_seq_len)) 87 | new_data[count:(count + num_full_seq)] = full_seq 88 | new_label[count:(count + num_full_seq)] = full_lab 89 | new_mmsi[0][count:(count + num_full_seq)] = vessel_mmsi 90 | new_mmsi[1][count:(count + num_full_seq)] = exp_seq_len 91 | count += num_full_seq 92 | 93 | # 序列切片多出来的长度保存起来 94 | remain_seq = np.zeros((exp_seq_len, num_features)) 95 | remain_seq[0:(mmsi[1][v] - num_full_seq * exp_seq_len)] = vessel_data[num_full_seq * exp_seq_len:mmsi[1][v]] 96 | remain_lab = np.zeros(exp_seq_len) 97 | remain_lab[0:(mmsi[1][v] - num_full_seq * exp_seq_len)] = vessel_lab[num_full_seq * exp_seq_len:mmsi[1][v]] 98 | new_data[count] = remain_seq 99 | new_label[count] = remain_lab 100 | new_mmsi[0][count] = vessel_mmsi 101 | new_mmsi[1][count] = mmsi[1][v] - num_full_seq * exp_seq_len 102 | count += 1 103 | return (new_data, new_label, new_mmsi) 104 | 105 | #处理原始数据,提取经纬度,时间,与标签对应 106 | @staticmethod 107 | def sovle_row_data(interval): 108 | datadir = "G:/新建文件夹/Geolife Trajectories 1.3/Data/" 109 | 110 | valiable_user_data = open("./data/have_label_user.txt","r") 111 | user_list = valiable_user_data.readlines() 112 | for i in user_list: 113 | user_id = i[0:3] 114 | label_txt_name = datadir + user_id+"/labels.txt" 115 | label_file = open(label_txt_name,"r") 116 | #label文件 数据还是字符串 117 | list_label = label_file.readlines()[1:] 118 | #label_list 数据是label数组 119 | label_list = [] 120 | for i in list_label: 121 | l = i[0:len(i)-1].split("\t") 122 | label_list.append(l) 123 | 124 | plt_path = datadir + user_id + "/Trajectory" 125 | list_plt_name = os.listdir(plt_path) 126 | 127 | user_data = datadir + user_id + "/userdata_interval_"+str(interval)+".csv" 128 | user_data_file = open(user_data,"w") 129 | 130 | label_time_index = 0 131 | 132 | #循环处理所有plt文件 133 | i = 0 134 | while(i < len(list_plt_name)): 135 | 136 | is_finish = False 137 | plt_name = list_plt_name[i] 138 | print("处理", plt_name) 139 | 140 | plt_file_name = plt_path + "/" + plt_name 141 | #plt_time_str = plt_name[0:4] + "/" +plt_name[4:6] + "/" +plt_name[6:8] +" " + plt_name[8:10] +":"+plt_name[10:12]+":"+plt_name[12:14] 142 | #plt_time = time.strptime(plt_time_str,'%Y/%m/%d %H:%M:%S') 143 | #if plt_time 144 | plt_file = open(plt_file_name,"r") 145 | data = plt_file.readlines() 146 | data = data[6:len(data)] 147 | 148 | #plt文件的起始时间 149 | plt_start_time_str = data[0] 150 | plt_end_time_str = data[-1] 151 | plt_start_time_list = plt_start_time_str[0:len(plt_start_time_str)-1].split(",") 152 | plt_start_time = time.strptime(plt_start_time_list[-2] + " " + plt_start_time_list[-1],'%Y-%m-%d %H:%M:%S') 153 | plt_end_time_list = plt_end_time_str[0:len(plt_end_time_str)-1].split(",") 154 | plt_end_time = time.strptime(plt_end_time_list[-2] + " " + plt_end_time_list[-1],'%Y-%m-%d %H:%M:%S') 155 | 156 | #label 当前起始时间 157 | label_start_time = time.strptime(label_list[label_time_index][0], '%Y/%m/%d %H:%M:%S') 158 | label_end_time = time.strptime(label_list[label_time_index][1], '%Y/%m/%d %H:%M:%S') 159 | 160 | #如果plt_end_time < 当前label_start_time 处理下一个plt文件 161 | if plt_end_time <= label_start_time: 162 | i+=1 163 | continue 164 | elif plt_start_time >= label_end_time : 165 | #重复此次循环 166 | i-=1 167 | label_time_index += 1 168 | if label_time_index > len(label_list)-1: 169 | is_finish = True 170 | else: 171 | #处理plt文件中的内容 172 | print("处理有标签的文件",plt_name) 173 | 174 | last_time = None 175 | k = 0 176 | while(k < len(data)): 177 | line = data[k] 178 | line_time_list = line[0:len(line)-1].split(",") 179 | line_time = time.strptime(line_time_list[-2] + " " + line_time_list[-1],'%Y-%m-%d %H:%M:%S') 180 | #print(line_time,label_start_time,label_end_time) 181 | 182 | if line_time >= label_start_time and line_time <= label_end_time: 183 | if k == 0: 184 | last_time = line_time 185 | else: 186 | if line_time == last_time: 187 | last_time = line_time 188 | k+=1 189 | continue 190 | result_line = user_id +"," + line[0:len(line)-1] + "," + label_list[label_time_index][-1] + "," +str(label_time_index) 191 | user_data_file.write(result_line + "\n") 192 | last_time = line_time 193 | k+=interval 194 | elif line_time >label_end_time: 195 | 196 | label_time_index += 1 197 | if label_time_index > len(label_list)-1: 198 | is_finish = True 199 | break 200 | label_start_time = time.strptime(label_list[label_time_index][0], '%Y/%m/%d %H:%M:%S') 201 | label_end_time = time.strptime(label_list[label_time_index][1], '%Y/%m/%d %H:%M:%S') 202 | elif line_time offset+1): 295 | #加速度 296 | #a = (v1-v0)/t 297 | feature_arr[ii- offset][1] = (feature_arr[ii- offset][0] - feature_arr[ii-1-offset][0]) / t 298 | 299 | fangweijiao[ii-offset] = util.jwd2angle(group.loc[ii,"lat"],group.loc[ii,"lon"],group.loc[ii-1,"lat"],group.loc[ii-1,"lon"]) 300 | 301 | #方向转换 正数代表作,负数代表右 302 | for k in range(2,len(fangweijiao)): 303 | if fangweijiao[k] - fangweijiao[k-1] <= 180: 304 | feature_arr[k][6] = fangweijiao[k] - fangweijiao[k-1] 305 | else: 306 | feature_arr[k][6] = -(360 - (fangweijiao[k] - fangweijiao[k-1])) 307 | 308 | #0 放的是速度 1放的是加速度 309 | avg_speed = np.mean(feature_arr[2:,0],axis=0) 310 | acc_mean = np.mean(feature_arr[2:,1],axis=0) 311 | std_speed = np.std(feature_arr[2:,0],axis=0) 312 | std_acc = np.std(feature_arr[2:,1],axis=0) 313 | head_mean = np.mean(np.abs(feature_arr[2:,6]),axis=0) 314 | std_head = np.std(feature_arr[2:,6],axis=0) 315 | feature_arr[2:,2] = std_speed 316 | feature_arr[2:,3] = avg_speed 317 | feature_arr[2:,4] = acc_mean 318 | feature_arr[2:,5] = std_acc 319 | feature_arr[2:,7] = head_mean 320 | feature_arr[2:,8] = std_head 321 | feature_arr = feature_arr[2:,:] 322 | 323 | #print(feature_arr) 324 | result = pd.DataFrame(columns=result_col_name) 325 | #result["user_id"] = group["user_id"][2:len(group.index)] 326 | start = group.index[0] + 2 327 | end = group.index[-1] 328 | result["user_id"] = group.loc[start:end,"user_id"] 329 | result["lat"] = group.loc[start:end,"lat"] 330 | result["lon"] = group.loc[start:end,"lon"] 331 | #print(result.info(),length,feature_arr.shape) 332 | result["speed_sec"] = feature_arr[:,0] 333 | result["acc_sec"] = feature_arr[:,1] 334 | result["std_speed"] = feature_arr[:,2] 335 | result["avg_speed"] = feature_arr[:,3] 336 | result["mean_acc"] = feature_arr[:,4] 337 | result["std_acc"] = feature_arr[:,5] 338 | result["head"] = feature_arr[:,6] 339 | result["head_mean"] = feature_arr[:,7] 340 | result["std_head"] = feature_arr[:,8] 341 | result["date"] = group.loc[start:end,"date"] 342 | result["time"] = group.loc[start:end,"time"] 343 | result["label"] = util.switch_mode(group.loc[start,"label"]) 344 | result["seg_label"] = user_id +" " + str(group.loc[start,"label_count"]) 345 | #一组label最终结果dataframe 346 | result_df = result_df.append(result) 347 | 348 | result_df.index = range(0,result_df.shape[0]) 349 | #result_df.to_csv(datadir + user_id + "/user_features.csv", index=False) 350 | result_df.to_csv(datadir + user_id +"/user_features_interval_"+str(interval) +".csv",index=False,mode="w+") 351 | user_data_file.close() 352 | 353 | # 计算特征 354 | @staticmethod 355 | def caculate_feature_12(interval_list): 356 | datadir = "G:/新建文件夹/Geolife Trajectories 1.3/Data/" 357 | feature_num = 12 358 | valiable_user_data = open("./data/have_label_user.txt", "r") 359 | user_list = valiable_user_data.readlines() 360 | for interval in interval_list: 361 | print("处理%d" % (interval)) 362 | for user in user_list: 363 | user_id = user[0:3] 364 | user_data_name = datadir + user_id + "/userdata_interval_" + str(interval) + ".csv" 365 | # user_data_name = datadir + user_id + "/userdata.csv" 366 | print("开始处理", user_id) 367 | user_data_file = open(user_data_name, "r") 368 | 369 | # user_data_file = np.loadtxt(user_data_name,dtype=np.str,delimiter=",") 370 | # label_list = user_data_file[:,-1] 371 | # label_list = label_list.astype(int) 372 | # label_unique,label_index,label_count = np.unique(label_list, return_counts=True, return_index=True) 373 | # #print(label_unique,label_index,label_count) 374 | # 375 | # 376 | # for i in range(1): 377 | # #一个label要使用的数组 378 | # #result = np.empty(shape=[label_count[i],feature_num],dtype=np.str_) 379 | # #一个label的索引在一个用户文件中 380 | # start = label_index[i] 381 | # end = label_index[i] + label_count[i] 382 | # #一个label索引对应的原始数据 383 | # data = user_data_file[start:end,:] 384 | # #经纬度 以及时间 385 | # lat_lon_time = data[:,[1,2,5]] 386 | # #将user_id,经纬度赋值给结果数组 387 | # #result[:,0:3] = data[:,0:3] 388 | # 389 | # #计算特征 速度 加速度 开始点没有速度,第一个点没有加速度, 所以最后数组比原始数组少两个点 390 | # for i in range(1,len(lat_lon_time)): 391 | # dis = util.jwd2dis(lat_lon_time[i][0],lat_lon_time[i][1],lat_lon_time[i-1][0],lat_lon_time[i-1][1]) 392 | # t = util.timestamp2second(lat_lon_time[i],lat_lon_time[i-1]) 393 | # 394 | # print(lat_lon_time) 395 | 396 | # #user_data = user_data_file.readlines() 397 | # 列名 398 | col_name = ["user_id", "lat", "lon", "non-use", "alt", "timestamp", "date", "time", "label", 399 | "label_count"] 400 | # 原始数据 401 | raw_data_df = pd.DataFrame(pd.read_csv(user_data_file, header=None, names=col_name)) 402 | # 结果列名 403 | result_col_name = ["user_id", "lat", "lon", "speed_sec", "acc_sec", "std_speed", "avg_speed", 404 | "mean_acc", "std_acc", "head", "head_mean", "std_head","max_speed","max_acc","max_head", "date", "time", "label", 405 | "seg_label"] 406 | # 结果数据 407 | result_df = pd.DataFrame(columns=result_col_name) 408 | 409 | # 通过标签分组轨迹 410 | label_gp = raw_data_df.groupby(by=col_name[-1]) 411 | 412 | for label_count, group in label_gp: 413 | # print(group) 414 | # print(len(group.index)) 415 | # temp_result = pd.DataFrame(columns = result_col_name) 416 | # 特征数组 417 | # print("label_count",label_count) 418 | if (group.index[-1] - group.index[0]) < 2: 419 | print("丢弃本组数据") 420 | continue 421 | feature_arr = np.zeros(shape=[group.index[-1] - group.index[0] + 1, feature_num], 422 | dtype=np.float64) 423 | fangweijiao = np.zeros(shape=[group.index[-1] - group.index[0] + 1], dtype=np.float64) 424 | # print(group) 425 | # print(len(group.index)) 426 | offset = group.index[0] 427 | for ii in group.index[1:]: 428 | # row_result = pd.Series(index=result_col_name) 429 | dis = util.jwd2dis(group.loc[ii, "lat"], group.loc[ii, "lon"], group.loc[ii - 1, "lat"], 430 | group.loc[ii - 1, "lon"]) 431 | t = util.timestamp2second(group.loc[ii, "timestamp"], group.loc[ii - 1, "timestamp"]) 432 | # 速度 433 | feature_arr[ii - offset][0] = dis / t 434 | if (ii > offset + 1): 435 | # 加速度 436 | # a = (v1-v0)/t 437 | feature_arr[ii - offset][1] = (feature_arr[ii - offset][0] - 438 | feature_arr[ii - 1 - offset][0]) / t 439 | 440 | fangweijiao[ii - offset] = util.jwd2angle(group.loc[ii, "lat"], group.loc[ii, "lon"], 441 | group.loc[ii - 1, "lat"], 442 | group.loc[ii - 1, "lon"]) 443 | 444 | # 方向转换 正数代表作,负数代表右 445 | #print(fangweijiao) 446 | for k in range(2, len(fangweijiao)): 447 | #print(fangweijiao[k],fangweijiao[k-1]) 448 | #print(fangweijiao[k] - fangweijiao[k-1]) 449 | if fangweijiao[k] >= fangweijiao[k-1]: 450 | 451 | if fangweijiao[k] - fangweijiao[k - 1] <= 180: 452 | feature_arr[k][6] = fangweijiao[k] - fangweijiao[k - 1] 453 | else: 454 | feature_arr[k][6] = -(360 - (fangweijiao[k] - fangweijiao[k - 1])) 455 | else: 456 | if fangweijiao[k-1] - fangweijiao[k] <=180: 457 | feature_arr[k][6] = fangweijiao[k-1] - fangweijiao[k] 458 | else: 459 | feature_arr[k][6] = -(360 - (fangweijiao[k-1] - fangweijiao[k])) 460 | 461 | 462 | # 0 放的是速度 1放的是加速度 463 | avg_speed = np.mean(feature_arr[2:, 0], axis=0) 464 | acc_mean = np.mean(feature_arr[2:, 1], axis=0) 465 | std_speed = np.std(feature_arr[2:, 0], axis=0) 466 | std_acc = np.std(feature_arr[2:, 1], axis=0) 467 | head_mean = np.mean(np.abs(feature_arr[2:, 6]), axis=0) 468 | std_head = np.std(feature_arr[2:, 6], axis=0) 469 | max_speed = np.max(np.abs(feature_arr[2:,0]),axis=0) 470 | max_acc = np.max(np.abs(feature_arr[2:,1]),axis=0) 471 | max_head = np.max(np.abs(feature_arr[2:,6]),axis=0) 472 | #print(feature_arr[2:,6]) 473 | feature_arr[2:, 2] = std_speed 474 | feature_arr[2:, 3] = avg_speed 475 | feature_arr[2:, 4] = acc_mean 476 | feature_arr[2:, 5] = std_acc 477 | feature_arr[2:, 7] = head_mean 478 | feature_arr[2:, 8] = std_head 479 | feature_arr[2:,9] = max_speed 480 | feature_arr[2:, 10]= max_acc 481 | feature_arr[2:,11] = max_head 482 | feature_arr = feature_arr[2:, :] 483 | 484 | # print(feature_arr) 485 | result = pd.DataFrame(columns=result_col_name) 486 | # result["user_id"] = group["user_id"][2:len(group.index)] 487 | start = group.index[0] + 2 488 | end = group.index[-1] 489 | result["user_id"] = group.loc[start:end, "user_id"] 490 | result["lat"] = group.loc[start:end, "lat"] 491 | result["lon"] = group.loc[start:end, "lon"] 492 | # print(result.info(),length,feature_arr.shape) 493 | result["speed_sec"] = feature_arr[:, 0] 494 | result["acc_sec"] = feature_arr[:, 1] 495 | result["std_speed"] = feature_arr[:, 2] 496 | result["avg_speed"] = feature_arr[:, 3] 497 | result["mean_acc"] = feature_arr[:, 4] 498 | result["std_acc"] = feature_arr[:, 5] 499 | result["head"] = feature_arr[:, 6] 500 | result["head_mean"] = feature_arr[:, 7] 501 | result["std_head"] = feature_arr[:, 8] 502 | result["max_speed"] = feature_arr[:,9] 503 | result["max_acc"] = feature_arr[:,10] 504 | result["max_head"] = feature_arr[:,11] 505 | result["date"] = group.loc[start:end, "date"] 506 | result["time"] = group.loc[start:end, "time"] 507 | result["label"] = util.switch_mode(group.loc[start, "label"]) 508 | result["seg_label"] = user_id + " " + str(group.loc[start, "label_count"]) 509 | # 一组label最终结果dataframe 510 | result_df = result_df.append(result) 511 | 512 | result_df.index = range(0, result_df.shape[0]) 513 | # result_df.to_csv(datadir + user_id + "/user_features.csv", index=False) 514 | result_df.to_csv(datadir + user_id + "/user_features_interval_" + str(interval) + ".csv", 515 | index=False, mode="w+") 516 | user_data_file.close() 517 | 518 | @staticmethod 519 | def caculate_feature_max_min(): 520 | datadir = "G:/新建文件夹/Geolife Trajectories 1.3/Data/" 521 | feature_num = 10 522 | valiable_user_data = open("./data/have_label_user.txt", "r") 523 | user_list = valiable_user_data.readlines() 524 | for user in user_list: 525 | user_id = user[0:3] 526 | user_feature_name = datadir + user_id + "/user_features.csv" 527 | user_feature_file = open(user_feature_name,"r") 528 | user_feature_df = pd.DataFrame(pd.read_csv(user_feature_file)) 529 | 530 | user_feature_max_min_name = datadir + user_id +"/user_features_max_min.csv" 531 | label_group = user_feature_df.groupby(by="label") 532 | 533 | #result = np.zeros(shape=[10,len(label_group)+1]) 534 | result_df = pd.DataFrame(columns=["speed_sec","acc_sec","std_speed","avg_speed","mean_acc","max_or_min","label"]) 535 | 536 | print(user_id) 537 | 538 | for name,group in label_group: 539 | #print(type(group)) 540 | #series_max = group.iloc[:,[3,4,5,6,7]].idxmax() 541 | #series_min = group.iloc[:,[3,4,5,6,7]].idxmin() 542 | max = group.iloc[:,[3,4,5,6,7,-2]].max() 543 | min = group.iloc[:,[3,4,5,6,7,-2]].min() 544 | max["max_or_min"] = "max" 545 | min["max_or_min"] = "min" 546 | #max_list = max.tolist() 547 | #max_list.append("max") 548 | df_max = pd.DataFrame(max) 549 | df_max = df_max.T 550 | df_min = pd.DataFrame(min) 551 | df_min = df_min.T 552 | result_df = result_df.append(df_max) 553 | result_df = result_df.append(df_min) 554 | # df.append(pd.DataFrame(max)) 555 | #dict = max.to_dict() 556 | #max.to_csv(user_feature_max_name,mode= "a+",index =True) 557 | #min.to_csv(user_feature_min_name,mode = "a+",index = True) 558 | # print(name) 559 | # print(group.describe()) 560 | # print(group.iloc[:,[3,4,5,6,7]].quantile(0.95)) 561 | # #print(group.loc[237777,"speed_sec"]) 562 | # #print(series_max[[0,1]]) 563 | # #print(type(list(series_max.index))) 564 | # #print(group.iloc[series_max,series_max.index]) 565 | # max_list = [] 566 | # min_list = [] 567 | # for i in range(len(series_max)): 568 | # #print(series_max[i]) 569 | # #print(series_max.index[i]) 570 | # #print(series_max.iloc[i]) 571 | # max_list.append(group.loc[series_max.iloc[i],series_max.index[i]]) 572 | # min_list.append(group.loc[series_min.iloc[i],series_min.index[i]]) 573 | # 574 | # print(max_list,min_list) 575 | 576 | #print(result_df) 577 | result_df.to_csv(user_feature_max_min_name,index=False) 578 | user_feature_file.close() 579 | 580 | valiable_user_data.close() 581 | 582 | @staticmethod 583 | def caculate_all_max_min(): 584 | datadir = "G:/新建文件夹/Geolife Trajectories 1.3/Data/" 585 | feature_num = 10 586 | valiable_user_data = open("./data/have_label_user.txt", "r") 587 | user_list = valiable_user_data.readlines() 588 | col_name = ["speed_sec", "acc_sec", "std_speed", "avg_speed", "mean_acc", "max_or_min", "label"] 589 | df = pd.DataFrame() 590 | #status = open(datadir+"status.csv","w+") 591 | 592 | 593 | for user in user_list: 594 | user_id = user[0:3] 595 | # user_features_max_min_name = datadir + user_id + "/user_features_max_min.csv" 596 | # user_features_max_min_file = open(user_features_max_min_name,"r") 597 | # # 原始数据 598 | # raw_data_df = pd.DataFrame(pd.read_csv(user_features_max_min_file)) 599 | # max_min_df = max_min_df.append(raw_data_df) 600 | # 601 | # user_features_max_min_file.close() 602 | user_feature_file_name = datadir + user_id +"/user_features.csv" 603 | user_feature_file = open(user_feature_file_name,"r") 604 | raw_data_df = pd.DataFrame(pd.read_csv(user_feature_file)) 605 | df = df.append(raw_data_df) 606 | 607 | df_label_groups = df.groupby("label") 608 | 609 | 610 | result_df = pd.DataFrame() 611 | for name,group in df_label_groups: 612 | df_gp_desc = group.iloc[:,[3,4,5,6,7]].describe() 613 | baifenwei_95 = group.iloc[:,[3,4,5,6,7]].quantile(0.95) 614 | baifenwei_96 = group.iloc[:,[3,4,5,6,7]].quantile(0.96) 615 | baifenwei_97 = group.iloc[:, [3, 4, 5, 6, 7]].quantile(0.97) 616 | baifenwei_98 = group.iloc[:, [3, 4, 5, 6, 7]].quantile(0.98) 617 | baifenwei_99 = group.iloc[:, [3, 4, 5, 6, 7]].quantile(0.99) 618 | #result_df = result_df.append(df_gp_desc) 619 | #print(name,"\n",baifenwei_95,baifenwei_96,baifenwei_97,baifenwei_98,baifenwei_99) 620 | file_name_99 = datadir + "baifenwei_99" + ".csv" 621 | file_name_98 = datadir + "baifenwei_98" + ".csv" 622 | file_name_97 = datadir + "baifenwei_97" + ".csv" 623 | file_name_96 = datadir + "baifenwei_96" + ".csv" 624 | file_name_95 = datadir + "baifenwei_95" + ".csv" 625 | baifenwei_99.to_csv(file_name_99,mode = "a+") 626 | baifenwei_98.to_csv(file_name_98,mode = "a+") 627 | baifenwei_97.to_csv(file_name_97,mode = "a+") 628 | baifenwei_96.to_csv(file_name_96,mode = "a+") 629 | baifenwei_95.to_csv(file_name_95,mode = "a+") 630 | file_name = datadir+"status_label_" +str(name) + ".csv" 631 | df_gp_desc.to_csv(file_name,index=True,mode = "w+") 632 | 633 | 634 | #print(result_df) 635 | #result_df.to_csv(datadir+"status.csv",mode="w+") 636 | # max_min_groups = max_min_df.groupby(by = "max_or_min") 637 | # 638 | # max_group = max_min_groups.get_group(name="max") 639 | # min_group = max_min_groups.get_group(name="min") 640 | # 641 | # label_max_groups = max_group.groupby(by="label") 642 | # label_min_groups = min_group.groupby(by= "label") 643 | # 644 | # for name,group in label_max_groups: 645 | # df_desc = group.describe() 646 | # baifenwei_75 = df_desc.loc["75%"] 647 | # baifenwei_25 = df_desc.loc["25%"] 648 | # delta_Q = baifenwei_75 - baifenwei_25 649 | # max = baifenwei_75 + delta_Q*1.5 650 | # print(max) 651 | #for name,group in label_min_groups: 652 | # print(name,group.describe()) 653 | 654 | 655 | valiable_user_data.close() 656 | 657 | @staticmethod 658 | def features_status(interval_list): 659 | datadir = "G:/新建文件夹/Geolife Trajectories 1.3/Data/" 660 | out_path = "G:/新建文件夹/Geolife Trajectories 1.3/gps_en_discrezation/features_status/" 661 | valiable_user_data = open("./data/have_label_user.txt", "r") 662 | user_list = valiable_user_data.readlines() 663 | 664 | for interval in interval_list: 665 | print("处理%d" %(interval)) 666 | users_df = pd.DataFrame() 667 | for user in user_list: 668 | user_id = user[0:3] 669 | # user_features_max_min_name = datadir + user_id + "/user_features_max_min.csv" 670 | # user_features_max_min_file = open(user_features_max_min_name,"r") 671 | # # 原始数据 672 | # raw_data_df = pd.DataFrame(pd.read_csv(user_features_max_min_file)) 673 | # max_min_df = max_min_df.append(raw_data_df) 674 | # 675 | # user_features_max_min_file.close() 676 | user_feature_file_name = datadir + user_id +"/user_features_interval_" + str(interval)+".csv" 677 | user_feature_file = open(user_feature_file_name,"r") 678 | raw_data_df = pd.DataFrame(pd.read_csv(user_feature_file)) 679 | users_df = users_df.append(raw_data_df) 680 | 681 | users_df.reset_index(drop=True) 682 | 683 | pd.DataFrame(users_df[param.SPEED_SEC].describe()).to_csv(out_path+"before_" +param.SPEED_SEC + ".csv") 684 | pd.DataFrame(users_df[param.AVG_SPEED].describe()).to_csv(out_path+"before_" +param.AVG_SPEED + ".csv") 685 | pd.DataFrame(users_df[param.STD_SPEED].describe()).to_csv(out_path+"before_" +param.STD_SPEED + ".csv") 686 | pd.DataFrame(users_df[param.ACC_SEC].describe()).to_csv(out_path+"before_" +param.ACC_SEC + ".csv") 687 | pd.DataFrame(users_df[param.MEAN_ACC].describe()).to_csv(out_path+"before_" +param.MEAN_ACC + ".csv") 688 | pd.DataFrame(users_df[param.STD_ACC].describe()).to_csv(out_path+"before_" +param.STD_ACC + ".csv") 689 | pd.DataFrame(users_df[param.HEAD].describe()).to_csv(out_path+"before_" +param.HEAD + ".csv") 690 | pd.DataFrame(users_df[param.HEAD_MEAN].describe()).to_csv(out_path+"before_" +param.HEAD_MEAN + ".csv") 691 | pd.DataFrame(users_df[param.STD_HEAD].describe()).to_csv(out_path+"before_" +param.STD_HEAD + ".csv") 692 | 693 | 694 | 695 | speed_sec = pd.DataFrame(Data.filter_box_quantile(users_df["speed_sec"], FILTER_K)).describe() 696 | acc_sec = pd.DataFrame(Data.filter_box_quantile(users_df["acc_sec"], FILTER_K)).describe() 697 | avg_speed = pd.DataFrame(Data.filter_box_quantile(users_df["avg_speed"], FILTER_K)).describe() 698 | std_speed = pd.DataFrame(Data.filter_box_quantile(users_df["std_speed"], FILTER_K)).describe() 699 | mean_acc = pd.DataFrame(Data.filter_box_quantile(users_df["mean_acc"], FILTER_K)).describe() 700 | std_acc = pd.DataFrame(Data.filter_box_quantile(users_df["std_acc"], FILTER_K)).describe() 701 | head = pd.DataFrame(Data.filter_box_quantile(users_df["head"], FILTER_K)).describe() 702 | head_mean = pd.DataFrame(Data.filter_box_quantile(users_df["head_mean"], FILTER_K)).describe() 703 | std_head = pd.DataFrame(Data.filter_box_quantile(users_df["std_head"], FILTER_K)).describe() 704 | 705 | pd.DataFrame(speed_sec).to_csv(out_path+"after_"+param.SPEED_SEC +".csv") 706 | pd.DataFrame(avg_speed).to_csv(out_path+"after_"+param.AVG_SPEED +".csv") 707 | pd.DataFrame(std_speed).to_csv(out_path+"after_"+param.STD_SPEED +".csv") 708 | pd.DataFrame(acc_sec).to_csv(out_path+"after_"+param.ACC_SEC +".csv") 709 | pd.DataFrame(mean_acc).to_csv(out_path+"after_"+param.MEAN_ACC +".csv") 710 | pd.DataFrame(std_acc).to_csv(out_path+"after_"+param.STD_ACC +".csv") 711 | pd.DataFrame(head).to_csv(out_path+"after_"+param.HEAD +".csv") 712 | pd.DataFrame(head_mean).to_csv(out_path+"after_"+param.HEAD_MEAN +".csv") 713 | pd.DataFrame(std_head).to_csv(out_path+"after_"+param.STD_HEAD +".csv") 714 | 715 | #离散化 716 | @staticmethod 717 | def discretization(interval_list): 718 | datadir = "G:/新建文件夹/Geolife Trajectories 1.3/Data/" 719 | out_path = "G:/新建文件夹/Geolife Trajectories 1.3/gps_en_discrezation/features_95_15/" 720 | feature_num = 9 721 | valiable_user_data = open("./data/have_label_user.txt", "r") 722 | user_list = valiable_user_data.readlines() 723 | #col_name = ["speed_sec", "acc_sec", "std_speed", "avg_speed", "mean_acc", "max_or_min", "label"] 724 | #所有数据 725 | 726 | # status = open(datadir+"status.csv","w+") 727 | for interval in interval_list: 728 | print("处理%d" %(interval)) 729 | users_df = pd.DataFrame() 730 | for user in user_list: 731 | user_id = user[0:3] 732 | # user_features_max_min_name = datadir + user_id + "/user_features_max_min.csv" 733 | # user_features_max_min_file = open(user_features_max_min_name,"r") 734 | # # 原始数据 735 | # raw_data_df = pd.DataFrame(pd.read_csv(user_features_max_min_file)) 736 | # max_min_df = max_min_df.append(raw_data_df) 737 | # 738 | # user_features_max_min_file.close() 739 | user_feature_file_name = datadir + user_id +"/user_features_interval_" + str(interval)+".csv" 740 | user_feature_file = open(user_feature_file_name,"r") 741 | raw_data_df = pd.DataFrame(pd.read_csv(user_feature_file)) 742 | users_df = users_df.append(raw_data_df) 743 | 744 | users_df.reset_index(drop=True) 745 | # print("离散化") 746 | # 747 | # file = open(out_path+"status"+str(interval)+".txt",mode="w+") 748 | # file.write("interval_%d \n"%(interval)) 749 | # for i in [0,0.95,0.96,0.97,0.98,0.99]: 750 | # file.write("%s %f %f\n" % (param.SPEED_SEC,i,users_df[param.SPEED_SEC].quantile(i))) 751 | # file.write("%s %f %f\n" % (param.AVG_SPEED,i,users_df[param.AVG_SPEED].quantile(i))) 752 | # file.write("%s %f %f\n" % (param.STD_SPEED,i,users_df[param.STD_SPEED].quantile(i))) 753 | # file.write("%s %f %f\n" % (param.ACC_SEC,i,users_df[param.ACC_SEC].quantile(i))) 754 | # file.write("%s %f %f\n" % (param.MEAN_ACC,i,users_df[param.MEAN_ACC].quantile(i))) 755 | # file.write("%s %f %f\n" % (param.STD_ACC,i,users_df[param.STD_ACC].quantile(i))) 756 | # file.write("\n") 757 | # 758 | # file.close() 759 | speed_sec = pd.DataFrame(Data.equal_width(users_df["speed_sec"],WIDTH)) 760 | acc_sec = pd.DataFrame(Data.equal_width(users_df["acc_sec"],WIDTH)) 761 | avg_speed = pd.DataFrame(Data.equal_width(users_df["avg_speed"],WIDTH)) 762 | std_speed = pd.DataFrame(Data.equal_width(users_df["std_speed"],WIDTH)) 763 | mean_acc = pd.DataFrame(Data.equal_width(users_df["mean_acc"],WIDTH)) 764 | std_acc = pd.DataFrame(Data.equal_width(users_df["std_acc"],WIDTH)) 765 | head = pd.DataFrame(Data.equal_width(users_df["head"],WIDTH)) 766 | head_mean = pd.DataFrame(Data.equal_width(users_df["head_mean"],WIDTH)) 767 | std_head = pd.DataFrame(Data.equal_width(users_df["std_head"],WIDTH)) 768 | 769 | print("连接矩阵") 770 | #features_en = np.concatenate((speed_sec,avg_speed,std_speed,acc_sec,mean_acc,std_acc),axis=1) 771 | result_df = pd.concat([speed_sec,avg_speed,std_speed,acc_sec,mean_acc,std_acc,head,head_mean,std_head],axis=1) 772 | 773 | #result_df = pd.DataFrame(features_en) 774 | result_df["label"] = users_df["label"].values 775 | result_df["seg_label"] = users_df["seg_label"].values 776 | #col_name = result_df.columns.tolist() 777 | #col_name.insert(col_name.index(0),"user_id") 778 | #result_df.reindex(columns=col_name) 779 | result_df["user_id"] = users_df["user_id"].values 780 | #result_df columns =[userid(1),speed_sec(width),avg_speed(width),std_speed(width),acc_sec(width),mean_acc(width),label(1),seg_label(1)] 781 | 782 | #result_file = open(datadir+"user_features_data_en.csv",mode="w+") 783 | result_df.to_csv(out_path+"user_features_data_en_1_interval_"+str(interval)+".csv",mode="w+",header=True,index=False) 784 | 785 | valiable_user_data.close() 786 | 787 | @staticmethod 788 | def discretization_12(interval_list): 789 | datadir = "G:/新建文件夹/Geolife Trajectories 1.3/Data/" 790 | out_path = "G:/新建文件夹/Geolife Trajectories 1.3/gps_en_discrezation/features_12_95_30/" 791 | feature_num = 12 792 | valiable_user_data = open("./data/have_label_user.txt", "r") 793 | user_list = valiable_user_data.readlines() 794 | # col_name = ["speed_sec", "acc_sec", "std_speed", "avg_speed", "mean_acc", "max_or_min", "label"] 795 | # 所有数据 796 | 797 | # status = open(datadir+"status.csv","w+") 798 | for interval in interval_list: 799 | print("处理%d" % (interval)) 800 | users_df = pd.DataFrame() 801 | for user in user_list: 802 | user_id = user[0:3] 803 | # user_features_max_min_name = datadir + user_id + "/user_features_max_min.csv" 804 | # user_features_max_min_file = open(user_features_max_min_name,"r") 805 | # # 原始数据 806 | # raw_data_df = pd.DataFrame(pd.read_csv(user_features_max_min_file)) 807 | # max_min_df = max_min_df.append(raw_data_df) 808 | # 809 | # user_features_max_min_file.close() 810 | user_feature_file_name = datadir + user_id + "/user_features_interval_" + str(interval)+ ".csv" 811 | user_feature_file = open(user_feature_file_name, "r") 812 | raw_data_df = pd.DataFrame(pd.read_csv(user_feature_file)) 813 | users_df = users_df.append(raw_data_df) 814 | 815 | users_df.reset_index(drop=True) 816 | # print("离散化") 817 | # 818 | # file = open(out_path+"status"+str(interval)+".txt",mode="w+") 819 | # file.write("interval_%d \n"%(interval)) 820 | # for i in [0,0.95,0.96,0.97,0.98,0.99]: 821 | # file.write("%s %f %f\n" % (param.SPEED_SEC,i,users_df[param.SPEED_SEC].quantile(i))) 822 | # file.write("%s %f %f\n" % (param.AVG_SPEED,i,users_df[param.AVG_SPEED].quantile(i))) 823 | # file.write("%s %f %f\n" % (param.STD_SPEED,i,users_df[param.STD_SPEED].quantile(i))) 824 | # file.write("%s %f %f\n" % (param.ACC_SEC,i,users_df[param.ACC_SEC].quantile(i))) 825 | # file.write("%s %f %f\n" % (param.MEAN_ACC,i,users_df[param.MEAN_ACC].quantile(i))) 826 | # file.write("%s %f %f\n" % (param.STD_ACC,i,users_df[param.STD_ACC].quantile(i))) 827 | # file.write("\n") 828 | # 829 | # file.close() 830 | speed_sec = pd.DataFrame(Data.equal_width(users_df["speed_sec"], WIDTH)) 831 | acc_sec = pd.DataFrame(Data.equal_width(users_df["acc_sec"], WIDTH)) 832 | avg_speed = pd.DataFrame(Data.equal_width(users_df["avg_speed"], WIDTH)) 833 | std_speed = pd.DataFrame(Data.equal_width(users_df["std_speed"], WIDTH)) 834 | mean_acc = pd.DataFrame(Data.equal_width(users_df["mean_acc"], WIDTH)) 835 | std_acc = pd.DataFrame(Data.equal_width(users_df["std_acc"], WIDTH)) 836 | head = pd.DataFrame(Data.equal_width(users_df["head"], WIDTH)) 837 | head_mean = pd.DataFrame(Data.equal_width(users_df["head_mean"], WIDTH)) 838 | std_head = pd.DataFrame(Data.equal_width(users_df["std_head"], WIDTH)) 839 | max_speed = pd.DataFrame(Data.equal_width(users_df["max_speed"], WIDTH)) 840 | max_acc = pd.DataFrame(Data.equal_width(users_df["max_acc"], WIDTH)) 841 | max_head = pd.DataFrame(Data.equal_width(users_df["max_head"], WIDTH)) 842 | 843 | print("连接矩阵") 844 | # features_en = np.concatenate((speed_sec,avg_speed,std_speed,acc_sec,mean_acc,std_acc),axis=1) 845 | result_df = pd.concat( 846 | [speed_sec, avg_speed, std_speed, acc_sec, mean_acc, std_acc, head, head_mean, std_head,max_speed,max_acc,max_head], axis=1) 847 | 848 | # result_df = pd.DataFrame(features_en) 849 | result_df["label"] = users_df["label"].values 850 | result_df["seg_label"] = users_df["seg_label"].values 851 | # col_name = result_df.columns.tolist() 852 | # col_name.insert(col_name.index(0),"user_id") 853 | # result_df.reindex(columns=col_name) 854 | result_df["user_id"] = users_df["user_id"].values 855 | # result_df columns =[userid(1),speed_sec(width),avg_speed(width),std_speed(width),acc_sec(width),mean_acc(width),label(1),seg_label(1)] 856 | 857 | # result_file = open(datadir+"user_features_data_en.csv",mode="w+") 858 | result_df.to_csv(out_path + "user_features_data_en_1_interval_" + str(interval) + ".csv", mode="w+", 859 | header=True, index=False) 860 | 861 | valiable_user_data.close() 862 | 863 | #盒状过滤 864 | @staticmethod 865 | def filter_box_quantile(x,k): 866 | print(x.name) 867 | #不同的特征不同过滤 868 | min = 0 869 | max = 0 870 | if x.name == param.SPEED_SEC or x.name == param.AVG_SPEED \ 871 | or x.name == param.STD_SPEED or x.name == param.MEAN_ACC or x.name == param.STD_ACC\ 872 | or x.name == param.HEAD_MEAN or x.name == param.STD_HEAD: 873 | min = x.quantile(0) 874 | max = x.quantile(FENWEI_MAX) 875 | elif x.name == param.ACC_SEC or x.name == param.HEAD: 876 | min = x.quantile(0.01) 877 | max = x.quantile(FENWEI_MAX) 878 | n = len(x.index) 879 | y = np.array(x.values) 880 | 881 | for i in range(k+1,n-k): 882 | 883 | if y[i] >min and y[i] max: 888 | y[i] = max 889 | if y[i] < min: 890 | y[i] = min 891 | series_y = pd.Series(data=y) 892 | 893 | return series_y 894 | 895 | #等宽离散 896 | @staticmethod 897 | def equal_width(x,width): 898 | x = Data.filter_box_quantile(x,10) 899 | 900 | min = x.min() 901 | max = x.max() 902 | interval = (max - min + 0.001)/width 903 | x_arr = np.array(x.values) 904 | x_arr = (x_arr - min) / interval 905 | x_arr = np.floor(x_arr).astype(np.int64) 906 | x_result = np.zeros(shape=[len(x_arr),width],dtype=np.int32) 907 | for i in range(len(x_arr)): 908 | x_result[i][x_arr[i]] = 1 909 | 910 | return x_result 911 | 912 | #制作npy文件 913 | @staticmethod 914 | def create_npy(interval): 915 | datadir = "G:/新建文件夹/Geolife Trajectories 1.3/Data/" 916 | self_data_dir = "./data/transportation_feature_en_1_interval_2/" 917 | user_data_file_name = datadir + "user_features_data_en_1_interval_"+str(interval)+".csv" 918 | user_data_file = open(user_data_file_name, "r") 919 | user_data_df = pd.DataFrame(pd.read_csv(user_data_file)) 920 | classes = 4 921 | #0-99 特征one-hot编码后数据 100 label 101 seg_label 102 user_id 922 | user_data_label_groups = user_data_df.groupby(by="label") 923 | 924 | for name,group in user_data_label_groups: 925 | #if int(name) < 7: 926 | # continue 927 | print("处理label ",name) 928 | mode_file_name = self_data_dir + "transportation_mode" + str(name) +".npy" 929 | features_arr = np.array(group.iloc[:,0:100]) 930 | seg_label_arr = np.array(group.iloc[:,-2]) 931 | seg_label_unique,seg_label_index,seg_label_count = np.unique(seg_label_arr,return_index=True,return_counts=True) 932 | index_file_name = self_data_dir + "transportation_mode_" + str(name) +"_seg_index.csv" 933 | index_df = pd.DataFrame() 934 | index_df["seg_label_unique"] = seg_label_unique 935 | index_df["seg_label_index"] = seg_label_index.astype(np.int32) 936 | index_df["seg_label_count"] = seg_label_count.astype(np.int32) 937 | index_df = index_df.sort_values(by="seg_label_index") 938 | 939 | index_df.to_csv(index_file_name,mode="w+",index=False) 940 | del index_df 941 | del seg_label_arr 942 | np.save(mode_file_name,features_arr) 943 | 944 | 945 | 946 | user_data_file.close() 947 | 948 | #user_data_df_classes_4 = user_data_df[user_data_df["label"]<4] 949 | #data_classes_4_groups = user_data_df_classes_4.groupby(by="label") 950 | #for name,group in data_classes_4_groups: 951 | 952 | #切割序列为指定长度 953 | @staticmethod 954 | def slice_seq(x,index,exp_seq_len): 955 | #index 第一维是索引,第二维是长度 956 | 957 | #特征长度 958 | features_len = x.shape[1] 959 | #每一段可以切出的序列个数 960 | seq_num_list = np.array([math.ceil(i) for i in (index[1]/exp_seq_len)]) 961 | #总序列个数 962 | num_total_seq = int(sum(seq_num_list)) 963 | #结果矩阵 964 | new_data = np.zeros(shape=[num_total_seq,exp_seq_len,features_len],dtype=np.float64) 965 | #new_label = np.zeros(shape=[num_total_seq,exp_seq_len]) 966 | new_index = np.zeros(shape=[2,num_total_seq],dtype=np.int64) 967 | 968 | count = 0 969 | for i in range(len(seq_num_list)): 970 | #该段轨迹的长度 971 | seg_len = index[1][i] 972 | #索引开始 973 | seg_start = index[0][i] 974 | seg_end = seg_start + seg_len 975 | #二维数组 976 | seg_data = x[seg_start:seg_end] 977 | 978 | num_full_seq = seg_len // exp_seq_len 979 | if num_full_seq: 980 | full_seq = seg_data[0:num_full_seq * exp_seq_len].reshape((num_full_seq, exp_seq_len, features_len)) 981 | new_data[count:(count + num_full_seq)] = full_seq 982 | #new_label[count:(count + num_full_seq)] = full_lab 983 | new_index[0][count:(count + num_full_seq)] = i 984 | new_index[1][count:(count + num_full_seq)] = exp_seq_len 985 | count += num_full_seq 986 | #如果序列没有对齐 987 | if num_full_seq 5: 1230 | # break 1231 | file_group_count = 0 1232 | 1233 | print("处理label"+str(label_name)) 1234 | train_writer = tf.python_io.TFRecordWriter( 1235 | "G:/all_data/tfrecords/interval_" + str(interval)+"_label_"+str(label_name) + "_train_0.tfrecords") 1236 | valid_writer = tf.python_io.TFRecordWriter( 1237 | "G:/all_data/tfrecords/interval_" + str(interval) +"_label_"+str(label_name)+ "_valid_0.tfrecords") 1238 | test_writer = tf.python_io.TFRecordWriter( 1239 | "G:/all_data/tfrecords/interval_" + str(interval)+"_label_"+str(label_name) + "_test_0.tfrecords") 1240 | seg_groups = label_group.groupby(by="seg_label") 1241 | count = 0 1242 | for seg_name,seg_group in seg_groups: 1243 | #seg_group 存放每段的轨迹点的特征,每个特征长30 1244 | speed_sec = np.array(seg_group.iloc[:,0:1*WIDTH]) 1245 | avg_speed = np.array(seg_group.iloc[:1*WIDTH:2*WIDTH]) 1246 | std_speed = np.array(seg_group.iloc[:,2*WIDTH:3*WIDTH]) 1247 | acc_sec = np.array(seg_group.iloc[:, 3 * WIDTH:4 * WIDTH]) 1248 | mean_acc = np.array(seg_group.iloc[:, 4 * WIDTH:5 * WIDTH]) 1249 | std_acc = np.array(seg_group.iloc[:, 5 * WIDTH:6 * WIDTH]) 1250 | feature = { 1251 | FeatureName.SPEED_SEC.value : Data._bytes_feature(speed_sec.tobytes()), 1252 | FeatureName.AVG_SPEED.value : Data._bytes_feature(avg_speed.tobytes()), 1253 | FeatureName.STD_SPEED.value : Data._bytes_feature(std_speed.tobytes()), 1254 | FeatureName.ACC_SEC.value : Data._bytes_feature(acc_sec.tobytes()), 1255 | FeatureName.MEAN_ACC.value : Data._bytes_feature(mean_acc.tobytes()), 1256 | FeatureName.STD_ACC.value : Data._bytes_feature(std_acc.tobytes()), 1257 | "label":Data._int64_feature(label_name) 1258 | } 1259 | example = tf.train.Example(features = tf.train.Features(feature = feature)) 1260 | 1261 | if count % 1000 == 0 and count > 0: 1262 | train_writer.close() 1263 | valid_writer.close() 1264 | test_writer.close() 1265 | sys.stdout.flush() 1266 | file_group_count += 1 1267 | 1268 | train_writer = tf.python_io.TFRecordWriter( 1269 | "G:/all_data/tfrecords/interval_" + str(interval) + "_label_" + str( 1270 | label_name) + "_train_" + str(file_group_count)+ ".tfrecords") 1271 | valid_writer = tf.python_io.TFRecordWriter( 1272 | "G:/all_data/tfrecords/interval_" + str(interval) + "_label_" + str( 1273 | label_name) + "_valid_" + str(file_group_count)+ ".tfrecords") 1274 | test_writer = tf.python_io.TFRecordWriter( 1275 | "G:/all_data/tfrecords/interval_" + str(interval) + "_label_" + str( 1276 | label_name) + "_test_" + str(file_group_count)+ ".tfrecords") 1277 | 1278 | 1279 | t = count % 10 1280 | if t >=0 and t <8 : 1281 | train_writer.write(example.SerializeToString()) 1282 | elif t == 8: 1283 | valid_writer.write(example.SerializeToString()) 1284 | else: 1285 | test_writer.write(example.SerializeToString()) 1286 | 1287 | count += 1 1288 | k+=1 1289 | train_writer.close() 1290 | valid_writer.close() 1291 | test_writer.close() 1292 | sys.stdout.flush() 1293 | 1294 | #补零 规定长度 未分开 1295 | @staticmethod 1296 | def pad_seqs(x,exp_seq_len): 1297 | seq_nums = int(np.ceil(x.shape[0]/exp_seq_len)) 1298 | seq_len = x.shape[0] 1299 | early = np.zeros([seq_nums],dtype=np.int64) 1300 | 1301 | remain_len = seq_len % exp_seq_len 1302 | if remain_len != 0: 1303 | x_pad = np.pad(x,[[0,exp_seq_len-remain_len],[0,0]],"constant",constant_values=0) 1304 | early[0:seq_nums-1] = exp_seq_len 1305 | early[-1] = remain_len 1306 | return x_pad,early 1307 | else: 1308 | early[:] = exp_seq_len 1309 | return x,early 1310 | 1311 | #未完成 1312 | @staticmethod 1313 | def pad_slice_seqs(x,exp_seq_len): 1314 | #未完成 1315 | seq_nums = int(np.ceil(x.shape[0] / exp_seq_len)) 1316 | seq_len = x.shape[0] 1317 | early = np.zeros([seq_nums], dtype=np.int32) 1318 | remain_len = seq_len % exp_seq_len 1319 | full_seq_nums = seq_len // exp_seq_len 1320 | pass 1321 | 1322 | #制作规定长度的tfrecord 1323 | @staticmethod 1324 | def make_tfrecord_seq(interval_list,exp_seq_len): 1325 | data_dir = "G:/新建文件夹/Geolife Trajectories 1.3/gps_en_discrezation/" 1326 | out_path = "G:/all_data/tfrecords/" 1327 | for interval in interval_list: 1328 | print("处理" + str(interval)) 1329 | # train_writer = tf.python_io.TFRecordWriter("G:/all_data/tfrecords/interval_"+str(interval)+"_train.tfrecords") 1330 | # valid_writer = tf.python_io.TFRecordWriter("G:/all_data/tfrecords/interval_"+str(interval)+"_valid.tfrecords") 1331 | # test_writer = tf.python_io.TFRecordWriter("G:/all_data/tfrecords/interval_"+str(interval)+"_test.tfrecords") 1332 | data_file_name = data_dir + "user_features_data_en_1_interval_" + str(interval) + ".csv" 1333 | data_file = open(data_file_name, mode="r") 1334 | data_df = pd.DataFrame(pd.read_csv(data_file)) 1335 | 1336 | data_label_groups = data_df.groupby(by="label") 1337 | k = 0 1338 | for label_name, label_group in data_label_groups: 1339 | 1340 | # if k < 7: 1341 | # k+=1 1342 | # continue 1343 | if k > 3: 1344 | return 1345 | file_group_count = 0 1346 | print("处理label" + str(label_name)) 1347 | train_writer = tf.python_io.TFRecordWriter( 1348 | out_path + "interval_"+str(interval) + "_label_" + str( 1349 | label_name) + "_train_0.tfrecords") 1350 | valid_writer = tf.python_io.TFRecordWriter( 1351 | out_path + "interval_"+ str(interval) + "_label_" + str( 1352 | label_name) + "_valid_0.tfrecords") 1353 | test_writer = tf.python_io.TFRecordWriter( 1354 | out_path + "interval_"+ str(interval) + "_label_" + str(label_name) + "_test_0.tfrecords") 1355 | seg_groups = label_group.groupby(by="seg_label") 1356 | count = 0 1357 | for seg_name, seg_group in seg_groups: 1358 | # seg_group 存放每段的轨迹点的特征,每个特征长30 1359 | speed_sec = np.array(seg_group.iloc[:, 0 : 1 * WIDTH],dtype=np.int64) 1360 | avg_speed = np.array(seg_group.iloc[:, 1* WIDTH : 2 * WIDTH],dtype=np.int64) 1361 | std_speed = np.array(seg_group.iloc[:, 2* WIDTH : 3 * WIDTH],dtype=np.int64) 1362 | acc_sec = np.array(seg_group.iloc[:, 3* WIDTH : 4 * WIDTH],dtype=np.int64) 1363 | mean_acc = np.array(seg_group.iloc[:, 4* WIDTH : 5 * WIDTH],dtype=np.int64) 1364 | std_acc = np.array(seg_group.iloc[:, 5* WIDTH : 6 * WIDTH],dtype=np.int64) 1365 | 1366 | speed_sec_pad,speed_sec_early = Data.pad_seqs(speed_sec,exp_seq_len) 1367 | avg_speed_pad,avg_speed_early = Data.pad_seqs(avg_speed,exp_seq_len) 1368 | std_speed_pad,std_speed_early = Data.pad_seqs(std_speed,exp_seq_len) 1369 | acc_sec_pad,acc_sec_early = Data.pad_seqs(acc_sec,exp_seq_len) 1370 | mean_acc_pad,mean_acc_early = Data.pad_seqs(mean_acc,exp_seq_len) 1371 | std_acc_pad,std_acc_early = Data.pad_seqs(std_acc,exp_seq_len) 1372 | 1373 | label = np.zeros(speed_sec_early.shape,np.int64) 1374 | label[:] = int(label_name) 1375 | 1376 | for i in range(len(speed_sec_early)): 1377 | start = i*exp_seq_len 1378 | end = (i+1)*exp_seq_len 1379 | 1380 | feature = { 1381 | param.SPEED_SEC: Data._bytes_feature(speed_sec_pad[start:end].tobytes()), 1382 | param.AVG_SPEED: Data._bytes_feature(avg_speed_pad[start:end].tobytes()), 1383 | param.STD_SPEED: Data._bytes_feature(std_speed_pad[start:end].tobytes()), 1384 | param.ACC_SEC: Data._bytes_feature(acc_sec_pad[start:end].tobytes()), 1385 | param.MEAN_ACC: Data._bytes_feature(mean_acc_pad[start:end].tobytes()), 1386 | param.STD_ACC: Data._bytes_feature(std_acc_pad[start:end].tobytes()), 1387 | param.EARLY:Data._int64_feature(speed_sec_early[i]), 1388 | param.LABEL: Data._int64_feature(label[i]) 1389 | } 1390 | example = tf.train.Example(features=tf.train.Features(feature=feature)) 1391 | 1392 | if count % 1000 == 0 and count > 0: 1393 | print("1000") 1394 | train_writer.close() 1395 | valid_writer.close() 1396 | test_writer.close() 1397 | #sys.stdout.flush() 1398 | file_group_count += 1 1399 | 1400 | train_writer = tf.python_io.TFRecordWriter( 1401 | out_path + "interval_" + str(interval) + "_label_" + str( 1402 | label_name) + "_train_" + str(file_group_count) + ".tfrecords") 1403 | valid_writer = tf.python_io.TFRecordWriter( 1404 | out_path + "interval_" + str(interval) + "_label_" + str( 1405 | label_name) + "_valid_" + str(file_group_count) + ".tfrecords") 1406 | test_writer = tf.python_io.TFRecordWriter( 1407 | out_path + "interval_" + str(interval) + "_label_" + str( 1408 | label_name) + "_test_" + str(file_group_count) + ".tfrecords") 1409 | 1410 | t = count % 10 1411 | if t >= 0 and t < 8: 1412 | train_writer.write(example.SerializeToString()) 1413 | elif t == 8: 1414 | valid_writer.write(example.SerializeToString()) 1415 | else: 1416 | test_writer.write(example.SerializeToString()) 1417 | 1418 | count += 1 1419 | 1420 | print(count) 1421 | k += 1 1422 | train_writer.close() 1423 | valid_writer.close() 1424 | test_writer.close() 1425 | sys.stdout.flush() 1426 | 1427 | #制作规定长度的tfrecord 1428 | @staticmethod 1429 | def make_tfrecord_seq_shuffle(interval_list,exp_seq_len,dirname): 1430 | data_dir = "G:/新建文件夹/Geolife Trajectories 1.3/gps_en_discrezation/features_12_95_30/" 1431 | out_path = "G:/all_data/"+dirname 1432 | for interval in interval_list: 1433 | print("处理" + str(interval)) 1434 | # train_writer = tf.python_io.TFRecordWriter("G:/all_data/tfrecords/interval_"+str(interval)+"_train.tfrecords") 1435 | # valid_writer = tf.python_io.TFRecordWriter("G:/all_data/tfrecords/interval_"+str(interval)+"_valid.tfrecords") 1436 | # test_writer = tf.python_io.TFRecordWriter("G:/all_data/tfrecords/interval_"+str(interval)+"_test.tfrecords") 1437 | data_file_name = data_dir + "user_features_data_en_1_interval_" + str(interval) + ".csv" 1438 | data_file = open(data_file_name, mode="r") 1439 | data_df = pd.DataFrame(pd.read_csv(data_file)) 1440 | 1441 | data_label_groups = data_df.groupby(by="label") 1442 | k = 0 1443 | #for label_name, label_group in data_label_groups: 1444 | 1445 | # if k < 7: 1446 | # k+=1 1447 | # continue 1448 | # if k > 3: 1449 | # return 1450 | file_group_count = 0 1451 | train_writer = tf.python_io.TFRecordWriter( 1452 | out_path + "interval_"+str(interval) + "_train_0.tfrecords") 1453 | valid_writer = tf.python_io.TFRecordWriter( 1454 | out_path + "interval_"+ str(interval) + "_valid_0.tfrecords") 1455 | test_writer = tf.python_io.TFRecordWriter( 1456 | out_path + "interval_"+ str(interval) + "_test_0.tfrecords") 1457 | seg_groups = data_df.groupby(by="seg_label") 1458 | count = 0 1459 | for seg_name, seg_group in seg_groups: 1460 | if int(seg_group.iloc[0,-3]) > 3: 1461 | continue 1462 | 1463 | # seg_group 存放每段的轨迹点的特征,每个特征长30 1464 | speed_sec = np.array(seg_group.iloc[:, 0 : 1 * WIDTH],dtype=np.int64) 1465 | avg_speed = np.array(seg_group.iloc[:, 1* WIDTH : 2 * WIDTH],dtype=np.int64) 1466 | std_speed = np.array(seg_group.iloc[:, 2* WIDTH : 3 * WIDTH],dtype=np.int64) 1467 | acc_sec = np.array(seg_group.iloc[:, 3* WIDTH : 4 * WIDTH],dtype=np.int64) 1468 | mean_acc = np.array(seg_group.iloc[:, 4* WIDTH : 5 * WIDTH],dtype=np.int64) 1469 | std_acc = np.array(seg_group.iloc[:, 5* WIDTH : 6 * WIDTH],dtype=np.int64) 1470 | head = np.array(seg_group.iloc[:, 6* WIDTH : 7 * WIDTH],dtype=np.int64) 1471 | head_mean = np.array(seg_group.iloc[:, 7* WIDTH : 8 * WIDTH],dtype=np.int64) 1472 | std_head = np.array(seg_group.iloc[:, 8* WIDTH : 9 * WIDTH],dtype=np.int64) 1473 | max_speed = np.array(seg_group.iloc[:, 9* WIDTH : 10 * WIDTH],dtype=np.int64) 1474 | max_acc = np.array(seg_group.iloc[:, 10* WIDTH : 11 * WIDTH],dtype=np.int64) 1475 | max_head = np.array(seg_group.iloc[:, 11* WIDTH : 12 * WIDTH],dtype=np.int64) 1476 | 1477 | 1478 | speed_sec_pad,speed_sec_early = Data.pad_seqs(speed_sec,exp_seq_len) 1479 | avg_speed_pad,avg_speed_early = Data.pad_seqs(avg_speed,exp_seq_len) 1480 | std_speed_pad,std_speed_early = Data.pad_seqs(std_speed,exp_seq_len) 1481 | acc_sec_pad,acc_sec_early = Data.pad_seqs(acc_sec,exp_seq_len) 1482 | mean_acc_pad,mean_acc_early = Data.pad_seqs(mean_acc,exp_seq_len) 1483 | std_acc_pad,std_acc_early = Data.pad_seqs(std_acc,exp_seq_len) 1484 | head_pad,head_early = Data.pad_seqs(head,exp_seq_len) 1485 | head_mean_pad,head_mean_early = Data.pad_seqs(head_mean,exp_seq_len) 1486 | std_head_pad,std_head_early = Data.pad_seqs(std_head,exp_seq_len) 1487 | max_speed_pad,max_speed_early = Data.pad_seqs(max_speed,exp_seq_len) 1488 | max_acc_pad,max_acc_early = Data.pad_seqs(max_acc,exp_seq_len) 1489 | max_head_pad,max_head_early = Data.pad_seqs(max_head,exp_seq_len) 1490 | 1491 | label = np.zeros(speed_sec_early.shape,np.int64) 1492 | #print(int(seg_group.iloc[0,-3])) 1493 | label[:] = int(seg_group.iloc[0,-3]) 1494 | 1495 | for i in range(len(speed_sec_early)): 1496 | start = i*exp_seq_len 1497 | end = (i+1)*exp_seq_len 1498 | 1499 | feature = { 1500 | param.SPEED_SEC: Data._bytes_feature(speed_sec_pad[start:end].tobytes()), 1501 | param.AVG_SPEED: Data._bytes_feature(avg_speed_pad[start:end].tobytes()), 1502 | param.STD_SPEED: Data._bytes_feature(std_speed_pad[start:end].tobytes()), 1503 | param.ACC_SEC: Data._bytes_feature(acc_sec_pad[start:end].tobytes()), 1504 | param.MEAN_ACC: Data._bytes_feature(mean_acc_pad[start:end].tobytes()), 1505 | param.STD_ACC: Data._bytes_feature(std_acc_pad[start:end].tobytes()), 1506 | param.HEAD: Data._bytes_feature(head_pad[start:end].tobytes()), 1507 | param.HEAD_MEAN: Data._bytes_feature(head_mean_pad[start:end].tobytes()), 1508 | param.STD_HEAD: Data._bytes_feature(std_head_pad[start:end].tobytes()), 1509 | param.MAX_SPEED: Data._bytes_feature(max_speed_pad[start:end].tobytes()), 1510 | param.MAX_ACC: Data._bytes_feature(max_acc_pad[start:end].tobytes()), 1511 | param.MAX_HEAD: Data._bytes_feature(max_head_pad[start:end].tobytes()), 1512 | param.EARLY: Data._int64_feature(std_head_early[i]), 1513 | param.LABEL: Data._int64_feature(label[i]) 1514 | } 1515 | example = tf.train.Example(features=tf.train.Features(feature=feature)) 1516 | 1517 | if count % 1000 == 0 and count > 0: 1518 | print("1000") 1519 | train_writer.close() 1520 | valid_writer.close() 1521 | test_writer.close() 1522 | #sys.stdout.flush() 1523 | file_group_count += 1 1524 | 1525 | train_writer = tf.python_io.TFRecordWriter( 1526 | out_path + "interval_" + str(interval) + "_train_" + str(file_group_count) + ".tfrecords") 1527 | valid_writer = tf.python_io.TFRecordWriter( 1528 | out_path + "interval_" + str(interval) + "_valid_" + str(file_group_count) + ".tfrecords") 1529 | test_writer = tf.python_io.TFRecordWriter( 1530 | out_path + "interval_" + str(interval) + "_test_" + str(file_group_count) + ".tfrecords") 1531 | 1532 | t = count % 10 1533 | if t >= 0 and t < 8: 1534 | train_writer.write(example.SerializeToString()) 1535 | elif t == 8: 1536 | valid_writer.write(example.SerializeToString()) 1537 | else: 1538 | test_writer.write(example.SerializeToString()) 1539 | 1540 | count += 1 1541 | 1542 | print(count) 1543 | k += 1 1544 | train_writer.close() 1545 | valid_writer.close() 1546 | test_writer.close() 1547 | sys.stdout.flush() 1548 | 1549 | #未完成 1550 | @staticmethod 1551 | def tf_slice_seq(input,exp_len_seq,has_early): 1552 | 1553 | shape = tf.shape(input) 1554 | full_seq_nums = tf.floordiv(shape[0],exp_len_seq) 1555 | # ?tf.zeros() 1556 | result_list = [] 1557 | early_stop = [] 1558 | zero_constant = tf.zeros_like(full_seq_nums) 1559 | is_zero = tf.equal(full_seq_nums,zero_constant) 1560 | if not is_zero : 1561 | for i in range(full_seq_nums): 1562 | result_list.append(tf.slice(input,[i*exp_len_seq,WIDTH],[exp_len_seq,WIDTH])) 1563 | if has_early: 1564 | early_stop.append(exp_len_seq) 1565 | 1566 | remain_length = shape[0] - full_seq_nums*exp_len_seq 1567 | remain = tf.slice(input,[full_seq_nums*exp_len_seq,WIDTH],[shape[0]-remain_length,WIDTH]) 1568 | remain_padding = tf.pad(remain,[[0,exp_len_seq - remain_length],[0,0]]) 1569 | result_list.append(remain_padding) 1570 | if has_early: 1571 | early_stop.append(remain_length) 1572 | 1573 | if has_early: 1574 | e = np.reshape(np.array(early_stop,np.int32),[len(early_stop),1]) 1575 | 1576 | earlys = tf.Constant(e) 1577 | return result_list,earlys 1578 | 1579 | return result_list 1580 | 1581 | #未完成 1582 | @staticmethod 1583 | def tf_slice_examples(features,feature_name_list,label_name,has_early,exp_seq_len): 1584 | 1585 | #feature_sliced_list [[speed_sec_tensor*full_seq_num]] 1586 | feature_sliced_list = [] 1587 | seqs = None 1588 | early_seqs = None 1589 | for feature_name in feature_name_list: 1590 | feature = tf.decode_raw(features[feature_name],tf.int32) 1591 | if has_early: 1592 | seqs,early_seqs = Data.tf_slice_seq(feature,exp_seq_len,has_early) 1593 | else: 1594 | seqs = Data.tf_slice_seq(feature,exp_seq_len, has_early) 1595 | feature_sliced_list.append(seqs) 1596 | 1597 | if len(feature_sliced_list)>0: 1598 | seq_nums = len(feature_sliced_list[0]) 1599 | else: 1600 | return 1601 | 1602 | features_seqs = None 1603 | for i in range(seq_nums): 1604 | single_seq = None 1605 | for j in range(len[feature_name_list]): 1606 | if single_seq is None: 1607 | single_seq = feature_sliced_list[i][j] 1608 | else: 1609 | single_seq = tf.concat([single_seq,feature_sliced_list[i]][j],axis=1) 1610 | 1611 | single_seq = tf.expand_dims(single_seq,axis=0) 1612 | if features_seqs is None: 1613 | features_seqs = single_seq 1614 | else: 1615 | features_seqs = tf.concat([features_seqs,single_seq],axis=0) 1616 | 1617 | label = tf.cast(features[label_name], tf.int32) 1618 | label_arr = np.zeros(shape=[seq_nums,1],dtype=tf.int32) 1619 | label_arr[:] = label 1620 | label_seqs = tf.Constant(label_arr,dtype=tf.int32) 1621 | if has_early: 1622 | return features_seqs,early_seqs,label_seqs 1623 | else: 1624 | return features_seqs,label_seqs 1625 | 1626 | if __name__ == "__main__": 1627 | #Data.sovle_row_data(5) 1628 | #Data.caculate_feature([1,2,3,4,5]) 1629 | #Data.caculate_feature_max_min() 1630 | #Data.caculate_all_max_min() 1631 | #Data.discretization([1,2,3,4,5]) 1632 | #Data.create_npy(2) 1633 | #Data.expand_data_npy(4,100) 1634 | #Data.create_all_data_npy(4,100) 1635 | #Data.concat_data(4,100) 1636 | #Data.make_tfrecord([5]) 1637 | #Data.make_tfrecord_seq_shuffle([1,2,3,4,5],50,"tfrecords_95_15/") 1638 | #Data.features_status([5]) 1639 | #Data.caculate_feature_12([1,2,3,4,5]) 1640 | #Data.discretization_12([1,2,3,4]) 1641 | Data.make_tfrecord_seq_shuffle([4], 50, "tfrecords_95_30_12/") --------------------------------------------------------------------------------