├── README.md ├── ci.py ├── uril_connectUser.py ├── uril_entropy.py ├── code3_runEpoch.py ├── trainAutoEncoder.py ├── doTrainTest.py ├── uril_tools.py ├── code0_parameter.py ├── code1_data.py ├── code2_model.py ├── uril_cmu_statistic.py ├── uril_oneHotEncoder.py └── uril_assistment2009.py /README.md: -------------------------------------------------------------------------------- 1 | ### Environment 2 | python3.5 3 | Tensorflow 0.10 4 | pip install pyprind 5 | 6 | ### Run code 7 | setting parameter in code0 file 8 | python doTrainTest.py 9 | 10 | 11 | ### Disclaimer 12 | Unless stated otherwise, all software is provided free of charge. As well, all software is provided on an "as is" basis without warranty of any kind, express or implied. Under no circumstances and under no legal theory, whether in tort, contract, or otherwise, shall Liang Zhang be liable to you or to any other person for any indirect, special, incidental, or consequential damages of any character including, without limitation, damages for loss of goodwill, work stoppage, computer failure or malfunction, or for any and all other damages or losses. If you do not agree with these terms, then you are advised to not use the software. 13 | -------------------------------------------------------------------------------- /ci.py: -------------------------------------------------------------------------------- 1 | ''' Unless stated otherwise, all software is provided free of charge. 2 | As well, all software is provided on an "as is" basis without warranty 3 | of any kind, express or implied. Under no circumstances and under no legal 4 | theory, whether in tort, contract, or otherwise, shall Liang Zhang be liable 5 | to you or to any other person for any indirect, special, incidental, 6 | or consequential damages of any character including, without limitation, 7 | damages for loss of goodwill, work stoppage, computer failure or malfunction, 8 | or for any and all other damages or losses. If you do not agree with these terms, 9 | then you are advised to not use the software.''' 10 | 11 | import pandas as pd 12 | import numpy as np 13 | import scipy.stats as st 14 | import scipy as sp 15 | import scipy.stats 16 | 17 | def mean_confidence_interval(data, confidence=0.95): 18 | a = 1.0*np.array(data) 19 | n = len(a) 20 | m, se = np.mean(a), scipy.stats.sem(a) 21 | h = se * sp.stats.t._ppf((1+confidence)/2., n-1) 22 | return m,h 23 | 24 | assistments_list = [ 25 | './result/assistment2009/result_01-25-17:04.csv', #baseline 26 | './result/assistment2009/result_02-05-11:51.csv', #baseline + t/c 27 | './result/assistment2009/result_02-05-21:28.csv', #baseline + t/c [ae] 28 | './result/assistment2009/result_02-06-02:10.csv', #baseline + t/c + t + a + f 29 | './result/assistment2009/result_02-06-09:21.csv', #baseline + t/c + t + a + f [ae] 30 | './result/assistment2009/result_02-08-10:57.csv', #baseline + t/c + t/s + t + a + f [ae] 31 | ] 32 | 33 | cmu_list = [ 34 | './result/cmu_stat_f2011/result_01-24-23:23.csv', #baseline 35 | './result/cmu_stat_f2011/result_01-25-09:35.csv', #baseline + t/c 36 | './result/cmu_stat_f2011/result_01-29-17:35.csv', #baseline + t/c [ae] 37 | './result/cmu_stat_f2011/result_02-06-21:45.csv', #baseline + t/c + t + a + f 38 | './result/cmu_stat_f2011/result_02-07-08:44.csv', #baseline + t/c + t + a + f [ae] 39 | './result/cmu_stat_f2011/result_02-07-23:44.csv', #baseline + t/c + t/s + t + a + f [ae] 40 | ] 41 | 42 | for name_list in [assistments_list,cmu_list]: 43 | print ("=="*25) 44 | for idx,name in enumerate(name_list): 45 | print ("\n","-"*5,idx,"\t",name,"-"*5,) 46 | data = pd.read_csv(name) 47 | data = data[(data['cv']!='average') & (data['type']!='train')] 48 | data = data[data['epoch']==8] 49 | 50 | aucs = data['auc'] 51 | print ("auc mean and 95ci %2.3f\t%2.3f"%mean_confidence_interval(aucs)) 52 | 53 | r2s = data['r2'] 54 | print ("r2 mean and 95ci %2.3f\t%2.3f"%mean_confidence_interval(r2s)) 55 | 56 | 57 | -------------------------------------------------------------------------------- /uril_connectUser.py: -------------------------------------------------------------------------------- 1 | ''' Unless stated otherwise, all software is provided free of charge. 2 | As well, all software is provided on an "as is" basis without warranty 3 | of any kind, express or implied. Under no circumstances and under no legal 4 | theory, whether in tort, contract, or otherwise, shall Liang Zhang be liable 5 | to you or to any other person for any indirect, special, incidental, 6 | or consequential damages of any character including, without limitation, 7 | damages for loss of goodwill, work stoppage, computer failure or malfunction, 8 | or for any and all other damages or losses. If you do not agree with these terms, 9 | then you are advised to not use the software.''' 10 | 11 | import numpy as np 12 | import pandas as pd 13 | import pyprind as pp 14 | import sys 15 | 16 | import numpy as np 17 | import pandas as pd 18 | import pyprind as pp 19 | import sys 20 | import uril_tools as aux 21 | import code0_parameter as code0 22 | 23 | 24 | def counter(a): 25 | a = list(a) 26 | unique, counts = np.unique(a, return_counts=True) 27 | return unique, counts 28 | 29 | 30 | def getUserQuesNumIndexList(dataList): 31 | a = list(dataList) 32 | target = np.empty((0, 3)) 33 | size = len(a) 34 | temp = [a[0], 1, 0] 35 | for i in range(1, size): 36 | if a[i] == a[i - 1]: 37 | temp[1] += 1 38 | else: 39 | target = np.vstack((target, temp)) 40 | temp = [a[i], 1, i] 41 | return np.vstack((target, temp)) 42 | 43 | 44 | def connectUser(data, connected_file_name): 45 | print("==> load data successful") 46 | u, c = counter(data['user_id']) 47 | # UserNumberDict = dict(zip(u, c)) 48 | 49 | userQuesNumIndexList = getUserQuesNumIndexList(data['user_id']) 50 | newdata = pd.DataFrame() 51 | 52 | print('==> begin concatenate dataset') 53 | for i in pp.prog_percent(range(len(u)), stream=sys.stdout): 54 | for k in range(len(userQuesNumIndexList)): 55 | if userQuesNumIndexList[k, 0] == u[i]: 56 | temp = data.iloc[ 57 | int(userQuesNumIndexList[k, 2]):int(userQuesNumIndexList[k, 2] + userQuesNumIndexList[k, 1])] 58 | newdata = newdata.append(temp) 59 | 60 | newdata.reset_index(drop=True) 61 | newdata.to_csv(connected_file_name, index=False) 62 | 63 | print('==> before connect\t', aux.stastic_SecNumber_UserNumber_SkillNumber(data, code0.DatasetParameter())) 64 | print('==> after connect\t', aux.stastic_SecNumber_UserNumber_SkillNumber(newdata, code0.DatasetParameter())) 65 | 66 | return newdata 67 | 68 | 69 | if __name__ == "__main__": 70 | filename = './data/assistment2009/skill_builder_data_corrected.csv' 71 | newfile = './data/assistment2009/connect_dataset_small.csv' 72 | 73 | data = pd.read_csv(filename, encoding='latin-1', error_bad_lines=False, index_col=False) 74 | data = data[0:50000] 75 | connectUser(data, newfile) 76 | -------------------------------------------------------------------------------- /uril_entropy.py: -------------------------------------------------------------------------------- 1 | ''' Unless stated otherwise, all software is provided free of charge. 2 | As well, all software is provided on an "as is" basis without warranty 3 | of any kind, express or implied. Under no circumstances and under no legal 4 | theory, whether in tort, contract, or otherwise, shall Liang Zhang be liable 5 | to you or to any other person for any indirect, special, incidental, 6 | or consequential damages of any character including, without limitation, 7 | damages for loss of goodwill, work stoppage, computer failure or malfunction, 8 | or for any and all other damages or losses. If you do not agree with these terms, 9 | then you are advised to not use the software.''' 10 | 11 | import pandas as pd 12 | import numpy as np 13 | 14 | 15 | class EntropyCls(): 16 | def __init__(self): 17 | pass 18 | 19 | def _entropy_single_variable(self, X): 20 | X = list(X) 21 | probs = [list(X).count(c) * 1.0 / len(X) for c in set(X)] 22 | return np.sum(-p * np.log2(p) for p in probs) 23 | 24 | def _joint_entropy_two_variables(self, X, Y): 25 | """ 26 | :param X: 27 | :param Y: 28 | :return: H(X|Y) 29 | """ 30 | data = pd.DataFrame({'X': list(X), 'Y': list(Y)}) 31 | joint_entropy = 0 32 | for y_item in set(data['Y']): 33 | temp_data = data[data['Y'] == y_item] 34 | temp_list = temp_data['X'] 35 | entropy = self._entropy_single_variable(temp_list) 36 | p_y = len(temp_data) / len(data) 37 | joint_entropy += p_y * entropy 38 | return joint_entropy 39 | 40 | def _get_information_grain(self, X, Y): 41 | """ 42 | :param X: 43 | :param Y: 44 | :return: IG(X|Y) = H(X) - H(X|Y) 45 | """ 46 | return self._entropy_single_variable(X) - self._joint_entropy_two_variables(X, Y) 47 | 48 | def _get_sym_uncertity(self, X, Y): 49 | """ 50 | :param X: 51 | :param Y: 52 | :return: SU(X,Y) = 2[IG(X|Y)/(H(X)+H(Y))] 53 | """ 54 | return 2 * ( 55 | self._get_information_grain(X, Y) / (self._entropy_single_variable(X) + self._entropy_single_variable(Y))) 56 | 57 | def get_sym_uncertity_matrix(self, data): 58 | """ 59 | :param data: pandas data 60 | :return: 61 | """ 62 | name_list = list(data) 63 | result = pd.DataFrame(index=name_list, columns=name_list) 64 | for c_idx, c_item in enumerate(name_list): 65 | for r_idx, r_item in enumerate(name_list): 66 | if c_idx > r_idx: 67 | result.loc[r_item, c_item] = self._get_sym_uncertity(data[r_item], data[c_item]) 68 | 69 | return result 70 | 71 | def get_coeff(self, data): 72 | name_list = list(data) 73 | result = pd.DataFrame(index=name_list, columns=name_list) 74 | for c_idx, c_item in enumerate(name_list): 75 | for r_idx, r_item in enumerate(name_list): 76 | if c_idx > r_idx: 77 | result.loc[r_item, c_item] = abs(np.corrcoef(data[r_item], data[c_item])[0][1]) 78 | return result 79 | 80 | def print_assistment2009(): 81 | data = pd.read_csv( 82 | "./data/assistment2009/attempt_level correct attempt_level time_level correct first_action correct first_action time_level correct skill_id correct time_level correct_large_.csv") 83 | 84 | temp_data = data[ 85 | ['correct', 'first_action', 'time_level', 'attempt_level', 'attempt_level correct', 'first_action correct', 86 | 'time_level correct','attempt_level time_level correct','first_action time_level correct']] 87 | r1 = EntropyCls().get_sym_uncertity_matrix(temp_data) 88 | r2 = EntropyCls().get_coeff(temp_data) 89 | 90 | result = pd.concat([r1, r2]) 91 | result.to_csv('./result/assistment2009/correlationship_add_3.csv') 92 | print(result) 93 | 94 | def print_cmu(): 95 | data = pd.read_csv( 96 | "./data/cmu_stat_f2011/skill_id correct skill_id time_level time_level correct_large_.csv") 97 | 98 | temp_data = data[ 99 | ['correct', 'first_action', 'time_level', 'attempt_level', "time_level correct","skill_id time_level","skill_id correct"]] 100 | r1 = EntropyCls().get_sym_uncertity_matrix(temp_data) 101 | r2 = EntropyCls().get_coeff(temp_data) 102 | 103 | result = pd.concat([r1, r2]) 104 | result.to_csv('./result/cmu_stat_f2011/correlationship_add_3.csv') 105 | print(result) 106 | 107 | if __name__ == "__main__": 108 | #print_assistment2009() 109 | print_cmu() 110 | 111 | -------------------------------------------------------------------------------- /code3_runEpoch.py: -------------------------------------------------------------------------------- 1 | ''' Unless stated otherwise, all software is provided free of charge. 2 | As well, all software is provided on an "as is" basis without warranty 3 | of any kind, express or implied. Under no circumstances and under no legal 4 | theory, whether in tort, contract, or otherwise, shall Liang Zhang be liable 5 | to you or to any other person for any indirect, special, incidental, 6 | or consequential damages of any character including, without limitation, 7 | damages for loss of goodwill, work stoppage, computer failure or malfunction, 8 | or for any and all other damages or losses. If you do not agree with these terms, 9 | then you are advised to not use the software.''' 10 | 11 | from uril_tools import * 12 | from sklearn.metrics import mean_squared_error 13 | from sklearn.metrics import r2_score 14 | from sklearn import metrics 15 | from math import sqrt 16 | import numpy as np 17 | import pyprind 18 | np.set_printoptions(threshold=np.inf) 19 | 20 | def get_evaluate_result(actual_labels, pred_prob): 21 | rmse = sqrt(mean_squared_error(actual_labels, pred_prob)) 22 | fpr, tpr, thresholds = metrics.roc_curve(actual_labels, pred_prob, pos_label=1) 23 | auc = metrics.auc(fpr, tpr) 24 | r2 = r2_score(actual_labels, pred_prob) 25 | return rmse,auc,r2 26 | 27 | def run_epoch(session, m, students, eval_op, verbose=False): 28 | pred_prob = [] 29 | actual_labels = [] # use for whole comparasion 30 | 31 | skill_id_origin_list = [] 32 | target_id_origin_list = [] 33 | iteration = int(len(students)/m.batch_size) 34 | 35 | for i_iter in pyprind.prog_percent(range(iteration)): 36 | #bar.update(m.batch_size) 37 | x = np.zeros((m.batch_size, m.num_steps, m.seq_width)) 38 | 39 | target_id = np.array([],dtype=np.int32) 40 | skill_id_origin = np.array([],dtype=np.int32) 41 | target_id_origin = np.array([],dtype=np.int32) 42 | target_correctness = [] # use for just a batch 43 | 44 | #load data for a batch 45 | # tuple formate 46 | # 0: user_id 47 | # 1: record_numb 48 | # 2: data 49 | # 3: Target_Id 50 | # 4: correctness 51 | for i_batch in range(m.batch_size): 52 | student = students[i_iter*m.batch_size+i_batch] 53 | record_num = student[1] 54 | #record_content_pd = student[2].reset_index(drop=True) 55 | record_content = student[2].as_matrix() 56 | temp_skill_id_list = list(student[2]['skill_id']) 57 | skill_id = student[3] 58 | correctness = student[4] 59 | 60 | # construct data for training: 61 | # data ~ x 62 | # target_id ~ skill_id 63 | # target_correctness ~ correctness 64 | for i_recordNumb in range(record_num): 65 | if(i_recordNumb predict_prob shape\t",np.shape(pred_prob),'\tactual_labels\t',np.shape(actual_labels),'\ttarget_id_list\t',np.shape(target_id_origin_list)) 112 | #print (target_id_origin_list[1:100]) 113 | intra_skill_actual = [] 114 | intra_skill_pred = [] 115 | 116 | inter_skill_actual = [] 117 | inter_skill_pred = [] 118 | 119 | for idx in np.arange(len(target_id_origin_list)): 120 | if skill_id_origin_list[idx]==target_id_origin_list[idx]: 121 | intra_skill_actual.append(actual_labels[idx]) 122 | intra_skill_pred.append(pred_prob[idx]) 123 | else: 124 | inter_skill_actual.append(actual_labels[idx]) 125 | inter_skill_pred.append(pred_prob[idx]) 126 | 127 | inter_rmse,inter_auc,inter_r2 = get_evaluate_result(inter_skill_actual, inter_skill_pred) 128 | intra_rmse,intra_auc,intra_r2 = get_evaluate_result(intra_skill_actual, intra_skill_pred) 129 | 130 | return rmse, auc, r2,inter_rmse,inter_auc,inter_r2,intra_rmse,intra_auc,intra_r2 131 | 132 | if __name__=="__main__": 133 | pass 134 | 135 | -------------------------------------------------------------------------------- /trainAutoEncoder.py: -------------------------------------------------------------------------------- 1 | ''' Unless stated otherwise, all software is provided free of charge. 2 | As well, all software is provided on an "as is" basis without warranty 3 | of any kind, express or implied. Under no circumstances and under no legal 4 | theory, whether in tort, contract, or otherwise, shall Liang Zhang be liable 5 | to you or to any other person for any indirect, special, incidental, 6 | or consequential damages of any character including, without limitation, 7 | damages for loss of goodwill, work stoppage, computer failure or malfunction, 8 | or for any and all other damages or losses. If you do not agree with these terms, 9 | then you are advised to not use the software.''' 10 | 11 | import tensorflow as tf 12 | import math 13 | import code0_parameter as code0 14 | import code1_data as code1 15 | from uril_oneHotEncoder import ONEHOTENCODERINPUT 16 | import pyprind, sys,os 17 | import numpy as np 18 | 19 | 20 | class SIMPLEAUTOENCODER(object): 21 | def __init__(self, SAEconfig, dp): 22 | self.inputs = tf.placeholder(tf.float32, [SAEconfig.batch_size, SAEconfig.num_steps, SAEconfig.seq_width]) 23 | ohe = ONEHOTENCODERINPUT(SAEconfig, dp, self.inputs) 24 | #self.mask = tf.placeholder(tf.float32, [SAEconfig.batch_size*SAEconfig.num_steps, SAEconfig.seq_width]) 25 | ############################################################################################################# 26 | #featureslist = [ohe.getSkillCorrectCrossFeature(),ohe.getCrossFeatureAll()] 27 | featureslist = [ohe.getSkillCorrectCrossFeature(), ohe.getCrossFeatureAll()]#, ohe.getCategoryFeatureInputs()] 28 | ############################################################################################################# 29 | x_tmp = tf.concat(2, featureslist) 30 | self.dp = dp 31 | self.x = x = tf.reshape(x_tmp, [SAEconfig.batch_size * SAEconfig.num_steps, -1]) 32 | self.dimensions = dimensions = [int(x.get_shape()[-1]), code0.TARGETSIZE] 33 | #xp = self.mask*x 34 | print("n_input\t", str(dimensions[0]), "\tn_output\t", str(dimensions[1])) 35 | W_init_max = 4 * np.sqrt(6. / (dimensions[0] + dimensions[1])) 36 | W_init = tf.random_uniform(shape=dimensions, minval=-W_init_max,maxval=W_init_max) 37 | self.WE = WE = tf.Variable(W_init) 38 | self.bE = bE = tf.Variable(tf.zeros([dimensions[-1]])) 39 | if code0.AUTOENCODER_ACT == 'tanh': 40 | transfer_function = tf.nn.tanh 41 | elif code0.AUTOENCODER_ACT == 'sigmoid': 42 | transfer_function = tf.nn.sigmoid 43 | featureVector = transfer_function(tf.matmul(x, WE) + bE) 44 | 45 | self.WD = WD = tf.transpose(WE) 46 | #self.WD = WD = tf.Variable(tf.random_normal([self.dimensions[1],self.dimensions[0],], stddev=0.35)) 47 | self.bD = bD = tf.Variable(tf.zeros([dimensions[0]])) 48 | y = transfer_function(tf.matmul(featureVector, WD) + bD) 49 | #self.learning_rate = tf.placeholder(tf.float32,1) 50 | self.cost = cost = tf.reduce_sum(tf.square(y - x)) 51 | #self.optimizer = tf.train.GradientDescentOptimizer(SAEconfig.learning_rate).minimize(cost) 52 | self.optimizer = tf.train.AdamOptimizer(SAEconfig.learning_rate).minimize(cost) 53 | 54 | self.avgcost = tf.div(cost, tf.to_float(dimensions[0])) 55 | 56 | def saveWeights(self, sess): 57 | weigthpath = './weights/'+str(self.dp.dataSetType)+'/weights_' + str(self.dimensions[0]) + '_' + str(self.dimensions[1]) + '.csv' 58 | baispath = './weights/'+str(self.dp.dataSetType)+'/bias_' + str(self.dimensions[0]) + '_' + str(self.dimensions[1]) + '.csv' 59 | 60 | if os.path.exists(weigthpath): 61 | os.remove(weigthpath) 62 | if os.path.exists(baispath): 63 | os.remove(baispath) 64 | 65 | wt = self.WE.eval(sess) 66 | np.savetxt(weigthpath, wt) 67 | bs = self.bE.eval(sess) 68 | np.savetxt(baispath, bs) 69 | print("==> save weights to \t", os.path.dirname(weigthpath)) 70 | 71 | 72 | def run_ae_epoch(sess, model, data, TrainConfig): 73 | batch_number = int(len(data) / (TrainConfig.batch_size * TrainConfig.num_steps)) 74 | learning_rate = TrainConfig.learning_rate 75 | for i in pyprind.prog_percent(range(batch_number), stream=sys.stdout): 76 | x = np.zeros((TrainConfig.batch_size, TrainConfig.num_steps, TrainConfig.seq_width)) 77 | kindex = i * (TrainConfig.batch_size * TrainConfig.num_steps) 78 | for ip in range(TrainConfig.batch_size): 79 | for j in range(TrainConfig.num_steps): 80 | x[ip, j, :] = data.iloc[kindex] 81 | kindex += 1 82 | #mask_np = np.random.binomial(1, 1 - TrainConfig.corruption_level, [TrainConfig.batch_size * TrainConfig.num_steps,TrainConfig.seq_width]) 83 | learning_rate = learning_rate*TrainConfig.lr_decay 84 | if learning_rate<=TrainConfig.min_lr: 85 | learning_rate = TrainConfig.min_lr 86 | _ = sess.run(model.optimizer, feed_dict={model.inputs: x}) 87 | avgcost = sess.run(model.avgcost, feed_dict={model.inputs: x}) 88 | return avgcost 89 | 90 | 91 | def trainAEWeights(): 92 | if not code0.BASELINE: 93 | dp = code0.DatasetParameter() 94 | dataset, labels = code1.load_data(dp) 95 | 96 | dp.skill_num = len(dataset['skill_id'].unique()) + 1 97 | dp.skill_set = list(dataset['skill_id'].unique()) 98 | dp.columns_max, dp.columns_numb, dp.columnsName_to_index = code1.get_columns_info(dataset) 99 | dp.seq_width = len(dp.columnsName_to_index) 100 | 101 | 102 | SAEconfig = code0.SAEParamsConfig() 103 | SAEconfig.num_steps = 30 104 | SAEconfig.seq_width = dp.seq_width 105 | 106 | g = tf.Graph() 107 | with g.as_default(): 108 | model_autoencoder = SIMPLEAUTOENCODER(SAEconfig, dp) 109 | initializer = tf.random_uniform_initializer(-SAEconfig.init_scale, SAEconfig.init_scale) 110 | 111 | with tf.Session(graph=g) as sess: 112 | tf.initialize_all_variables().run() 113 | 114 | for i in range(SAEconfig.max_max_epoch): 115 | p = run_ae_epoch(sess, model_autoencoder, dataset, SAEconfig) 116 | print(str(i)+"/"+str(SAEconfig.max_max_epoch)+" epoch,avgcost ", str(p)) 117 | model_autoencoder.saveWeights(sess) 118 | else: 119 | print("BASELINE model, don't need train weights") 120 | 121 | if __name__ == "__main__": 122 | trainAEWeights() 123 | -------------------------------------------------------------------------------- /doTrainTest.py: -------------------------------------------------------------------------------- 1 | """ Code of deep knowledge tracing-assistment 2014-2015 dataset 2 | Reference: 3 | 1. https://github.com/siyuanzhao/2016-EDM/ 4 | 2. https://www.tensorflow.org/versions/0.6.0/tutorials/recurrent/index.html 5 | 3. https://github.com/tensorflow/tensorflow/blob/master/tensorflow/models/rnn/ptb/ptb_word_lm.py 6 | 4. https://github.com/Cospel/rbm-ae-tf 7 | 8 | Run code: 9 | 1. only set the hyperparameter in code0_params.py 10 | 2. train your autoencoder parameters 11 | python trainWeights.py 12 | 3. python doAll.py 13 | 14 | Environment: 15 | 1. ubuntu 14.04 16 | 2. python3 17 | 3. tensorflow : 0.10 18 | 4. cuda 7.5 19 | 5. GPU GTX1070 (8G) 20 | 6. CPU i5-6600k 21 | 7. RAM: 16G 22 | """ 23 | 24 | ''' 25 | Unless stated otherwise, all software is provided free of charge. 26 | As well, all software is provided on an "as is" basis without warranty 27 | of any kind, express or implied. Under no circumstances and under no legal 28 | theory, whether in tort, contract, or otherwise, shall Liang Zhang be liable 29 | to you or to any other person for any indirect, special, incidental, 30 | or consequential damages of any character including, without limitation, 31 | damages for loss of goodwill, work stoppage, computer failure or malfunction, 32 | or for any and all other damages or losses. If you do not agree with these terms, 33 | then you are advised to not use the software.''' 34 | 35 | from __future__ import print_function 36 | 37 | import code0_parameter as code0 38 | import code1_data as code1 39 | import code2_model as code2 40 | import code3_runEpoch as code3 41 | import uril_tools as aux 42 | import tensorflow as tf 43 | import numpy as np 44 | import pandas as pd 45 | import datetime 46 | from trainAutoEncoder import trainAEWeights 47 | 48 | np.set_printoptions(threshold=np.inf) 49 | 50 | 51 | def main(unused_args): 52 | aux.check_directories() 53 | 54 | if not code0.BASELINE and code0.AUTOENCODER_LABEL: 55 | trainAEWeights() 56 | 57 | dp = code0.DatasetParameter() 58 | dataset, labels = code1.load_data(dp) 59 | tuple_data = code1.convert_data_labels_to_tuples(dataset, labels) 60 | 61 | skill_num = len(dataset['skill_id'].unique()) + 1 62 | dp.skill_num = skill_num 63 | dp.skill_set = list(dataset['skill_id'].unique()) 64 | dp.columns_max, dp.columns_numb, dp.columnsName_to_index = code1.get_columns_info(dataset) 65 | dp.seq_width = len(dp.columnsName_to_index) 66 | 67 | print("-" * 50, "\ndp.columns_max\n", dp.columns_max, "\n") 68 | print("-" * 50, "\ndp.columns_numb\n", dp.columns_numb, "\n") 69 | print("-" * 50, "\ndp.columnsName_to_index\n", dp.columnsName_to_index, "\n") 70 | 71 | config = code0.ModelParamsConfig(dp) 72 | eval_config = code0.ModelParamsConfig(dp) 73 | 74 | if dp.dataSetType == 'kdd': 75 | config.num_steps = 1500 76 | elif dp.dataSetType == 'cmu_stat_f2011': 77 | config.num_steps = 1500 78 | else: 79 | config.num_steps = aux.get_num_step(dataset) 80 | 81 | eval_config.num_steps = config.num_steps 82 | eval_config.batch_size = 2 83 | 84 | config.skill_num = skill_num 85 | eval_config.skill_num = config.skill_num 86 | 87 | name_list = ['cv', 'epoch', 'type', 'rmse', 'auc', 'r2', 'inter_rmse', 'inter_auc', 'inter_r2', 'intra_rmse', 88 | 'intra_auc', 'intra_r2'] 89 | result_data = pd.DataFrame(columns=name_list) 90 | CVname = ['c1', 'c2', 'c3', 'c4', 'c5'] 91 | size = len(tuple_data) 92 | 93 | # write all the records to log file 94 | aux.printConfigration(config=config, dp=dp, train_numb=int(size * 0.8), test_numb=int(size * 0.2)) 95 | aux.logwrite(["==> model_continues_columns\n" + ','.join(dp.model_continues_columns)], dp, True) 96 | aux.logwrite(["==> model_category_columns\n" + ','.join(dp.model_category_columns)], dp, True) 97 | str_cross_columns_list = ['-'.join(i) for i in dp.model_cross_columns] 98 | str_cross_columns = ','.join(str_cross_columns_list) 99 | aux.logwrite(["==> model_cross_columns\n" + str_cross_columns], dp, True) 100 | 101 | for index, cv_num_name in enumerate(CVname): 102 | aux.logwrite(["\nCross-validation: \t" + str(index + 1) + "/5"], dp, prt=True) 103 | timeStampe = datetime.datetime.now().strftime("%m-%d-%H:%M") 104 | aux.logwrite(["\ntime:\t" + timeStampe], dp) 105 | 106 | train_tuple_rows = tuple_data[:int(index * 0.2 * size)] + tuple_data[int((index + 1) * 0.2 * size):] 107 | test_tuple_rows = tuple_data[int(index * 0.2 * size): int((index + 1) * 0.2 * size)] 108 | 109 | with tf.Graph().as_default(), tf.Session() as session: 110 | initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) 111 | # training model 112 | print("\n==> Load Training model") 113 | with tf.variable_scope("model", reuse=None, initializer=initializer): 114 | m = code2.Model(is_training=True, config=config, dp=dp) 115 | # testing model 116 | print("\n==> Load Testing model") 117 | with tf.variable_scope("model", reuse=True, initializer=initializer): 118 | mtest = code2.Model(is_training=False, config=eval_config, dp=dp) 119 | 120 | tf.initialize_all_variables().run() 121 | 122 | print("==> begin to run epoch...") 123 | for i in range(config.max_max_epoch): 124 | lr_decay = config.lr_decay ** max(i - config.max_epoch, 0) 125 | m.assign_lr(session, config.learning_rate * lr_decay) 126 | 127 | rt = session.run(m.lr) 128 | rmse, auc, r2, inter_rmse, inter_auc, inter_r2, intra_rmse, intra_auc, intra_r2 = code3.run_epoch( 129 | session, m, train_tuple_rows, m.train_op, verbose=True) 130 | 131 | aux.print_result(dp, cv_num_name, i, rt, rmse, auc, r2, inter_rmse, inter_auc, inter_r2, intra_rmse, 132 | intra_auc, intra_r2, 'train') 133 | 134 | result_data = result_data.append(pd.Series( 135 | [cv_num_name, i, 'train', rmse, auc, r2, inter_rmse, inter_auc, inter_r2, intra_rmse, intra_auc, 136 | intra_r2], index=name_list), ignore_index=True) 137 | 138 | display = 5 139 | if ((i + 1) % display == 0): 140 | print('BEGIN', "-" * 80) 141 | rmse, auc, r2, inter_rmse, inter_auc, inter_r2, intra_rmse, intra_auc, intra_r2 = code3.run_epoch( 142 | session, mtest, test_tuple_rows, tf.no_op()) 143 | aux.print_result(dp, cv_num_name, i, rt, rmse, auc, r2, inter_rmse, inter_auc, inter_r2, intra_rmse, 144 | intra_auc, intra_r2, 'test', display) 145 | print('END--', "-" * 80) 146 | 147 | result_data = result_data.append(pd.Series( 148 | [cv_num_name, (i + 1) / display, 'test', rmse, auc, r2, inter_rmse, inter_auc, inter_r2, 149 | intra_rmse, intra_auc, intra_r2], index=name_list), ignore_index=True) 150 | 151 | #print ("-*"*50,"\n",result_data) 152 | 153 | print("==> Finsih! whole process, save result and print\t" + dp.currentTime) 154 | 155 | temp_data = result_data[result_data['type'] == 'test'] 156 | for idx in set(temp_data['epoch']): 157 | tp = temp_data[temp_data['epoch'] == idx] 158 | result_data = result_data.append(pd.Series( 159 | ['average', idx, 'test_mean', tp['rmse'].mean(), tp['auc'].mean(), tp['r2'].mean(), tp['inter_rmse'].mean(), 160 | tp['inter_auc'].mean(), tp['inter_r2'].mean(), tp['intra_rmse'].mean(), tp['intra_auc'].mean(), 161 | tp['intra_r2'].mean()], index=name_list), ignore_index=True) 162 | 163 | print(result_data[result_data['cv']=='average']) 164 | result_data.to_csv('./result/'+code0.DATASETTYPE+'/result_'+timeStampe+'.csv') 165 | print('==> save to ./result/'+code0.DATASETTYPE+'/result_'+timeStampe+'.csv') 166 | 167 | 168 | if __name__ == "__main__": 169 | tf.app.run() 170 | -------------------------------------------------------------------------------- /uril_tools.py: -------------------------------------------------------------------------------- 1 | ''' Unless stated otherwise, all software is provided free of charge. 2 | As well, all software is provided on an "as is" basis without warranty 3 | of any kind, express or implied. Under no circumstances and under no legal 4 | theory, whether in tort, contract, or otherwise, shall Liang Zhang be liable 5 | to you or to any other person for any indirect, special, incidental, 6 | or consequential damages of any character including, without limitation, 7 | damages for loss of goodwill, work stoppage, computer failure or malfunction, 8 | or for any and all other damages or losses. If you do not agree with these terms, 9 | then you are advised to not use the software.''' 10 | 11 | import numpy as np 12 | import os, sys, csv 13 | from code0_parameter import DATASETSIZE, CELLTYPE 14 | import tensorflow as tf 15 | import pandas as pd 16 | import pylab as pl 17 | 18 | 19 | def print_result(dp, cv_num_name, i, rt, rmse, auc, r2, inter_rmse, inter_auc, inter_r2, intra_rmse, intra_auc, 20 | intra_r2, run_type, display=5): 21 | if run_type == 'train': 22 | result = "==> %s cross-valuation: Train Epoch: %d \trate: %.3f \tRMSE: %.3f \tAUC: %.3f \tR2: %.3f" % ( 23 | cv_num_name, i + 1, rt, rmse, auc, r2) 24 | else: 25 | result = "==> %s cross-valuation: Test Epoch: %d \t rmse: %.3f \t auc: %.3f \t r2: %.3f" % ( 26 | cv_num_name, (i + 1) / display, rmse, auc, r2) 27 | 28 | print(result) 29 | logwrite(result, dp, False) 30 | 31 | inter_result = "==> inter_skill\t RMSE: %.3f \tAUC: %.3f \tR2: %.3f" % (inter_rmse, inter_auc, inter_r2) 32 | print(inter_result) 33 | logwrite(inter_result, dp, False) 34 | 35 | intra_result = "==> intra_skill\t RMSE: %.3f \tAUC: %.3f \tR2: %.3f" % (intra_rmse, intra_auc, intra_r2) 36 | print(intra_result) 37 | logwrite(intra_result, dp, False) 38 | 39 | 40 | def check_directories(): 41 | par_dir = ['result', 'data','weights'] 42 | datasets_dir = ['assistment2009', 'kdd', 'cmu_stat_f2011'] 43 | 44 | print('==> check directories') 45 | 46 | for p_item in par_dir: 47 | if not os.path.exists('./' + p_item): 48 | os.mkdir('./' + p_item) 49 | print('==> create directory ./' + p_item) 50 | else: 51 | print('==> directory: ./' + p_item + ' exists') 52 | for c_item in datasets_dir: 53 | if not os.path.exists('./' + p_item + '/' + c_item): 54 | os.mkdir('./' + p_item + '/' + c_item) 55 | print('==> create directory: ./' + p_item + '/' + c_item) 56 | else: 57 | print('==> directory ./' + p_item + '/' + c_item + ' exists') 58 | 59 | 60 | def counter(a): 61 | a = list(a) 62 | unique, counts = np.unique(a, return_counts=True) 63 | return unique, counts 64 | 65 | 66 | def create_column_dict_and_set(data, columnName, dp): 67 | setName = os.path.dirname(dp.csv_file_name) + "/" + columnName + "_set_" + str(dp.dataSetSize) + ".csv" 68 | dictName = os.path.dirname(dp.csv_file_name) + "/" + columnName + "_dict_" + str(dp.dataSetSize) + ".csv" 69 | column_ct = data[columnName] 70 | column_set_original = list(column_ct.unique()) 71 | size = len(column_set_original) 72 | column_dict = {value: key + 1 for key, value in enumerate(column_set_original)} 73 | column_dict[0] = 0 74 | column_set = [i + 1 for i in range(size)] 75 | 76 | with open(setName, 'w') as f: 77 | w = csv.writer(f) 78 | w.writerow(column_set) 79 | print('==> save ', setName) 80 | with open(dictName, 'w') as f: 81 | w = csv.writer(f) 82 | for key, val in column_dict.items(): 83 | w.writerow([key, val]) 84 | print('==> save ', dictName) 85 | return column_set, column_dict 86 | 87 | 88 | def stastic_SecNumber_UserNumber_SkillNumber(data, dp): 89 | secNumber = len(getUserQuesNumList(data['user_id'])) 90 | userNumber = len(data['user_id'].unique()) 91 | skillNumber = len(data['skill_id'].unique()) 92 | 93 | secNumberStr = "SecNumber {:>10}\n".format(secNumber) 94 | userNumberStr = "userNumber {:>10}\n".format(userNumber) 95 | skillNumberStr = "skillNumber {:>10}\n".format(skillNumber) 96 | 97 | logwrite([secNumberStr, userNumberStr, skillNumberStr], dp, True) 98 | return secNumber, userNumber, skillNumber 99 | 100 | 101 | def mean_normalization(X_train, X_test): 102 | data = np.concatenate((X_train, X_test), axis=0) 103 | mean = data.mean(axis=0) 104 | std = data.std(axis=0) 105 | return (X_train - mean) / std, (X_test - mean) / std 106 | 107 | 108 | def xavier_init(fan_in, fan_out, function): 109 | if function is tf.nn.sigmoid: 110 | low = -4.0 * np.sqrt(6.0 / (fan_in + fan_out)) 111 | high = 4.0 * np.sqrt(6.0 / (fan_in + fan_out)) 112 | return tf.random_uniform((fan_in, fan_out), minval=low, maxval=high, dtype=tf.float32) 113 | elif function is tf.nn.tanh: 114 | low = -1 * np.sqrt(6.0 / (fan_in + fan_out)) 115 | high = 1 * np.sqrt(6.0 / (fan_in + fan_out)) 116 | return tf.random_uniform((fan_in, fan_out), minval=low, maxval=high, dtype=tf.float32) 117 | 118 | 119 | def getUserQuesNumList(dataList): 120 | a = list(dataList) 121 | target = np.empty((0, 2)) 122 | size = len(a) 123 | temp = [a[0], 1] 124 | for i in range(1, size): 125 | if a[i] == a[i - 1]: 126 | temp[1] += 1 127 | else: 128 | target = np.vstack((target, temp)) 129 | temp = [a[i], 1] 130 | return np.vstack((target, temp)) 131 | 132 | 133 | def connectStringfromList(klist): 134 | if type(klist) != list: 135 | raise ValueError("only convert list") 136 | tmp = '' 137 | for i, v in enumerate(klist): 138 | if i == 0: 139 | tmp = klist[i] 140 | else: 141 | tmp = tmp + " " + klist[i] 142 | return tmp 143 | 144 | 145 | def unique_rows(a): 146 | a = np.ascontiguousarray(a) 147 | unique_a = np.unique(a.view([('', a.dtype)] * a.shape[1])) 148 | return unique_a.view(a.dtype).reshape((unique_a.shape[0], a.shape[1])) 149 | 150 | 151 | def get_num_step(dataset): 152 | u, c = counter(dataset['user_id']) 153 | return max(c) 154 | 155 | 156 | def logwrite(strList, dp, prt=False): 157 | logfileName = "result/" + str(dp.dataSetType) + "/log_" + str(dp.dataSetType) + "_" + str( 158 | dp.currentTime) + "_" + str(CELLTYPE) + "_" + str(DATASETSIZE) + ".txt" 159 | 160 | for item in strList: 161 | with open(logfileName, "a") as myfile: 162 | myfile.write(str(item)) 163 | if prt: 164 | print(item) 165 | 166 | 167 | def printConfigration(config, dp, train_numb, test_numb): 168 | l1 = "\n" + "-" * 15 + " Configuration " + "-" * 15 169 | l11 = "DataSet {:>10}".format(dp.dataSetType) 170 | l2 = "RNN layers {:>10}".format(config.num_layer) 171 | l3 = "cell type {:>10}".format(config.cell_type) 172 | l4 = "hidden_size {:>10}".format(config.hidden_size) 173 | 174 | if config.num_layer == 2: 175 | l41 = "hidden_size2 {:>10}".format(config.hidden_size_2) 176 | logwrite([l1, l11, l2, l3, l4, l41], dp=dp, prt=True) 177 | else: 178 | logwrite([l1, l11, l2, l3, l4], dp=dp, prt=True) 179 | l5 = "keep_prob {:>10}".format(config.keep_prob) 180 | l6 = "num_steps {:>10}".format(config.num_steps) 181 | l7 = "seq_width {:>10}".format(len(dp.columnsName_to_index)) 182 | l8 = "skill_num {:>10}".format(config.skill_num) 183 | l9 = "skill_id_one_hot {:>10}".format(dp.columns_max['skill_id'] + 1) 184 | l10 = "max_max_epoch {:>10}".format(config.max_max_epoch) 185 | l11 = "batch_size {:>10}".format(config.batch_size) 186 | l12 = "train student number{:>10}".format(train_numb) 187 | l13 = "test student number {:>10}".format(test_numb) 188 | l14 = "-" * 20 + " End " + "-" * 20 + "\n" 189 | logwrite([l5, l6, l7, l8, l9, l10, l11, l12, l13, l14], dp=dp, prt=True) 190 | 191 | 192 | def saveResult(dp, auc_train, rmse_train, r2_train, auc_test, rmse_test, r2_test, mean_result): 193 | print("==> save the result\t", str(dp.currentTime)) 194 | auc_train.to_csv("result/" + str(dp.dataSetType) + "/auc_train_" + str(dp.currentTime) + ".csv") 195 | rmse_train.to_csv("result/rmse_train_" + str(dp.currentTime) + ".csv") 196 | r2_train.to_csv("result/" + str(dp.dataSetType) + "/r2_train_" + str(dp.currentTime) + ".csv") 197 | 198 | auc_test.to_csv("result/" + str(dp.dataSetType) + "/auc_test_" + str(dp.currentTime) + ".csv") 199 | rmse_test.to_csv("result/" + str(dp.dataSetType) + "/rmse_test_" + str(dp.currentTime) + ".csv") 200 | r2_test.to_csv("result/" + str(dp.dataSetType) + "/r2_test_" + str(dp.currentTime) + ".csv") 201 | 202 | mean_result.to_csv("result/" + str(dp.dataSetType) + "/Mean_" + str(dp.currentTime) + ".csv") 203 | 204 | 205 | def draw_hist_graph(data_list, title, bins): 206 | pl.hist(data_list, bins=bins) 207 | pl.xlabel(title) 208 | pl.show() 209 | 210 | 211 | if __name__ == "__main__": 212 | pass 213 | -------------------------------------------------------------------------------- /code0_parameter.py: -------------------------------------------------------------------------------- 1 | ''' Unless stated otherwise, all software is provided free of charge. 2 | As well, all software is provided on an "as is" basis without warranty 3 | of any kind, express or implied. Under no circumstances and under no legal 4 | theory, whether in tort, contract, or otherwise, shall Liang Zhang be liable 5 | to you or to any other person for any indirect, special, incidental, 6 | or consequential damages of any character including, without limitation, 7 | damages for loss of goodwill, work stoppage, computer failure or malfunction, 8 | or for any and all other damages or losses. If you do not agree with these terms, 9 | then you are advised to not use the software.''' 10 | 11 | import tensorflow as tf 12 | import datetime 13 | 14 | # Hyperparameter for all kinds of file 15 | DATASETTYPE = 'assistment2009' # 'assistment2009'|'cmu_stat_f2011' 16 | if DATASETTYPE == 'cmu_stat_f2011': 17 | TARGETSIZE = 250 18 | elif DATASETTYPE == 'assistment2009': 19 | TARGETSIZE = 1000 20 | AUTOENCODER_ACT = 'tanh' # tanh, sigmoid 21 | CONNECT_DATASET_2009 = True 22 | 23 | DATASETSIZE = "large" # 'large | small' 24 | RNN_layer_number = 1 # '1|2' 25 | CELLTYPE = "LSTM" # "RNN | LSTM | GRU" 26 | 27 | BASELINE = True 28 | AUTOENCODER_LABEL = False 29 | 30 | 31 | class DatasetParameter(object): 32 | def __init__(self, data_type=DATASETTYPE): 33 | if data_type == DATASETTYPE: 34 | self.dataSetType = DATASETTYPE # "assistment2009 | cmu_stat_f2011 " 35 | else: 36 | self.dataSetType = data_type 37 | self.dataSetSize = DATASETSIZE # 'small |large' 38 | 39 | if self.dataSetType == "assistment2009": 40 | self.csv_file_name = "./data/assistment2009/skill_builder_data_corrected.csv" 41 | if CONNECT_DATASET_2009: 42 | self.processedFileName = "./data/assistment2009/processded_" + str(self.dataSetSize) + "_connected.csv" 43 | else: 44 | self.processedFileName = "./data/assistment2009/processded_" + str( 45 | self.dataSetSize) + "_nonconnected.csv" 46 | 47 | self.filtedColumnNameList = ['skill_id', 'user_id', 'original', 'correct', 'attempt_count', 'time', 48 | 'hint_count', 'problem_id', 'first_action', 'template_id', 'opportunity'] 49 | self.connect_dataset = CONNECT_DATASET_2009 50 | self.connect_file_name = "./data/assistment2009/connected_" + str(self.dataSetSize) + ".csv" 51 | self.time_z_level = 'skill_id' 52 | self.time_threshold = 400 53 | self.time_interval = 0.05 54 | self.attemp_max = 10 55 | self.correct_boundary_list = [0.5, 0.7] 56 | self.time_boundary_list = [-0.8, -0.6, 0] 57 | 58 | elif self.dataSetType == "cmu_stat_f2011": 59 | self.csv_file_name = "./data/cmu_stat_f2011/cmu.txt" 60 | self.filtedColumnNameList = ['time', 'correct', 'skill_id', 'step_id', 'problem_id', 'user_id', 61 | 'Level (Unit)', 'Level (Module)',"first_action", "attempt_level"] 62 | 63 | elif self.dataSetType == "kdd": 64 | self.csv_file_name = "data/kdd/algebra_2005_2006_train.txt" 65 | self.processedFileName = "data/kdd/processded_" + str(self.dataSetSize) + ".csv" 66 | self.filtedColumnNameList = ['skill_id', 'user_id', 'correct', 'time', 'hint_count', 'problem_view'] 67 | # 'step_id','unit_id','problem_id','incorrect','correct_num','opportunity' 68 | else: 69 | raise ValueError("check DATASETTYPE") 70 | 71 | self.currentTime = datetime.datetime.now().strftime("%m-%d-%H:%M") 72 | 73 | if self.dataSetType == "assistment2009": 74 | ##config 75 | self.dataset_columns_for_cross_feature = [['skill_id', 'correct'], ['first_action', 'correct'], 76 | ['time_level', 'correct'], ['attempt_level', 'correct'], 77 | ['first_action', 'time_level', 'correct'],['skill_id', 'time_level'], 78 | ['attempt_level', 'time_level', 'correct'], ] 79 | self.model_continues_columns = ["time", "hint_count", "attempt_count"] 80 | self.model_category_columns = ["first_action", "time_level", "attempt_level"] 81 | self.model_cross_columns = [['skill_id', 'time_level'],['time_level', 'correct']] # "the continues data columns needed to consider" 82 | elif self.dataSetType == 'cmu_stat_f2011': 83 | self.dataset_columns_for_cross_feature = [['skill_id', 'correct'], ['skill_id', 'time_level'],['time_level', 'correct']] 84 | self.model_continues_columns = ["time"] 85 | self.model_category_columns = ["first_action", "time_level", "attempt_level"] 86 | self.model_cross_columns = [['time_level', 'correct']] # "the continues data columns needed to consider" 87 | 88 | elif self.dataSetType == 'kdd': 89 | self.dataset_columns_for_cross_feature = [['skill_id', 'correct'], ['time_level', 'correct']] 90 | self.model_continues_columns = ["time", "hint_count", "problem_view"] 91 | self.model_category_columns = ["time", "hint_count", "problem_view"] 92 | self.model_cross_columns = [['time_level', 'correct']] # "the continues data columns needed to consider" 93 | 94 | if [['skill_id', 'correct']] in self.model_cross_columns: 95 | self.model_cross_columns.remove(['skill_id', 'correct']) 96 | elif [['correct', 'skill_id']] in self.model_cross_columns: 97 | self.model_cross_columns.remove(['correct', 'skill_id']) 98 | 99 | self.dataset_columns_for_cross_feature = self.__sortList(self.dataset_columns_for_cross_feature) 100 | self.model_cross_columns = self.__sortList(self.model_cross_columns) 101 | for items in self.model_cross_columns: 102 | if items not in self.dataset_columns_for_cross_feature: 103 | raise ValueError('model_cross_columns must in dataset_columns_for_cross_feature') 104 | for item in items: 105 | if item not in self.filtedColumnNameList + ['skill_id'] + ['time_level'] + ['attempt_level']: 106 | raise ValueError(item, " not in filtedColumnNameList") 107 | # need to change value 108 | self.columnsName_to_index = {} 109 | self.columns_max = {} 110 | self.columns_numb = {} 111 | self.seq_width = 0 112 | self.skill_num = 0 113 | 114 | def __sortList(self, listName): 115 | return sorted(listName) 116 | 117 | def convertCrossCoumnsToNameList(self, Flag=True): 118 | if Flag: 119 | mcu = self.dataset_columns_for_cross_feature 120 | else: 121 | mcu = self.model_cross_columns 122 | crossFeatureNameList = [] 123 | if len(mcu) != 0: 124 | for index_ccl, crossColumnsList in enumerate(mcu): 125 | crossFeatureName = '' 126 | if len(set(crossColumnsList)) <= 1: 127 | raise ValueError("need two different feature at least ") 128 | 129 | for index_cc, crossColumn in enumerate(crossColumnsList): 130 | if index_cc == 0: 131 | crossFeatureName = crossColumn 132 | else: 133 | crossFeatureName = crossFeatureName + " " + crossColumn 134 | crossFeatureNameList.append(crossFeatureName) 135 | return crossFeatureNameList 136 | 137 | 138 | class autoencoderParameter(object): 139 | def __init__(self): 140 | self.epoch_rbm = 10 141 | self.epoch_autoencoder = 10 142 | self.batch_size = 50 143 | self.num_steps = 100 144 | 145 | 146 | class SAEParamsConfig(object): 147 | def __init__(self): 148 | self.learning_rate = 0.005 149 | self.min_lr = 0.0001 150 | self.lr_decay = 0.98 151 | self.layer_num = 1 152 | self.init_scale = 0.05 153 | self.target_size = TARGETSIZE 154 | self.max_max_epoch = 5 155 | self.display_step = 1 156 | 157 | self.batch_size = 300 158 | self.num_steps = 0 # need to resign value of time stampes 159 | self.seq_width = 0 # need to resign value 160 | 161 | 162 | # Parameter for RNN 163 | class ModelParamsConfig(object): 164 | def __init__(self, dp): 165 | self.num_steps = 0 # need to resign value of time stampes 166 | self.skill_num = 0 # need to resign value of skill number 167 | self.seq_width = 0 # need to resign value 168 | if dp.dataSetType == 'kdd': 169 | self.batch_size = 5 170 | elif dp.dataSetType == 'cmu_stat_f2011': 171 | self.batch_size = 10 172 | else: 173 | self.batch_size = 30 174 | self.max_max_epoch = 40 175 | self.num_layer = RNN_layer_number 176 | self.cell_type = CELLTYPE # "RNN | LSTM | GRU" 177 | self.hidden_size = 200 178 | self.hidden_size_2 = 150 179 | 180 | self.init_scale = 0.05 181 | self.learning_rate = 0.05 182 | self.max_grad_norm = 4 183 | self.max_epoch = 5 184 | self.keep_prob = 0.6 185 | self.lr_decay = 0.9 186 | self.momentum = 0.95 187 | self.min_lr = 0.0001 188 | 189 | 190 | if __name__ == "__main__": 191 | param_ass = DatasetParameter() 192 | print(param_ass.convertCrossCoumnsToNameList()) 193 | -------------------------------------------------------------------------------- /code1_data.py: -------------------------------------------------------------------------------- 1 | ''' Unless stated otherwise, all software is provided free of charge. 2 | As well, all software is provided on an "as is" basis without warranty 3 | of any kind, express or implied. Under no circumstances and under no legal 4 | theory, whether in tort, contract, or otherwise, shall Liang Zhang be liable 5 | to you or to any other person for any indirect, special, incidental, 6 | or consequential damages of any character including, without limitation, 7 | damages for loss of goodwill, work stoppage, computer failure or malfunction, 8 | or for any and all other damages or losses. If you do not agree with these terms, 9 | then you are advised to not use the software.''' 10 | 11 | import math 12 | import os 13 | import pyprind 14 | import random 15 | import sys 16 | 17 | import numpy as np 18 | import pandas as pd 19 | 20 | import code0_parameter as code0 21 | import uril_assistment2009 22 | import uril_cmu_statistic 23 | import uril_tools as aux 24 | 25 | 26 | 27 | def create_label_and_delete_last_one(dp): 28 | dataFileName = os.path.dirname(dp.csv_file_name) + "/dataset_" + str(dp.dataSetSize) + ".csv" 29 | labelFileName = os.path.dirname(dp.csv_file_name) + "/labels_" + str(dp.dataSetSize) + ".csv" 30 | 31 | if os.path.exists(dataFileName) and os.path.exists(labelFileName): 32 | dataset = pd.read_csv(dataFileName) 33 | labels = pd.read_csv(labelFileName) 34 | print('==> ', dataFileName, " exists,load directly") 35 | print('==> ', labelFileName, " exists,load directly") 36 | return dataset, labels 37 | 38 | if dp.dataSetType == "assistment2009": 39 | data = uril_assistment2009.read_asssistment2009_data_from_csv(dp) 40 | elif dp.dataSetType == "kdd": 41 | data = uril_kdd.read_kdd_data_from_csv(dp) 42 | elif dp.dataSetType == "cmu_stat_f2011": 43 | data = uril_cmu_statistic.read_data_from_csv() 44 | 45 | userID_Quest_number_matrix = aux.getUserQuesNumList(data['user_id']) # user_id: number of questions 46 | print("==> creat skill_id+label, last record of every user is deleted") 47 | print("==> delete user whose problem number is less than 2") 48 | row_size = len(data); 49 | index = 0 50 | kindex = 0 51 | dataset = pd.DataFrame() 52 | labels = pd.DataFrame() 53 | 54 | bar = pyprind.ProgPercent(row_size, stream=sys.stdout) 55 | while (index < row_size): 56 | id_number = userID_Quest_number_matrix[kindex, 1] 57 | if id_number > 2: 58 | dataTemp = data.loc[index:index + id_number - 2] 59 | labeTemp = pd.DataFrame({'user_id': int(data.loc[index, 'user_id']), 60 | 'label_skill_id': data.loc[index + 1:index + id_number - 1, "skill_id"], 61 | 'label_correct': data.loc[index + 1:index + id_number - 1, "correct"]}) 62 | assert len(dataTemp) == len(labeTemp) 63 | dataset = dataset.append(dataTemp) 64 | labels = labels.append(labeTemp) 65 | del dataTemp, labeTemp 66 | bar.update(id_number) 67 | index += id_number 68 | kindex += 1 69 | dataset = dataset.reset_index(drop=True) 70 | labels = labels.reset_index(drop=True) 71 | 72 | if os.path.exists(dataFileName): os.remove(dataFileName) 73 | if os.path.exists(labelFileName): os.remove(labelFileName) 74 | dataset.to_csv(dataFileName, index=False) 75 | labels.to_csv(labelFileName, index=False) 76 | print("==> save ", dataFileName) 77 | print("==> save ", labelFileName) 78 | 79 | assert len(dataset) == len(labels), "dateset size\t" + str(len(dataset)) + "\tlabels size\t" + str(len(labels)) 80 | return dataset, labels 81 | 82 | 83 | def convert_data_labels_to_tuples(dataset, labels): 84 | index = 0 85 | kindex = 0 86 | tuple_rows = [] 87 | userID_Quest_number_matrix = aux.getUserQuesNumList(dataset['user_id']) 88 | print("==> convert data and labels to tuples") 89 | # tuple formate 90 | # 0: user_id 91 | # 1: record_numb 92 | # 2: data 93 | # 3: Target_Id 94 | # 4: correctness 95 | dataset_size = len(dataset) 96 | bar = pyprind.ProgPercent(dataset_size, stream=sys.stdout) 97 | while index < dataset_size: 98 | numb = int(userID_Quest_number_matrix[kindex, 1]) 99 | assert int(userID_Quest_number_matrix[kindex, 0]) == int(dataset.loc[index, "user_id"]) 100 | tup = (dataset.loc[index, "user_id"], numb, dataset.iloc[index:index + numb], 101 | list(labels.loc[index:index + numb - 1, "label_skill_id"]), 102 | # the input is a list but not pd.DataFrame, don't need to reset the index. 103 | list(labels.loc[index:index + numb - 1, "label_correct"])) 104 | # pd.DataFrame, loc and iloc cut differentsize! 105 | tuple_rows.append(tup) 106 | index += numb 107 | kindex += 1 108 | bar.update(numb) 109 | random.shuffle(tuple_rows) 110 | return tuple_rows 111 | 112 | 113 | def get_columns_info(dataset): 114 | columns_max = {} 115 | columns_numb = {} 116 | columnsName_to_index = {} 117 | for i, column_name in enumerate(dataset.columns): 118 | try: 119 | columns_max[column_name] = max(dataset[column_name]) 120 | columns_numb[column_name] = len(dataset[column_name].unique()) 121 | columnsName_to_index[column_name] = i 122 | except: 123 | print(dataset.columns) 124 | print(np.shape(dataset)) 125 | print(dataset[column_name]) 126 | raise ValueError(column_name) 127 | 128 | return columns_max, columns_numb, columnsName_to_index 129 | 130 | 131 | def add_cross_feature_to_dataset(dataset, dp): 132 | if len(dp.dataset_columns_for_cross_feature) == 0: 133 | print("==> no need to add cross feature to dataset") 134 | return dataset 135 | else: 136 | print("==> add cross feature to dataset") 137 | columns_max, columns_numb, _ = get_columns_info(dataset) 138 | d_size = len(dataset) 139 | for item in dp.dataset_columns_for_cross_feature: 140 | print("==> add", aux.connectStringfromList(item)) 141 | temp = [] 142 | for i in pyprind.prog_percent(range(d_size), stream=sys.stdout, title=item): 143 | if len(item) == 2: 144 | 145 | value = dataset.loc[i, item[0]] + dataset.loc[i, item[1]] * (columns_max[item[0]] + 1) 146 | #print(" dataset.loc[i, item[0]]\t", dataset.loc[i, item[0]], "\tdataset.loc[i, item[1]]\t", 147 | # dataset.loc[i, item[1]], "\t(columns_max[item[0]] + 1)\t",(columns_max[item[0]] + 1), 148 | # "\tvalue\t", value) 149 | elif len(item) == 3: 150 | value = dataset.loc[i, item[0]] + dataset.loc[i, item[1]] * (columns_max[item[0]] + 1) + \ 151 | dataset.loc[i, item[2]] * (columns_max[item[0]] + 1) * (columns_max[item[1]] + 1) 152 | else: 153 | raise ValueError('cross features only support 3 at most') 154 | temp.append(value) 155 | dataset[aux.connectStringfromList(item)] = temp 156 | return dataset 157 | 158 | 159 | # only for assistment 2009 and 2014 data 160 | def normalization_continues_data(data): 161 | print('==> normalize continues data') 162 | columns_name_list = ["attempt_count", "time", "hint_count"] 163 | data = data.reset_index(drop=True) 164 | 165 | size = len(data) 166 | for column_name in columns_name_list: 167 | if column_name == "time": 168 | bins = [-1, 60, 300, 1200, 3600, 60000000] 169 | data[column_name] = pd.cut(data[column_name], bins, labels=False) 170 | tmpList = [] 171 | 172 | for i in pyprind.prog_percent(range(size), stream=sys.stdout, title=column_name): 173 | try: 174 | tmp = int(data.loc[i, column_name]) 175 | except: 176 | tmp = 0 177 | # raise ValueError(str(data.loc[i, column_name])+"_"+str(i)) 178 | tmpList.append(math.log((tmp + 2), 6)) 179 | data['time_normal'] = tmpList 180 | elif column_name == "attempt_count": 181 | bins = [-10, 1, 20, 100, 40000] 182 | data[column_name] = pd.cut(data[column_name], bins, labels=False) 183 | data[column_name] += 1 184 | tmpList = [] 185 | 186 | for i in pyprind.prog_percent(range(size), stream=sys.stdout, title=column_name): 187 | # print ("attempt_count\t",str(i)) 188 | tmp = int(data.loc[i, column_name]) 189 | tmpList.append(math.log((tmp + 1), 5)) 190 | data['attempt_count_normal'] = tmpList 191 | elif column_name == "hint_count": 192 | bins = [-1, 0, 2, 4, 3000] 193 | data[column_name] = pd.cut(data[column_name], bins, labels=False) 194 | data[column_name] += 1 195 | tmpList = [] 196 | for i in pyprind.prog_percent(range(size), stream=sys.stdout, title=column_name): 197 | try: 198 | tmp = int(data.loc[i, column_name]) 199 | except: 200 | tmp = 0 201 | tmpList.append(math.log((tmp + 1), 5)) 202 | data['hint_count_normal'] = tmpList 203 | else: 204 | raise ValueError("check your continus_columns parameter!") 205 | return data 206 | 207 | 208 | def load_data(dp): 209 | if len(dp.dataset_columns_for_cross_feature) == 0: 210 | dataFileName = os.path.dirname(dp.csv_file_name) + "/dataset_" + str(dp.dataSetSize) + ".csv" 211 | else: 212 | tmp = aux.connectStringfromList(dp.convertCrossCoumnsToNameList()) 213 | dataFileName = os.path.dirname(dp.csv_file_name) + '/' + tmp + "_" + str(dp.dataSetSize) + '_' + ".csv" 214 | labelFileName = os.path.dirname(dp.csv_file_name) + "/labels_" + str(dp.dataSetSize) + ".csv" 215 | 216 | if os.path.exists(dataFileName) and os.path.exists(labelFileName): 217 | data = pd.read_csv(dataFileName) 218 | labels = pd.read_csv(labelFileName) 219 | return data, labels 220 | else: 221 | data, labels = create_label_and_delete_last_one(dp) 222 | dataset_with_crossFeatures = add_cross_feature_to_dataset(data, dp) 223 | dataset_with_crossFeatures.to_csv(dataFileName, index=False) 224 | print("==> save ", dataFileName) 225 | return dataset_with_crossFeatures, labels 226 | 227 | 228 | if __name__ == "__main__": 229 | dp = code0.DatasetParameter() 230 | load_data(dp) 231 | -------------------------------------------------------------------------------- /code2_model.py: -------------------------------------------------------------------------------- 1 | ''' Unless stated otherwise, all software is provided free of charge. 2 | As well, all software is provided on an "as is" basis without warranty 3 | of any kind, express or implied. Under no circumstances and under no legal 4 | theory, whether in tort, contract, or otherwise, shall Liang Zhang be liable 5 | to you or to any other person for any indirect, special, incidental, 6 | or consequential damages of any character including, without limitation, 7 | damages for loss of goodwill, work stoppage, computer failure or malfunction, 8 | or for any and all other damages or losses. If you do not agree with these terms, 9 | then you are advised to not use the software.''' 10 | 11 | import tensorflow as tf 12 | import numpy as np 13 | from code0_parameter import AUTOENCODER_ACT, BASELINE, TARGETSIZE, AUTOENCODER_LABEL 14 | from tensorflow.python.ops import rnn_cell 15 | from tensorflow.python.ops.rnn_cell import LSTMCell, BasicRNNCell, GRUCell, DropoutWrapper 16 | from uril_oneHotEncoder import ONEHOTENCODERINPUT 17 | 18 | 19 | class Model(object): 20 | def __init__(self, is_training, config, dp): 21 | self._batch_size = batch_size = config.batch_size 22 | 23 | self._min_lr = config.min_lr 24 | self.hidden_size = hidden_size = config.hidden_size 25 | self.hidden_size_2 = hidden_size_2 = config.hidden_size_2 26 | self.skill_set = dp.skill_set 27 | self.num_steps = num_steps = config.num_steps 28 | self.skill_num = skill_numb = config.skill_num 29 | self.seq_width = seq_width = len(dp.columnsName_to_index) 30 | self.skill_num = skill_num = dp.skill_num 31 | 32 | # load data 33 | self.inputs = tf.placeholder(tf.float32, [batch_size, num_steps, seq_width]) 34 | self.inputs_wide_skill_correct = tf.placeholder(tf.int32, [batch_size, num_steps]) 35 | self._target_id = tf.placeholder(tf.int32, [None]) 36 | self._target_correctness = target_correctness = tf.placeholder(tf.float32, [None]) 37 | 38 | ohe = ONEHOTENCODERINPUT(config, dp, self.inputs) 39 | 40 | # load features 41 | if not BASELINE: 42 | if AUTOENCODER_LABEL: 43 | ########################################################################################################### 44 | featurelist = [ohe.getSkillCorrectCrossFeature(),ohe.getCrossFeatureAll()]#,ohe.getCategoryFeatureInputs()] 45 | # ohe.getCategoryFeatureInputs()], # ohe.getContinuesFeatureInputs()] 46 | ########################################################################################################### 47 | tmp_v = tf.concat(2, featurelist) 48 | tmp_vs = tf.reshape(tmp_v, [-1, int(tmp_v.get_shape()[-1])]) 49 | 50 | if AUTOENCODER_ACT == 'tanh': 51 | transfer_function = tf.nn.tanh 52 | elif AUTOENCODER_ACT == 'sigmoid': 53 | transfer_function = tf.nn.sigmoid 54 | 55 | path = './weights/' + dp.dataSetType + '/weights_' + str(tmp_vs.get_shape()[-1]) + '_' + str( 56 | TARGETSIZE) + '.csv' 57 | autoencoderweights = tf.constant(np.loadtxt(path), dtype=tf.float32) 58 | path = './weights/' + dp.dataSetType + '/bias_' + str(tmp_vs.get_shape()[-1]) + '_' + str( 59 | TARGETSIZE) + '.csv' 60 | autoencoderBias = tf.constant(np.loadtxt(path), dtype=tf.float32) 61 | tmp_vs = transfer_function(tf.matmul(tmp_vs, autoencoderweights) + autoencoderBias) 62 | else: 63 | ########################################################################################################### 64 | featurelist = [ohe.getSkillCorrectCrossFeature(), ohe.getCrossFeatureAll(), ohe.getCategoryFeatureInputs()] 65 | # featurelist = [ohe.getSkillCorrectCrossFeature(), ohe.getCrossFeatureAll()] 66 | ########################################################################################################### 67 | tmp_v = tf.concat(2, featurelist) 68 | print("==> [Tensor Shape] Final Shape\t", tmp_v.get_shape()) 69 | tmp_vs = tf.reshape(tmp_v, [-1, int(tmp_v.get_shape()[-1])]) 70 | else: 71 | tmp_v = ohe.getSkillCorrectCrossFeature() 72 | tmp_vs = tf.reshape(tmp_v, [-1, int(tmp_v.get_shape()[-1])]) 73 | input_RNN = tf.reshape(tmp_vs, [batch_size, num_steps, -1]) 74 | 75 | cell = self.getCell(is_training=is_training, dp=dp, config=config) 76 | self._initial_state = cell.zero_state(batch_size, tf.float32) 77 | 78 | outputs = [] 79 | state = self._initial_state 80 | 81 | with tf.variable_scope(config.cell_type): 82 | for time_step in range(num_steps): 83 | if time_step > 0: tf.get_variable_scope().reuse_variables() 84 | (cell_output, state) = cell(input_RNN[:, time_step, :], state) 85 | outputs.append(cell_output) 86 | 87 | if config.num_layer == 1: 88 | size_rnn_out = hidden_size 89 | elif config.num_layer == 2: 90 | size_rnn_out = hidden_size_2 91 | else: 92 | raise ValueError("only support 1-2 layers, check your layer number!") 93 | 94 | output_RNN = tf.reshape(tf.concat(1, outputs), [-1, size_rnn_out]) 95 | softmax_w = tf.get_variable("softmax_w", [size_rnn_out, skill_numb]) 96 | softmax_b = tf.get_variable("softmax_b", [skill_numb]) 97 | 98 | logits = tf.matmul(output_RNN, softmax_w) + softmax_b 99 | 100 | # pick up the right one 101 | self.logits = logits = tf.reshape(logits, [-1]) 102 | self.selected_logits = selected_logits = tf.gather(logits, self.target_id) 103 | 104 | # make prediction 105 | self._pred = self._pred_values = tf.sigmoid(selected_logits) 106 | 107 | loss = tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(selected_logits, target_correctness)) 108 | self._cost = loss 109 | 110 | if not is_training: 111 | return 112 | 113 | self._lr = tf.Variable(0.0, trainable=False) 114 | tvars = tf.trainable_variables() 115 | grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars), config.max_grad_norm) 116 | optimizer = tf.train.GradientDescentOptimizer(self.lr) 117 | 118 | self._train_op = optimizer.apply_gradients(zip(grads, tvars)) 119 | 120 | def assign_lr(self, session, lr_value): 121 | if (lr_value > self.min_lr): 122 | session.run(tf.assign(self._lr, lr_value)) 123 | else: 124 | session.run(tf.assign(self._lr, self.min_lr)) 125 | 126 | def getCell(self, is_training, dp, config): 127 | # code for RNN 128 | if is_training == True: 129 | print("==> Construct ", config.cell_type, " graph for training") 130 | else: 131 | print("==> Construct ", config.cell_type, " graph for testing") 132 | 133 | if config.cell_type == "LSTM": 134 | if config.num_layer == 1: 135 | basicCell = LSTMCell(config.hidden_size, forget_bias=0.0, state_is_tuple=True) 136 | elif config.num_layer == 2: 137 | basicCell = LSTMCell(config.hidden_size, forget_bias=0.0, state_is_tuple=True) 138 | basicCell_2 = LSTMCell(config.hidden_size_2, forget_bias=0.0, state_is_tuple=True) 139 | else: 140 | raise ValueError("config.num_layer should be 1:2 ") 141 | elif config.cell_type == "RNN": 142 | if config.num_layer == 1: 143 | basicCell = BasicRNNCell(config.hidden_size) 144 | elif config.num_layer == 2: 145 | basicCell = BasicRNNCell(config.hidden_size) 146 | basicCell_2 = BasicRNNCell(config.hidden_size_2) 147 | else: 148 | raise ValueError("config.num_layer should be [1-3] ") 149 | elif config.cell_type == "GRU": 150 | if config.num_layer == 1: 151 | basicCell = GRUCell(config.hidden_size, forget_bias=0.0, state_is_tuple=True) 152 | elif config.num_layer == 2: 153 | basicCell = GRUCell(config.hidden_size, forget_bias=0.0, state_is_tuple=True) 154 | basicCell_2 = GRUCell(config.hidden_size_2, forget_bias=0.0, state_is_tuple=True) 155 | else: 156 | raise ValueError("only support 1-2 layers ") 157 | else: 158 | raise ValueError("cell type should be GRU,LSTM,RNN") 159 | 160 | # add dropout layer between hidden layers 161 | if is_training and config.keep_prob < 1: 162 | if config.num_layer == 1: 163 | basicCell = DropoutWrapper(basicCell, input_keep_prob=config.keep_prob, 164 | output_keep_prob=config.keep_prob) 165 | elif config.num_layer == 2: 166 | basicCell = DropoutWrapper(basicCell, input_keep_prob=config.keep_prob, 167 | output_keep_prob=config.keep_prob) 168 | basicCell_2 = DropoutWrapper(basicCell_2, input_keep_prob=config.keep_prob, 169 | output_keep_prob=config.keep_prob) 170 | else: 171 | pass 172 | 173 | if config.num_layer == 1: 174 | cell = rnn_cell.MultiRNNCell([basicCell], state_is_tuple=True) 175 | elif config.num_layer == 2: 176 | cell = rnn_cell.MultiRNNCell([basicCell, basicCell_2], state_is_tuple=True) 177 | 178 | return cell 179 | 180 | @property 181 | def batch_size(self): 182 | return self._batch_size 183 | 184 | @property 185 | def min_lr(self): 186 | return self._min_lr 187 | 188 | @property 189 | def auc(self): 190 | return self._auc 191 | 192 | @property 193 | def pred(self): 194 | return self._pred 195 | 196 | @property 197 | def target_id(self): 198 | return self._target_id 199 | 200 | @property 201 | def target_correctness(self): 202 | return self._target_correctness 203 | 204 | @property 205 | def initial_state(self): 206 | return self._initial_state 207 | 208 | @property 209 | def pred_values(self): 210 | return self._pred_values 211 | 212 | @property 213 | def cost(self): 214 | return self._cost 215 | 216 | @property 217 | def final_state(self): 218 | return self._final_state 219 | 220 | @property 221 | def lr(self): 222 | return self._lr 223 | 224 | @property 225 | def train_op(self): 226 | return self._train_op 227 | 228 | 229 | if __name__ == "__main__": 230 | pass 231 | -------------------------------------------------------------------------------- /uril_cmu_statistic.py: -------------------------------------------------------------------------------- 1 | ''' Unless stated otherwise, all software is provided free of charge. 2 | As well, all software is provided on an "as is" basis without warranty 3 | of any kind, express or implied. Under no circumstances and under no legal 4 | theory, whether in tort, contract, or otherwise, shall Liang Zhang be liable 5 | to you or to any other person for any indirect, special, incidental, 6 | or consequential damages of any character including, without limitation, 7 | damages for loss of goodwill, work stoppage, computer failure or malfunction, 8 | or for any and all other damages or losses. If you do not agree with these terms, 9 | then you are advised to not use the software.''' 10 | 11 | import pandas as pd 12 | import numpy as np 13 | import sys, pyprind 14 | import uril_tools as aux 15 | import code1_data as code1 16 | import code0_parameter as code0 17 | import os 18 | import pyprind as pp 19 | import matplotlib.pyplot as plt 20 | import datetime 21 | 22 | 23 | def read_data_from_csv(): 24 | processedFileName = './data/cmu_stat_f2011/processded_data.csv' 25 | raw_data_txt = "./data/cmu_stat_f2011/cmu.txt" 26 | 27 | if os.path.exists(processedFileName): 28 | data = pd.read_csv(processedFileName) 29 | print("==> read ", processedFileName, " directly") 30 | 31 | else: 32 | if os.path.exists(raw_data_txt): 33 | data = pd.read_csv(raw_data_txt, sep=" ", delimiter='\t') 34 | print(data.columns) 35 | data.rename(columns={'Duration (sec)': 'time', 'Outcome': 'correct', 36 | 'KC (F2011)': 'skill_id', 'Problem Name': 'problem_id', 'Step Name': 'step_id', 37 | 'Anon Student Id': 'user_id',"Student Response Type":"first_action",'Attempt At Step':"attempt_level"}, inplace=True) 38 | 39 | data = data.fillna(-1) 40 | 41 | filer_data = data[code0.DatasetParameter('cmu_stat_f2011').filtedColumnNameList] 42 | filer_data = filer_data[(filer_data['correct'] != -1) & (filer_data['correct'] != 'HINT') & ( 43 | filer_data['skill_id'] != '-1') & (filer_data['time'] != '.')] 44 | 45 | filer_data['correct'].replace({'CORRECT': 1, 'INCORRECT': 0}, inplace=True) 46 | 47 | # change str to integar 48 | for feature in ['skill_id', 'step_id', 'problem_id', 'user_id', 'Level (Unit)', 'Level (Module)','first_action','attempt_level']: 49 | print("==> BEGIN ", feature) 50 | temp_set = set(list(filer_data[feature])) 51 | temp_dict = {key: value+1 for value, key in enumerate(temp_set)} 52 | filer_data[feature].replace(temp_dict, inplace=True) 53 | print("==> END ", feature) 54 | 55 | #print ("==> first_action",set(filer_data['first_action'])) 56 | #print ("==> attempt_level",set(filer_data['attempt_level'])) 57 | data = attempt_process(filer_data) 58 | data = time_basic_process(data) 59 | data = time_add_level_process(data) 60 | data.to_csv(processedFileName, index=False) 61 | 62 | else: 63 | raise ('No data file exists!') 64 | return data 65 | 66 | def attempt_process(data): 67 | temp_list = list(data['attempt_level']) 68 | new_list = [] 69 | 70 | for i in range(len(temp_list)): 71 | if temp_list[i]==1: 72 | new_list.append(0) 73 | elif temp_list[i]<=5 and temp_list[i]>1: 74 | new_list.append(1) 75 | elif temp_list[i]>5: 76 | new_list.append(2) 77 | else: 78 | new_list.append(3) 79 | data['attempt_level'] = new_list 80 | return data 81 | 82 | def test_data(): 83 | data = read_data_from_csv() 84 | 85 | k1 = [] 86 | k2 = [] 87 | for item in data.columns: 88 | num = len(set(data[item])) 89 | print("****%10d--%s" % (num, item)) 90 | k2.append(item) 91 | if num < 10: 92 | print("-" * 10, item, "-" * 10, "\n", np.unique(data[item]), "\n", "--" * 15) 93 | 94 | print('--' * 30) 95 | print("more than 1 elements\n", k2) 96 | 97 | print(np.shape(data)) 98 | 99 | 100 | def time_basic_process(data): 101 | # -1-transfer time to 'integar' from 'str' 102 | # -2-remove outlier records 103 | old_time_list = list(data['time']) 104 | new_time_list = [] 105 | for i in old_time_list: 106 | kp = int(float(i)) 107 | if kp > 150: kp = 150 108 | new_time_list.append(kp) 109 | data['time'] = new_time_list 110 | 111 | # -3-transfer to z-score 112 | time_z_level = 'skill_id' 113 | print('==> preprocerss time to z-score based on ', time_z_level) 114 | time_z_id_set = np.unique(data[time_z_level]) 115 | std_dict = {} 116 | mean_dict = {} 117 | for itme_id in pp.prog_percent(time_z_id_set, stream=sys.stdout, title='==> extract mean and std of time'): 118 | temp_data = data[data[time_z_level] == itme_id] 119 | temp_list = list(temp_data['time']) 120 | # print ('-- problem_id ',problem_id,' -- ',len(temp_list),' --') 121 | std_dict[itme_id] = np.std(temp_list, axis=0) 122 | mean_dict[itme_id] = np.mean(temp_list, axis=0) 123 | 124 | assert len(std_dict) == len(mean_dict) 125 | 126 | data = data.reset_index(drop=True) 127 | 128 | for id in pp.prog_percent(range(len(data)), stream=sys.stdout, title='==> cast time to z-score'): 129 | data.loc[id, 'time'] = (data.loc[id, 'time'] - mean_dict[data.loc[id, time_z_level]]) / ( 130 | std_dict[data.loc[id, time_z_level]] * 1.0) 131 | 132 | return data 133 | 134 | 135 | def temp(data): 136 | # -1-transfer time to 'integar' from 'str' 137 | old_time_list = list(data['time']) 138 | new_time_list = [] 139 | for i in old_time_list: 140 | new_time_list.append(int(float(i))) 141 | data['time'] = new_time_list 142 | 143 | plt.hist(new_time_list, bins=np.arange(min(new_time_list), max(new_time_list), )) 144 | plt.show() 145 | 146 | 147 | def time_add_level_process(data): 148 | time_interval = 0.025 149 | boundary_list = [0.5, 0.7] 150 | data = data.reset_index(drop=True) 151 | bins = np.arange(min(data['time']), max(data['time']), time_interval * 2) 152 | 153 | correct_mean_list = [] 154 | correct_std_list = [] 155 | correct_num_list = [] 156 | for item_index in pp.prog_percent(range(len(bins)), stream=sys.stdout, title='==> get correctness'): 157 | up_bin = bins[item_index] + time_interval 158 | down_bin = bins[item_index] - time_interval 159 | 160 | temp_data = data[(data['time'] >= down_bin) & (data['time'] < up_bin)] 161 | temp_correct_list = list(temp_data['correct']) 162 | 163 | """ 164 | if up_bin<=-1: 165 | print ("---"*20) 166 | print ("*\t",down_bin) 167 | print ("*\t",up_bin) 168 | print (temp_correct_list) 169 | #print (temp_data) 170 | print ("---"*20) 171 | """ 172 | 173 | correct_num_list.append(len(temp_correct_list)) 174 | if (len(temp_correct_list) != 0): 175 | if np.mean(temp_correct_list, axis=0) > 1: 176 | print("******\t", np.mean(temp_correct_list, axis=0), "\t", temp_correct_list) 177 | correct_mean_list.append(np.mean(temp_correct_list, axis=0)) 178 | correct_std_list.append(np.std(temp_correct_list, axis=0)) 179 | else: 180 | correct_mean_list.append(0) 181 | correct_std_list.append(0) 182 | 183 | # plot the relationship 184 | fig, axs = plt.subplots(nrows=2, ncols=1, sharex=True) 185 | ax = axs[0] 186 | ax.plot(bins, correct_mean_list, "r.") 187 | ax.set_title('correctness') 188 | 189 | for nmber in boundary_list: 190 | ax.axhline(y=nmber, xmin=0, xmax=1, c="red", linewidth=0.5, zorder=0) 191 | 192 | ax = axs[1] 193 | ax.plot(bins, correct_num_list, "b--") 194 | ax.set_title("time z score distribution") 195 | 196 | ax.set_xlim([-2, 4]) 197 | plt.savefig('./result/cmu_stat_f2011/time_distribution_correctness_' + str( 198 | datetime.datetime.now().strftime("%Y-%m-%d-%H-%M")) + '.png') 199 | #plt.show() 200 | 201 | # add a colum according to correctness boundary 202 | time_level_list = [] 203 | temp_list = list(data['time']) 204 | bd = [-1.2, -0.7, 0.75] 205 | 206 | # 0 ~ time < -1.2 207 | # 1 ~ -1.2 < time < -0.7 208 | # 2 ~ -0.7 < time < 0.75 209 | # 3 ~ 0.75 < time 210 | for idx in range(len(temp_list)): 211 | if temp_list[idx] <= bd[0]: 212 | time_level_list.append(0) 213 | elif (bd[0] < temp_list[idx] and temp_list[idx] <= bd[1]): 214 | time_level_list.append(1) 215 | elif (bd[1] < temp_list[idx] and temp_list[idx] <= bd[2]): 216 | time_level_list.append(2) 217 | elif (temp_list[idx] > bd[2]): 218 | time_level_list.append(3) 219 | else: 220 | raise Exception("Error in time division") 221 | print("==> add time_level") 222 | data['time_level'] = time_level_list 223 | return data 224 | 225 | def read_data_from_csv2(): 226 | processedFileName = './data/cmu_stat_f2011/test_data.csv' 227 | raw_data_txt = "./data/cmu_stat_f2011/cmu.txt" 228 | 229 | if os.path.exists(processedFileName): 230 | data = pd.read_csv(processedFileName) 231 | print("==> read ", processedFileName, " directly") 232 | 233 | else: 234 | if os.path.exists(raw_data_txt): 235 | data = pd.read_csv(raw_data_txt, sep=" ", delimiter='\t') 236 | print(data.columns) 237 | data.rename(columns={'Duration (sec)': 'time', 'Outcome': 'correct', 238 | 'KC (F2011)': 'skill_id', 'Problem Name': 'problem_id', 'Step Name': 'step_id', 239 | 'Anon Student Id': 'user_id',"Student Response Type":"first_action",'Attempt At Step':"attempt_level"}, inplace=True) 240 | 241 | data = data.fillna(-1) 242 | 243 | filer_data = data[code0.DatasetParameter('cmu_stat_f2011').filtedColumnNameList] 244 | filer_data = filer_data[(filer_data['correct'] != -1) & (filer_data['correct'] != 'HINT') & ( 245 | filer_data['skill_id'] != '-1') & (filer_data['time'] != '.')] 246 | 247 | filer_data['correct'].replace({'CORRECT': 1, 'INCORRECT': 0}, inplace=True) 248 | 249 | # change str to integar 250 | for feature in ['skill_id', 'step_id', 'problem_id', 'user_id', 'Level (Unit)', 'Level (Module)','first_action','attempt_level']: 251 | print("==> BEGIN ", feature) 252 | temp_set = set(list(filer_data[feature])) 253 | temp_dict = {key: value+1 for value, key in enumerate(temp_set)} 254 | filer_data[feature].replace(temp_dict, inplace=True) 255 | print("==> END ", feature) 256 | 257 | print ("==> first_action",set(filer_data['first_action'])) 258 | print ("==> attempt_level",set(filer_data['attempt_level'])) 259 | data.to_csv(processedFileName,index=False) 260 | else: 261 | raise ('No data file exists!') 262 | return data 263 | 264 | def test(): 265 | processedFileName = './data/cmu_stat_f2011/test_data.csv' 266 | data = pd.read_csv(processedFileName) 267 | plt.hist(list(data['attempt_level']),np.arange(min(data['attempt_level']), max(data['attempt_level']), 1)) 268 | plt.show() 269 | 270 | if __name__ == '__main__': 271 | data = read_data_from_csv() 272 | 273 | -------------------------------------------------------------------------------- /uril_oneHotEncoder.py: -------------------------------------------------------------------------------- 1 | ''' Unless stated otherwise, all software is provided free of charge. 2 | As well, all software is provided on an "as is" basis without warranty 3 | of any kind, express or implied. Under no circumstances and under no legal 4 | theory, whether in tort, contract, or otherwise, shall Liang Zhang be liable 5 | to you or to any other person for any indirect, special, incidental, 6 | or consequential damages of any character including, without limitation, 7 | damages for loss of goodwill, work stoppage, computer failure or malfunction, 8 | or for any and all other damages or losses. If you do not agree with these terms, 9 | then you are advised to not use the software.''' 10 | 11 | import tensorflow as tf 12 | import numpy as np 13 | import code0_parameter as code0 14 | import code1_data as code1 15 | 16 | 17 | class ONEHOTENCODERINPUT(object): 18 | def __init__(self, ap, dp, inputs,printControl=True): 19 | self.batch_size = batch_size = ap.batch_size 20 | self.num_steps = num_steps = ap.num_steps 21 | self.seq_width = seq_width = len(dp.columnsName_to_index) 22 | self.skill_num = dp.skill_num 23 | self.dp = dp 24 | self.ap = ap 25 | self.model_continues_columns = dp.model_continues_columns 26 | self.model_category_columns = dp.model_category_columns 27 | self.model_cross_columns = dp.model_cross_columns 28 | self.inputs = inputs 29 | self.printControl=printControl 30 | 31 | if dp.dataSetType == "assistment2009": 32 | width_deep_width_dict = {"skill_id": dp.columns_max['skill_id'] + 1, 33 | "correct": dp.columns_max['correct'] + 1, 34 | "time_level": dp.columns_max['time_level'] + 1, 35 | "attempt_level": dp.columns_max['attempt_level'] + 1, 36 | "first_action": dp.columns_max['first_action'] + 1} 37 | 38 | self.data_first_action = tf.to_int32( 39 | tf.slice(self.inputs, [0, 0, dp.columnsName_to_index['first_action']], [-1, -1, 1])) 40 | self.data_first_action_process = tf.to_float(tf.squeeze( 41 | tf.one_hot(indices=self.data_first_action, depth=width_deep_width_dict['first_action'], on_value=1.0, 42 | off_value=0.0, axis=-1))) 43 | 44 | 45 | self.data_time_level = tf.to_int32( 46 | tf.slice(self.inputs, [0, 0, dp.columnsName_to_index['time_level']], [-1, -1, 1])) 47 | self.data_time_level_process = tf.to_float(tf.squeeze( 48 | tf.one_hot(indices=self.data_time_level, depth=width_deep_width_dict['time_level'], on_value=1.0, 49 | off_value=0.0, axis=-1))) 50 | 51 | self.data_attempt_level = tf.to_int32( 52 | tf.slice(self.inputs, [0, 0, dp.columnsName_to_index['attempt_level']], [-1, -1, 1])) 53 | self.data_attempt_level_process = tf.to_float(tf.squeeze( 54 | tf.one_hot(indices=self.data_attempt_level, depth=width_deep_width_dict['attempt_level'], on_value=1.0, 55 | off_value=0.0, axis=-1))) 56 | 57 | elif dp.dataSetType == "cmu_stat_f2011": # kdd 58 | width_deep_width_dict = {"skill_id": dp.columns_max['skill_id'] + 1, 59 | "correct": dp.columns_max['correct'] + 1, 60 | "time_level": dp.columns_max['time_level'] + 1, 61 | "attempt_level": dp.columns_max['attempt_level'] + 1, 62 | "first_action": dp.columns_max['first_action'] + 1 63 | } 64 | self.data_time_level = tf.to_int32( 65 | tf.slice(self.inputs, [0, 0, dp.columnsName_to_index['time_level']], [-1, -1, 1])) 66 | self.data_time_level_process = tf.to_float(tf.squeeze( 67 | tf.one_hot(indices=self.data_time_level, depth=width_deep_width_dict['time_level'], on_value=1.0, 68 | off_value=0.0, axis=-1))) 69 | 70 | self.data_first_action = tf.to_int32( 71 | tf.slice(self.inputs, [0, 0, dp.columnsName_to_index['first_action']], [-1, -1, 1])) 72 | self.data_first_action_process = tf.to_float(tf.squeeze( 73 | tf.one_hot(indices=self.data_first_action, depth=width_deep_width_dict['first_action'], on_value=1.0, 74 | off_value=0.0, axis=-1))) 75 | 76 | self.data_attempt_level = tf.to_int32( 77 | tf.slice(self.inputs, [0, 0, dp.columnsName_to_index['attempt_level']], [-1, -1, 1])) 78 | self.data_attempt_level_process = tf.to_float(tf.squeeze( 79 | tf.one_hot(indices=self.data_attempt_level, depth=width_deep_width_dict['attempt_level'], on_value=1.0, 80 | off_value=0.0, axis=-1))) 81 | 82 | elif dp.dataSetType == "kdd": # kdd 83 | width_deep_width_dict = {"skill_id": dp.columns_max['skill_id'] + 1, 84 | "correct": dp.columns_max['correct'] + 1, 85 | "time_level": dp.columns_max['time_level'] + 1 86 | } 87 | self.data_time_level = tf.to_int32( 88 | tf.slice(self.inputs, [0, 0, dp.columnsName_to_index['time_level']], [-1, -1, 1])) 89 | self.data_time_level_process = tf.to_float(tf.squeeze( 90 | tf.one_hot(indices=self.data_time_level, depth=width_deep_width_dict['time_level'], on_value=1.0, 91 | off_value=0.0, axis=-1))) 92 | 93 | self.data_skill_id = tf.to_int32( 94 | tf.slice(self.inputs, [0, 0, dp.columnsName_to_index['skill_id']], [-1, -1, 1])) 95 | self.data_skill_id_process = tf.to_float(tf.squeeze( 96 | tf.one_hot(indices=self.data_skill_id, depth=width_deep_width_dict['skill_id'], on_value=1.0, off_value=0.0, 97 | axis=-1))) 98 | self.data_correct = tf.slice(self.inputs, [0, 0, dp.columnsName_to_index['correct']], [-1, -1, 1]) 99 | 100 | 101 | def getSkillCorrectMerge(self): 102 | featureList = [self.data_skill_id_process, self.data_correct] 103 | TensorskillCorrect = tf.concat(2, featureList) 104 | if self.printControl: print("==> [Tensor Shape] skill_id and correct merge formate\t", TensorskillCorrect.get_shape()) 105 | return TensorskillCorrect 106 | 107 | def getSkillCorrectCrossFeature(self): 108 | TensorCrossFeatures = self._getCrossFeature(['skill_id correct']) 109 | if self.printControl: print("==> [Tensor Shape] skill_id and correct cross feature\t", TensorCrossFeatures.get_shape()) 110 | return TensorCrossFeatures 111 | 112 | def getContinuesFeatureInputs(self): 113 | featureList = [] 114 | for columnName in set(self.model_continues_columns): 115 | if columnName == 'time': 116 | featureList.append(self.data_time_normal) 117 | elif columnName == 'attempt_count': 118 | featureList.append(self.data_attempt_count_normal) 119 | elif columnName == 'hint_count': 120 | featureList.append(self.data_hint_count_normal) 121 | elif columnName == 'problem_view': 122 | featureList.append(self.data_problem_view_normal) 123 | elif columnName in ['skill_id', 'correct']: 124 | pass 125 | else: 126 | raise ValueError('only support time、attempt_count、hint_count') 127 | 128 | TensorContinuesFeature = tf.concat(2, featureList) 129 | if self.printControl: print("==> [Tensor Shape] continues features\t", TensorContinuesFeature.get_shape()) 130 | return TensorContinuesFeature 131 | 132 | def getCategoryFeatureInputs(self): 133 | featureList = [] 134 | for columnName in set(self.model_category_columns): 135 | if columnName == 'first_action': 136 | featureList.append(self.data_first_action_process) 137 | elif columnName == 'time_level': 138 | featureList.append(self.data_time_level_process) 139 | elif columnName == 'attempt_level': 140 | featureList.append(self.data_attempt_level_process) 141 | elif columnName in ['skill_id', 'correct']: 142 | pass 143 | else: 144 | raise ValueError('Check your model_category_columns configuration') 145 | 146 | TensorCategoryFeature = tf.concat(2, featureList) 147 | if self.printControl: print("==> [Tensor Shape] category features\t", TensorCategoryFeature.get_shape()) 148 | return TensorCategoryFeature 149 | 150 | def getCrossFeatureAll(self): 151 | crossFeatureNameList = self.dp.convertCrossCoumnsToNameList(Flag=False) 152 | TensorCrossFeatures = self._getCrossFeature(crossFeatureNameList) 153 | if self.printControl: print("==> [Tensor Shape] Cross Feature whole\t", TensorCrossFeatures.get_shape()) 154 | return TensorCrossFeatures 155 | 156 | def _getCrossFeature(self, crossFeatureNameList): 157 | if crossFeatureNameList == ['skill_id correct'] or crossFeatureNameList == ['correct skill_id']: 158 | crossFeatureNameList = ['skill_id correct'] 159 | 160 | wide_length = 0 161 | for i, crossFeatureName in enumerate(crossFeatureNameList): # crossFeatureName is a string'correct first_response_time' 162 | depthValue = int(self.dp.columns_max[crossFeatureName] + 1) 163 | wide_length += depthValue 164 | 165 | tmp_value = tf.to_int32( 166 | tf.slice(self.inputs, [0, 0, self.dp.columnsName_to_index[crossFeatureName]], [-1, -1, 1])) 167 | tmp_value_ohe = tf.to_float( 168 | tf.squeeze(tf.one_hot(indices=tmp_value, depth=depthValue, on_value=1.0, off_value=0.0, axis=-1))) 169 | if self.printControl: print("==> [Tensor Shape] Cross Feature--", crossFeatureName, " width\t", depthValue) 170 | 171 | if i == 0: 172 | TensorCrossFeatures = tmp_value_ohe 173 | else: 174 | TensorCrossFeatures = tf.concat(2, [TensorCrossFeatures, tmp_value_ohe]) 175 | # if no cross features, the return value is null 176 | return TensorCrossFeatures 177 | 178 | def get_init_value_for_train_weights(self): 179 | featureslist = [self.getSkillCorrectCrossFeature(), self.getCrossFeatureAll(), self.getCategoryFeatureInputs(), 180 | self.getContinuesFeatureInputs()] 181 | x_tmp = tf.concat(2, featureslist) 182 | x = tf.reshape(x_tmp, [self.batch_size * self.num_steps, -1]) 183 | return x 184 | 185 | 186 | if __name__ == "__main__": 187 | dp = code0.DatasetParameter() 188 | ap = code0.autoencoderParameter() 189 | 190 | dataset, labels = code1.load_data(dp) 191 | # tuple_data = code1.convert_data_labels_to_tuples(dataset, labels) 192 | 193 | skill_num = len(dataset['skill_id'].unique()) + 1 # 0 for unlisted skill_id 194 | dp.skill_num = skill_num 195 | dp.skill_set = list(dataset['skill_id'].unique()) 196 | dp.columns_max, dp.columns_numb, dp.columnsName_to_index = code1.get_columns_info(dataset) 197 | dp.seq_width = len(dp.columnsName_to_index) 198 | 199 | print("columns_max\n", dp.columns_max) 200 | print("columns_numb\n", dp.columns_numb) 201 | print("columnsName_to_index\n", dp.columnsName_to_index) 202 | 203 | data = np.random.randint(low=0,high=2, size=()) 204 | g =tf.Graph() 205 | with g.as_default(): 206 | inputs = tf.placeholder(tf.float32, [ap.batch_size, ap.num_steps, len(dp.columnsName_to_index)]) 207 | m = ONEHOTENCODERINPUT(ap=ap, dp=dp,inputs=inputs) 208 | 209 | with tf.Session(graph=g) as sess: 210 | m.getSkillCorrectMerge() 211 | m.getContinuesFeatureInputs() 212 | m.getCategoryFeatureInputs() 213 | print("-" * 60) 214 | m.getSkillCorrectCrossFeature() 215 | print("-" * 60) 216 | m.getCrossFeatureAll() 217 | -------------------------------------------------------------------------------- /uril_assistment2009.py: -------------------------------------------------------------------------------- 1 | ''' Unless stated otherwise, all software is provided free of charge. 2 | As well, all software is provided on an "as is" basis without warranty 3 | of any kind, express or implied. Under no circumstances and under no legal 4 | theory, whether in tort, contract, or otherwise, shall Liang Zhang be liable 5 | to you or to any other person for any indirect, special, incidental, 6 | or consequential damages of any character including, without limitation, 7 | damages for loss of goodwill, work stoppage, computer failure or malfunction, 8 | or for any and all other damages or losses. If you do not agree with these terms, 9 | then you are advised to not use the software.''' 10 | 11 | import pandas as pd 12 | import uril_tools as aux 13 | import code1_data as code1 14 | import code0_parameter as code0 15 | import pyprind, os 16 | import sys 17 | import uril_connectUser 18 | import numpy as np 19 | import pyprind as pp 20 | import pylab as pl 21 | import datetime 22 | import matplotlib.pyplot as plt 23 | 24 | 25 | def read_asssistment2009_data_from_csv(dp): 26 | # read process file directly if exists 27 | if os.path.exists(dp.processedFileName): 28 | print("==> read ", dp.processedFileName) 29 | data = pd.read_csv(dp.processedFileName) 30 | print(aux.stastic_SecNumber_UserNumber_SkillNumber(data, dp)) 31 | return data 32 | 33 | # processfile not exist, load connect data and process it 34 | if os.path.exists(dp.connect_file_name): 35 | print("==> read ", dp.connect_file_name) 36 | data = pd.read_csv(dp.connect_file_name) 37 | else: # read raw data and connect 38 | try: 39 | data = pd.read_csv(dp.csv_file_name, encoding='latin-1', error_bad_lines=False, index_col=False) 40 | if dp.csv_file_name == "./data/assistment2009/skill_builder_data_corrected.csv": 41 | data = data.loc[:338000] 42 | elif dp.csv_file_name == "./data/assistment2009/skill_builder_data.csv": 43 | data = data.loc[:450000] 44 | else: 45 | pass 46 | print("==> read ", dp.csv_file_name) 47 | except: 48 | raise NameError("can't load " + dp.csv_file_name + " pleace check your file") 49 | print('==> columns names\t', data.columns) 50 | 51 | data.rename(columns={'ms_first_response': 'time', 'hint_count': '_hint_count', 'hint_total': 'hint_count'}, 52 | inplace=True) 53 | 54 | data = data[dp.filtedColumnNameList].fillna(0) 55 | if dp.dataSetSize == "small": 56 | data = data[0:50000] 57 | print("==> run ", dp.dataSetSize, " dataset") 58 | 59 | data = data[data['original'] == 1] 60 | data = data.reset_index(drop=True) 61 | print("==> consider original==1, data shape\t", data.shape) 62 | 63 | data = uril_connectUser.connectUser(data, dp.connect_file_name) 64 | print("==> save ", dp.connect_file_name) 65 | 66 | ### data process 67 | # correct process 68 | print("==> remove records whose correct is not 1 or 0") 69 | data = data[(data['correct'] == 1) | (data['correct'] == 0)] 70 | data = data.reset_index(drop=True) 71 | 72 | # time process 73 | data = time_basic_process(data) 74 | data = time_add_level_process(data) 75 | data = data.reset_index(drop=True) 76 | 77 | # attempt process 78 | data = attempt_add_level_process(data) 79 | 80 | print("==> dataset column name\n", data.columns) 81 | print("==> dataset shape\t", data.shape) 82 | 83 | data.to_csv(dp.processedFileName, index=False) 84 | print("==> save file to ", dp.processedFileName) 85 | 86 | aux.stastic_SecNumber_UserNumber_SkillNumber(data, dp) 87 | return data 88 | 89 | 90 | def time_basic_process(data): 91 | # -1-transfer to second unit 92 | print("==> transfer time unit: millsecond to second") 93 | tempTimeList = list(data['time']) 94 | newTimeList = [int(x / 1000) for x in tempTimeList] 95 | data['time'] = newTimeList 96 | del newTimeList, tempTimeList 97 | 98 | # -2-remove outlier records 99 | print('==> delete outlier of time feature') 100 | print('==> length before delete\t', len(data)) 101 | data = data[(data['time'] <= code0.DatasetParameter().time_threshold) & (data['time'] > 0)] 102 | print('==> length after delete\t', len(data)) 103 | 104 | # -3-transfer to z-score 105 | time_z_level = code0.DatasetParameter().time_z_level 106 | print('==> preprocerss time to z-score based on ', time_z_level) 107 | time_z_id_set = np.unique(data[time_z_level]) 108 | std_dict = {} 109 | mean_dict = {} 110 | for itme_id in pp.prog_percent(time_z_id_set, stream=sys.stdout, title='==> extract mean and std of time'): 111 | temp_data = data[data[time_z_level] == itme_id] 112 | temp_list = list(temp_data['time']) 113 | # print ('-- problem_id ',problem_id,' -- ',len(temp_list),' --') 114 | std_dict[itme_id] = np.std(temp_list, axis=0) 115 | mean_dict[itme_id] = np.mean(temp_list, axis=0) 116 | 117 | assert len(std_dict) == len(mean_dict) 118 | 119 | data = data.reset_index(drop=True) 120 | for id in pp.prog_percent(range(len(data)), stream=sys.stdout, title='==> cast time to z-score'): 121 | data.loc[id, 'time'] = (data.loc[id, 'time'] - mean_dict[data.loc[id, time_z_level]]) / ( 122 | std_dict[data.loc[id, time_z_level]] * 1.0) 123 | 124 | data = data.fillna(0) 125 | 126 | """ 127 | plt.hist(list(data['time']), bins=np.arange(min(data['time']), max(data['time']), code0.DatasetParameter().time_interval*2)) 128 | plt.title("time z score distribution") 129 | plt.savefig('./result/assistment2009/time_distribution' + str(datetime.datetime.now().strftime("%Y-%m-%d-%H-%M")) + '.png') 130 | """ 131 | return data 132 | 133 | 134 | def time_add_level_process(data): 135 | data = data.reset_index(drop=True) 136 | bins = np.arange(min(data['time']), max(data['time']), code0.DatasetParameter().time_interval * 2) 137 | correct_mean_list = [] 138 | correct_std_list = [] 139 | correct_num_list = [] 140 | for item_index in pp.prog_percent(range(len(bins)), stream=sys.stdout, title='==> get correctness'): 141 | up_bin = bins[item_index] + code0.DatasetParameter().time_interval 142 | down_bin = bins[item_index] - code0.DatasetParameter().time_interval 143 | 144 | temp_data = data[data['time'] >= down_bin] 145 | temp_data = temp_data[temp_data['time'] < up_bin] 146 | 147 | temp_correct_list = list(temp_data['correct']) 148 | correct_num_list.append(len(temp_correct_list)) 149 | if (len(temp_correct_list) != 0): 150 | correct_mean_list.append(np.mean(temp_correct_list, axis=0)) 151 | correct_std_list.append(np.std(temp_correct_list, axis=0)) 152 | else: 153 | correct_mean_list.append(0) 154 | correct_std_list.append(0) 155 | 156 | # plot the relationship 157 | fig, axs = plt.subplots(nrows=2, ncols=1, sharex=True) 158 | ax = axs[0] 159 | ax.plot(bins, correct_mean_list) 160 | ax.set_title('correctness') 161 | boundary_list = code0.DatasetParameter().correct_boundary_list 162 | for nmber in boundary_list: 163 | ax.axhline(y=nmber, xmin=0, xmax=1, c="red", linewidth=0.5, zorder=0) 164 | 165 | ax = axs[1] 166 | ax.plot(bins, correct_num_list) 167 | ax.set_title("time z score distribution") 168 | 169 | ax.set_xlim([-2, 4]) 170 | plt.savefig('./result/assistment2009/time_distribution_correctness_' + str( 171 | datetime.datetime.now().strftime("%Y-%m-%d-%H-%M")) + '.png') 172 | # plt.show() 173 | 174 | # add a colum according to correctness boundary 175 | time_level_list = [] 176 | temp_list = list(data['time']) 177 | bd = code0.DatasetParameter().time_boundary_list 178 | # 0 ~ time <-0.8 179 | # 1 ~ -0.8 < time < -0.6 180 | # 2 ~ -0.6 < time < 0 181 | # 3 ~ 0 < time 182 | for idx in range(len(temp_list)): 183 | if temp_list[idx] <= bd[0]: 184 | time_level_list.append(0) 185 | elif (bd[0] < temp_list[idx] and temp_list[idx] <= bd[1]): 186 | time_level_list.append(1) 187 | elif (bd[1] < temp_list[idx] and temp_list[idx] <= bd[2]): 188 | time_level_list.append(2) 189 | elif (temp_list[idx] > bd[2]): 190 | time_level_list.append(3) 191 | else: 192 | raise Exception("Error in time division") 193 | 194 | data['time_level'] = time_level_list 195 | return data 196 | 197 | 198 | def attempt_add_level_process(data): 199 | """ 200 | based on correctness and attempt relationship 201 | 0 - attempt: 0 - 0 202 | 1 - attempt: 1 - 81.7% 203 | 2 - attempt: 2 - 204 | 3 - attempt: 0 - 0 205 | """ 206 | temp_list = [] 207 | 208 | for item in pp.prog_percent(list(data['attempt_count']), stream=sys.stdout, title='==> cast attmept to attempt_level'): 209 | if item == 0: 210 | temp = 0 211 | elif item == 1: 212 | temp = 1 213 | else: 214 | temp = 2 215 | 216 | temp_list.append(temp) 217 | data['attempt_level'] = temp_list 218 | return data 219 | 220 | 221 | def attempt_and_hint_process(data): 222 | print('==> remove records whose attempt_account is more than 15') 223 | data = data[data['attempt_count'] <= 15] 224 | data = data.reset_index(drop=True) 225 | 226 | problem_list = np.unique(data['problem_id']) 227 | attempt_dict = {} 228 | hint_dict = {} 229 | attempt_list = [] 230 | hint_list = [] 231 | for idx in pp.prog_percent(range(len(problem_list)), stream=sys.stdout, 232 | title='==> get attmept and hint max value at problem level'): 233 | temp_data = data[data['problem_id'] == problem_list[idx]] 234 | attempt_dict[problem_list[idx]] = max(temp_data['attempt_count']) 235 | attempt_list.append(max(temp_data['attempt_count'])) 236 | hint_dict[problem_list[idx]] = max(temp_data['hint_count']) 237 | hint_list.append(max(temp_data['hint_count'])) 238 | 239 | fig, axs = plt.subplots(nrows=2, ncols=1, sharex=False) 240 | ax = axs[0] 241 | ax.hist(attempt_list, bins=np.arange(0, 16, 1)) 242 | ax.set_title('max attempt distribution') 243 | ax.set_xlabel("attempt(max)") 244 | ax.set_ylabel("number") 245 | 246 | ax = axs[1] 247 | ax.hist(hint_list) 248 | ax.set_title("max hint distribution") 249 | ax.set_xlabel("hint(max)") 250 | ax.set_ylabel("number") 251 | 252 | plt.savefig( 253 | './result/assistment2009/attempt_hint_number_' + datetime.datetime.now().strftime("%Y-%m-%d-%H-%M") + '.png') 254 | 255 | for idx in pp.prog_percent(range(len(data)), stream=sys.stdout, 256 | title='==> cast attempt count and hint count to value/max'): 257 | if attempt_dict[data.loc[idx, 'problem_id']] == 0: 258 | data.loc[idx, 'attempt_count_level'] = -1 259 | else: 260 | data.loc[idx, 'attempt_count_level'] = data.loc[idx, 'attempt_count'] / ( 261 | attempt_dict[data.loc[idx, 'problem_id']] * 1.0) 262 | 263 | if hint_dict[data.loc[idx, 'problem_id']] == 0: 264 | data.loc[idx, 'hint_count_level'] = -1 265 | else: 266 | data.loc[idx, 'hint_count_level'] = data.loc[idx, 'hint_count'] / ( 267 | hint_dict[data.loc[idx, 'problem_id']] * 1.0) 268 | 269 | return data 270 | 271 | 272 | def attemp_hint_and_correctness_analysis(data): 273 | data = data.reset_index(drop=True) 274 | bins = np.concatenate([[-1], np.arange(0.0, 1.1, 0.1)]) 275 | 276 | for attri in ['hint_count_level', 'attempt_count_level']: 277 | correct_mean_list = [] 278 | correct_std_list = [] 279 | correct_num_list = [] 280 | 281 | for item_index in pp.prog_percent(range(len(bins)), stream=sys.stdout, 282 | title='==> get correctness according to ' + attri): 283 | up_bin = bins[item_index] + 0.05 284 | down_bin = bins[item_index] - 0.05 285 | 286 | temp_data = data[(data[attri] >= down_bin) & (data[attri] < up_bin)] 287 | temp_correct_list = list(temp_data['correct']) 288 | correct_num_list.append(len(temp_correct_list)) 289 | 290 | if (len(temp_correct_list) != 0): 291 | correct_mean_list.append(np.mean(temp_correct_list, axis=0)) 292 | correct_std_list.append(np.std(temp_correct_list, axis=0)) 293 | else: 294 | correct_mean_list.append(0) 295 | correct_std_list.append(0) 296 | 297 | fig, axs = plt.subplots(nrows=2, ncols=1, sharex=True) 298 | ax = axs[0] 299 | ax.plot(bins, correct_mean_list) 300 | ax.set_title('correctness ' + attri) 301 | 302 | boundary_list = code0.DatasetParameter().correct_boundary_list 303 | for nmber in boundary_list: 304 | ax.axhline(y=nmber, xmin=0, xmax=1, c="red", linewidth=0.5, zorder=0) 305 | 306 | ax = axs[1] 307 | ax.plot(bins, correct_num_list) 308 | ax.set_title(attri + " number distribution") 309 | ax.set_xlim([-1.1, 1.1]) 310 | plt.savefig('./result/assistment2009/' + attri + '_correctness_' + str( 311 | datetime.datetime.now().strftime("%Y-%m-%d-%H-%M")) + '.png') 312 | 313 | 314 | def attempt_correct_analysis(data): 315 | data = data[data['attempt_count'] <= code0.DatasetParameter().attemp_max] 316 | u, c = aux.counter(list(data['attempt_count'])) 317 | 318 | atempt_list = np.arange(code0.DatasetParameter().attemp_max + 1) 319 | correct_num_list = [] 320 | for item in atempt_list: 321 | temp_data = data[(data['attempt_count'] == item)] 322 | if len(temp_data) != 0: 323 | correct_num_list.append(sum(temp_data['correct']) * 1.0 / len(temp_data)) 324 | else: 325 | correct_num_list.append(0) 326 | print(u, "\n", c) 327 | print(atempt_list, "\n", correct_num_list) 328 | 329 | for a in correct_num_list: 330 | print("%.3f" % a) 331 | 332 | 333 | if __name__ == "__main__": 334 | dp = code0.DatasetParameter() 335 | data = read_asssistment2009_data_from_csv(dp) 336 | attempt_correct_analysis(data) 337 | """ 338 | data = pd.read_csv("./data/assistment2009/time_connect_data.csv") 339 | data = data[:30000] 340 | data = attempt_and_hint_process(data) 341 | attemp_hint_and_correctness_analysis(data) 342 | 343 | data = pd.read_csv(dp.connect_file_name) 344 | data = data[:10000] 345 | print(data[:10]) 346 | data = attempt_and_hint_process(data) 347 | print(data[:10]) 348 | 349 | data = pd.read_csv(dp.connect_file_name) 350 | data = data[:10000] 351 | print(data[:10]) 352 | data = time_process(data) 353 | print(data[:10]) 354 | data.to_csv('./data/assistment2009/kkk.csv') 355 | data = time_correctness_relation_analysis(data) 356 | print(data[:10]) 357 | data = pd.read_csv('./data/assistment2009/kkk.csv') 358 | """ 359 | --------------------------------------------------------------------------------