├── README.md
├── ci.py
├── uril_connectUser.py
├── uril_entropy.py
├── code3_runEpoch.py
├── trainAutoEncoder.py
├── doTrainTest.py
├── uril_tools.py
├── code0_parameter.py
├── code1_data.py
├── code2_model.py
├── uril_cmu_statistic.py
├── uril_oneHotEncoder.py
└── uril_assistment2009.py


/README.md:
--------------------------------------------------------------------------------
 1 | ### Environment
 2 | python3.5  
 3 | Tensorflow 0.10  
 4 | pip install pyprind  
 5 | 
 6 | ### Run code
 7 | setting parameter in code0 file  
 8 | python doTrainTest.py
 9 | 
10 | 
11 | ###  Disclaimer 
12 | Unless stated otherwise, all software is provided free of charge. As well, all software is provided on an "as is" basis without warranty of any kind, express or implied. Under no circumstances and under no legal theory, whether in tort, contract, or otherwise, shall Liang Zhang be liable to you or to any other person for any indirect, special, incidental, or consequential damages of any character including, without limitation, damages for loss of goodwill, work stoppage, computer failure or malfunction, or for any and all other damages or losses. If you do not agree with these terms, then you are advised to not use the software.
13 | 


--------------------------------------------------------------------------------
/ci.py:
--------------------------------------------------------------------------------
 1 | ''' Unless stated otherwise, all software is provided free of charge. 
 2 | As well, all software is provided on an "as is" basis without warranty 
 3 | of any kind, express or implied. Under no circumstances and under no legal 
 4 | theory, whether in tort, contract, or otherwise, shall Liang Zhang be liable 
 5 | to you or to any other person for any indirect, special, incidental, 
 6 | or consequential damages of any character including, without limitation, 
 7 | damages for loss of goodwill, work stoppage, computer failure or malfunction, 
 8 | or for any and all other damages or losses. If you do not agree with these terms, 
 9 | then you are advised to not use the software.'''
10 | 
11 | import pandas as pd
12 | import numpy as np
13 | import scipy.stats as st
14 | import scipy as sp
15 | import scipy.stats
16 | 
17 | def mean_confidence_interval(data, confidence=0.95):
18 |     a = 1.0*np.array(data)
19 |     n = len(a)
20 |     m, se = np.mean(a), scipy.stats.sem(a)
21 |     h = se * sp.stats.t._ppf((1+confidence)/2., n-1)
22 |     return m,h
23 | 
24 | assistments_list = [
25 |     './result/assistment2009/result_01-25-17:04.csv', #baseline
26 |     './result/assistment2009/result_02-05-11:51.csv', #baseline + t/c
27 |     './result/assistment2009/result_02-05-21:28.csv', #baseline + t/c [ae]
28 |     './result/assistment2009/result_02-06-02:10.csv', #baseline + t/c + t + a + f
29 |     './result/assistment2009/result_02-06-09:21.csv', #baseline + t/c + t + a + f [ae]
30 |     './result/assistment2009/result_02-08-10:57.csv', #baseline + t/c + t/s + t + a + f [ae]
31 | ]
32 | 
33 | cmu_list = [
34 |     './result/cmu_stat_f2011/result_01-24-23:23.csv', #baseline
35 |     './result/cmu_stat_f2011/result_01-25-09:35.csv', #baseline + t/c
36 |     './result/cmu_stat_f2011/result_01-29-17:35.csv', #baseline + t/c [ae]
37 |     './result/cmu_stat_f2011/result_02-06-21:45.csv', #baseline + t/c + t + a + f
38 |     './result/cmu_stat_f2011/result_02-07-08:44.csv', #baseline + t/c + t + a + f [ae]
39 |     './result/cmu_stat_f2011/result_02-07-23:44.csv', #baseline + t/c + t/s + t + a + f [ae]
40 | ]
41 | 
42 | for name_list in [assistments_list,cmu_list]:
43 |     print ("=="*25)
44 |     for idx,name in enumerate(name_list):
45 |         print ("\n","-"*5,idx,"\t",name,"-"*5,)
46 |         data = pd.read_csv(name)
47 |         data = data[(data['cv']!='average') & (data['type']!='train')]
48 |         data = data[data['epoch']==8]
49 | 
50 |         aucs = data['auc']
51 |         print ("auc mean and 95ci %2.3f\t%2.3f"%mean_confidence_interval(aucs))
52 | 
53 |         r2s = data['r2']
54 |         print ("r2 mean and 95ci %2.3f\t%2.3f"%mean_confidence_interval(r2s))
55 | 
56 | 
57 | 


--------------------------------------------------------------------------------
/uril_connectUser.py:
--------------------------------------------------------------------------------
 1 | ''' Unless stated otherwise, all software is provided free of charge. 
 2 | As well, all software is provided on an "as is" basis without warranty 
 3 | of any kind, express or implied. Under no circumstances and under no legal 
 4 | theory, whether in tort, contract, or otherwise, shall Liang Zhang be liable 
 5 | to you or to any other person for any indirect, special, incidental, 
 6 | or consequential damages of any character including, without limitation, 
 7 | damages for loss of goodwill, work stoppage, computer failure or malfunction, 
 8 | or for any and all other damages or losses. If you do not agree with these terms, 
 9 | then you are advised to not use the software.'''
10 | 
11 | import numpy as np
12 | import pandas as pd
13 | import pyprind as pp
14 | import sys
15 | 
16 | import numpy as np
17 | import pandas as pd
18 | import pyprind as pp
19 | import sys
20 | import uril_tools as aux
21 | import code0_parameter as code0
22 | 
23 | 
24 | def counter(a):
25 |     a = list(a)
26 |     unique, counts = np.unique(a, return_counts=True)
27 |     return unique, counts
28 | 
29 | 
30 | def getUserQuesNumIndexList(dataList):
31 |     a = list(dataList)
32 |     target = np.empty((0, 3))
33 |     size = len(a)
34 |     temp = [a[0], 1, 0]
35 |     for i in range(1, size):
36 |         if a[i] == a[i - 1]:
37 |             temp[1] += 1
38 |         else:
39 |             target = np.vstack((target, temp))
40 |             temp = [a[i], 1, i]
41 |     return np.vstack((target, temp))
42 | 
43 | 
44 | def connectUser(data, connected_file_name):
45 |     print("==> load data successful")
46 |     u, c = counter(data['user_id'])
47 |     # UserNumberDict = dict(zip(u, c))
48 | 
49 |     userQuesNumIndexList = getUserQuesNumIndexList(data['user_id'])
50 |     newdata = pd.DataFrame()
51 | 
52 |     print('==> begin concatenate dataset')
53 |     for i in pp.prog_percent(range(len(u)), stream=sys.stdout):
54 |         for k in range(len(userQuesNumIndexList)):
55 |             if userQuesNumIndexList[k, 0] == u[i]:
56 |                 temp = data.iloc[
57 |                        int(userQuesNumIndexList[k, 2]):int(userQuesNumIndexList[k, 2] + userQuesNumIndexList[k, 1])]
58 |                 newdata = newdata.append(temp)
59 | 
60 |     newdata.reset_index(drop=True)
61 |     newdata.to_csv(connected_file_name, index=False)
62 | 
63 |     print('==> before connect\t', aux.stastic_SecNumber_UserNumber_SkillNumber(data, code0.DatasetParameter()))
64 |     print('==> after connect\t', aux.stastic_SecNumber_UserNumber_SkillNumber(newdata, code0.DatasetParameter()))
65 | 
66 |     return newdata
67 | 
68 | 
69 | if __name__ == "__main__":
70 |     filename = './data/assistment2009/skill_builder_data_corrected.csv'
71 |     newfile = './data/assistment2009/connect_dataset_small.csv'
72 | 
73 |     data = pd.read_csv(filename, encoding='latin-1', error_bad_lines=False, index_col=False)
74 |     data = data[0:50000]
75 |     connectUser(data, newfile)
76 | 


--------------------------------------------------------------------------------
/uril_entropy.py:
--------------------------------------------------------------------------------
  1 | ''' Unless stated otherwise, all software is provided free of charge. 
  2 | As well, all software is provided on an "as is" basis without warranty 
  3 | of any kind, express or implied. Under no circumstances and under no legal 
  4 | theory, whether in tort, contract, or otherwise, shall Liang Zhang be liable 
  5 | to you or to any other person for any indirect, special, incidental, 
  6 | or consequential damages of any character including, without limitation, 
  7 | damages for loss of goodwill, work stoppage, computer failure or malfunction, 
  8 | or for any and all other damages or losses. If you do not agree with these terms, 
  9 | then you are advised to not use the software.'''
 10 | 
 11 | import pandas as pd
 12 | import numpy as np
 13 | 
 14 | 
 15 | class EntropyCls():
 16 |     def __init__(self):
 17 |         pass
 18 | 
 19 |     def _entropy_single_variable(self, X):
 20 |         X = list(X)
 21 |         probs = [list(X).count(c) * 1.0 / len(X) for c in set(X)]
 22 |         return np.sum(-p * np.log2(p) for p in probs)
 23 | 
 24 |     def _joint_entropy_two_variables(self, X, Y):
 25 |         """
 26 |         :param X:
 27 |         :param Y:
 28 |         :return: H(X|Y)
 29 |         """
 30 |         data = pd.DataFrame({'X': list(X), 'Y': list(Y)})
 31 |         joint_entropy = 0
 32 |         for y_item in set(data['Y']):
 33 |             temp_data = data[data['Y'] == y_item]
 34 |             temp_list = temp_data['X']
 35 |             entropy = self._entropy_single_variable(temp_list)
 36 |             p_y = len(temp_data) / len(data)
 37 |             joint_entropy += p_y * entropy
 38 |         return joint_entropy
 39 | 
 40 |     def _get_information_grain(self, X, Y):
 41 |         """
 42 |         :param X:
 43 |         :param Y:
 44 |         :return: IG(X|Y) = H(X) - H(X|Y)
 45 |         """
 46 |         return self._entropy_single_variable(X) - self._joint_entropy_two_variables(X, Y)
 47 | 
 48 |     def _get_sym_uncertity(self, X, Y):
 49 |         """
 50 |         :param X:
 51 |         :param Y:
 52 |         :return: SU(X,Y) = 2[IG(X|Y)/(H(X)+H(Y))]
 53 |         """
 54 |         return 2 * (
 55 |             self._get_information_grain(X, Y) / (self._entropy_single_variable(X) + self._entropy_single_variable(Y)))
 56 | 
 57 |     def get_sym_uncertity_matrix(self, data):
 58 |         """
 59 |         :param data: pandas data
 60 |         :return:
 61 |         """
 62 |         name_list = list(data)
 63 |         result = pd.DataFrame(index=name_list, columns=name_list)
 64 |         for c_idx, c_item in enumerate(name_list):
 65 |             for r_idx, r_item in enumerate(name_list):
 66 |                 if c_idx > r_idx:
 67 |                     result.loc[r_item, c_item] = self._get_sym_uncertity(data[r_item], data[c_item])
 68 | 
 69 |         return result
 70 | 
 71 |     def get_coeff(self, data):
 72 |         name_list = list(data)
 73 |         result = pd.DataFrame(index=name_list, columns=name_list)
 74 |         for c_idx, c_item in enumerate(name_list):
 75 |             for r_idx, r_item in enumerate(name_list):
 76 |                 if c_idx > r_idx:
 77 |                     result.loc[r_item, c_item] = abs(np.corrcoef(data[r_item], data[c_item])[0][1])
 78 |         return result
 79 | 
 80 | def print_assistment2009():
 81 |     data = pd.read_csv(
 82 |         "./data/assistment2009/attempt_level correct attempt_level time_level correct first_action correct first_action time_level correct skill_id correct time_level correct_large_.csv")
 83 | 
 84 |     temp_data = data[
 85 |         ['correct', 'first_action', 'time_level', 'attempt_level', 'attempt_level correct', 'first_action correct',
 86 |          'time_level correct','attempt_level time_level correct','first_action time_level correct']]
 87 |     r1 = EntropyCls().get_sym_uncertity_matrix(temp_data)
 88 |     r2 = EntropyCls().get_coeff(temp_data)
 89 | 
 90 |     result = pd.concat([r1, r2])
 91 |     result.to_csv('./result/assistment2009/correlationship_add_3.csv')
 92 |     print(result)
 93 | 
 94 | def print_cmu():
 95 |     data = pd.read_csv(
 96 |         "./data/cmu_stat_f2011/skill_id correct skill_id time_level time_level correct_large_.csv")
 97 | 
 98 |     temp_data = data[
 99 |         ['correct', 'first_action', 'time_level', 'attempt_level', "time_level correct","skill_id time_level","skill_id correct"]]
100 |     r1 = EntropyCls().get_sym_uncertity_matrix(temp_data)
101 |     r2 = EntropyCls().get_coeff(temp_data)
102 | 
103 |     result = pd.concat([r1, r2])
104 |     result.to_csv('./result/cmu_stat_f2011/correlationship_add_3.csv')
105 |     print(result)
106 | 
107 | if __name__ == "__main__":
108 |     #print_assistment2009()
109 |     print_cmu()
110 | 
111 | 


--------------------------------------------------------------------------------
/code3_runEpoch.py:
--------------------------------------------------------------------------------
  1 | ''' Unless stated otherwise, all software is provided free of charge. 
  2 | As well, all software is provided on an "as is" basis without warranty 
  3 | of any kind, express or implied. Under no circumstances and under no legal 
  4 | theory, whether in tort, contract, or otherwise, shall Liang Zhang be liable 
  5 | to you or to any other person for any indirect, special, incidental, 
  6 | or consequential damages of any character including, without limitation, 
  7 | damages for loss of goodwill, work stoppage, computer failure or malfunction, 
  8 | or for any and all other damages or losses. If you do not agree with these terms, 
  9 | then you are advised to not use the software.'''
 10 | 
 11 | from uril_tools import *
 12 | from sklearn.metrics import mean_squared_error
 13 | from sklearn.metrics import r2_score
 14 | from sklearn import metrics
 15 | from math import sqrt
 16 | import numpy as np
 17 | import pyprind
 18 | np.set_printoptions(threshold=np.inf)
 19 | 
 20 | def get_evaluate_result(actual_labels, pred_prob):
 21 |     rmse = sqrt(mean_squared_error(actual_labels, pred_prob))
 22 |     fpr, tpr, thresholds = metrics.roc_curve(actual_labels, pred_prob, pos_label=1)
 23 |     auc = metrics.auc(fpr, tpr)
 24 |     r2 = r2_score(actual_labels, pred_prob)
 25 |     return rmse,auc,r2
 26 | 
 27 | def run_epoch(session, m, students, eval_op, verbose=False):
 28 |     pred_prob = []
 29 |     actual_labels = []                   # use for whole comparasion
 30 | 
 31 |     skill_id_origin_list = []
 32 |     target_id_origin_list = []
 33 |     iteration = int(len(students)/m.batch_size)
 34 | 
 35 |     for i_iter in pyprind.prog_percent(range(iteration)):
 36 |         #bar.update(m.batch_size)
 37 |         x = np.zeros((m.batch_size, m.num_steps, m.seq_width))
 38 | 
 39 |         target_id = np.array([],dtype=np.int32)
 40 |         skill_id_origin = np.array([],dtype=np.int32)
 41 |         target_id_origin = np.array([],dtype=np.int32)
 42 |         target_correctness = []         # use for just a batch
 43 | 
 44 |         #load data for a batch
 45 |         # tuple formate
 46 |         # 0: user_id
 47 |         # 1: record_numb
 48 |         # 2: data
 49 |         # 3: Target_Id
 50 |         # 4: correctness
 51 |         for i_batch in range(m.batch_size):
 52 |             student = students[i_iter*m.batch_size+i_batch]
 53 |             record_num = student[1]
 54 |             #record_content_pd = student[2].reset_index(drop=True)
 55 |             record_content = student[2].as_matrix()
 56 |             temp_skill_id_list = list(student[2]['skill_id'])
 57 |             skill_id = student[3]
 58 |             correctness = student[4]
 59 | 
 60 |             # construct data for training:
 61 |             # data ~ x
 62 |             # target_id ~ skill_id
 63 |             # target_correctness ~ correctness
 64 |             for i_recordNumb in range(record_num):
 65 |                 if(i_recordNumb<m.num_steps):
 66 |                     x[i_batch, i_recordNumb,:] = record_content[i_recordNumb,:]
 67 | 
 68 |                     if skill_id[i_recordNumb] in m.skill_set:
 69 |                         temp =i_batch*m.num_steps*m.skill_num + i_recordNumb*m.skill_num + skill_id[i_recordNumb]
 70 |                         temp_i = skill_id[i_recordNumb]
 71 |                         temp_s = temp_skill_id_list[i_recordNumb]
 72 |                     else:
 73 |                         temp = i_batch*m.num_steps + i_recordNumb*m.skill_num + 0
 74 |                         temp_i = 0
 75 |                         temp_s = temp_skill_id_list[i_recordNumb]
 76 | 
 77 |                     target_id = np.append(target_id,[[temp]])
 78 |                     target_id_origin  = np.append(target_id_origin,[[temp_i]])
 79 |                     skill_id_origin = np.append(skill_id_origin,[[temp_s]])
 80 | 
 81 |                     target_correctness.append(int(correctness[i_recordNumb]))
 82 |                     actual_labels.append(int(correctness[i_recordNumb]))
 83 |                 else:
 84 |                     break
 85 | 
 86 |             #test inter_skill and intra_skill
 87 |             """
 88 |             if (record_num<=m.num_steps):
 89 |                 skill_id_origin = np.append(skill_id_origin,temp_skill_id_list)
 90 |             else:
 91 |                 skill_id_origin = np.append(skill_id_origin,temp_skill_id_list[:m.num_steps])
 92 |             """
 93 |         pred, _ = session.run([m.pred, eval_op],feed_dict={m.inputs: x,
 94 |                                                            m.target_id: target_id,
 95 |                                                            m.target_correctness: target_correctness})
 96 | 
 97 |         for s in skill_id_origin:
 98 |             skill_id_origin_list.append(s)
 99 | 
100 |         for t in target_id_origin:
101 |             target_id_origin_list.append(t)
102 | 
103 |         for p in pred:
104 |             pred_prob.append(p)
105 | 
106 |     # print ("------------------len ",len(skill_id_origin_list),"\t",len(target_id_origin_list))
107 |     # print (skill_id_origin_list[:100])
108 |     # print (target_id_origin_list[:100])
109 |     rmse,auc,r2 = get_evaluate_result(actual_labels, pred_prob)
110 | 
111 |     #print ("==> predict_prob shape\t",np.shape(pred_prob),'\tactual_labels\t',np.shape(actual_labels),'\ttarget_id_list\t',np.shape(target_id_origin_list))
112 |     #print (target_id_origin_list[1:100])
113 |     intra_skill_actual = []
114 |     intra_skill_pred = []
115 | 
116 |     inter_skill_actual = []
117 |     inter_skill_pred = []
118 | 
119 |     for idx in np.arange(len(target_id_origin_list)):
120 |         if skill_id_origin_list[idx]==target_id_origin_list[idx]:
121 |             intra_skill_actual.append(actual_labels[idx])
122 |             intra_skill_pred.append(pred_prob[idx])
123 |         else:
124 |             inter_skill_actual.append(actual_labels[idx])
125 |             inter_skill_pred.append(pred_prob[idx])
126 | 
127 |     inter_rmse,inter_auc,inter_r2 = get_evaluate_result(inter_skill_actual, inter_skill_pred)
128 |     intra_rmse,intra_auc,intra_r2 = get_evaluate_result(intra_skill_actual, intra_skill_pred)
129 | 
130 |     return rmse, auc, r2,inter_rmse,inter_auc,inter_r2,intra_rmse,intra_auc,intra_r2
131 | 
132 | if __name__=="__main__":
133 |     pass
134 | 
135 | 


--------------------------------------------------------------------------------
/trainAutoEncoder.py:
--------------------------------------------------------------------------------
  1 | ''' Unless stated otherwise, all software is provided free of charge. 
  2 | As well, all software is provided on an "as is" basis without warranty 
  3 | of any kind, express or implied. Under no circumstances and under no legal 
  4 | theory, whether in tort, contract, or otherwise, shall Liang Zhang be liable 
  5 | to you or to any other person for any indirect, special, incidental, 
  6 | or consequential damages of any character including, without limitation, 
  7 | damages for loss of goodwill, work stoppage, computer failure or malfunction, 
  8 | or for any and all other damages or losses. If you do not agree with these terms, 
  9 | then you are advised to not use the software.'''
 10 | 
 11 | import tensorflow as tf
 12 | import math
 13 | import code0_parameter as code0
 14 | import code1_data as code1
 15 | from uril_oneHotEncoder import ONEHOTENCODERINPUT
 16 | import pyprind, sys,os
 17 | import numpy as np
 18 | 
 19 | 
 20 | class SIMPLEAUTOENCODER(object):
 21 |     def __init__(self, SAEconfig, dp):
 22 |         self.inputs = tf.placeholder(tf.float32, [SAEconfig.batch_size, SAEconfig.num_steps, SAEconfig.seq_width])
 23 |         ohe = ONEHOTENCODERINPUT(SAEconfig, dp, self.inputs)
 24 |         #self.mask = tf.placeholder(tf.float32, [SAEconfig.batch_size*SAEconfig.num_steps, SAEconfig.seq_width])
 25 |         #############################################################################################################
 26 |         #featureslist = [ohe.getSkillCorrectCrossFeature(),ohe.getCrossFeatureAll()]
 27 |         featureslist = [ohe.getSkillCorrectCrossFeature(), ohe.getCrossFeatureAll()]#, ohe.getCategoryFeatureInputs()]
 28 |         #############################################################################################################
 29 |         x_tmp = tf.concat(2, featureslist)
 30 |         self.dp = dp
 31 |         self.x = x = tf.reshape(x_tmp, [SAEconfig.batch_size * SAEconfig.num_steps, -1])
 32 |         self.dimensions = dimensions = [int(x.get_shape()[-1]), code0.TARGETSIZE]
 33 |         #xp = self.mask*x
 34 |         print("n_input\t", str(dimensions[0]), "\tn_output\t", str(dimensions[1]))
 35 |         W_init_max = 4 * np.sqrt(6. / (dimensions[0] + dimensions[1]))
 36 |         W_init = tf.random_uniform(shape=dimensions, minval=-W_init_max,maxval=W_init_max)
 37 |         self.WE = WE = tf.Variable(W_init)
 38 |         self.bE = bE = tf.Variable(tf.zeros([dimensions[-1]]))
 39 |         if code0.AUTOENCODER_ACT == 'tanh':
 40 |             transfer_function = tf.nn.tanh
 41 |         elif code0.AUTOENCODER_ACT == 'sigmoid':
 42 |             transfer_function = tf.nn.sigmoid
 43 |         featureVector = transfer_function(tf.matmul(x, WE) + bE)
 44 | 
 45 |         self.WD = WD = tf.transpose(WE)
 46 |         #self.WD = WD = tf.Variable(tf.random_normal([self.dimensions[1],self.dimensions[0],], stddev=0.35))
 47 |         self.bD = bD = tf.Variable(tf.zeros([dimensions[0]]))
 48 |         y = transfer_function(tf.matmul(featureVector, WD) + bD)
 49 |         #self.learning_rate = tf.placeholder(tf.float32,1)
 50 |         self.cost = cost = tf.reduce_sum(tf.square(y - x))
 51 |         #self.optimizer = tf.train.GradientDescentOptimizer(SAEconfig.learning_rate).minimize(cost)
 52 |         self.optimizer = tf.train.AdamOptimizer(SAEconfig.learning_rate).minimize(cost)
 53 | 
 54 |         self.avgcost = tf.div(cost, tf.to_float(dimensions[0]))
 55 | 
 56 |     def saveWeights(self, sess):
 57 |         weigthpath = './weights/'+str(self.dp.dataSetType)+'/weights_' + str(self.dimensions[0]) + '_' + str(self.dimensions[1]) + '.csv'
 58 |         baispath = './weights/'+str(self.dp.dataSetType)+'/bias_' + str(self.dimensions[0]) + '_' + str(self.dimensions[1]) + '.csv'
 59 | 
 60 |         if os.path.exists(weigthpath):
 61 |             os.remove(weigthpath)
 62 |         if os.path.exists(baispath):
 63 |             os.remove(baispath)
 64 | 
 65 |         wt = self.WE.eval(sess)
 66 |         np.savetxt(weigthpath, wt)
 67 |         bs = self.bE.eval(sess)
 68 |         np.savetxt(baispath, bs)
 69 |         print("==> save weights to \t", os.path.dirname(weigthpath))
 70 | 
 71 | 
 72 | def run_ae_epoch(sess, model, data, TrainConfig):
 73 |     batch_number = int(len(data) / (TrainConfig.batch_size * TrainConfig.num_steps))
 74 |     learning_rate  = TrainConfig.learning_rate
 75 |     for i in pyprind.prog_percent(range(batch_number), stream=sys.stdout):
 76 |         x = np.zeros((TrainConfig.batch_size, TrainConfig.num_steps, TrainConfig.seq_width))
 77 |         kindex = i * (TrainConfig.batch_size * TrainConfig.num_steps)
 78 |         for ip in range(TrainConfig.batch_size):
 79 |             for j in range(TrainConfig.num_steps):
 80 |                 x[ip, j, :] = data.iloc[kindex]
 81 |                 kindex += 1
 82 |         #mask_np = np.random.binomial(1, 1 - TrainConfig.corruption_level, [TrainConfig.batch_size * TrainConfig.num_steps,TrainConfig.seq_width])
 83 |         learning_rate = learning_rate*TrainConfig.lr_decay
 84 |         if learning_rate<=TrainConfig.min_lr:
 85 |             learning_rate = TrainConfig.min_lr
 86 |         _ = sess.run(model.optimizer, feed_dict={model.inputs: x})
 87 |     avgcost = sess.run(model.avgcost, feed_dict={model.inputs: x})
 88 |     return avgcost
 89 | 
 90 | 
 91 | def trainAEWeights():
 92 |     if not code0.BASELINE:
 93 |         dp = code0.DatasetParameter()
 94 |         dataset, labels = code1.load_data(dp)
 95 | 
 96 |         dp.skill_num = len(dataset['skill_id'].unique()) + 1
 97 |         dp.skill_set = list(dataset['skill_id'].unique())
 98 |         dp.columns_max, dp.columns_numb, dp.columnsName_to_index = code1.get_columns_info(dataset)
 99 |         dp.seq_width = len(dp.columnsName_to_index)
100 | 
101 | 
102 |         SAEconfig = code0.SAEParamsConfig()
103 |         SAEconfig.num_steps = 30
104 |         SAEconfig.seq_width = dp.seq_width
105 | 
106 |         g = tf.Graph()
107 |         with g.as_default():
108 |             model_autoencoder = SIMPLEAUTOENCODER(SAEconfig, dp)
109 |             initializer = tf.random_uniform_initializer(-SAEconfig.init_scale, SAEconfig.init_scale)
110 | 
111 |         with tf.Session(graph=g) as sess:
112 |             tf.initialize_all_variables().run()
113 | 
114 |             for i in range(SAEconfig.max_max_epoch):
115 |                 p = run_ae_epoch(sess, model_autoencoder, dataset, SAEconfig)
116 |                 print(str(i)+"/"+str(SAEconfig.max_max_epoch)+" epoch,avgcost ", str(p))
117 |             model_autoencoder.saveWeights(sess)
118 |     else:
119 |         print("BASELINE model, don't need train weights")
120 | 
121 | if __name__ == "__main__":
122 |     trainAEWeights()
123 | 


--------------------------------------------------------------------------------
/doTrainTest.py:
--------------------------------------------------------------------------------
  1 | """ Code of deep knowledge tracing-assistment 2014-2015 dataset
  2 | Reference:
  3 |     1. https://github.com/siyuanzhao/2016-EDM/
  4 |     2. https://www.tensorflow.org/versions/0.6.0/tutorials/recurrent/index.html
  5 |     3. https://github.com/tensorflow/tensorflow/blob/master/tensorflow/models/rnn/ptb/ptb_word_lm.py
  6 |     4. https://github.com/Cospel/rbm-ae-tf
  7 | 
  8 | Run code:
  9 |     1. only set the hyperparameter in code0_params.py
 10 |     2. train your autoencoder parameters
 11 |        python trainWeights.py
 12 |     3. python doAll.py
 13 | 
 14 | Environment:
 15 |     1. ubuntu 14.04
 16 |     2. python3
 17 |     3. tensorflow : 0.10
 18 |     4. cuda 7.5
 19 |     5. GPU GTX1070 (8G)
 20 |     6. CPU i5-6600k
 21 |     7. RAM: 16G
 22 | """
 23 | 
 24 | ''' 
 25 | Unless stated otherwise, all software is provided free of charge. 
 26 | As well, all software is provided on an "as is" basis without warranty 
 27 | of any kind, express or implied. Under no circumstances and under no legal 
 28 | theory, whether in tort, contract, or otherwise, shall Liang Zhang be liable 
 29 | to you or to any other person for any indirect, special, incidental, 
 30 | or consequential damages of any character including, without limitation, 
 31 | damages for loss of goodwill, work stoppage, computer failure or malfunction, 
 32 | or for any and all other damages or losses. If you do not agree with these terms, 
 33 | then you are advised to not use the software.'''
 34 | 
 35 | from __future__ import print_function
 36 | 
 37 | import code0_parameter as code0
 38 | import code1_data as code1
 39 | import code2_model as code2
 40 | import code3_runEpoch as code3
 41 | import uril_tools as aux
 42 | import tensorflow as tf
 43 | import numpy as np
 44 | import pandas as pd
 45 | import datetime
 46 | from trainAutoEncoder import trainAEWeights
 47 | 
 48 | np.set_printoptions(threshold=np.inf)
 49 | 
 50 | 
 51 | def main(unused_args):
 52 |     aux.check_directories()
 53 | 
 54 |     if not code0.BASELINE and code0.AUTOENCODER_LABEL:
 55 |         trainAEWeights()
 56 | 
 57 |     dp = code0.DatasetParameter()
 58 |     dataset, labels = code1.load_data(dp)
 59 |     tuple_data = code1.convert_data_labels_to_tuples(dataset, labels)
 60 | 
 61 |     skill_num = len(dataset['skill_id'].unique()) + 1
 62 |     dp.skill_num = skill_num
 63 |     dp.skill_set = list(dataset['skill_id'].unique())
 64 |     dp.columns_max, dp.columns_numb, dp.columnsName_to_index = code1.get_columns_info(dataset)
 65 |     dp.seq_width = len(dp.columnsName_to_index)
 66 | 
 67 |     print("-" * 50, "\ndp.columns_max\n", dp.columns_max, "\n")
 68 |     print("-" * 50, "\ndp.columns_numb\n", dp.columns_numb, "\n")
 69 |     print("-" * 50, "\ndp.columnsName_to_index\n", dp.columnsName_to_index, "\n")
 70 | 
 71 |     config = code0.ModelParamsConfig(dp)
 72 |     eval_config = code0.ModelParamsConfig(dp)
 73 | 
 74 |     if dp.dataSetType == 'kdd':
 75 |         config.num_steps = 1500
 76 |     elif dp.dataSetType == 'cmu_stat_f2011':
 77 |         config.num_steps = 1500
 78 |     else:
 79 |         config.num_steps = aux.get_num_step(dataset)
 80 | 
 81 |     eval_config.num_steps = config.num_steps
 82 |     eval_config.batch_size = 2
 83 | 
 84 |     config.skill_num = skill_num
 85 |     eval_config.skill_num = config.skill_num
 86 | 
 87 |     name_list = ['cv', 'epoch', 'type', 'rmse', 'auc', 'r2', 'inter_rmse', 'inter_auc', 'inter_r2', 'intra_rmse',
 88 |                  'intra_auc', 'intra_r2']
 89 |     result_data = pd.DataFrame(columns=name_list)
 90 |     CVname = ['c1', 'c2', 'c3', 'c4', 'c5']
 91 |     size = len(tuple_data)
 92 | 
 93 |     # write all the records to log file
 94 |     aux.printConfigration(config=config, dp=dp, train_numb=int(size * 0.8), test_numb=int(size * 0.2))
 95 |     aux.logwrite(["==> model_continues_columns\n" + ','.join(dp.model_continues_columns)], dp, True)
 96 |     aux.logwrite(["==> model_category_columns\n" + ','.join(dp.model_category_columns)], dp, True)
 97 |     str_cross_columns_list = ['-'.join(i) for i in dp.model_cross_columns]
 98 |     str_cross_columns = ','.join(str_cross_columns_list)
 99 |     aux.logwrite(["==> model_cross_columns\n" + str_cross_columns], dp, True)
100 | 
101 |     for index, cv_num_name in enumerate(CVname):
102 |         aux.logwrite(["\nCross-validation: \t" + str(index + 1) + "/5"], dp, prt=True)
103 |         timeStampe = datetime.datetime.now().strftime("%m-%d-%H:%M")
104 |         aux.logwrite(["\ntime:\t" + timeStampe], dp)
105 | 
106 |         train_tuple_rows = tuple_data[:int(index * 0.2 * size)] + tuple_data[int((index + 1) * 0.2 * size):]
107 |         test_tuple_rows = tuple_data[int(index * 0.2 * size): int((index + 1) * 0.2 * size)]
108 | 
109 |         with tf.Graph().as_default(), tf.Session() as session:
110 |             initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale)
111 |             # training model
112 |             print("\n==> Load Training model")
113 |             with tf.variable_scope("model", reuse=None, initializer=initializer):
114 |                 m = code2.Model(is_training=True, config=config, dp=dp)
115 |             # testing model
116 |             print("\n==> Load Testing model")
117 |             with tf.variable_scope("model", reuse=True, initializer=initializer):
118 |                 mtest = code2.Model(is_training=False, config=eval_config, dp=dp)
119 | 
120 |             tf.initialize_all_variables().run()
121 | 
122 |             print("==> begin to run epoch...")
123 |             for i in range(config.max_max_epoch):
124 |                 lr_decay = config.lr_decay ** max(i - config.max_epoch, 0)
125 |                 m.assign_lr(session, config.learning_rate * lr_decay)
126 | 
127 |                 rt = session.run(m.lr)
128 |                 rmse, auc, r2, inter_rmse, inter_auc, inter_r2, intra_rmse, intra_auc, intra_r2 = code3.run_epoch(
129 |                     session, m, train_tuple_rows, m.train_op, verbose=True)
130 | 
131 |                 aux.print_result(dp, cv_num_name, i, rt, rmse, auc, r2, inter_rmse, inter_auc, inter_r2, intra_rmse,
132 |                                  intra_auc, intra_r2, 'train')
133 | 
134 |                 result_data = result_data.append(pd.Series(
135 |                     [cv_num_name, i, 'train', rmse, auc, r2, inter_rmse, inter_auc, inter_r2, intra_rmse, intra_auc,
136 |                      intra_r2], index=name_list), ignore_index=True)
137 | 
138 |                 display = 5
139 |                 if ((i + 1) % display == 0):
140 |                     print('BEGIN', "-" * 80)
141 |                     rmse, auc, r2, inter_rmse, inter_auc, inter_r2, intra_rmse, intra_auc, intra_r2 = code3.run_epoch(
142 |                         session, mtest, test_tuple_rows, tf.no_op())
143 |                     aux.print_result(dp, cv_num_name, i, rt, rmse, auc, r2, inter_rmse, inter_auc, inter_r2, intra_rmse,
144 |                                      intra_auc, intra_r2, 'test', display)
145 |                     print('END--', "-" * 80)
146 | 
147 |                     result_data = result_data.append(pd.Series(
148 |                         [cv_num_name, (i + 1) / display, 'test', rmse, auc, r2, inter_rmse, inter_auc, inter_r2,
149 |                          intra_rmse, intra_auc, intra_r2], index=name_list), ignore_index=True)
150 | 
151 |                 #print ("-*"*50,"\n",result_data)
152 | 
153 |     print("==> Finsih! whole process, save result and print\t" + dp.currentTime)
154 | 
155 |     temp_data = result_data[result_data['type'] == 'test']
156 |     for idx in set(temp_data['epoch']):
157 |         tp = temp_data[temp_data['epoch'] == idx]
158 |         result_data = result_data.append(pd.Series(
159 |             ['average', idx, 'test_mean', tp['rmse'].mean(), tp['auc'].mean(), tp['r2'].mean(), tp['inter_rmse'].mean(),
160 |              tp['inter_auc'].mean(), tp['inter_r2'].mean(), tp['intra_rmse'].mean(), tp['intra_auc'].mean(),
161 |              tp['intra_r2'].mean()], index=name_list), ignore_index=True)
162 | 
163 |     print(result_data[result_data['cv']=='average'])
164 |     result_data.to_csv('./result/'+code0.DATASETTYPE+'/result_'+timeStampe+'.csv')
165 |     print('==> save to ./result/'+code0.DATASETTYPE+'/result_'+timeStampe+'.csv')
166 | 
167 | 
168 | if __name__ == "__main__":
169 |     tf.app.run()
170 | 


--------------------------------------------------------------------------------
/uril_tools.py:
--------------------------------------------------------------------------------
  1 | ''' Unless stated otherwise, all software is provided free of charge. 
  2 | As well, all software is provided on an "as is" basis without warranty 
  3 | of any kind, express or implied. Under no circumstances and under no legal 
  4 | theory, whether in tort, contract, or otherwise, shall Liang Zhang be liable 
  5 | to you or to any other person for any indirect, special, incidental, 
  6 | or consequential damages of any character including, without limitation, 
  7 | damages for loss of goodwill, work stoppage, computer failure or malfunction, 
  8 | or for any and all other damages or losses. If you do not agree with these terms, 
  9 | then you are advised to not use the software.'''
 10 | 
 11 | import numpy as np
 12 | import os, sys, csv
 13 | from code0_parameter import DATASETSIZE, CELLTYPE
 14 | import tensorflow as tf
 15 | import pandas as pd
 16 | import pylab as pl
 17 | 
 18 | 
 19 | def print_result(dp, cv_num_name, i, rt, rmse, auc, r2, inter_rmse, inter_auc, inter_r2, intra_rmse, intra_auc,
 20 |                  intra_r2, run_type, display=5):
 21 |     if run_type == 'train':
 22 |         result = "==> %s cross-valuation: Train Epoch: %d \trate: %.3f \tRMSE: %.3f  \tAUC: %.3f \tR2: %.3f" % (
 23 |             cv_num_name, i + 1, rt, rmse, auc, r2)
 24 |     else:
 25 |         result = "==> %s cross-valuation: Test Epoch: %d \t rmse: %.3f \t auc: %.3f \t r2: %.3f" % (
 26 |             cv_num_name, (i + 1) / display, rmse, auc, r2)
 27 | 
 28 |     print(result)
 29 |     logwrite(result, dp, False)
 30 | 
 31 |     inter_result = "==> inter_skill\t RMSE: %.3f  \tAUC: %.3f \tR2: %.3f" % (inter_rmse, inter_auc, inter_r2)
 32 |     print(inter_result)
 33 |     logwrite(inter_result, dp, False)
 34 | 
 35 |     intra_result = "==> intra_skill\t RMSE: %.3f  \tAUC: %.3f \tR2: %.3f" % (intra_rmse, intra_auc, intra_r2)
 36 |     print(intra_result)
 37 |     logwrite(intra_result, dp, False)
 38 | 
 39 | 
 40 | def check_directories():
 41 |     par_dir = ['result', 'data','weights']
 42 |     datasets_dir = ['assistment2009', 'kdd', 'cmu_stat_f2011']
 43 | 
 44 |     print('==> check directories')
 45 | 
 46 |     for p_item in par_dir:
 47 |         if not os.path.exists('./' + p_item):
 48 |             os.mkdir('./' + p_item)
 49 |             print('==> create directory ./' + p_item)
 50 |         else:
 51 |             print('==> directory: ./' + p_item + ' exists')
 52 |         for c_item in datasets_dir:
 53 |             if not os.path.exists('./' + p_item + '/' + c_item):
 54 |                 os.mkdir('./' + p_item + '/' + c_item)
 55 |                 print('==> create directory: ./' + p_item + '/' + c_item)
 56 |             else:
 57 |                 print('==> directory  ./' + p_item + '/' + c_item + ' exists')
 58 | 
 59 | 
 60 | def counter(a):
 61 |     a = list(a)
 62 |     unique, counts = np.unique(a, return_counts=True)
 63 |     return unique, counts
 64 | 
 65 | 
 66 | def create_column_dict_and_set(data, columnName, dp):
 67 |     setName = os.path.dirname(dp.csv_file_name) + "/" + columnName + "_set_" + str(dp.dataSetSize) + ".csv"
 68 |     dictName = os.path.dirname(dp.csv_file_name) + "/" + columnName + "_dict_" + str(dp.dataSetSize) + ".csv"
 69 |     column_ct = data[columnName]
 70 |     column_set_original = list(column_ct.unique())
 71 |     size = len(column_set_original)
 72 |     column_dict = {value: key + 1 for key, value in enumerate(column_set_original)}
 73 |     column_dict[0] = 0
 74 |     column_set = [i + 1 for i in range(size)]
 75 | 
 76 |     with open(setName, 'w') as f:
 77 |         w = csv.writer(f)
 78 |         w.writerow(column_set)
 79 |     print('==> save ', setName)
 80 |     with open(dictName, 'w') as f:
 81 |         w = csv.writer(f)
 82 |         for key, val in column_dict.items():
 83 |             w.writerow([key, val])
 84 |     print('==> save ', dictName)
 85 |     return column_set, column_dict
 86 | 
 87 | 
 88 | def stastic_SecNumber_UserNumber_SkillNumber(data, dp):
 89 |     secNumber = len(getUserQuesNumList(data['user_id']))
 90 |     userNumber = len(data['user_id'].unique())
 91 |     skillNumber = len(data['skill_id'].unique())
 92 | 
 93 |     secNumberStr = "SecNumber         {:>10}\n".format(secNumber)
 94 |     userNumberStr = "userNumber        {:>10}\n".format(userNumber)
 95 |     skillNumberStr = "skillNumber       {:>10}\n".format(skillNumber)
 96 | 
 97 |     logwrite([secNumberStr, userNumberStr, skillNumberStr], dp, True)
 98 |     return secNumber, userNumber, skillNumber
 99 | 
100 | 
101 | def mean_normalization(X_train, X_test):
102 |     data = np.concatenate((X_train, X_test), axis=0)
103 |     mean = data.mean(axis=0)
104 |     std = data.std(axis=0)
105 |     return (X_train - mean) / std, (X_test - mean) / std
106 | 
107 | 
108 | def xavier_init(fan_in, fan_out, function):
109 |     if function is tf.nn.sigmoid:
110 |         low = -4.0 * np.sqrt(6.0 / (fan_in + fan_out))
111 |         high = 4.0 * np.sqrt(6.0 / (fan_in + fan_out))
112 |         return tf.random_uniform((fan_in, fan_out), minval=low, maxval=high, dtype=tf.float32)
113 |     elif function is tf.nn.tanh:
114 |         low = -1 * np.sqrt(6.0 / (fan_in + fan_out))
115 |         high = 1 * np.sqrt(6.0 / (fan_in + fan_out))
116 |         return tf.random_uniform((fan_in, fan_out), minval=low, maxval=high, dtype=tf.float32)
117 | 
118 | 
119 | def getUserQuesNumList(dataList):
120 |     a = list(dataList)
121 |     target = np.empty((0, 2))
122 |     size = len(a)
123 |     temp = [a[0], 1]
124 |     for i in range(1, size):
125 |         if a[i] == a[i - 1]:
126 |             temp[1] += 1
127 |         else:
128 |             target = np.vstack((target, temp))
129 |             temp = [a[i], 1]
130 |     return np.vstack((target, temp))
131 | 
132 | 
133 | def connectStringfromList(klist):
134 |     if type(klist) != list:
135 |         raise ValueError("only convert list")
136 |     tmp = ''
137 |     for i, v in enumerate(klist):
138 |         if i == 0:
139 |             tmp = klist[i]
140 |         else:
141 |             tmp = tmp + " " + klist[i]
142 |     return tmp
143 | 
144 | 
145 | def unique_rows(a):
146 |     a = np.ascontiguousarray(a)
147 |     unique_a = np.unique(a.view([('', a.dtype)] * a.shape[1]))
148 |     return unique_a.view(a.dtype).reshape((unique_a.shape[0], a.shape[1]))
149 | 
150 | 
151 | def get_num_step(dataset):
152 |     u, c = counter(dataset['user_id'])
153 |     return max(c)
154 | 
155 | 
156 | def logwrite(strList, dp, prt=False):
157 |     logfileName = "result/" + str(dp.dataSetType) + "/log_" + str(dp.dataSetType) + "_" + str(
158 |         dp.currentTime) + "_" + str(CELLTYPE) + "_" + str(DATASETSIZE) + ".txt"
159 | 
160 |     for item in strList:
161 |         with open(logfileName, "a") as myfile:
162 |             myfile.write(str(item))
163 |         if prt:
164 |             print(item)
165 | 
166 | 
167 | def printConfigration(config, dp, train_numb, test_numb):
168 |     l1 = "\n" + "-" * 15 + " Configuration " + "-" * 15
169 |     l11 = "DataSet             {:>10}".format(dp.dataSetType)
170 |     l2 = "RNN layers          {:>10}".format(config.num_layer)
171 |     l3 = "cell type           {:>10}".format(config.cell_type)
172 |     l4 = "hidden_size         {:>10}".format(config.hidden_size)
173 | 
174 |     if config.num_layer == 2:
175 |         l41 = "hidden_size2        {:>10}".format(config.hidden_size_2)
176 |         logwrite([l1, l11, l2, l3, l4, l41], dp=dp, prt=True)
177 |     else:
178 |         logwrite([l1, l11, l2, l3, l4], dp=dp, prt=True)
179 |     l5 = "keep_prob           {:>10}".format(config.keep_prob)
180 |     l6 = "num_steps           {:>10}".format(config.num_steps)
181 |     l7 = "seq_width           {:>10}".format(len(dp.columnsName_to_index))
182 |     l8 = "skill_num           {:>10}".format(config.skill_num)
183 |     l9 = "skill_id_one_hot    {:>10}".format(dp.columns_max['skill_id'] + 1)
184 |     l10 = "max_max_epoch       {:>10}".format(config.max_max_epoch)
185 |     l11 = "batch_size          {:>10}".format(config.batch_size)
186 |     l12 = "train student number{:>10}".format(train_numb)
187 |     l13 = "test student number {:>10}".format(test_numb)
188 |     l14 = "-" * 20 + " End " + "-" * 20 + "\n"
189 |     logwrite([l5, l6, l7, l8, l9, l10, l11, l12, l13, l14], dp=dp, prt=True)
190 | 
191 | 
192 | def saveResult(dp, auc_train, rmse_train, r2_train, auc_test, rmse_test, r2_test, mean_result):
193 |     print("==> save the result\t", str(dp.currentTime))
194 |     auc_train.to_csv("result/" + str(dp.dataSetType) + "/auc_train_" + str(dp.currentTime) + ".csv")
195 |     rmse_train.to_csv("result/rmse_train_" + str(dp.currentTime) + ".csv")
196 |     r2_train.to_csv("result/" + str(dp.dataSetType) + "/r2_train_" + str(dp.currentTime) + ".csv")
197 | 
198 |     auc_test.to_csv("result/" + str(dp.dataSetType) + "/auc_test_" + str(dp.currentTime) + ".csv")
199 |     rmse_test.to_csv("result/" + str(dp.dataSetType) + "/rmse_test_" + str(dp.currentTime) + ".csv")
200 |     r2_test.to_csv("result/" + str(dp.dataSetType) + "/r2_test_" + str(dp.currentTime) + ".csv")
201 | 
202 |     mean_result.to_csv("result/" + str(dp.dataSetType) + "/Mean_" + str(dp.currentTime) + ".csv")
203 | 
204 | 
205 | def draw_hist_graph(data_list, title, bins):
206 |     pl.hist(data_list, bins=bins)
207 |     pl.xlabel(title)
208 |     pl.show()
209 | 
210 | 
211 | if __name__ == "__main__":
212 |     pass
213 | 


--------------------------------------------------------------------------------
/code0_parameter.py:
--------------------------------------------------------------------------------
  1 | ''' Unless stated otherwise, all software is provided free of charge. 
  2 | As well, all software is provided on an "as is" basis without warranty 
  3 | of any kind, express or implied. Under no circumstances and under no legal 
  4 | theory, whether in tort, contract, or otherwise, shall Liang Zhang be liable 
  5 | to you or to any other person for any indirect, special, incidental, 
  6 | or consequential damages of any character including, without limitation, 
  7 | damages for loss of goodwill, work stoppage, computer failure or malfunction, 
  8 | or for any and all other damages or losses. If you do not agree with these terms, 
  9 | then you are advised to not use the software.'''
 10 | 
 11 | import tensorflow as tf
 12 | import datetime
 13 | 
 14 | # Hyperparameter for all kinds of file
 15 | DATASETTYPE = 'assistment2009'  # 'assistment2009'|'cmu_stat_f2011'
 16 | if DATASETTYPE == 'cmu_stat_f2011':
 17 |     TARGETSIZE = 250
 18 | elif DATASETTYPE == 'assistment2009':
 19 |     TARGETSIZE = 1000
 20 | AUTOENCODER_ACT = 'tanh'  # tanh, sigmoid
 21 | CONNECT_DATASET_2009 = True
 22 | 
 23 | DATASETSIZE = "large"  # 'large | small'
 24 | RNN_layer_number = 1  # '1|2'
 25 | CELLTYPE = "LSTM"  # "RNN | LSTM | GRU"
 26 | 
 27 | BASELINE = True
 28 | AUTOENCODER_LABEL = False
 29 | 
 30 | 
 31 | class DatasetParameter(object):
 32 |     def __init__(self, data_type=DATASETTYPE):
 33 |         if data_type == DATASETTYPE:
 34 |             self.dataSetType = DATASETTYPE  # "assistment2009 | cmu_stat_f2011 "
 35 |         else:
 36 |             self.dataSetType = data_type
 37 |         self.dataSetSize = DATASETSIZE  # 'small |large'
 38 | 
 39 |         if self.dataSetType == "assistment2009":
 40 |             self.csv_file_name = "./data/assistment2009/skill_builder_data_corrected.csv"
 41 |             if CONNECT_DATASET_2009:
 42 |                 self.processedFileName = "./data/assistment2009/processded_" + str(self.dataSetSize) + "_connected.csv"
 43 |             else:
 44 |                 self.processedFileName = "./data/assistment2009/processded_" + str(
 45 |                     self.dataSetSize) + "_nonconnected.csv"
 46 | 
 47 |             self.filtedColumnNameList = ['skill_id', 'user_id', 'original', 'correct', 'attempt_count', 'time',
 48 |                                          'hint_count', 'problem_id', 'first_action', 'template_id', 'opportunity']
 49 |             self.connect_dataset = CONNECT_DATASET_2009
 50 |             self.connect_file_name = "./data/assistment2009/connected_" + str(self.dataSetSize) + ".csv"
 51 |             self.time_z_level = 'skill_id'
 52 |             self.time_threshold = 400
 53 |             self.time_interval = 0.05
 54 |             self.attemp_max = 10
 55 |             self.correct_boundary_list = [0.5, 0.7]
 56 |             self.time_boundary_list = [-0.8, -0.6, 0]
 57 | 
 58 |         elif self.dataSetType == "cmu_stat_f2011":
 59 |             self.csv_file_name = "./data/cmu_stat_f2011/cmu.txt"
 60 |             self.filtedColumnNameList = ['time', 'correct', 'skill_id', 'step_id', 'problem_id', 'user_id',
 61 |                                          'Level (Unit)', 'Level (Module)',"first_action", "attempt_level"]
 62 | 
 63 |         elif self.dataSetType == "kdd":
 64 |             self.csv_file_name = "data/kdd/algebra_2005_2006_train.txt"
 65 |             self.processedFileName = "data/kdd/processded_" + str(self.dataSetSize) + ".csv"
 66 |             self.filtedColumnNameList = ['skill_id', 'user_id', 'correct', 'time', 'hint_count', 'problem_view']
 67 |             # 'step_id','unit_id','problem_id','incorrect','correct_num','opportunity'
 68 |         else:
 69 |             raise ValueError("check DATASETTYPE")
 70 | 
 71 |         self.currentTime = datetime.datetime.now().strftime("%m-%d-%H:%M")
 72 | 
 73 |         if self.dataSetType == "assistment2009":
 74 |             ##config
 75 |             self.dataset_columns_for_cross_feature = [['skill_id', 'correct'], ['first_action', 'correct'],
 76 |                                                       ['time_level', 'correct'], ['attempt_level', 'correct'],
 77 |                                                       ['first_action', 'time_level', 'correct'],['skill_id', 'time_level'],
 78 |                                                       ['attempt_level', 'time_level', 'correct'], ]
 79 |             self.model_continues_columns = ["time", "hint_count", "attempt_count"]
 80 |             self.model_category_columns = ["first_action", "time_level", "attempt_level"]
 81 |             self.model_cross_columns = [['skill_id', 'time_level'],['time_level', 'correct']]  # "the continues data columns needed to consider"
 82 |         elif self.dataSetType == 'cmu_stat_f2011':
 83 |             self.dataset_columns_for_cross_feature = [['skill_id', 'correct'], ['skill_id', 'time_level'],['time_level', 'correct']]
 84 |             self.model_continues_columns = ["time"]
 85 |             self.model_category_columns = ["first_action", "time_level", "attempt_level"]
 86 |             self.model_cross_columns = [['time_level', 'correct']]  # "the continues data columns needed to consider"
 87 | 
 88 |         elif self.dataSetType == 'kdd':
 89 |             self.dataset_columns_for_cross_feature = [['skill_id', 'correct'], ['time_level', 'correct']]
 90 |             self.model_continues_columns = ["time", "hint_count", "problem_view"]
 91 |             self.model_category_columns = ["time", "hint_count", "problem_view"]
 92 |             self.model_cross_columns = [['time_level', 'correct']]  # "the continues data columns needed to consider"
 93 | 
 94 |         if [['skill_id', 'correct']] in self.model_cross_columns:
 95 |             self.model_cross_columns.remove(['skill_id', 'correct'])
 96 |         elif [['correct', 'skill_id']] in self.model_cross_columns:
 97 |             self.model_cross_columns.remove(['correct', 'skill_id'])
 98 | 
 99 |         self.dataset_columns_for_cross_feature = self.__sortList(self.dataset_columns_for_cross_feature)
100 |         self.model_cross_columns = self.__sortList(self.model_cross_columns)
101 |         for items in self.model_cross_columns:
102 |             if items not in self.dataset_columns_for_cross_feature:
103 |                 raise ValueError('model_cross_columns must in dataset_columns_for_cross_feature')
104 |             for item in items:
105 |                 if item not in self.filtedColumnNameList + ['skill_id'] + ['time_level'] + ['attempt_level']:
106 |                     raise ValueError(item, " not in filtedColumnNameList")
107 |         # need to change value
108 |         self.columnsName_to_index = {}
109 |         self.columns_max = {}
110 |         self.columns_numb = {}
111 |         self.seq_width = 0
112 |         self.skill_num = 0
113 | 
114 |     def __sortList(self, listName):
115 |         return sorted(listName)
116 | 
117 |     def convertCrossCoumnsToNameList(self, Flag=True):
118 |         if Flag:
119 |             mcu = self.dataset_columns_for_cross_feature
120 |         else:
121 |             mcu = self.model_cross_columns
122 |         crossFeatureNameList = []
123 |         if len(mcu) != 0:
124 |             for index_ccl, crossColumnsList in enumerate(mcu):
125 |                 crossFeatureName = ''
126 |                 if len(set(crossColumnsList)) <= 1:
127 |                     raise ValueError("need two different feature at least ")
128 | 
129 |                 for index_cc, crossColumn in enumerate(crossColumnsList):
130 |                     if index_cc == 0:
131 |                         crossFeatureName = crossColumn
132 |                     else:
133 |                         crossFeatureName = crossFeatureName + " " + crossColumn
134 |                 crossFeatureNameList.append(crossFeatureName)
135 |         return crossFeatureNameList
136 | 
137 | 
138 | class autoencoderParameter(object):
139 |     def __init__(self):
140 |         self.epoch_rbm = 10
141 |         self.epoch_autoencoder = 10
142 |         self.batch_size = 50
143 |         self.num_steps = 100
144 | 
145 | 
146 | class SAEParamsConfig(object):
147 |     def __init__(self):
148 |         self.learning_rate = 0.005
149 |         self.min_lr = 0.0001
150 |         self.lr_decay = 0.98
151 |         self.layer_num = 1
152 |         self.init_scale = 0.05
153 |         self.target_size = TARGETSIZE
154 |         self.max_max_epoch = 5
155 |         self.display_step = 1
156 | 
157 |         self.batch_size = 300
158 |         self.num_steps = 0  # need to resign value of time stampes
159 |         self.seq_width = 0  # need to resign value
160 | 
161 | 
162 | # Parameter for RNN
163 | class ModelParamsConfig(object):
164 |     def __init__(self, dp):
165 |         self.num_steps = 0  # need to resign value of time stampes
166 |         self.skill_num = 0  # need to resign value of skill number
167 |         self.seq_width = 0  # need to resign value
168 |         if dp.dataSetType == 'kdd':
169 |             self.batch_size = 5
170 |         elif dp.dataSetType == 'cmu_stat_f2011':
171 |             self.batch_size = 10
172 |         else:
173 |             self.batch_size = 30
174 |         self.max_max_epoch = 40
175 |         self.num_layer = RNN_layer_number
176 |         self.cell_type = CELLTYPE  # "RNN | LSTM | GRU"
177 |         self.hidden_size = 200
178 |         self.hidden_size_2 = 150
179 | 
180 |         self.init_scale = 0.05
181 |         self.learning_rate = 0.05
182 |         self.max_grad_norm = 4
183 |         self.max_epoch = 5
184 |         self.keep_prob = 0.6
185 |         self.lr_decay = 0.9
186 |         self.momentum = 0.95
187 |         self.min_lr = 0.0001
188 | 
189 | 
190 | if __name__ == "__main__":
191 |     param_ass = DatasetParameter()
192 |     print(param_ass.convertCrossCoumnsToNameList())
193 | 


--------------------------------------------------------------------------------
/code1_data.py:
--------------------------------------------------------------------------------
  1 | ''' Unless stated otherwise, all software is provided free of charge. 
  2 | As well, all software is provided on an "as is" basis without warranty 
  3 | of any kind, express or implied. Under no circumstances and under no legal 
  4 | theory, whether in tort, contract, or otherwise, shall Liang Zhang be liable 
  5 | to you or to any other person for any indirect, special, incidental, 
  6 | or consequential damages of any character including, without limitation, 
  7 | damages for loss of goodwill, work stoppage, computer failure or malfunction, 
  8 | or for any and all other damages or losses. If you do not agree with these terms, 
  9 | then you are advised to not use the software.'''
 10 | 
 11 | import math
 12 | import os
 13 | import pyprind
 14 | import random
 15 | import sys
 16 | 
 17 | import numpy as np
 18 | import pandas as pd
 19 | 
 20 | import code0_parameter as code0
 21 | import uril_assistment2009
 22 | import uril_cmu_statistic
 23 | import uril_tools as aux
 24 | 
 25 | 
 26 | 
 27 | def create_label_and_delete_last_one(dp):
 28 |     dataFileName = os.path.dirname(dp.csv_file_name) + "/dataset_" + str(dp.dataSetSize) + ".csv"
 29 |     labelFileName = os.path.dirname(dp.csv_file_name) + "/labels_" + str(dp.dataSetSize) + ".csv"
 30 | 
 31 |     if os.path.exists(dataFileName) and os.path.exists(labelFileName):
 32 |         dataset = pd.read_csv(dataFileName)
 33 |         labels = pd.read_csv(labelFileName)
 34 |         print('==> ', dataFileName, " exists,load directly")
 35 |         print('==> ', labelFileName, " exists,load directly")
 36 |         return dataset, labels
 37 | 
 38 |     if dp.dataSetType == "assistment2009":
 39 |         data = uril_assistment2009.read_asssistment2009_data_from_csv(dp)
 40 |     elif dp.dataSetType == "kdd":
 41 |         data = uril_kdd.read_kdd_data_from_csv(dp)
 42 |     elif dp.dataSetType == "cmu_stat_f2011":
 43 |         data = uril_cmu_statistic.read_data_from_csv()
 44 | 
 45 |     userID_Quest_number_matrix = aux.getUserQuesNumList(data['user_id'])  # user_id: number of questions
 46 |     print("==> creat skill_id+label, last record of every user is deleted")
 47 |     print("==> delete user whose problem number is less than 2")
 48 |     row_size = len(data);
 49 |     index = 0
 50 |     kindex = 0
 51 |     dataset = pd.DataFrame()
 52 |     labels = pd.DataFrame()
 53 | 
 54 |     bar = pyprind.ProgPercent(row_size, stream=sys.stdout)
 55 |     while (index < row_size):
 56 |         id_number = userID_Quest_number_matrix[kindex, 1]
 57 |         if id_number > 2:
 58 |             dataTemp = data.loc[index:index + id_number - 2]
 59 |             labeTemp = pd.DataFrame({'user_id': int(data.loc[index, 'user_id']),
 60 |                                      'label_skill_id': data.loc[index + 1:index + id_number - 1, "skill_id"],
 61 |                                      'label_correct': data.loc[index + 1:index + id_number - 1, "correct"]})
 62 |             assert len(dataTemp) == len(labeTemp)
 63 |             dataset = dataset.append(dataTemp)
 64 |             labels = labels.append(labeTemp)
 65 |             del dataTemp, labeTemp
 66 |         bar.update(id_number)
 67 |         index += id_number
 68 |         kindex += 1
 69 |     dataset = dataset.reset_index(drop=True)
 70 |     labels = labels.reset_index(drop=True)
 71 | 
 72 |     if os.path.exists(dataFileName): os.remove(dataFileName)
 73 |     if os.path.exists(labelFileName): os.remove(labelFileName)
 74 |     dataset.to_csv(dataFileName, index=False)
 75 |     labels.to_csv(labelFileName, index=False)
 76 |     print("==> save ", dataFileName)
 77 |     print("==> save ", labelFileName)
 78 | 
 79 |     assert len(dataset) == len(labels), "dateset size\t" + str(len(dataset)) + "\tlabels size\t" + str(len(labels))
 80 |     return dataset, labels
 81 | 
 82 | 
 83 | def convert_data_labels_to_tuples(dataset, labels):
 84 |     index = 0
 85 |     kindex = 0
 86 |     tuple_rows = []
 87 |     userID_Quest_number_matrix = aux.getUserQuesNumList(dataset['user_id'])
 88 |     print("==> convert data and labels to tuples")
 89 |     # tuple formate
 90 |     # 0: user_id
 91 |     # 1: record_numb
 92 |     # 2: data
 93 |     # 3: Target_Id
 94 |     # 4: correctness
 95 |     dataset_size = len(dataset)
 96 |     bar = pyprind.ProgPercent(dataset_size, stream=sys.stdout)
 97 |     while index < dataset_size:
 98 |         numb = int(userID_Quest_number_matrix[kindex, 1])
 99 |         assert int(userID_Quest_number_matrix[kindex, 0]) == int(dataset.loc[index, "user_id"])
100 |         tup = (dataset.loc[index, "user_id"], numb, dataset.iloc[index:index + numb],
101 |                list(labels.loc[index:index + numb - 1, "label_skill_id"]),
102 |                # the input is a list but not pd.DataFrame, don't need to reset the index.
103 |                list(labels.loc[index:index + numb - 1, "label_correct"]))
104 |         # pd.DataFrame, loc and iloc cut differentsize!
105 |         tuple_rows.append(tup)
106 |         index += numb
107 |         kindex += 1
108 |         bar.update(numb)
109 |     random.shuffle(tuple_rows)
110 |     return tuple_rows
111 | 
112 | 
113 | def get_columns_info(dataset):
114 |     columns_max = {}
115 |     columns_numb = {}
116 |     columnsName_to_index = {}
117 |     for i, column_name in enumerate(dataset.columns):
118 |         try:
119 |             columns_max[column_name] = max(dataset[column_name])
120 |             columns_numb[column_name] = len(dataset[column_name].unique())
121 |             columnsName_to_index[column_name] = i
122 |         except:
123 |             print(dataset.columns)
124 |             print(np.shape(dataset))
125 |             print(dataset[column_name])
126 |             raise ValueError(column_name)
127 | 
128 |     return columns_max, columns_numb, columnsName_to_index
129 | 
130 | 
131 | def add_cross_feature_to_dataset(dataset, dp):
132 |     if len(dp.dataset_columns_for_cross_feature) == 0:
133 |         print("==> no need to add cross feature to dataset")
134 |         return dataset
135 |     else:
136 |         print("==> add cross feature to dataset")
137 |         columns_max, columns_numb, _ = get_columns_info(dataset)
138 |         d_size = len(dataset)
139 |         for item in dp.dataset_columns_for_cross_feature:
140 |             print("==> add", aux.connectStringfromList(item))
141 |             temp = []
142 |             for i in pyprind.prog_percent(range(d_size), stream=sys.stdout, title=item):
143 |                 if len(item) == 2:
144 | 
145 |                     value = dataset.loc[i, item[0]] + dataset.loc[i, item[1]] * (columns_max[item[0]] + 1)
146 |                     #print(" dataset.loc[i, item[0]]\t", dataset.loc[i, item[0]], "\tdataset.loc[i, item[1]]\t",
147 |                     #      dataset.loc[i, item[1]], "\t(columns_max[item[0]] + 1)\t",(columns_max[item[0]] + 1),
148 |                     #      "\tvalue\t", value)
149 |                 elif len(item) == 3:
150 |                     value = dataset.loc[i, item[0]] + dataset.loc[i, item[1]] * (columns_max[item[0]] + 1) + \
151 |                             dataset.loc[i, item[2]] * (columns_max[item[0]] + 1) * (columns_max[item[1]] + 1)
152 |                 else:
153 |                     raise ValueError('cross features only support 3 at most')
154 |                 temp.append(value)
155 |             dataset[aux.connectStringfromList(item)] = temp
156 |         return dataset
157 | 
158 | 
159 | # only for assistment 2009 and 2014 data
160 | def normalization_continues_data(data):
161 |     print('==> normalize continues data')
162 |     columns_name_list = ["attempt_count", "time", "hint_count"]
163 |     data = data.reset_index(drop=True)
164 | 
165 |     size = len(data)
166 |     for column_name in columns_name_list:
167 |         if column_name == "time":
168 |             bins = [-1, 60, 300, 1200, 3600, 60000000]
169 |             data[column_name] = pd.cut(data[column_name], bins, labels=False)
170 |             tmpList = []
171 | 
172 |             for i in pyprind.prog_percent(range(size), stream=sys.stdout, title=column_name):
173 |                 try:
174 |                     tmp = int(data.loc[i, column_name])
175 |                 except:
176 |                     tmp = 0
177 |                     # raise ValueError(str(data.loc[i, column_name])+"_"+str(i))
178 |                 tmpList.append(math.log((tmp + 2), 6))
179 |             data['time_normal'] = tmpList
180 |         elif column_name == "attempt_count":
181 |             bins = [-10, 1, 20, 100, 40000]
182 |             data[column_name] = pd.cut(data[column_name], bins, labels=False)
183 |             data[column_name] += 1
184 |             tmpList = []
185 | 
186 |             for i in pyprind.prog_percent(range(size), stream=sys.stdout, title=column_name):
187 |                 # print ("attempt_count\t",str(i))
188 |                 tmp = int(data.loc[i, column_name])
189 |                 tmpList.append(math.log((tmp + 1), 5))
190 |             data['attempt_count_normal'] = tmpList
191 |         elif column_name == "hint_count":
192 |             bins = [-1, 0, 2, 4, 3000]
193 |             data[column_name] = pd.cut(data[column_name], bins, labels=False)
194 |             data[column_name] += 1
195 |             tmpList = []
196 |             for i in pyprind.prog_percent(range(size), stream=sys.stdout, title=column_name):
197 |                 try:
198 |                     tmp = int(data.loc[i, column_name])
199 |                 except:
200 |                     tmp = 0
201 |                 tmpList.append(math.log((tmp + 1), 5))
202 |             data['hint_count_normal'] = tmpList
203 |         else:
204 |             raise ValueError("check your continus_columns parameter!")
205 |     return data
206 | 
207 | 
208 | def load_data(dp):
209 |     if len(dp.dataset_columns_for_cross_feature) == 0:
210 |         dataFileName = os.path.dirname(dp.csv_file_name) + "/dataset_" + str(dp.dataSetSize) + ".csv"
211 |     else:
212 |         tmp = aux.connectStringfromList(dp.convertCrossCoumnsToNameList())
213 |         dataFileName = os.path.dirname(dp.csv_file_name) + '/' + tmp + "_" + str(dp.dataSetSize) + '_' + ".csv"
214 |     labelFileName = os.path.dirname(dp.csv_file_name) + "/labels_" + str(dp.dataSetSize) + ".csv"
215 | 
216 |     if os.path.exists(dataFileName) and os.path.exists(labelFileName):
217 |         data = pd.read_csv(dataFileName)
218 |         labels = pd.read_csv(labelFileName)
219 |         return data, labels
220 |     else:
221 |         data, labels = create_label_and_delete_last_one(dp)
222 |         dataset_with_crossFeatures = add_cross_feature_to_dataset(data, dp)
223 |         dataset_with_crossFeatures.to_csv(dataFileName, index=False)
224 |         print("==> save ", dataFileName)
225 |         return dataset_with_crossFeatures, labels
226 | 
227 | 
228 | if __name__ == "__main__":
229 |     dp = code0.DatasetParameter()
230 |     load_data(dp)
231 | 


--------------------------------------------------------------------------------
/code2_model.py:
--------------------------------------------------------------------------------
  1 | ''' Unless stated otherwise, all software is provided free of charge. 
  2 | As well, all software is provided on an "as is" basis without warranty 
  3 | of any kind, express or implied. Under no circumstances and under no legal 
  4 | theory, whether in tort, contract, or otherwise, shall Liang Zhang be liable 
  5 | to you or to any other person for any indirect, special, incidental, 
  6 | or consequential damages of any character including, without limitation, 
  7 | damages for loss of goodwill, work stoppage, computer failure or malfunction, 
  8 | or for any and all other damages or losses. If you do not agree with these terms, 
  9 | then you are advised to not use the software.'''
 10 | 
 11 | import tensorflow as tf
 12 | import numpy as np
 13 | from code0_parameter import AUTOENCODER_ACT, BASELINE, TARGETSIZE, AUTOENCODER_LABEL
 14 | from tensorflow.python.ops import rnn_cell
 15 | from tensorflow.python.ops.rnn_cell import LSTMCell, BasicRNNCell, GRUCell, DropoutWrapper
 16 | from uril_oneHotEncoder import ONEHOTENCODERINPUT
 17 | 
 18 | 
 19 | class Model(object):
 20 |     def __init__(self, is_training, config, dp):
 21 |         self._batch_size = batch_size = config.batch_size
 22 | 
 23 |         self._min_lr = config.min_lr
 24 |         self.hidden_size = hidden_size = config.hidden_size
 25 |         self.hidden_size_2 = hidden_size_2 = config.hidden_size_2
 26 |         self.skill_set = dp.skill_set
 27 |         self.num_steps = num_steps = config.num_steps
 28 |         self.skill_num = skill_numb = config.skill_num
 29 |         self.seq_width = seq_width = len(dp.columnsName_to_index)
 30 |         self.skill_num = skill_num = dp.skill_num
 31 | 
 32 |         # load data
 33 |         self.inputs = tf.placeholder(tf.float32, [batch_size, num_steps, seq_width])
 34 |         self.inputs_wide_skill_correct = tf.placeholder(tf.int32, [batch_size, num_steps])
 35 |         self._target_id = tf.placeholder(tf.int32, [None])
 36 |         self._target_correctness = target_correctness = tf.placeholder(tf.float32, [None])
 37 | 
 38 |         ohe = ONEHOTENCODERINPUT(config, dp, self.inputs)
 39 | 
 40 |         # load features
 41 |         if not BASELINE:
 42 |             if AUTOENCODER_LABEL:
 43 |                 ###########################################################################################################
 44 |                 featurelist = [ohe.getSkillCorrectCrossFeature(),ohe.getCrossFeatureAll()]#,ohe.getCategoryFeatureInputs()]
 45 |                                 # ohe.getCategoryFeatureInputs()], # ohe.getContinuesFeatureInputs()]
 46 |                 ###########################################################################################################
 47 |                 tmp_v = tf.concat(2, featurelist)
 48 |                 tmp_vs = tf.reshape(tmp_v, [-1, int(tmp_v.get_shape()[-1])])
 49 | 
 50 |                 if AUTOENCODER_ACT == 'tanh':
 51 |                     transfer_function = tf.nn.tanh
 52 |                 elif AUTOENCODER_ACT == 'sigmoid':
 53 |                     transfer_function = tf.nn.sigmoid
 54 | 
 55 |                 path = './weights/' + dp.dataSetType + '/weights_' + str(tmp_vs.get_shape()[-1]) + '_' + str(
 56 |                     TARGETSIZE) + '.csv'
 57 |                 autoencoderweights = tf.constant(np.loadtxt(path), dtype=tf.float32)
 58 |                 path = './weights/' + dp.dataSetType + '/bias_' + str(tmp_vs.get_shape()[-1]) + '_' + str(
 59 |                     TARGETSIZE) + '.csv'
 60 |                 autoencoderBias = tf.constant(np.loadtxt(path), dtype=tf.float32)
 61 |                 tmp_vs = transfer_function(tf.matmul(tmp_vs, autoencoderweights) + autoencoderBias)
 62 |             else:
 63 |                 ###########################################################################################################
 64 |                 featurelist = [ohe.getSkillCorrectCrossFeature(), ohe.getCrossFeatureAll(), ohe.getCategoryFeatureInputs()]
 65 |                 # featurelist = [ohe.getSkillCorrectCrossFeature(), ohe.getCrossFeatureAll()]
 66 |                 ###########################################################################################################
 67 |                 tmp_v = tf.concat(2, featurelist)
 68 |                 print("==> [Tensor Shape] Final Shape\t", tmp_v.get_shape())
 69 |                 tmp_vs = tf.reshape(tmp_v, [-1, int(tmp_v.get_shape()[-1])])
 70 |         else:
 71 |             tmp_v = ohe.getSkillCorrectCrossFeature()
 72 |             tmp_vs = tf.reshape(tmp_v, [-1, int(tmp_v.get_shape()[-1])])
 73 |         input_RNN = tf.reshape(tmp_vs, [batch_size, num_steps, -1])
 74 | 
 75 |         cell = self.getCell(is_training=is_training, dp=dp, config=config)
 76 |         self._initial_state = cell.zero_state(batch_size, tf.float32)
 77 | 
 78 |         outputs = []
 79 |         state = self._initial_state
 80 | 
 81 |         with tf.variable_scope(config.cell_type):
 82 |             for time_step in range(num_steps):
 83 |                 if time_step > 0: tf.get_variable_scope().reuse_variables()
 84 |                 (cell_output, state) = cell(input_RNN[:, time_step, :], state)
 85 |                 outputs.append(cell_output)
 86 | 
 87 |         if config.num_layer == 1:
 88 |             size_rnn_out = hidden_size
 89 |         elif config.num_layer == 2:
 90 |             size_rnn_out = hidden_size_2
 91 |         else:
 92 |             raise ValueError("only support 1-2 layers, check your layer number!")
 93 | 
 94 |         output_RNN = tf.reshape(tf.concat(1, outputs), [-1, size_rnn_out])
 95 |         softmax_w = tf.get_variable("softmax_w", [size_rnn_out, skill_numb])
 96 |         softmax_b = tf.get_variable("softmax_b", [skill_numb])
 97 | 
 98 |         logits = tf.matmul(output_RNN, softmax_w) + softmax_b
 99 | 
100 |         # pick up the right one
101 |         self.logits = logits = tf.reshape(logits, [-1])
102 |         self.selected_logits = selected_logits = tf.gather(logits, self.target_id)
103 | 
104 |         # make prediction
105 |         self._pred = self._pred_values = tf.sigmoid(selected_logits)
106 | 
107 |         loss = tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(selected_logits, target_correctness))
108 |         self._cost = loss
109 | 
110 |         if not is_training:
111 |             return
112 | 
113 |         self._lr = tf.Variable(0.0, trainable=False)
114 |         tvars = tf.trainable_variables()
115 |         grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars), config.max_grad_norm)
116 |         optimizer = tf.train.GradientDescentOptimizer(self.lr)
117 | 
118 |         self._train_op = optimizer.apply_gradients(zip(grads, tvars))
119 | 
120 |     def assign_lr(self, session, lr_value):
121 |         if (lr_value > self.min_lr):
122 |             session.run(tf.assign(self._lr, lr_value))
123 |         else:
124 |             session.run(tf.assign(self._lr, self.min_lr))
125 | 
126 |     def getCell(self, is_training, dp, config):
127 |         # code for RNN
128 |         if is_training == True:
129 |             print("==> Construct ", config.cell_type, " graph for training")
130 |         else:
131 |             print("==> Construct ", config.cell_type, " graph for testing")
132 | 
133 |         if config.cell_type == "LSTM":
134 |             if config.num_layer == 1:
135 |                 basicCell = LSTMCell(config.hidden_size, forget_bias=0.0, state_is_tuple=True)
136 |             elif config.num_layer == 2:
137 |                 basicCell = LSTMCell(config.hidden_size, forget_bias=0.0, state_is_tuple=True)
138 |                 basicCell_2 = LSTMCell(config.hidden_size_2, forget_bias=0.0, state_is_tuple=True)
139 |             else:
140 |                 raise ValueError("config.num_layer should be 1:2 ")
141 |         elif config.cell_type == "RNN":
142 |             if config.num_layer == 1:
143 |                 basicCell = BasicRNNCell(config.hidden_size)
144 |             elif config.num_layer == 2:
145 |                 basicCell = BasicRNNCell(config.hidden_size)
146 |                 basicCell_2 = BasicRNNCell(config.hidden_size_2)
147 |             else:
148 |                 raise ValueError("config.num_layer should be [1-3] ")
149 |         elif config.cell_type == "GRU":
150 |             if config.num_layer == 1:
151 |                 basicCell = GRUCell(config.hidden_size, forget_bias=0.0, state_is_tuple=True)
152 |             elif config.num_layer == 2:
153 |                 basicCell = GRUCell(config.hidden_size, forget_bias=0.0, state_is_tuple=True)
154 |                 basicCell_2 = GRUCell(config.hidden_size_2, forget_bias=0.0, state_is_tuple=True)
155 |             else:
156 |                 raise ValueError("only support 1-2 layers ")
157 |         else:
158 |             raise ValueError("cell type should be GRU,LSTM,RNN")
159 | 
160 |             # add dropout layer between hidden layers
161 |         if is_training and config.keep_prob < 1:
162 |             if config.num_layer == 1:
163 |                 basicCell = DropoutWrapper(basicCell, input_keep_prob=config.keep_prob,
164 |                                            output_keep_prob=config.keep_prob)
165 |             elif config.num_layer == 2:
166 |                 basicCell = DropoutWrapper(basicCell, input_keep_prob=config.keep_prob,
167 |                                            output_keep_prob=config.keep_prob)
168 |                 basicCell_2 = DropoutWrapper(basicCell_2, input_keep_prob=config.keep_prob,
169 |                                              output_keep_prob=config.keep_prob)
170 |             else:
171 |                 pass
172 | 
173 |         if config.num_layer == 1:
174 |             cell = rnn_cell.MultiRNNCell([basicCell], state_is_tuple=True)
175 |         elif config.num_layer == 2:
176 |             cell = rnn_cell.MultiRNNCell([basicCell, basicCell_2], state_is_tuple=True)
177 | 
178 |         return cell
179 | 
180 |     @property
181 |     def batch_size(self):
182 |         return self._batch_size
183 | 
184 |     @property
185 |     def min_lr(self):
186 |         return self._min_lr
187 | 
188 |     @property
189 |     def auc(self):
190 |         return self._auc
191 | 
192 |     @property
193 |     def pred(self):
194 |         return self._pred
195 | 
196 |     @property
197 |     def target_id(self):
198 |         return self._target_id
199 | 
200 |     @property
201 |     def target_correctness(self):
202 |         return self._target_correctness
203 | 
204 |     @property
205 |     def initial_state(self):
206 |         return self._initial_state
207 | 
208 |     @property
209 |     def pred_values(self):
210 |         return self._pred_values
211 | 
212 |     @property
213 |     def cost(self):
214 |         return self._cost
215 | 
216 |     @property
217 |     def final_state(self):
218 |         return self._final_state
219 | 
220 |     @property
221 |     def lr(self):
222 |         return self._lr
223 | 
224 |     @property
225 |     def train_op(self):
226 |         return self._train_op
227 | 
228 | 
229 | if __name__ == "__main__":
230 |     pass
231 | 


--------------------------------------------------------------------------------
/uril_cmu_statistic.py:
--------------------------------------------------------------------------------
  1 | ''' Unless stated otherwise, all software is provided free of charge. 
  2 | As well, all software is provided on an "as is" basis without warranty 
  3 | of any kind, express or implied. Under no circumstances and under no legal 
  4 | theory, whether in tort, contract, or otherwise, shall Liang Zhang be liable 
  5 | to you or to any other person for any indirect, special, incidental, 
  6 | or consequential damages of any character including, without limitation, 
  7 | damages for loss of goodwill, work stoppage, computer failure or malfunction, 
  8 | or for any and all other damages or losses. If you do not agree with these terms, 
  9 | then you are advised to not use the software.'''
 10 | 
 11 | import pandas as pd
 12 | import numpy as np
 13 | import sys, pyprind
 14 | import uril_tools as aux
 15 | import code1_data as code1
 16 | import code0_parameter as code0
 17 | import os
 18 | import pyprind as pp
 19 | import matplotlib.pyplot as plt
 20 | import datetime
 21 | 
 22 | 
 23 | def read_data_from_csv():
 24 |     processedFileName = './data/cmu_stat_f2011/processded_data.csv'
 25 |     raw_data_txt = "./data/cmu_stat_f2011/cmu.txt"
 26 | 
 27 |     if os.path.exists(processedFileName):
 28 |         data = pd.read_csv(processedFileName)
 29 |         print("==> read ", processedFileName, " directly")
 30 | 
 31 |     else:
 32 |         if os.path.exists(raw_data_txt):
 33 |             data = pd.read_csv(raw_data_txt, sep=" ", delimiter='\t')
 34 |             print(data.columns)
 35 |             data.rename(columns={'Duration (sec)': 'time', 'Outcome': 'correct',
 36 |                          'KC (F2011)': 'skill_id', 'Problem Name': 'problem_id', 'Step Name': 'step_id',
 37 |                          'Anon Student Id': 'user_id',"Student Response Type":"first_action",'Attempt At Step':"attempt_level"}, inplace=True)
 38 | 
 39 |             data = data.fillna(-1)
 40 | 
 41 |             filer_data = data[code0.DatasetParameter('cmu_stat_f2011').filtedColumnNameList]
 42 |             filer_data = filer_data[(filer_data['correct'] != -1) & (filer_data['correct'] != 'HINT') & (
 43 |                 filer_data['skill_id'] != '-1') & (filer_data['time'] != '.')]
 44 | 
 45 |             filer_data['correct'].replace({'CORRECT': 1, 'INCORRECT': 0}, inplace=True)
 46 | 
 47 |             # change str to integar
 48 |             for feature in ['skill_id', 'step_id', 'problem_id', 'user_id', 'Level (Unit)', 'Level (Module)','first_action','attempt_level']:
 49 |                 print("==> BEGIN ", feature)
 50 |                 temp_set = set(list(filer_data[feature]))
 51 |                 temp_dict = {key: value+1 for value, key in enumerate(temp_set)}
 52 |                 filer_data[feature].replace(temp_dict, inplace=True)
 53 |                 print("==> END   ", feature)
 54 | 
 55 |             #print ("==> first_action",set(filer_data['first_action']))
 56 |             #print ("==> attempt_level",set(filer_data['attempt_level']))
 57 |             data = attempt_process(filer_data)
 58 |             data = time_basic_process(data)
 59 |             data = time_add_level_process(data)
 60 |             data.to_csv(processedFileName, index=False)
 61 | 
 62 |         else:
 63 |             raise ('No data file exists!')
 64 |     return data
 65 | 
 66 | def attempt_process(data):
 67 |     temp_list = list(data['attempt_level'])
 68 |     new_list = []
 69 | 
 70 |     for i in range(len(temp_list)):
 71 |         if temp_list[i]==1:
 72 |             new_list.append(0)
 73 |         elif temp_list[i]<=5 and temp_list[i]>1:
 74 |             new_list.append(1)
 75 |         elif temp_list[i]>5:
 76 |             new_list.append(2)
 77 |         else:
 78 |             new_list.append(3)
 79 |     data['attempt_level'] = new_list
 80 |     return data
 81 | 
 82 | def test_data():
 83 |     data = read_data_from_csv()
 84 | 
 85 |     k1 = []
 86 |     k2 = []
 87 |     for item in data.columns:
 88 |         num = len(set(data[item]))
 89 |         print("****%10d--%s" % (num, item))
 90 |         k2.append(item)
 91 |         if num < 10:
 92 |             print("-" * 10, item, "-" * 10, "\n", np.unique(data[item]), "\n", "--" * 15)
 93 | 
 94 |     print('--' * 30)
 95 |     print("more than 1 elements\n", k2)
 96 | 
 97 |     print(np.shape(data))
 98 | 
 99 | 
100 | def time_basic_process(data):
101 |     # -1-transfer time to 'integar' from 'str'
102 |     # -2-remove outlier records
103 |     old_time_list = list(data['time'])
104 |     new_time_list = []
105 |     for i in old_time_list:
106 |         kp = int(float(i))
107 |         if kp > 150: kp = 150
108 |         new_time_list.append(kp)
109 |     data['time'] = new_time_list
110 | 
111 |     # -3-transfer to z-score
112 |     time_z_level = 'skill_id'
113 |     print('==> preprocerss time to z-score based on ', time_z_level)
114 |     time_z_id_set = np.unique(data[time_z_level])
115 |     std_dict = {}
116 |     mean_dict = {}
117 |     for itme_id in pp.prog_percent(time_z_id_set, stream=sys.stdout, title='==> extract mean and std of time'):
118 |         temp_data = data[data[time_z_level] == itme_id]
119 |         temp_list = list(temp_data['time'])
120 |         # print ('-- problem_id ',problem_id,' -- ',len(temp_list),' --')
121 |         std_dict[itme_id] = np.std(temp_list, axis=0)
122 |         mean_dict[itme_id] = np.mean(temp_list, axis=0)
123 | 
124 |     assert len(std_dict) == len(mean_dict)
125 | 
126 |     data = data.reset_index(drop=True)
127 | 
128 |     for id in pp.prog_percent(range(len(data)), stream=sys.stdout, title='==> cast time to z-score'):
129 |         data.loc[id, 'time'] = (data.loc[id, 'time'] - mean_dict[data.loc[id, time_z_level]]) / (
130 |             std_dict[data.loc[id, time_z_level]] * 1.0)
131 | 
132 |     return data
133 | 
134 | 
135 | def temp(data):
136 |     # -1-transfer time to 'integar' from 'str'
137 |     old_time_list = list(data['time'])
138 |     new_time_list = []
139 |     for i in old_time_list:
140 |         new_time_list.append(int(float(i)))
141 |     data['time'] = new_time_list
142 | 
143 |     plt.hist(new_time_list, bins=np.arange(min(new_time_list), max(new_time_list), ))
144 |     plt.show()
145 | 
146 | 
147 | def time_add_level_process(data):
148 |     time_interval = 0.025
149 |     boundary_list = [0.5, 0.7]
150 |     data = data.reset_index(drop=True)
151 |     bins = np.arange(min(data['time']), max(data['time']), time_interval * 2)
152 | 
153 |     correct_mean_list = []
154 |     correct_std_list = []
155 |     correct_num_list = []
156 |     for item_index in pp.prog_percent(range(len(bins)), stream=sys.stdout, title='==> get correctness'):
157 |         up_bin = bins[item_index] + time_interval
158 |         down_bin = bins[item_index] - time_interval
159 | 
160 |         temp_data = data[(data['time'] >= down_bin) & (data['time'] < up_bin)]
161 |         temp_correct_list = list(temp_data['correct'])
162 | 
163 |         """
164 |         if up_bin<=-1:
165 |             print ("---"*20)
166 |             print ("*\t",down_bin)
167 |             print ("*\t",up_bin)
168 |             print (temp_correct_list)
169 |             #print (temp_data)
170 |             print ("---"*20)
171 |         """
172 | 
173 |         correct_num_list.append(len(temp_correct_list))
174 |         if (len(temp_correct_list) != 0):
175 |             if np.mean(temp_correct_list, axis=0) > 1:
176 |                 print("******\t", np.mean(temp_correct_list, axis=0), "\t", temp_correct_list)
177 |             correct_mean_list.append(np.mean(temp_correct_list, axis=0))
178 |             correct_std_list.append(np.std(temp_correct_list, axis=0))
179 |         else:
180 |             correct_mean_list.append(0)
181 |             correct_std_list.append(0)
182 | 
183 |     # plot the relationship
184 |     fig, axs = plt.subplots(nrows=2, ncols=1, sharex=True)
185 |     ax = axs[0]
186 |     ax.plot(bins, correct_mean_list, "r.")
187 |     ax.set_title('correctness')
188 | 
189 |     for nmber in boundary_list:
190 |         ax.axhline(y=nmber, xmin=0, xmax=1, c="red", linewidth=0.5, zorder=0)
191 | 
192 |     ax = axs[1]
193 |     ax.plot(bins, correct_num_list, "b--")
194 |     ax.set_title("time z score distribution")
195 | 
196 |     ax.set_xlim([-2, 4])
197 |     plt.savefig('./result/cmu_stat_f2011/time_distribution_correctness_' + str(
198 |         datetime.datetime.now().strftime("%Y-%m-%d-%H-%M")) + '.png')
199 |     #plt.show()
200 | 
201 |     # add a colum according to correctness boundary
202 |     time_level_list = []
203 |     temp_list = list(data['time'])
204 |     bd = [-1.2, -0.7, 0.75]
205 | 
206 |     # 0 ~        time < -1.2
207 |     # 1 ~ -1.2 < time < -0.7
208 |     # 2 ~ -0.7 < time < 0.75
209 |     # 3 ~ 0.75 < time
210 |     for idx in range(len(temp_list)):
211 |         if temp_list[idx] <= bd[0]:
212 |             time_level_list.append(0)
213 |         elif (bd[0] < temp_list[idx] and temp_list[idx] <= bd[1]):
214 |             time_level_list.append(1)
215 |         elif (bd[1] < temp_list[idx] and temp_list[idx] <= bd[2]):
216 |             time_level_list.append(2)
217 |         elif (temp_list[idx] > bd[2]):
218 |             time_level_list.append(3)
219 |         else:
220 |             raise Exception("Error in time division")
221 |     print("==> add time_level")
222 |     data['time_level'] = time_level_list
223 |     return data
224 | 
225 | def read_data_from_csv2():
226 |     processedFileName = './data/cmu_stat_f2011/test_data.csv'
227 |     raw_data_txt = "./data/cmu_stat_f2011/cmu.txt"
228 | 
229 |     if os.path.exists(processedFileName):
230 |         data = pd.read_csv(processedFileName)
231 |         print("==> read ", processedFileName, " directly")
232 | 
233 |     else:
234 |         if os.path.exists(raw_data_txt):
235 |             data = pd.read_csv(raw_data_txt, sep=" ", delimiter='\t')
236 |             print(data.columns)
237 |             data.rename(columns={'Duration (sec)': 'time', 'Outcome': 'correct',
238 |                          'KC (F2011)': 'skill_id', 'Problem Name': 'problem_id', 'Step Name': 'step_id',
239 |                          'Anon Student Id': 'user_id',"Student Response Type":"first_action",'Attempt At Step':"attempt_level"}, inplace=True)
240 | 
241 |             data = data.fillna(-1)
242 | 
243 |             filer_data = data[code0.DatasetParameter('cmu_stat_f2011').filtedColumnNameList]
244 |             filer_data = filer_data[(filer_data['correct'] != -1) & (filer_data['correct'] != 'HINT') & (
245 |                 filer_data['skill_id'] != '-1') & (filer_data['time'] != '.')]
246 | 
247 |             filer_data['correct'].replace({'CORRECT': 1, 'INCORRECT': 0}, inplace=True)
248 | 
249 |             # change str to integar
250 |             for feature in ['skill_id', 'step_id', 'problem_id', 'user_id', 'Level (Unit)', 'Level (Module)','first_action','attempt_level']:
251 |                 print("==> BEGIN ", feature)
252 |                 temp_set = set(list(filer_data[feature]))
253 |                 temp_dict = {key: value+1 for value, key in enumerate(temp_set)}
254 |                 filer_data[feature].replace(temp_dict, inplace=True)
255 |                 print("==> END   ", feature)
256 | 
257 |             print ("==> first_action",set(filer_data['first_action']))
258 |             print ("==> attempt_level",set(filer_data['attempt_level']))
259 |             data.to_csv(processedFileName,index=False)
260 |         else:
261 |             raise ('No data file exists!')
262 |     return data
263 | 
264 | def test():
265 |     processedFileName = './data/cmu_stat_f2011/test_data.csv'
266 |     data = pd.read_csv(processedFileName)
267 |     plt.hist(list(data['attempt_level']),np.arange(min(data['attempt_level']), max(data['attempt_level']), 1))
268 |     plt.show()
269 | 
270 | if __name__ == '__main__':
271 |     data = read_data_from_csv()
272 | 
273 | 


--------------------------------------------------------------------------------
/uril_oneHotEncoder.py:
--------------------------------------------------------------------------------
  1 | ''' Unless stated otherwise, all software is provided free of charge. 
  2 | As well, all software is provided on an "as is" basis without warranty 
  3 | of any kind, express or implied. Under no circumstances and under no legal 
  4 | theory, whether in tort, contract, or otherwise, shall Liang Zhang be liable 
  5 | to you or to any other person for any indirect, special, incidental, 
  6 | or consequential damages of any character including, without limitation, 
  7 | damages for loss of goodwill, work stoppage, computer failure or malfunction, 
  8 | or for any and all other damages or losses. If you do not agree with these terms, 
  9 | then you are advised to not use the software.'''
 10 | 
 11 | import tensorflow as tf
 12 | import numpy as np
 13 | import code0_parameter as code0
 14 | import code1_data as code1
 15 | 
 16 | 
 17 | class ONEHOTENCODERINPUT(object):
 18 |     def __init__(self, ap, dp, inputs,printControl=True):
 19 |         self.batch_size = batch_size = ap.batch_size
 20 |         self.num_steps = num_steps = ap.num_steps
 21 |         self.seq_width = seq_width = len(dp.columnsName_to_index)
 22 |         self.skill_num = dp.skill_num
 23 |         self.dp = dp
 24 |         self.ap = ap
 25 |         self.model_continues_columns = dp.model_continues_columns
 26 |         self.model_category_columns = dp.model_category_columns
 27 |         self.model_cross_columns = dp.model_cross_columns
 28 |         self.inputs = inputs
 29 |         self.printControl=printControl
 30 | 
 31 |         if dp.dataSetType == "assistment2009":
 32 |             width_deep_width_dict = {"skill_id": dp.columns_max['skill_id'] + 1,
 33 |                                      "correct": dp.columns_max['correct'] + 1,
 34 |                                      "time_level": dp.columns_max['time_level'] + 1,
 35 |                                      "attempt_level": dp.columns_max['attempt_level'] + 1,
 36 |                                      "first_action": dp.columns_max['first_action'] + 1}
 37 | 
 38 |             self.data_first_action = tf.to_int32(
 39 |                 tf.slice(self.inputs, [0, 0, dp.columnsName_to_index['first_action']], [-1, -1, 1]))
 40 |             self.data_first_action_process = tf.to_float(tf.squeeze(
 41 |                 tf.one_hot(indices=self.data_first_action, depth=width_deep_width_dict['first_action'], on_value=1.0,
 42 |                            off_value=0.0, axis=-1)))
 43 | 
 44 | 
 45 |             self.data_time_level = tf.to_int32(
 46 |                 tf.slice(self.inputs, [0, 0, dp.columnsName_to_index['time_level']], [-1, -1, 1]))
 47 |             self.data_time_level_process = tf.to_float(tf.squeeze(
 48 |                 tf.one_hot(indices=self.data_time_level, depth=width_deep_width_dict['time_level'], on_value=1.0,
 49 |                            off_value=0.0, axis=-1)))
 50 | 
 51 |             self.data_attempt_level = tf.to_int32(
 52 |                 tf.slice(self.inputs, [0, 0, dp.columnsName_to_index['attempt_level']], [-1, -1, 1]))
 53 |             self.data_attempt_level_process = tf.to_float(tf.squeeze(
 54 |                 tf.one_hot(indices=self.data_attempt_level, depth=width_deep_width_dict['attempt_level'], on_value=1.0,
 55 |                            off_value=0.0, axis=-1)))
 56 | 
 57 |         elif dp.dataSetType == "cmu_stat_f2011":  # kdd
 58 |             width_deep_width_dict = {"skill_id": dp.columns_max['skill_id'] + 1,
 59 |                                      "correct": dp.columns_max['correct'] + 1,
 60 |                                      "time_level": dp.columns_max['time_level'] + 1,
 61 |                                      "attempt_level": dp.columns_max['attempt_level'] + 1,
 62 |                                      "first_action": dp.columns_max['first_action'] + 1
 63 |                                      }
 64 |             self.data_time_level = tf.to_int32(
 65 |                 tf.slice(self.inputs, [0, 0, dp.columnsName_to_index['time_level']], [-1, -1, 1]))
 66 |             self.data_time_level_process = tf.to_float(tf.squeeze(
 67 |                 tf.one_hot(indices=self.data_time_level, depth=width_deep_width_dict['time_level'], on_value=1.0,
 68 |                            off_value=0.0, axis=-1)))
 69 | 
 70 |             self.data_first_action = tf.to_int32(
 71 |                 tf.slice(self.inputs, [0, 0, dp.columnsName_to_index['first_action']], [-1, -1, 1]))
 72 |             self.data_first_action_process = tf.to_float(tf.squeeze(
 73 |                 tf.one_hot(indices=self.data_first_action, depth=width_deep_width_dict['first_action'], on_value=1.0,
 74 |                            off_value=0.0, axis=-1)))
 75 | 
 76 |             self.data_attempt_level = tf.to_int32(
 77 |                 tf.slice(self.inputs, [0, 0, dp.columnsName_to_index['attempt_level']], [-1, -1, 1]))
 78 |             self.data_attempt_level_process = tf.to_float(tf.squeeze(
 79 |                 tf.one_hot(indices=self.data_attempt_level, depth=width_deep_width_dict['attempt_level'], on_value=1.0,
 80 |                            off_value=0.0, axis=-1)))
 81 | 
 82 |         elif dp.dataSetType == "kdd":  # kdd
 83 |             width_deep_width_dict = {"skill_id": dp.columns_max['skill_id'] + 1,
 84 |                                      "correct": dp.columns_max['correct'] + 1,
 85 |                                      "time_level": dp.columns_max['time_level'] + 1
 86 |                                     }
 87 |             self.data_time_level = tf.to_int32(
 88 |                 tf.slice(self.inputs, [0, 0, dp.columnsName_to_index['time_level']], [-1, -1, 1]))
 89 |             self.data_time_level_process = tf.to_float(tf.squeeze(
 90 |                 tf.one_hot(indices=self.data_time_level, depth=width_deep_width_dict['time_level'], on_value=1.0,
 91 |                            off_value=0.0, axis=-1)))
 92 | 
 93 |         self.data_skill_id = tf.to_int32(
 94 |             tf.slice(self.inputs, [0, 0, dp.columnsName_to_index['skill_id']], [-1, -1, 1]))
 95 |         self.data_skill_id_process = tf.to_float(tf.squeeze(
 96 |             tf.one_hot(indices=self.data_skill_id, depth=width_deep_width_dict['skill_id'], on_value=1.0, off_value=0.0,
 97 |                        axis=-1)))
 98 |         self.data_correct = tf.slice(self.inputs, [0, 0, dp.columnsName_to_index['correct']], [-1, -1, 1])
 99 | 
100 | 
101 |     def getSkillCorrectMerge(self):
102 |         featureList = [self.data_skill_id_process, self.data_correct]
103 |         TensorskillCorrect = tf.concat(2, featureList)
104 |         if self.printControl: print("==> [Tensor Shape] skill_id and correct merge formate\t", TensorskillCorrect.get_shape())
105 |         return TensorskillCorrect
106 | 
107 |     def getSkillCorrectCrossFeature(self):
108 |         TensorCrossFeatures = self._getCrossFeature(['skill_id correct'])
109 |         if self.printControl: print("==> [Tensor Shape] skill_id and correct cross feature\t", TensorCrossFeatures.get_shape())
110 |         return TensorCrossFeatures
111 | 
112 |     def getContinuesFeatureInputs(self):
113 |         featureList = []
114 |         for columnName in set(self.model_continues_columns):
115 |             if columnName == 'time':
116 |                 featureList.append(self.data_time_normal)
117 |             elif columnName == 'attempt_count':
118 |                 featureList.append(self.data_attempt_count_normal)
119 |             elif columnName == 'hint_count':
120 |                 featureList.append(self.data_hint_count_normal)
121 |             elif columnName == 'problem_view':
122 |                 featureList.append(self.data_problem_view_normal)
123 |             elif columnName in ['skill_id', 'correct']:
124 |                 pass
125 |             else:
126 |                 raise ValueError('only support time、attempt_count、hint_count')
127 | 
128 |         TensorContinuesFeature = tf.concat(2, featureList)
129 |         if self.printControl: print("==> [Tensor Shape] continues features\t", TensorContinuesFeature.get_shape())
130 |         return TensorContinuesFeature
131 | 
132 |     def getCategoryFeatureInputs(self):
133 |         featureList = []
134 |         for columnName in set(self.model_category_columns):
135 |             if columnName == 'first_action':
136 |                 featureList.append(self.data_first_action_process)
137 |             elif columnName == 'time_level':
138 |                 featureList.append(self.data_time_level_process)
139 |             elif columnName == 'attempt_level':
140 |                 featureList.append(self.data_attempt_level_process)
141 |             elif columnName in ['skill_id', 'correct']:
142 |                 pass
143 |             else:
144 |                 raise ValueError('Check your model_category_columns configuration')
145 | 
146 |         TensorCategoryFeature = tf.concat(2, featureList)
147 |         if self.printControl: print("==> [Tensor Shape] category features\t", TensorCategoryFeature.get_shape())
148 |         return TensorCategoryFeature
149 | 
150 |     def getCrossFeatureAll(self):
151 |         crossFeatureNameList = self.dp.convertCrossCoumnsToNameList(Flag=False)
152 |         TensorCrossFeatures = self._getCrossFeature(crossFeatureNameList)
153 |         if self.printControl: print("==> [Tensor Shape] Cross Feature whole\t", TensorCrossFeatures.get_shape())
154 |         return TensorCrossFeatures
155 | 
156 |     def _getCrossFeature(self, crossFeatureNameList):
157 |         if crossFeatureNameList == ['skill_id correct'] or crossFeatureNameList == ['correct skill_id']:
158 |             crossFeatureNameList = ['skill_id correct']
159 | 
160 |         wide_length = 0
161 |         for i, crossFeatureName in enumerate(crossFeatureNameList):  # crossFeatureName is a string'correct first_response_time'
162 |             depthValue = int(self.dp.columns_max[crossFeatureName] + 1)
163 |             wide_length += depthValue
164 | 
165 |             tmp_value = tf.to_int32(
166 |                 tf.slice(self.inputs, [0, 0, self.dp.columnsName_to_index[crossFeatureName]], [-1, -1, 1]))
167 |             tmp_value_ohe = tf.to_float(
168 |                 tf.squeeze(tf.one_hot(indices=tmp_value, depth=depthValue, on_value=1.0, off_value=0.0, axis=-1)))
169 |             if self.printControl: print("==> [Tensor Shape] Cross Feature--", crossFeatureName, " width\t", depthValue)
170 | 
171 |             if i == 0:
172 |                 TensorCrossFeatures = tmp_value_ohe
173 |             else:
174 |                 TensorCrossFeatures = tf.concat(2, [TensorCrossFeatures, tmp_value_ohe])
175 |         # if no cross features, the return value is null
176 |         return TensorCrossFeatures
177 | 
178 |     def get_init_value_for_train_weights(self):
179 |         featureslist = [self.getSkillCorrectCrossFeature(), self.getCrossFeatureAll(), self.getCategoryFeatureInputs(),
180 |                         self.getContinuesFeatureInputs()]
181 |         x_tmp = tf.concat(2, featureslist)
182 |         x = tf.reshape(x_tmp, [self.batch_size * self.num_steps, -1])
183 |         return x
184 | 
185 | 
186 | if __name__ == "__main__":
187 |     dp = code0.DatasetParameter()
188 |     ap = code0.autoencoderParameter()
189 | 
190 |     dataset, labels = code1.load_data(dp)
191 |     # tuple_data = code1.convert_data_labels_to_tuples(dataset, labels)
192 | 
193 |     skill_num = len(dataset['skill_id'].unique()) + 1  # 0 for unlisted skill_id
194 |     dp.skill_num = skill_num
195 |     dp.skill_set = list(dataset['skill_id'].unique())
196 |     dp.columns_max, dp.columns_numb, dp.columnsName_to_index = code1.get_columns_info(dataset)
197 |     dp.seq_width = len(dp.columnsName_to_index)
198 | 
199 |     print("columns_max\n", dp.columns_max)
200 |     print("columns_numb\n", dp.columns_numb)
201 |     print("columnsName_to_index\n", dp.columnsName_to_index)
202 | 
203 |     data = np.random.randint(low=0,high=2, size=())
204 |     g =tf.Graph()
205 |     with g.as_default():
206 |         inputs = tf.placeholder(tf.float32, [ap.batch_size, ap.num_steps, len(dp.columnsName_to_index)])
207 |         m = ONEHOTENCODERINPUT(ap=ap, dp=dp,inputs=inputs)
208 | 
209 |     with tf.Session(graph=g) as sess:
210 |         m.getSkillCorrectMerge()
211 |         m.getContinuesFeatureInputs()
212 |         m.getCategoryFeatureInputs()
213 |         print("-" * 60)
214 |         m.getSkillCorrectCrossFeature()
215 |         print("-" * 60)
216 |         m.getCrossFeatureAll()
217 | 


--------------------------------------------------------------------------------
/uril_assistment2009.py:
--------------------------------------------------------------------------------
  1 | ''' Unless stated otherwise, all software is provided free of charge. 
  2 | As well, all software is provided on an "as is" basis without warranty 
  3 | of any kind, express or implied. Under no circumstances and under no legal 
  4 | theory, whether in tort, contract, or otherwise, shall Liang Zhang be liable 
  5 | to you or to any other person for any indirect, special, incidental, 
  6 | or consequential damages of any character including, without limitation, 
  7 | damages for loss of goodwill, work stoppage, computer failure or malfunction, 
  8 | or for any and all other damages or losses. If you do not agree with these terms, 
  9 | then you are advised to not use the software.'''
 10 | 
 11 | import pandas as pd
 12 | import uril_tools as aux
 13 | import code1_data as code1
 14 | import code0_parameter as code0
 15 | import pyprind, os
 16 | import sys
 17 | import uril_connectUser
 18 | import numpy as np
 19 | import pyprind as pp
 20 | import pylab as pl
 21 | import datetime
 22 | import matplotlib.pyplot as plt
 23 | 
 24 | 
 25 | def read_asssistment2009_data_from_csv(dp):
 26 |     # read process file directly if exists
 27 |     if os.path.exists(dp.processedFileName):
 28 |         print("==> read ", dp.processedFileName)
 29 |         data = pd.read_csv(dp.processedFileName)
 30 |         print(aux.stastic_SecNumber_UserNumber_SkillNumber(data, dp))
 31 |         return data
 32 | 
 33 |     # processfile not exist, load connect data and process it
 34 |     if os.path.exists(dp.connect_file_name):
 35 |         print("==> read ", dp.connect_file_name)
 36 |         data = pd.read_csv(dp.connect_file_name)
 37 |     else:  # read raw data and connect
 38 |         try:
 39 |             data = pd.read_csv(dp.csv_file_name, encoding='latin-1', error_bad_lines=False, index_col=False)
 40 |             if dp.csv_file_name == "./data/assistment2009/skill_builder_data_corrected.csv":
 41 |                 data = data.loc[:338000]
 42 |             elif dp.csv_file_name == "./data/assistment2009/skill_builder_data.csv":
 43 |                 data = data.loc[:450000]
 44 |             else:
 45 |                 pass
 46 |             print("==> read ", dp.csv_file_name)
 47 |         except:
 48 |             raise NameError("can't load " + dp.csv_file_name + " pleace check your file")
 49 |         print('==> columns names\t', data.columns)
 50 | 
 51 |         data.rename(columns={'ms_first_response': 'time', 'hint_count': '_hint_count', 'hint_total': 'hint_count'},
 52 |                     inplace=True)
 53 | 
 54 |         data = data[dp.filtedColumnNameList].fillna(0)
 55 |         if dp.dataSetSize == "small":
 56 |             data = data[0:50000]
 57 |         print("==> run ", dp.dataSetSize, " dataset")
 58 | 
 59 |         data = data[data['original'] == 1]
 60 |         data = data.reset_index(drop=True)
 61 |         print("==> consider original==1, data shape\t", data.shape)
 62 | 
 63 |         data = uril_connectUser.connectUser(data, dp.connect_file_name)
 64 |         print("==> save ", dp.connect_file_name)
 65 | 
 66 |     ### data process
 67 |     # correct process
 68 |     print("==> remove records whose correct is not 1 or 0")
 69 |     data = data[(data['correct'] == 1) | (data['correct'] == 0)]
 70 |     data = data.reset_index(drop=True)
 71 | 
 72 |     # time process
 73 |     data = time_basic_process(data)
 74 |     data = time_add_level_process(data)
 75 |     data = data.reset_index(drop=True)
 76 | 
 77 |     # attempt process
 78 |     data = attempt_add_level_process(data)
 79 | 
 80 |     print("==> dataset column name\n", data.columns)
 81 |     print("==> dataset shape\t", data.shape)
 82 | 
 83 |     data.to_csv(dp.processedFileName, index=False)
 84 |     print("==> save file to ", dp.processedFileName)
 85 | 
 86 |     aux.stastic_SecNumber_UserNumber_SkillNumber(data, dp)
 87 |     return data
 88 | 
 89 | 
 90 | def time_basic_process(data):
 91 |     # -1-transfer to second unit
 92 |     print("==> transfer time unit: millsecond to second")
 93 |     tempTimeList = list(data['time'])
 94 |     newTimeList = [int(x / 1000) for x in tempTimeList]
 95 |     data['time'] = newTimeList
 96 |     del newTimeList, tempTimeList
 97 | 
 98 |     # -2-remove outlier records
 99 |     print('==> delete outlier of time feature')
100 |     print('==> length before delete\t', len(data))
101 |     data = data[(data['time'] <= code0.DatasetParameter().time_threshold) & (data['time'] > 0)]
102 |     print('==> length after delete\t', len(data))
103 | 
104 |     # -3-transfer to z-score
105 |     time_z_level = code0.DatasetParameter().time_z_level
106 |     print('==> preprocerss time to z-score based on ', time_z_level)
107 |     time_z_id_set = np.unique(data[time_z_level])
108 |     std_dict = {}
109 |     mean_dict = {}
110 |     for itme_id in pp.prog_percent(time_z_id_set, stream=sys.stdout, title='==> extract mean and std of time'):
111 |         temp_data = data[data[time_z_level] == itme_id]
112 |         temp_list = list(temp_data['time'])
113 |         # print ('-- problem_id ',problem_id,' -- ',len(temp_list),' --')
114 |         std_dict[itme_id] = np.std(temp_list, axis=0)
115 |         mean_dict[itme_id] = np.mean(temp_list, axis=0)
116 | 
117 |     assert len(std_dict) == len(mean_dict)
118 | 
119 |     data = data.reset_index(drop=True)
120 |     for id in pp.prog_percent(range(len(data)), stream=sys.stdout, title='==> cast time to z-score'):
121 |         data.loc[id, 'time'] = (data.loc[id, 'time'] - mean_dict[data.loc[id, time_z_level]]) / (
122 |             std_dict[data.loc[id, time_z_level]] * 1.0)
123 | 
124 |     data = data.fillna(0)
125 | 
126 |     """
127 |     plt.hist(list(data['time']), bins=np.arange(min(data['time']), max(data['time']), code0.DatasetParameter().time_interval*2))
128 |     plt.title("time z score distribution")
129 |     plt.savefig('./result/assistment2009/time_distribution' + str(datetime.datetime.now().strftime("%Y-%m-%d-%H-%M")) + '.png')
130 |     """
131 |     return data
132 | 
133 | 
134 | def time_add_level_process(data):
135 |     data = data.reset_index(drop=True)
136 |     bins = np.arange(min(data['time']), max(data['time']), code0.DatasetParameter().time_interval * 2)
137 |     correct_mean_list = []
138 |     correct_std_list = []
139 |     correct_num_list = []
140 |     for item_index in pp.prog_percent(range(len(bins)), stream=sys.stdout, title='==> get correctness'):
141 |         up_bin = bins[item_index] + code0.DatasetParameter().time_interval
142 |         down_bin = bins[item_index] - code0.DatasetParameter().time_interval
143 | 
144 |         temp_data = data[data['time'] >= down_bin]
145 |         temp_data = temp_data[temp_data['time'] < up_bin]
146 | 
147 |         temp_correct_list = list(temp_data['correct'])
148 |         correct_num_list.append(len(temp_correct_list))
149 |         if (len(temp_correct_list) != 0):
150 |             correct_mean_list.append(np.mean(temp_correct_list, axis=0))
151 |             correct_std_list.append(np.std(temp_correct_list, axis=0))
152 |         else:
153 |             correct_mean_list.append(0)
154 |             correct_std_list.append(0)
155 | 
156 |     # plot the relationship
157 |     fig, axs = plt.subplots(nrows=2, ncols=1, sharex=True)
158 |     ax = axs[0]
159 |     ax.plot(bins, correct_mean_list)
160 |     ax.set_title('correctness')
161 |     boundary_list = code0.DatasetParameter().correct_boundary_list
162 |     for nmber in boundary_list:
163 |         ax.axhline(y=nmber, xmin=0, xmax=1, c="red", linewidth=0.5, zorder=0)
164 | 
165 |     ax = axs[1]
166 |     ax.plot(bins, correct_num_list)
167 |     ax.set_title("time z score distribution")
168 | 
169 |     ax.set_xlim([-2, 4])
170 |     plt.savefig('./result/assistment2009/time_distribution_correctness_' + str(
171 |         datetime.datetime.now().strftime("%Y-%m-%d-%H-%M")) + '.png')
172 |     # plt.show()
173 | 
174 |     # add a colum according to correctness boundary
175 |     time_level_list = []
176 |     temp_list = list(data['time'])
177 |     bd = code0.DatasetParameter().time_boundary_list
178 |     # 0 ~        time <-0.8
179 |     # 1 ~ -0.8 < time < -0.6
180 |     # 2 ~ -0.6 < time < 0
181 |     # 3 ~    0 < time
182 |     for idx in range(len(temp_list)):
183 |         if temp_list[idx] <= bd[0]:
184 |             time_level_list.append(0)
185 |         elif (bd[0] < temp_list[idx] and temp_list[idx] <= bd[1]):
186 |             time_level_list.append(1)
187 |         elif (bd[1] < temp_list[idx] and temp_list[idx] <= bd[2]):
188 |             time_level_list.append(2)
189 |         elif (temp_list[idx] > bd[2]):
190 |             time_level_list.append(3)
191 |         else:
192 |             raise Exception("Error in time division")
193 | 
194 |     data['time_level'] = time_level_list
195 |     return data
196 | 
197 | 
198 | def attempt_add_level_process(data):
199 |     """
200 |     based on correctness and attempt relationship
201 |     0 - attempt: 0 - 0
202 |     1 - attempt: 1 - 81.7%
203 |     2 - attempt: 2 -
204 |     3 - attempt: 0 - 0
205 |     """
206 |     temp_list = []
207 | 
208 |     for item in pp.prog_percent(list(data['attempt_count']), stream=sys.stdout, title='==> cast attmept to attempt_level'):
209 |         if item == 0:
210 |             temp = 0
211 |         elif item == 1:
212 |             temp = 1
213 |         else:
214 |             temp = 2
215 | 
216 |         temp_list.append(temp)
217 |     data['attempt_level'] = temp_list
218 |     return data
219 | 
220 | 
221 | def attempt_and_hint_process(data):
222 |     print('==> remove records whose attempt_account is more than 15')
223 |     data = data[data['attempt_count'] <= 15]
224 |     data = data.reset_index(drop=True)
225 | 
226 |     problem_list = np.unique(data['problem_id'])
227 |     attempt_dict = {}
228 |     hint_dict = {}
229 |     attempt_list = []
230 |     hint_list = []
231 |     for idx in pp.prog_percent(range(len(problem_list)), stream=sys.stdout,
232 |                                title='==> get attmept and hint max value at problem level'):
233 |         temp_data = data[data['problem_id'] == problem_list[idx]]
234 |         attempt_dict[problem_list[idx]] = max(temp_data['attempt_count'])
235 |         attempt_list.append(max(temp_data['attempt_count']))
236 |         hint_dict[problem_list[idx]] = max(temp_data['hint_count'])
237 |         hint_list.append(max(temp_data['hint_count']))
238 | 
239 |     fig, axs = plt.subplots(nrows=2, ncols=1, sharex=False)
240 |     ax = axs[0]
241 |     ax.hist(attempt_list, bins=np.arange(0, 16, 1))
242 |     ax.set_title('max attempt distribution')
243 |     ax.set_xlabel("attempt(max)")
244 |     ax.set_ylabel("number")
245 | 
246 |     ax = axs[1]
247 |     ax.hist(hint_list)
248 |     ax.set_title("max hint distribution")
249 |     ax.set_xlabel("hint(max)")
250 |     ax.set_ylabel("number")
251 | 
252 |     plt.savefig(
253 |         './result/assistment2009/attempt_hint_number_' + datetime.datetime.now().strftime("%Y-%m-%d-%H-%M") + '.png')
254 | 
255 |     for idx in pp.prog_percent(range(len(data)), stream=sys.stdout,
256 |                                title='==> cast attempt  count and hint count to value/max'):
257 |         if attempt_dict[data.loc[idx, 'problem_id']] == 0:
258 |             data.loc[idx, 'attempt_count_level'] = -1
259 |         else:
260 |             data.loc[idx, 'attempt_count_level'] = data.loc[idx, 'attempt_count'] / (
261 |                 attempt_dict[data.loc[idx, 'problem_id']] * 1.0)
262 | 
263 |         if hint_dict[data.loc[idx, 'problem_id']] == 0:
264 |             data.loc[idx, 'hint_count_level'] = -1
265 |         else:
266 |             data.loc[idx, 'hint_count_level'] = data.loc[idx, 'hint_count'] / (
267 |                 hint_dict[data.loc[idx, 'problem_id']] * 1.0)
268 | 
269 |     return data
270 | 
271 | 
272 | def attemp_hint_and_correctness_analysis(data):
273 |     data = data.reset_index(drop=True)
274 |     bins = np.concatenate([[-1], np.arange(0.0, 1.1, 0.1)])
275 | 
276 |     for attri in ['hint_count_level', 'attempt_count_level']:
277 |         correct_mean_list = []
278 |         correct_std_list = []
279 |         correct_num_list = []
280 | 
281 |         for item_index in pp.prog_percent(range(len(bins)), stream=sys.stdout,
282 |                                           title='==> get correctness according to ' + attri):
283 |             up_bin = bins[item_index] + 0.05
284 |             down_bin = bins[item_index] - 0.05
285 | 
286 |             temp_data = data[(data[attri] >= down_bin) & (data[attri] < up_bin)]
287 |             temp_correct_list = list(temp_data['correct'])
288 |             correct_num_list.append(len(temp_correct_list))
289 | 
290 |             if (len(temp_correct_list) != 0):
291 |                 correct_mean_list.append(np.mean(temp_correct_list, axis=0))
292 |                 correct_std_list.append(np.std(temp_correct_list, axis=0))
293 |             else:
294 |                 correct_mean_list.append(0)
295 |                 correct_std_list.append(0)
296 | 
297 |         fig, axs = plt.subplots(nrows=2, ncols=1, sharex=True)
298 |         ax = axs[0]
299 |         ax.plot(bins, correct_mean_list)
300 |         ax.set_title('correctness ' + attri)
301 | 
302 |         boundary_list = code0.DatasetParameter().correct_boundary_list
303 |         for nmber in boundary_list:
304 |             ax.axhline(y=nmber, xmin=0, xmax=1, c="red", linewidth=0.5, zorder=0)
305 | 
306 |         ax = axs[1]
307 |         ax.plot(bins, correct_num_list)
308 |         ax.set_title(attri + " number distribution")
309 |         ax.set_xlim([-1.1, 1.1])
310 |         plt.savefig('./result/assistment2009/' + attri + '_correctness_' + str(
311 |             datetime.datetime.now().strftime("%Y-%m-%d-%H-%M")) + '.png')
312 | 
313 | 
314 | def attempt_correct_analysis(data):
315 |     data = data[data['attempt_count'] <= code0.DatasetParameter().attemp_max]
316 |     u, c = aux.counter(list(data['attempt_count']))
317 | 
318 |     atempt_list = np.arange(code0.DatasetParameter().attemp_max + 1)
319 |     correct_num_list = []
320 |     for item in atempt_list:
321 |         temp_data = data[(data['attempt_count'] == item)]
322 |         if len(temp_data) != 0:
323 |             correct_num_list.append(sum(temp_data['correct']) * 1.0 / len(temp_data))
324 |         else:
325 |             correct_num_list.append(0)
326 |     print(u, "\n", c)
327 |     print(atempt_list, "\n", correct_num_list)
328 | 
329 |     for a in correct_num_list:
330 |         print("%.3f" % a)
331 | 
332 | 
333 | if __name__ == "__main__":
334 |     dp = code0.DatasetParameter()
335 |     data = read_asssistment2009_data_from_csv(dp)
336 |     attempt_correct_analysis(data)
337 |     """
338 |     data = pd.read_csv("./data/assistment2009/time_connect_data.csv")
339 |     data = data[:30000]
340 |     data  = attempt_and_hint_process(data)
341 |     attemp_hint_and_correctness_analysis(data)
342 | 
343 |     data = pd.read_csv(dp.connect_file_name)
344 |     data = data[:10000]
345 |     print(data[:10])
346 |     data = attempt_and_hint_process(data)
347 |     print(data[:10])
348 | 
349 |     data = pd.read_csv(dp.connect_file_name)
350 |     data = data[:10000]
351 |     print(data[:10])
352 |     data = time_process(data)
353 |     print(data[:10])
354 |     data.to_csv('./data/assistment2009/kkk.csv')
355 |     data = time_correctness_relation_analysis(data)
356 |     print(data[:10])
357 |     data = pd.read_csv('./data/assistment2009/kkk.csv')
358 |     """
359 | 


--------------------------------------------------------------------------------