├── .gitignore ├── README.md ├── command.py ├── dataprep.py ├── datasplit.py ├── gen_test.py ├── model.py ├── run.py └── util.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | *.pyc 3 | *~ 4 | .DS_Store 5 | data/ 6 | ws/ 7 | .idea -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # EKT/EERNN code 2 | 3 | To run: `python run.py -w {config,train,test,stat,...}` 4 | 5 | If this code helps with your studies, please kindly cite the following publication: 6 | 7 | ``` 8 | @article{liu2019ekt, 9 | title={EKT: Exercise-aware Knowledge Tracing for Student Performance Prediction}, 10 | author={Liu, Qi and Huang, Zhenya and Yin, Yu and Chen, Enhong and Xiong, Hui and Su, Yu and Hu, Guoping}, 11 | journal={IEEE Transactions on Knowledge and Data Engineering}, 12 | year={2019}, 13 | publisher={IEEE} 14 | } 15 | ``` 16 | 17 | Also, visit https://base.ustc.edu.cn for more of our works. 18 | 19 | ## Configure 20 | 21 | ``` 22 | python run.py -w ws/test config EKTA -h # check parameters available 23 | python run.py -w ws/test config EKTA 24 | ``` 25 | 26 | ## Train 27 | 28 | Specify dataset to train (no dataset publicly available, but demo dataset is on the way) 29 | 30 | ``` 31 | python run.py -w ws/test train -d full -N 1 32 | ``` 33 | 34 | ## Test 35 | 36 | Test predicting result on sequeence #10000: 37 | 38 | ``` 39 | python run.py -w ws/test test -d full_test -s 0.10000 40 | ``` 41 | 42 | ## Evaluation 43 | 44 | Results are under `ws/test/results`. To evaluate: 45 | 46 | ``` 47 | python run.py stat ws/test/results/school.0.10000 48 | ``` 49 | -------------------------------------------------------------------------------- /command.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import print_function, division 3 | import torch 4 | import torch.nn.functional as F 5 | import numpy as np 6 | import matplotlib.pyplot as plt 7 | from scipy.stats import pearsonr 8 | import time 9 | import random 10 | import math 11 | import logging 12 | import json 13 | from pathlib import Path 14 | from six.moves import input 15 | from collections import namedtuple 16 | from operator import itemgetter 17 | from yata.fields import Words, Categorical 18 | 19 | from dataprep import get_dataset, get_topics 20 | from util import save_snapshot, load_snapshot, load_last_snapshot, \ 21 | open_result, Variable, use_cuda 22 | 23 | 24 | def train(model, args): 25 | logging.info('args: %s' % str(args)) 26 | logging.info('model: %s, setup: %s' % 27 | (type(model).__name__, str(model.args))) 28 | logging.info('loading dataset') 29 | data = get_dataset(args.dataset) 30 | data.random_level = args.random_level 31 | 32 | if args.split_method == 'user': 33 | data, _ = data.split_user(args.frac) 34 | elif args.split_method == 'future': 35 | data, _ = data.split_future(args.frac) 36 | elif args.split_method == 'old': 37 | data, _, _, _ = data.split() 38 | 39 | data = data.get_seq() 40 | 41 | if type(model).__name__.startswith('DK'): 42 | topic_dic = {} 43 | kcat = Categorical(one_hot=True) 44 | kcat.load_dict(open('data/know_list.txt').read().split('\n')) 45 | for line in open('data/id_know.txt'): 46 | uuid, know = line.strip().split(' ') 47 | know = know.split(',') 48 | topic_dic[uuid] = \ 49 | torch.LongTensor(kcat.apply(None, know)) \ 50 | .max(0)[0] \ 51 | .type(torch.LongTensor) 52 | zero = [0] * len(kcat.apply(None, '')) 53 | else: 54 | topics = get_topics(args.dataset, model.words) 55 | 56 | optimizer = torch.optim.Adam(model.parameters()) 57 | 58 | start_epoch = load_last_snapshot(model, args.workspace) 59 | if use_cuda: 60 | model.cuda() 61 | 62 | for epoch in range(start_epoch, args.epochs): 63 | logging.info(('epoch {}:'.format(epoch))) 64 | then = time.time() 65 | 66 | total_loss = 0 67 | total_mae = 0 68 | total_acc = 0 69 | total_seq_cnt = 0 70 | 71 | users = list(data) 72 | random.shuffle(users) 73 | seq_cnt = len(users) 74 | 75 | MSE = torch.nn.MSELoss() 76 | MAE = torch.nn.L1Loss() 77 | 78 | for user in users: 79 | total_seq_cnt += 1 80 | 81 | seq = data[user] 82 | length = len(seq) 83 | 84 | optimizer.zero_grad() 85 | 86 | loss = 0 87 | mae = 0 88 | acc = 0 89 | 90 | h = None 91 | 92 | for i, item in enumerate(seq): 93 | if type(model).__name__.startswith('DK'): 94 | if item.topic in topic_dic: 95 | x = topic_dic[item.topic] 96 | else: 97 | x = zero 98 | else: 99 | x = topics.get(item.topic).content 100 | x = Variable(torch.LongTensor(x)) 101 | # print(x.size()) 102 | score = Variable(torch.FloatTensor([round(item.score)])) 103 | t = Variable(torch.FloatTensor([item.time])) 104 | s, h = model(x, score, t, h) 105 | if args.loss == 'cross_entropy': 106 | loss += F.binary_cross_entropy_with_logits( 107 | s, score.view_as(s)) 108 | m = MAE(F.sigmoid(s), score).data[0] 109 | else: 110 | loss += MSE(s, score) 111 | m = MAE(s, score).data[0] 112 | mae += m 113 | acc += m < 0.5 114 | 115 | loss /= length 116 | mae /= length 117 | acc /= length 118 | 119 | total_loss += loss.data[0] 120 | total_mae += mae 121 | total_acc += acc 122 | 123 | loss.backward() 124 | optimizer.step() 125 | 126 | if total_seq_cnt % args.save_every == 0: 127 | save_snapshot(model, args.workspace, 128 | '%d.%d' % (epoch, total_seq_cnt)) 129 | 130 | if total_seq_cnt % args.print_every != 0 and \ 131 | total_seq_cnt != seq_cnt: 132 | continue 133 | 134 | now = time.time() 135 | duration = (now - then) / 60 136 | 137 | logging.info('[%d:%d/%d] (%.2f seqs/min) ' 138 | 'loss %.6f, mae %.6f, acc %.6f' % 139 | (epoch, total_seq_cnt, seq_cnt, 140 | ((total_seq_cnt - 1) % 141 | args.print_every + 1) / duration, 142 | total_loss / total_seq_cnt, 143 | total_mae / total_seq_cnt, 144 | total_acc / total_seq_cnt)) 145 | then = now 146 | 147 | save_snapshot(model, args.workspace, epoch + 1) 148 | 149 | 150 | def trainn(model, args): 151 | logging.info('model: %s, setup: %s' % (type(model).__name__, str(model.args))) 152 | logging.info('loading dataset') 153 | data = get_dataset(args.dataset) 154 | data.random_level = args.random_level 155 | 156 | if args.split_method == 'user': 157 | data, _ = data.split_user(args.frac) 158 | elif args.split_method == 'future': 159 | data, _ = data.split_future(args.frac) 160 | elif args.split_method == 'old': 161 | data, _, _, _ = data.split() 162 | 163 | data = data.get_seq() 164 | 165 | if args.input_knowledge: 166 | logging.info('loading knowledge concepts') 167 | topic_dic = {} 168 | kcat = Categorical(one_hot=True) 169 | kcat.load_dict(open(model.args['knows']).read().split('\n')) 170 | know = 'data/id_firstknow.txt' if 'first' in model.args['knows'] \ 171 | else 'data/id_know.txt' 172 | for line in open(know): 173 | uuid, know = line.strip().split(' ') 174 | know = know.split(',') 175 | topic_dic[uuid] = torch.LongTensor(kcat.apply(None, know)).max(0)[0] 176 | zero = [0] * len(kcat.apply(None, '')) 177 | 178 | if args.input_text: 179 | logging.info('loading exercise texts') 180 | topics = get_topics(args.dataset, model.words) 181 | 182 | optimizer = torch.optim.Adam(model.parameters()) 183 | 184 | start_epoch = load_last_snapshot(model, args.workspace) 185 | if use_cuda: 186 | model.cuda() 187 | 188 | for epoch in range(start_epoch, args.epochs): 189 | logging.info('epoch {}:'.format(epoch)) 190 | then = time.time() 191 | 192 | total_loss = 0 193 | total_mae = 0 194 | total_acc = 0 195 | total_seq_cnt = 0 196 | 197 | users = list(data) 198 | random.shuffle(users) 199 | seq_cnt = len(users) 200 | 201 | MSE = torch.nn.MSELoss() 202 | MAE = torch.nn.L1Loss() 203 | 204 | for user in users: 205 | total_seq_cnt += 1 206 | 207 | seq = data[user] 208 | seq_length = len(seq) 209 | 210 | optimizer.zero_grad() 211 | 212 | loss = 0 213 | mae = 0 214 | acc = 0 215 | 216 | h = None 217 | 218 | for i, item in enumerate(seq): 219 | # score = round(item.score) 220 | if args.input_knowledge: 221 | if item.topic in topic_dic: 222 | knowledge = topic_dic[item.topic] 223 | else: 224 | knowledge = zero 225 | # knowledge = torch.LongTensor(knowledge).view(-1).type(torch.FloatTensor) 226 | # one_index = torch.nonzero(knowledge).view(-1) 227 | # expand_vec = torch.zeros(knowledge.size()).view(-1) 228 | # expand_vec[one_index] = score 229 | # cks = torch.cat([knowledge, expand_vec]).view(1, -1) 230 | knowledge = Variable(torch.LongTensor(knowledge)) 231 | # cks = Variable(cks) 232 | 233 | if args.input_text: 234 | text = topics.get(item.topic).content 235 | text = Variable(torch.LongTensor(text)) 236 | score = Variable(torch.FloatTensor([item.score])) 237 | item_time = Variable(torch.FloatTensor([item.time])) 238 | 239 | if type(model).__name__.startswith('DK'): 240 | s, h = model(knowledge, score, item_time, h) 241 | elif type(model).__name__.startswith('RA'): 242 | s, h = model(text, score, item_time, h) 243 | elif type(model).__name__.startswith('EK'): 244 | s, h = model(text, knowledge, score, item_time, h) 245 | 246 | s = s[0] 247 | 248 | if args.loss == 'cross_entropy': 249 | loss += F.binary_cross_entropy_with_logits(s, score.view_as(s)) 250 | m = MAE(F.sigmoid(s), score).data[0] 251 | else: 252 | loss += MSE(s, score) 253 | m = MAE(s, score).data[0] 254 | mae += m 255 | acc += m < 0.5 256 | 257 | loss /= seq_length 258 | mae /= seq_length 259 | acc = float(acc) / seq_length 260 | 261 | total_loss += loss.data[0] 262 | total_mae += mae 263 | total_acc += acc 264 | 265 | loss.backward() 266 | optimizer.step() 267 | 268 | if total_seq_cnt % args.save_every == 0: 269 | save_snapshot(model, args.workspace, '%d.%d' % (epoch, total_seq_cnt)) 270 | 271 | if total_seq_cnt % args.print_every != 0 and total_seq_cnt != seq_cnt: 272 | continue 273 | 274 | now = time.time() 275 | duration = (now - then) / 60 276 | 277 | logging.info('[%d:%d/%d] (%.2f seqs/min) loss %.6f, mae %.6f, acc %.6f' % 278 | (epoch,total_seq_cnt, seq_cnt, ((total_seq_cnt-1) % args.print_every + 1)/duration, 279 | total_loss/total_seq_cnt, total_mae/total_seq_cnt, total_acc/total_seq_cnt)) 280 | then = now 281 | 282 | save_snapshot(model, args.workspace, epoch + 1) 283 | 284 | 285 | def test(model, args): 286 | try: 287 | torch.set_grad_enabled(False) 288 | except AttributeError: 289 | pass 290 | logging.info('model: %s, setup: %s' % 291 | (type(model).__name__, str(model.args))) 292 | logging.info('loading dataset') 293 | data = get_dataset(args.dataset) 294 | data.random_level = args.random_level 295 | 296 | if not args.dataset.endswith('test'): 297 | if args.split_method == 'user': 298 | _, data = data.split_user(args.frac) 299 | testsets = [('user_split', data, {})] 300 | elif args.split_method == 'future': 301 | _, data = data.split_future(args.frac) 302 | testsets = [('future_split', data, {})] 303 | elif args.split_method == 'old': 304 | trainset, _, _, _ = data.split() 305 | data = trainset.get_seq() 306 | train, user, exam, new = data.split() 307 | train = train.get_seq() 308 | user = user.get_seq() 309 | exam = exam.get_seq() 310 | new = new.get_seq() 311 | testsets = zip(['user', 'exam', 'new'], [user, exam, new], 312 | [{}, train, user]) 313 | else: 314 | if args.ref_set: 315 | ref = get_dataset(args.ref_set) 316 | ref.random_level = args.random_level 317 | testsets = [(args.dataset.split('/')[-1], 318 | data.get_seq(), ref.get_seq())] 319 | else: 320 | testsets = [('student', data.get_seq(), {})] 321 | else: 322 | testsets = [('school', data.get_seq(), {})] 323 | 324 | if type(model).__name__.startswith('DK'): 325 | topic_dic = {} 326 | kcat = Categorical(one_hot=True) 327 | kcat.load_dict(open('data/know_list.txt').read().split('\n')) 328 | for line in open('data/id_know.txt'): 329 | uuid, know = line.strip().split(' ') 330 | know = know.split(',') 331 | topic_dic[uuid] = \ 332 | torch.LongTensor(kcat.apply(None, know)) \ 333 | .max(0)[0] \ 334 | .type(torch.LongTensor) 335 | zero = [0] * len(kcat.apply(None, '')) 336 | else: 337 | topics = get_topics(args.dataset, model.words) 338 | 339 | if args.snapshot is None: 340 | epoch = load_last_snapshot(model, args.workspace) 341 | else: 342 | epoch = args.snapshot 343 | load_snapshot(model, args.workspace, epoch) 344 | logging.info('loaded model at epoch %s', str(epoch)) 345 | 346 | if use_cuda: 347 | model.cuda() 348 | 349 | for testset, data, ref_data in testsets: 350 | logging.info('testing on: %s', testset) 351 | f = open_result(args.workspace, testset, epoch) 352 | 353 | then = time.time() 354 | 355 | total_mse = 0 356 | total_mae = 0 357 | total_acc = 0 358 | total_seq_cnt = 0 359 | 360 | users = list(data) 361 | random.shuffle(users) 362 | seq_cnt = len(users) 363 | 364 | MSE = torch.nn.MSELoss() 365 | MAE = torch.nn.L1Loss() 366 | 367 | for user in users[:5000]: 368 | seq = data[user] 369 | if user in ref_data: 370 | ref_seq = ref_data[user] 371 | else: 372 | ref_seq = [] 373 | 374 | seq2 = [] 375 | seen = set() 376 | for item in ref_seq: 377 | if item.topic in seen: 378 | continue 379 | seen.add(item.topic) 380 | seq2.append(item) 381 | ref_seq = seq2 382 | 383 | seq2 = [] 384 | for item in seq: 385 | if item.topic in seen: 386 | continue 387 | seen.add(item.topic) 388 | seq2.append(item) 389 | seq = seq2 390 | 391 | ref_len = len(ref_seq) 392 | seq = ref_seq + seq 393 | length = len(seq) 394 | 395 | if ref_len < args.ref_len: 396 | length = length + ref_len - args.ref_len 397 | ref_len = args.ref_len 398 | 399 | if length < 1: 400 | continue 401 | total_seq_cnt += 1 402 | 403 | mse = 0 404 | mae = 0 405 | acc = 0 406 | 407 | pred_scores = Variable(torch.zeros(len(seq))) 408 | 409 | s = None 410 | h = None 411 | 412 | for i, item in enumerate(seq): 413 | if args.test_on_last: 414 | x = topics.get(seq[-1].topic).content 415 | x = Variable(torch.LongTensor(x), volatile=True) 416 | score = Variable(torch.FloatTensor([round(seq[-1].score)]), 417 | volatile=True) 418 | t = Variable(torch.FloatTensor([seq[-1].time]), 419 | volatile=True) 420 | s, _ = model(x, score, t, h) 421 | s_last = torch.clamp(s, 0, 1) 422 | if type(model).__name__.startswith('DK'): 423 | if item.topic in topic_dic: 424 | x = topic_dic[item.topic] 425 | else: 426 | x = zero 427 | else: 428 | x = topics.get(item.topic).content 429 | x = Variable(torch.LongTensor(x)) 430 | score = Variable(torch.FloatTensor([round(item.score)]), 431 | volatile=True) 432 | t = Variable(torch.FloatTensor([item.time]), volatile=True) 433 | if args.test_as_seq and i > ref_len and ref_len > 0: 434 | s, h = model(x, s.view(1), t, h) 435 | else: 436 | if ref_len > 0 and i > ref_len and not args.test_on_one: 437 | s, _ = model(x, score, t, h) 438 | else: 439 | s, h = model(x, score, t, h) 440 | if args.loss == 'cross_entropy': 441 | s = F.sigmoid(s) 442 | else: 443 | s = torch.clamp(s, 0, 1) 444 | if args.test_on_last: 445 | pred_scores[i] = s_last 446 | else: 447 | pred_scores[i] = s 448 | if i < ref_len: 449 | continue 450 | mse += MSE(s, score) 451 | m = MAE(s, score).data[0] 452 | mae += m 453 | acc += m < 0.5 454 | 455 | print_seq(seq, pred_scores.data.cpu().numpy(), ref_len, f, 456 | args.test_on_last) 457 | 458 | mse /= length 459 | mae /= length 460 | acc /= length 461 | 462 | total_mse += mse.data[0] 463 | total_mae += mae 464 | total_acc += acc 465 | 466 | if total_seq_cnt % args.print_every != 0 and \ 467 | total_seq_cnt != seq_cnt: 468 | continue 469 | 470 | now = time.time() 471 | duration = (now - then) / 60 472 | 473 | logging.info('[%d/%d] (%.2f seqs/min) ' 474 | 'rmse %.6f, mae %.6f, acc %.6f' % 475 | (total_seq_cnt, seq_cnt, 476 | ((total_seq_cnt - 1) % 477 | args.print_every + 1) / duration, 478 | math.sqrt(total_mse / total_seq_cnt), 479 | total_mae / total_seq_cnt, 480 | total_acc / total_seq_cnt)) 481 | then = now 482 | 483 | f.close() 484 | 485 | 486 | def testfuture(model, args): 487 | try: 488 | torch.set_grad_enabled(False) 489 | except AttributeError: 490 | pass 491 | logging.info('model: %s, setup: %s' % (type(model).__name__, str(model.args))) 492 | logging.info('loading dataset') 493 | 494 | data = get_dataset(args.dataset) 495 | data.random_level = args.random_level 496 | 497 | if not args.dataset.endswith('test'): 498 | if args.split_method == 'user': 499 | _, data = data.split_user(args.frac) 500 | testsets = [('user_split', data, {})] 501 | elif args.split_method == 'future': 502 | _, data = data.split_future(args.frac) 503 | testsets = [('future_split', data, {})] 504 | elif args.split_method == 'old': 505 | trainset, _, _, _ = data.split() 506 | data = trainset.get_seq() 507 | train, user, exam, new = data.split() 508 | train = train.get_seq() 509 | user = user.get_seq() 510 | exam = exam.get_seq() 511 | new = new.get_seq() 512 | testsets = zip(['user', 'exam', 'new'], [user, exam, new], 513 | [{}, train, user]) 514 | else: 515 | if args.ref_set: 516 | ref = get_dataset(args.ref_set) 517 | ref.random_level = args.random_level 518 | testsets = [(args.dataset.split('/')[-1], 519 | data.get_seq(), ref.get_seq())] 520 | else: 521 | testsets = [('student', data.get_seq(), {})] 522 | else: 523 | testsets = [('school', data.get_seq(), {})] 524 | 525 | if args.input_knowledge: 526 | logging.info('loading knowledge concepts') 527 | topic_dic = {} 528 | kcat = Categorical(one_hot=True) 529 | kcat.load_dict(open(model.args['knows']).read().split('\n')) 530 | know = 'data/id_firstknow.txt' if 'first' in model.args['knows'] \ 531 | else 'data/id_know.txt' 532 | for line in open(know): 533 | uuid, know = line.strip().split(' ') 534 | know = know.split(',') 535 | topic_dic[uuid] = torch.LongTensor(kcat.apply(None, know)).max(0)[0] 536 | zero = [0] * len(kcat.apply(None, '')) 537 | 538 | if args.input_text: 539 | logging.info('loading exercise texts') 540 | topics = get_topics(args.dataset, model.words) 541 | 542 | if args.snapshot is None: 543 | epoch = load_last_snapshot(model, args.workspace) 544 | else: 545 | epoch = args.snapshot 546 | load_snapshot(model, args.workspace, epoch) 547 | logging.info('loaded model at epoch %s', str(epoch)) 548 | 549 | if use_cuda: 550 | model.cuda() 551 | 552 | for testset, data, ref_data in testsets: 553 | logging.info('testing on: %s', testset) 554 | f = open_result(args.workspace, testset, epoch) 555 | 556 | then = time.time() 557 | 558 | total_mse = 0 559 | total_mae = 0 560 | total_acc = 0 561 | total_seq_cnt = 0 562 | 563 | users = list(data) 564 | random.shuffle(users) 565 | seq_cnt = len(users) 566 | 567 | MSE = torch.nn.MSELoss() 568 | MAE = torch.nn.L1Loss() 569 | 570 | for user in users[:5000]: 571 | seq = data[user] 572 | if user in ref_data: 573 | ref_seq = ref_data[user] 574 | else: 575 | ref_seq = [] 576 | 577 | # seq2 = [] 578 | # seen = set() 579 | # for item in ref_seq: 580 | # if item.topic in seen: 581 | # continue 582 | # seen.add(item.topic) 583 | # seq2.append(item) 584 | # ref_seq = seq2 585 | 586 | # seq2 = [] 587 | # for item in seq: 588 | # if item.topic in seen: 589 | # continue 590 | # seen.add(item.topic) 591 | # seq2.append(item) 592 | # seq = seq2 593 | 594 | ref_len = len(ref_seq) 595 | seq = ref_seq + seq 596 | length = len(seq) 597 | 598 | if ref_len < args.ref_len: 599 | length = length + ref_len - args.ref_len 600 | ref_len = args.ref_len 601 | 602 | if length < 1: 603 | continue 604 | 605 | length -= ref_len 606 | 607 | mse = 0 608 | mae = 0 609 | acc = 0 610 | 611 | pred_scores = Variable(torch.zeros(len(seq))) 612 | 613 | s = None 614 | h = None 615 | 616 | for i, item in enumerate(seq): 617 | if args.input_knowledge: 618 | if item.topic in topic_dic: 619 | knowledge = topic_dic[item.topic] 620 | else: 621 | knowledge = zero 622 | knowledge = Variable(torch.LongTensor(knowledge)) 623 | 624 | if args.input_text: 625 | text = topics.get(item.topic).content 626 | text = Variable(torch.LongTensor(text)) 627 | 628 | score = Variable(torch.FloatTensor([item.score]), volatile=True) 629 | item_time = Variable(torch.FloatTensor([item.time]), volatile=True) 630 | 631 | # change student state h until the fit process reaches trainset 632 | # predict on one 633 | if ref_len > 0 and i > ref_len: 634 | if type(model).__name__.startswith('DK'): 635 | s, _ = model(knowledge, score, item_time, h) 636 | elif type(model).__name__.startswith('RA'): 637 | s, _ = model(text, score, item_time, h) 638 | elif type(model).__name__.startswith('EK'): 639 | s, _ = model(text, knowledge, score, item_time, h) 640 | else: 641 | if type(model).__name__.startswith('DK'): 642 | s, h = model(knowledge, score, item_time, h) 643 | elif type(model).__name__.startswith('RA'): 644 | s, h = model(text, score, item_time, h) 645 | elif type(model).__name__.startswith('EK'): 646 | s, h = model(text, knowledge, score, item_time, h) 647 | 648 | pred_scores[i] = s 649 | 650 | if args.loss == 'cross_entropy': 651 | s = F.sigmoid(s) 652 | else: 653 | s = torch.clamp(s, 0, 1) 654 | 655 | # ignore the result if the fit process is not enough 656 | if i < ref_len: 657 | continue 658 | 659 | mse += MSE(s, score).data[0] 660 | m = MAE(s, score).data[0] 661 | mae += m 662 | acc += m < 0.5 663 | 664 | print_seq(seq, pred_scores.data.cpu().numpy(), ref_len, f, False) 665 | mse /= length 666 | mae /= length 667 | acc = float(acc) / length 668 | 669 | total_mse += mse 670 | total_mae += mae 671 | total_acc += acc 672 | 673 | total_seq_cnt += 1 674 | 675 | if total_seq_cnt % args.print_every != 0 and total_seq_cnt != seq_cnt: 676 | continue 677 | 678 | now = time.time() 679 | duration = (now - then) / 60 680 | 681 | logging.info('[%d/%d] (%.2f seqs/min) ' 682 | 'rmse %.6f, mae %.6f, acc %.6f' % 683 | (total_seq_cnt, seq_cnt, 684 | ((total_seq_cnt - 1) % 685 | args.print_every + 1) / duration, 686 | math.sqrt(total_mse / total_seq_cnt), 687 | total_mae / total_seq_cnt, 688 | total_acc / total_seq_cnt)) 689 | then = now 690 | f.close() 691 | 692 | 693 | def test_future_on_seq(model, args): 694 | try: 695 | torch.set_grad_enabled(False) 696 | except AttributeError: 697 | pass 698 | logging.info('model: %s, setup: %s' % (type(model).__name__, str(model.args))) 699 | logging.info('loading dataset') 700 | 701 | data = get_dataset(args.dataset) 702 | data.random_level = args.random_level 703 | 704 | if not args.dataset.endswith('test'): 705 | if args.split_method == 'user': 706 | _, data = data.split_user(args.frac) 707 | testsets = [('user_split', data, {})] 708 | elif args.split_method == 'future': 709 | _, data = data.split_future(args.frac) 710 | testsets = [('future_split', data, {})] 711 | elif args.split_method == 'old': 712 | trainset, _, _, _ = data.split() 713 | data = trainset.get_seq() 714 | train, user, exam, new = data.split() 715 | train = train.get_seq() 716 | user = user.get_seq() 717 | exam = exam.get_seq() 718 | new = new.get_seq() 719 | testsets = zip(['user', 'exam', 'new'], [user, exam, new], 720 | [{}, train, user]) 721 | else: 722 | if args.ref_set: 723 | ref = get_dataset(args.ref_set) 724 | ref.random_level = args.random_level 725 | testsets = [(args.dataset.split('/')[-1], 726 | data.get_seq(), ref.get_seq())] 727 | else: 728 | testsets = [('student', data.get_seq(), {})] 729 | else: 730 | testsets = [('school', data.get_seq(), {})] 731 | 732 | if args.input_knowledge: 733 | logging.info('loading knowledge concepts') 734 | topic_dic = {} 735 | kcat = Categorical(one_hot=True) 736 | kcat.load_dict(open(model.args['knows']).read().split('\n')) 737 | know = 'data/id_firstknow.txt' if 'first' in model.args['knows'] \ 738 | else 'data/id_know.txt' 739 | for line in open(know): 740 | uuid, know = line.strip().split(' ') 741 | know = know.split(',') 742 | topic_dic[uuid] = torch.LongTensor(kcat.apply(None, know)).max(0)[0] 743 | zero = [0] * len(kcat.apply(None, '')) 744 | 745 | if args.input_text: 746 | logging.info('loading exercise texts') 747 | topics = get_topics(args.dataset, model.words) 748 | 749 | if args.snapshot is None: 750 | epoch = load_last_snapshot(model, args.workspace) 751 | else: 752 | epoch = args.snapshot 753 | load_snapshot(model, args.workspace, epoch) 754 | logging.info('loaded model at epoch %s', str(epoch)) 755 | 756 | if use_cuda: 757 | model.cuda() 758 | 759 | for testset, data, ref_data in testsets: 760 | logging.info('testing on: %s', testset) 761 | f = open_result(args.workspace, testset, epoch) 762 | 763 | then = time.time() 764 | 765 | total_mse = 0 766 | total_mae = 0 767 | total_acc = 0 768 | total_seq_cnt = 0 769 | 770 | users = list(data) 771 | random.shuffle(users) 772 | seq_cnt = len(users) 773 | 774 | MSE = torch.nn.MSELoss() 775 | MAE = torch.nn.L1Loss() 776 | 777 | for user in users[:5000]: 778 | total_seq_cnt += 1 779 | 780 | seq = data[user] 781 | if user in ref_data: 782 | ref_seq = ref_data[user] 783 | else: 784 | ref_seq = [] 785 | 786 | length = len(seq) 787 | ref_len = len(ref_seq) 788 | seq = ref_seq + seq 789 | 790 | if ref_len < args.ref_len: 791 | length = length + ref_len - args.ref_len 792 | ref_len = args.ref_len 793 | 794 | if length < 1: 795 | ref_len = ref_len + length - 1 796 | length = 1 797 | 798 | mse = 0 799 | mae = 0 800 | acc = 0 801 | 802 | seq2 = [] 803 | seen = set() 804 | for item in seq: 805 | if item.topic in seen: 806 | continue 807 | seen.add(item.topic) 808 | seq2.append(item) 809 | 810 | seq = seq2 811 | length = len(seq) - ref_len 812 | 813 | pred_scores = Variable(torch.zeros(len(seq))) 814 | 815 | s = None 816 | h = None 817 | 818 | for i, item in enumerate(seq): 819 | if args.input_knowledge: 820 | if item.topic in topic_dic: 821 | knowledge = topic_dic[item.topic] 822 | else: 823 | knowledge = zero 824 | knowledge = Variable(torch.LongTensor(knowledge)) 825 | 826 | if args.input_text: 827 | text = topics.get(item.topic).content 828 | text = Variable(torch.LongTensor(text)) 829 | 830 | score = Variable(torch.FloatTensor([round(item.score)]), volatile=True) 831 | item_time = Variable(torch.FloatTensor([item.time]), volatile=True) 832 | 833 | # change student state h by true score if the fit process does not reach trainset 834 | # change student state h by pred score if the fit process reaches trainset 835 | # predict on seq 836 | if ref_len > 0 and i > ref_len: 837 | if type(model).__name__.startswith('DK'): 838 | s, h = model(knowledge, s.view(1), item_time, h) 839 | elif type(model).__name__.startswith('RA'): 840 | s, h = model(text, s.view(1), item_time, h) 841 | elif type(model).__name__.startswith('EK'): 842 | s, h = model(text, knowledge, s.view(1), item_time, h) 843 | else: 844 | if type(model).__name__.startswith('DK'): 845 | s, h = model(knowledge, score, item_time, h) 846 | elif type(model).__name__.startswith('RA'): 847 | s, h = model(text, score, item_time, h) 848 | elif type(model).__name__.startswith('EK'): 849 | s, h = model(text, knowledge, score, item_time, h) 850 | 851 | pred_scores[i] = s 852 | 853 | if args.loss == 'cross_entropy': 854 | s = F.sigmoid(s) 855 | else: 856 | s = torch.clamp(s, 0, 1) 857 | 858 | # ignore the result if the fit process is not enough 859 | if i < ref_len: 860 | continue 861 | 862 | mse += MSE(s, score) 863 | m = MAE(s, score).data[0] 864 | mae += m 865 | acc += m < 0.5 866 | 867 | print_seq(seq, pred_scores.data.cpu().numpy(), ref_len, f, args.test_on_last) 868 | 869 | mse /= length 870 | mae /= length 871 | acc /= length 872 | 873 | total_mse += mse.data[0] 874 | total_mae += mae 875 | total_acc += acc 876 | 877 | if total_seq_cnt % args.print_every != 0 and total_seq_cnt != seq_cnt: 878 | continue 879 | 880 | now = time.time() 881 | duration = (now - then) / 60 882 | 883 | logging.info('[%d/%d] (%.2f seqs/min) ' 884 | 'rmse %.6f, mae %.6f, acc %.6f' % 885 | (total_seq_cnt, seq_cnt, 886 | ((total_seq_cnt - 1) % 887 | args.print_every + 1) / duration, 888 | math.sqrt(total_mse / total_seq_cnt), 889 | total_mae / total_seq_cnt, 890 | total_acc / total_seq_cnt)) 891 | then = now 892 | f.close() 893 | 894 | def testseq(model, args): 895 | try: 896 | torch.set_grad_enabled(False) 897 | except AttributeError: 898 | pass 899 | logging.info('model: %s, setup: %s' % (type(model).__name__, str(model.args))) 900 | logging.info('loading dataset') 901 | 902 | data = get_dataset(args.dataset) 903 | data.random_level = args.random_level 904 | 905 | if not args.dataset.endswith('test'): 906 | if args.split_method == 'user': 907 | _, data = data.split_user(args.frac) 908 | testsets = [('user_split', data, {})] 909 | elif args.split_method == 'future': 910 | _, data = data.split_future(args.frac) 911 | testsets = [('future_split', data, {})] 912 | elif args.split_method == 'old': 913 | trainset, _, _, _ = data.split() 914 | data = trainset.get_seq() 915 | train, user, exam, new = data.split() 916 | train = train.get_seq() 917 | user = user.get_seq() 918 | exam = exam.get_seq() 919 | new = new.get_seq() 920 | testsets = zip(['user', 'exam', 'new'], [user, exam, new], 921 | [{}, train, user]) 922 | else: 923 | if args.ref_set: 924 | ref = get_dataset(args.ref_set) 925 | ref.random_level = args.random_level 926 | testsets = [(args.dataset.split('/')[-1], 927 | data.get_seq(), ref.get_seq())] 928 | else: 929 | testsets = [('student', data.get_seq(), {})] 930 | else: 931 | testsets = [('school', data.get_seq(), {})] 932 | 933 | if args.input_knowledge: 934 | logging.info('loading knowledge concepts') 935 | topic_dic = {} 936 | kcat = Categorical(one_hot=True) 937 | kcat.load_dict(open(model.args['knows']).read().split('\n')) 938 | know = 'data/id_firstknow.txt' if 'first' in model.args['knows'] \ 939 | else 'data/id_know.txt' 940 | for line in open(know): 941 | uuid, know = line.strip().split(' ') 942 | know = know.split(',') 943 | topic_dic[uuid] = torch.LongTensor(kcat.apply(None, know)).max(0)[0] 944 | zero = [0] * len(kcat.apply(None, '')) 945 | 946 | if args.input_text: 947 | logging.info('loading exercise texts') 948 | topics = get_topics(args.dataset, model.words) 949 | 950 | if args.snapshot is None: 951 | epoch = load_last_snapshot(model, args.workspace) 952 | else: 953 | epoch = args.snapshot 954 | load_snapshot(model, args.workspace, epoch) 955 | logging.info('loaded model at epoch %s', str(epoch)) 956 | 957 | if use_cuda: 958 | model.cuda() 959 | 960 | for testset, data, ref_data in testsets: 961 | logging.info('testing on: %s', testset) 962 | f = open_result(args.workspace, testset, epoch) 963 | 964 | then = time.time() 965 | 966 | total_mse = 0 967 | total_mae = 0 968 | total_acc = 0 969 | total_seq_cnt = 0 970 | 971 | users = list(data) 972 | random.shuffle(users) 973 | seq_cnt = len(users) 974 | 975 | MSE = torch.nn.MSELoss() 976 | MAE = torch.nn.L1Loss() 977 | 978 | for user in users[:5000]: 979 | total_seq_cnt += 1 980 | 981 | seq = data[user] 982 | if user in ref_data: 983 | ref_seq = ref_data[user] 984 | else: 985 | ref_seq = [] 986 | 987 | length = len(seq) 988 | ref_len = len(ref_seq) 989 | seq = ref_seq + seq 990 | 991 | if ref_len < args.ref_len: 992 | length = length + ref_len - args.ref_len 993 | ref_len = args.ref_len 994 | 995 | if length < 1: 996 | ref_len = ref_len + length - 1 997 | length = 1 998 | 999 | mse = 0 1000 | mae = 0 1001 | acc = 0 1002 | 1003 | # seq2 = [] 1004 | # seen = set() 1005 | # for item in seq: 1006 | # if item.topic in seen: 1007 | # continue 1008 | # seen.add(item.topic) 1009 | # seq2.append(item) 1010 | 1011 | # seq = seq2 1012 | # length = len(seq) - ref_len 1013 | 1014 | pred_scores = Variable(torch.zeros(len(seq))) 1015 | 1016 | s = None 1017 | h = None 1018 | 1019 | for i, item in enumerate(seq): 1020 | # get last record for testing and current record for updating 1021 | if args.input_knowledge: 1022 | if item.topic in topic_dic: 1023 | knowledge = topic_dic[item.topic] 1024 | knowledge_last = topic_dic[seq[-1].topic] 1025 | else: 1026 | knowledge = zero 1027 | knowledge_last = zero 1028 | knowledge = Variable(torch.LongTensor(knowledge)) 1029 | knowledge_last = Variable(torch.LongTensor(knowledge_last), volatile=True) 1030 | 1031 | if args.input_text: 1032 | text = topics.get(item.topic).content 1033 | text = Variable(torch.LongTensor(text)) 1034 | text_last = topics.get(seq[-1].topic).content 1035 | text_last = Variable(torch.LongTensor(text_last), volatile=True) 1036 | 1037 | score = Variable(torch.FloatTensor([item.score]), volatile=True) 1038 | score_last = Variable(torch.FloatTensor([round(seq[-1].score)]), volatile=True) 1039 | item_time = Variable(torch.FloatTensor([item.time]), volatile=True) 1040 | time_last = Variable(torch.FloatTensor([seq[-1].time]), volatile=True) 1041 | 1042 | # test last score of each seq for seq figure 1043 | if type(model).__name__.startswith('DK'): 1044 | s, _ = model(knowledge_last, score_last, time_last, h) 1045 | elif type(model).__name__.startswith('RA'): 1046 | s, _ = model(text_last, score_last, time_last, h) 1047 | elif type(model).__name__.startswith('EK'): 1048 | s, _ = model(text_last, knowledge_last, score_last, time_last, h) 1049 | s_last = torch.clamp(s, 0, 1) 1050 | 1051 | # update student state h until the fit process reaches trainset 1052 | if ref_len > 0 and i > ref_len: 1053 | if type(model).__name__.startswith('DK'): 1054 | s, _ = model(knowledge, score, item_time, h) 1055 | elif type(model).__name__.startswith('RA'): 1056 | s, _ = model(text, score, item_time, h) 1057 | elif type(model).__name__.startswith('EK'): 1058 | s, _ = model(text, knowledge, score, item_time, h) 1059 | else: 1060 | if type(model).__name__.startswith('DK'): 1061 | s, h = model(knowledge, score, item_time, h) 1062 | elif type(model).__name__.startswith('RA'): 1063 | s, h = model(text, score, item_time, h) 1064 | elif type(model).__name__.startswith('EK'): 1065 | s, h = model(text, knowledge, score, item_time, h) 1066 | 1067 | pred_scores[i] = s_last 1068 | 1069 | if args.loss == 'cross_entropy': 1070 | s = F.sigmoid(s) 1071 | else: 1072 | s = torch.clamp(s, 0, 1) 1073 | if i < ref_len: 1074 | continue 1075 | 1076 | mse += MSE(s, score) 1077 | m = MAE(s, score).data[0] 1078 | mae += m 1079 | acc += m < 0.5 1080 | 1081 | print_seq(seq, pred_scores.data.cpu().numpy(), ref_len, f, True) 1082 | 1083 | mse /= length 1084 | mae /= length 1085 | acc = float(acc) / length 1086 | 1087 | total_mse += mse.data[0] 1088 | total_mae += mae 1089 | total_acc += acc 1090 | 1091 | if total_seq_cnt % args.print_every != 0 and total_seq_cnt != seq_cnt: 1092 | continue 1093 | 1094 | now = time.time() 1095 | duration = (now - then) / 60 1096 | 1097 | logging.info('[%d/%d] (%.2f seqs/min) ' 1098 | 'rmse %.6f, mae %.6f, acc %.6f' % 1099 | (total_seq_cnt, seq_cnt, 1100 | ((total_seq_cnt - 1) % 1101 | args.print_every + 1) / duration, 1102 | math.sqrt(total_mse / total_seq_cnt), 1103 | total_mae / total_seq_cnt, 1104 | total_acc / total_seq_cnt)) 1105 | then = now 1106 | f.close() 1107 | 1108 | 1109 | def print_seq(seq, pred_scores, ref_len, f, last=False): 1110 | for i in range(len(seq)): 1111 | print(seq[i].topic, i, ref_len, seq[-1 if last else i].score, 1112 | pred_scores[i], file=f) 1113 | f.flush() 1114 | 1115 | 1116 | def stat_overall(f, with_auc, round_score=False, short=False): 1117 | max_cnt = 0 1118 | total_mse = 0 1119 | total_mae = 0 1120 | total_acc = 0 1121 | total_cnt = 0 1122 | sps = [] 1123 | sns = [] 1124 | seen = set() 1125 | 1126 | seq_cnt = 0 1127 | new_seq_flat = False 1128 | auc = 0 1129 | for line in f: 1130 | uuid, i, ref_len, true, pred = line.split() 1131 | 1132 | i = int(i) 1133 | ref_len = int(ref_len) 1134 | true = float(true) 1135 | if round_score: 1136 | true = round(true) 1137 | pred = float(pred) 1138 | pred = np.clip(pred, 0, 1) 1139 | mae = abs(true - pred) 1140 | acc = mae < 0.5 1141 | 1142 | if i == 0 and len(sps) > 0 and len(sns) > 0: 1143 | seq_cnt += 1 1144 | auc += calc_auc(sps, sns) 1145 | sps = [] 1146 | sns = [] 1147 | seen = set() 1148 | 1149 | if uuid in seen: 1150 | continue 1151 | else: 1152 | seen.add(uuid) 1153 | 1154 | if i < ref_len: 1155 | continue 1156 | if short and i > ref_len + 10: 1157 | continue 1158 | 1159 | total_cnt += 1 1160 | total_mae += mae 1161 | total_mse += mae * mae 1162 | total_acc += acc 1163 | 1164 | if true > 0.8: 1165 | sps.append(pred) 1166 | elif true < 0.2: 1167 | sns.append(pred) 1168 | 1169 | logging.info('mae: %f\trmse:%f\tacc:%f\tauc:%f' % 1170 | (total_mae / total_cnt, 1171 | math.sqrt(total_mse / total_cnt), 1172 | total_acc / total_cnt, 1173 | auc/seq_cnt)) 1174 | 1175 | 1176 | def stat_seq(f, with_auc, round_score=False, short=False): 1177 | cnt = [0 for _ in range(5000)] 1178 | accs = [0.0 for _ in range(5000)] 1179 | spss = [[] for _ in range(5000)] 1180 | snss = [[] for _ in range(5000)] 1181 | max_cnt = 0 1182 | total_mse = 0 1183 | total_mae = 0 1184 | total_acc = 0 1185 | total_cnt = 0 1186 | sps = [] 1187 | sns = [] 1188 | for line in f: 1189 | uuid, i, ref_len, true, pred = line.split() 1190 | i = int(i) 1191 | ref_len = int(ref_len) 1192 | true = float(true) 1193 | if round_score: 1194 | true = round(true) 1195 | pred = float(pred) 1196 | pred = np.clip(pred, 0, 1) 1197 | mae = abs(true - pred) 1198 | acc = mae < 0.5 1199 | cnt[i] += 1 1200 | if cnt[i] > max_cnt: 1201 | max_cnt = cnt[i] 1202 | accs[i] += acc 1203 | 1204 | if i < ref_len: 1205 | continue 1206 | if short and i > ref_len + 10: 1207 | continue 1208 | 1209 | if true >= 0.5: 1210 | sps.append(pred) 1211 | spss[i].append(pred) 1212 | else: 1213 | sns.append(pred) 1214 | snss[i].append(pred) 1215 | total_cnt += 1 1216 | total_mae += mae 1217 | total_mse += mae * mae 1218 | total_acc += acc 1219 | 1220 | for i in range(0, 500): 1221 | if cnt[i] < 10: 1222 | break 1223 | print(i, accs[i] / cnt[i], calc_auc(spss[i], snss[i]), sep='\t') 1224 | 1225 | auc = 0.5 1226 | if with_auc: 1227 | auc = calc_auc(sps, sns) 1228 | 1229 | logging.info('mae: %f\trmse:%f\tacc:%f\tauc:%f' % 1230 | (total_mae / total_cnt, 1231 | math.sqrt(total_mse / total_cnt), 1232 | total_acc / total_cnt, 1233 | auc)) 1234 | 1235 | 1236 | # deprecated 1237 | def stat(f, with_auc, round_score=False, short=False): 1238 | cnt = [0 for _ in range(5000)] 1239 | accs = [0.0 for _ in range(5000)] 1240 | spss = [[] for _ in range(5000)] 1241 | snss = [[] for _ in range(5000)] 1242 | max_cnt = 0 1243 | total_mse = 0 1244 | total_mae = 0 1245 | total_acc = 0 1246 | total_cnt = 0 1247 | sps = [] 1248 | sns = [] 1249 | for line in f: 1250 | uuid, i, ref_len, true, pred = line.split() 1251 | i = int(i) 1252 | ref_len = int(ref_len) 1253 | true = float(true) 1254 | if round_score: 1255 | true = round(true) 1256 | pred = float(pred) 1257 | # pred = np.clip(pred, 0, 1) 1258 | mae = abs(true - pred) 1259 | acc = mae < 0.5 1260 | cnt[i] += 1 1261 | if cnt[i] > max_cnt: 1262 | max_cnt = cnt[i] 1263 | accs[i] += acc 1264 | 1265 | if true >= 0.5: 1266 | sps.append(pred) 1267 | spss[i].append(pred) 1268 | else: 1269 | sns.append(pred) 1270 | snss[i].append(pred) 1271 | 1272 | if i < ref_len: 1273 | continue 1274 | if short and i > ref_len + 10: 1275 | continue 1276 | 1277 | total_cnt += 1 1278 | total_mae += mae 1279 | total_mse += mae * mae 1280 | total_acc += acc 1281 | 1282 | for i in range(0, 500): 1283 | if cnt[i] < 10: 1284 | break 1285 | print(i, accs[i] / cnt[i], calc_auc(spss[i], snss[i]), sep='\t') 1286 | 1287 | auc = 0.5 1288 | if with_auc: 1289 | auc = calc_auc(sps, sns) 1290 | 1291 | logging.info('mae: %f\trmse:%f\tacc:%f\tauc:%f' % 1292 | (total_mae / total_cnt, 1293 | math.sqrt(total_mse / total_cnt), 1294 | total_acc / total_cnt, 1295 | auc)) 1296 | 1297 | 1298 | def calc_auc(sps, sns): 1299 | auc = 0 1300 | cnt = 0 1301 | sns.sort() 1302 | for sp in sps: 1303 | i = np.searchsorted(sns, sp) 1304 | auc += i 1305 | if i < len(sns) and sns[i] - sp < 1e-6: 1306 | auc += 1 1307 | cnt += len(sns) 1308 | if cnt == 0: 1309 | return 1 1310 | auc /= cnt 1311 | return auc 1312 | 1313 | 1314 | def predict(model, args): 1315 | try: 1316 | torch.set_grad_enabled(False) 1317 | except AttributeError: 1318 | pass 1319 | logging.info('model: %s, setup: %s' % 1320 | (type(model).__name__, str(model.args))) 1321 | logging.info('loading dataset') 1322 | 1323 | if args.snapshot is None: 1324 | epoch = load_last_snapshot(model, args.workspace) 1325 | else: 1326 | epoch = args.snapshot 1327 | load_snapshot(model, args.workspace, epoch) 1328 | logging.info('loaded model at epoch %s', str(epoch)) 1329 | 1330 | to_categorical = Categorical('') 1331 | to_categorical.load_dict(model.words) 1332 | trans = to_categorical(Words(':', null='')) 1333 | 1334 | while True: 1335 | # loop over inputs 1336 | try: 1337 | line = input() 1338 | except EOFError: 1339 | logging.info('bye') 1340 | break 1341 | 1342 | try: 1343 | obj = json.loads(line, encoding='utf-8') 1344 | ref_seq = obj['ref'] 1345 | pred_seq = obj['pred'] 1346 | except (json.decoder.JSONDecodeError, KeyError): 1347 | print('[]') 1348 | continue 1349 | 1350 | h = None 1351 | for i, item in enumerate(ref_seq): 1352 | x = trans.apply(None, item['fea']) 1353 | x = Variable(torch.LongTensor(x), volatile=True) 1354 | score = Variable(torch.FloatTensor([item['t']]), 1355 | volatile=True) 1356 | t = Variable(torch.FloatTensor([item['s']]), volatile=True) 1357 | _, h = model(x, score, t, h) 1358 | 1359 | pred_scores = [] 1360 | 1361 | for i, item in enumerate(pred_seq): 1362 | x = trans.apply(None, item['fea']) 1363 | x = Variable(torch.LongTensor(x), volatile=True) 1364 | score = Variable(torch.FloatTensor([0.]), 1365 | volatile=True) 1366 | t = Variable(torch.FloatTensor([item['t']]), volatile=True) 1367 | s, _ = model(x, score, t, h) 1368 | pred_scores.append(s.cpu().data[0][0]) 1369 | 1370 | print(pred_scores) 1371 | 1372 | 1373 | if __name__ == '__main__': 1374 | print(calc_auc([1], [1])) 1375 | -------------------------------------------------------------------------------- /dataprep.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division, print_function, unicode_literals 3 | 4 | from datetime import datetime 5 | from collections import namedtuple, defaultdict 6 | from operator import itemgetter 7 | from io import open 8 | import os 9 | import time 10 | import json 11 | import numpy as np 12 | from yata.fields import Words, Categorical 13 | from yata.loaders import TableLoader 14 | from random import sample, randint, shuffle, seed 15 | 16 | # 做题记录 17 | Record = namedtuple('Record', ['user', 'school', 'topic', 'exam', 18 | 'score', 'time']) 19 | 20 | # 学生做题序列,其中bias用于执行考试内打乱 21 | Item = namedtuple('Item', ['topic', 'score', 'time', 'bias']) 22 | 23 | 24 | class Dataset: 25 | def __init__(self, random_level=1): 26 | """ 27 | 构造一个空的Dataset 28 | :param random_level: 获取序列时随机打乱的程度,0为不打乱,1为考试内打乱,2为全 29 | 部打乱。默认1 30 | """ 31 | self.topics = set() 32 | self.exams = set() 33 | self.users = set() 34 | self.schools = defaultdict(set) 35 | self.records = list() 36 | self.user_school_map = dict() 37 | self.topic_exam_map = dict() 38 | self.random_level = random_level 39 | 40 | @staticmethod 41 | def from_matrix(filename): 42 | f = open(filename, encoding='utf-8') 43 | d = Dataset() 44 | for line in f: 45 | desc, seq = line.strip().split('\t') 46 | scl, exam, grd = desc.strip().split('@') 47 | # 去除异常考试 48 | if exam == 'e061cfc1-86e9-4486-abc7-0630d0c1ad2d': 49 | continue 50 | if exam == '8c7a9923-0856-4fed-b326-c0f124129b86': 51 | continue 52 | data_fields = seq.strip().split(' ') 53 | exam_time = int(data_fields[0]) 54 | for record in data_fields[5:]: 55 | user, topic, score, std_score = record.split('@') 56 | score, std_score = float(score), float(std_score) 57 | r = Record(user, scl, topic, exam, 58 | score / std_score, exam_time) 59 | d._insert(r) 60 | return d 61 | 62 | @staticmethod 63 | def from_records(dirname): 64 | d = Dataset() 65 | 66 | record_f = open(os.path.join(dirname, 'records.txt'), encoding='utf-8') 67 | school_f = open(os.path.join(dirname, 'schools.txt'), encoding='utf-8') 68 | exam_f = open(os.path.join(dirname, 'exams.txt'), encoding='utf-8') 69 | 70 | topic_exam_map = dict() 71 | exam_info = dict() 72 | student_scl_map = dict() 73 | 74 | for line in exam_f: 75 | fields = line.strip().split(' ') 76 | exam_id, exam_type, exam_time, _ = fields[0].split(',') 77 | exam_id = exam_id + '_' + exam_type 78 | exam_time = \ 79 | int(time.mktime(datetime.strptime(exam_time, 80 | '%Y-%m-%d').timetuple())) 81 | exam_info[exam_id] = exam_time 82 | for topic in fields[1:]: 83 | topic_exam_map[topic] = exam_id 84 | 85 | for line in school_f: 86 | fields = line.strip().split(' ') 87 | school_id = fields[0] 88 | for student in fields[1:]: 89 | student_scl_map[student] = school_id 90 | 91 | exam_by_school = defaultdict(set) 92 | 93 | for line in record_f: 94 | fields = line.strip().split(' ') 95 | student_id, _, _ = fields[0].split(',') 96 | if student_id not in student_scl_map: 97 | continue 98 | for item in fields[1:]: 99 | topic_id, score = item.split(',') 100 | score = float(score) 101 | scl_id = student_scl_map[student_id] 102 | exam_id = topic_exam_map[topic_id] 103 | r = Record(student_id, scl_id, topic_id, exam_id, 104 | score, exam_info[exam_id]) 105 | exam_by_school[scl_id].add(exam_id) 106 | d._insert(r) 107 | 108 | return d 109 | 110 | def select(self, filter): 111 | """ 112 | 选择满足条件的记录集合,分别返回满足和不满足的两个dataset 113 | :param filter: 判断条件(函数) 114 | :return: selected, others 115 | """ 116 | selected = Dataset() 117 | others = Dataset() 118 | for r in self.records: 119 | if filter(r): 120 | selected._insert(r) 121 | else: 122 | others._insert(r) 123 | return selected, others 124 | 125 | def split(self): 126 | """ 127 | 划分数据为训练集、测试集(新学生、新考试、学生考试都未出现) 128 | :return: train, user, exam, new 129 | """ 130 | train = Dataset() 131 | user = Dataset() 132 | exam = Dataset() 133 | new = Dataset() 134 | 135 | train_exams = [] 136 | schools = dict() 137 | for s in self.schools: 138 | schools[s] = sorted(list(self.schools[s]), 139 | key=itemgetter(1)) 140 | for s in schools: 141 | train_exams.extend([x[0] for x in schools[s][:-1]]) 142 | train_exams = set(train_exams) 143 | train_users = sample(sorted(self.users), 0.9) 144 | train_users = set(train_users) 145 | 146 | for r in self.records: 147 | if r.exam in train_exams and r.user in train_users: 148 | # train set 149 | train._insert(r) 150 | elif r.exam in train_exams: 151 | # new user 152 | user._insert(r) 153 | elif r.user in train_users: 154 | # new exam 155 | exam._insert(r) 156 | else: 157 | # completely new record 158 | new._insert(r) 159 | 160 | train.random_level = self.random_level 161 | user.random_level = self.random_level 162 | exam.random_level = self.random_level 163 | new.random_level = self.random_level 164 | 165 | return train, user, exam, new 166 | 167 | def split_future(self, frac, rand_seed=324): 168 | seq = self.get_seq() 169 | train_data = Dataset() 170 | test_data = Dataset() 171 | seed(rand_seed) 172 | 173 | for user in seq: 174 | school = self.user_school_map[user] 175 | u_seq = seq[user] 176 | train_len = int(frac * len(u_seq)) 177 | for topic, score, time, _ in u_seq[:train_len]: 178 | exam = self.topic_exam_map[topic] 179 | train_data._insert(Record(user, school, topic, exam, 180 | score, time)) 181 | for topic, score, time, _ in u_seq[train_len:]: 182 | exam = self.topic_exam_map[topic] 183 | test_data._insert(Record(user, school, topic, exam, 184 | score, time)) 185 | 186 | return train_data, test_data 187 | 188 | def split_user(self, frac, rand_seed=101): 189 | seed(rand_seed) 190 | train_users = sample(sorted(self.users), 191 | int(len(self.users) * frac)) 192 | train_users = set(train_users) 193 | train_data = Dataset() 194 | test_data = Dataset() 195 | for r in self.records: 196 | if r.user in train_users: 197 | train_data._insert(r) 198 | else: 199 | test_data._insert(r) 200 | 201 | return train_data, test_data 202 | 203 | def get_seq(self): 204 | """ 205 | 返回每个学生的做题序列,根据设定的打乱程度(random_level,0为不打乱,1为考试内打 206 | 乱,2为全部打乱)对序列进行随机打乱 207 | :return: 一个学生到该学生做题记录(Item)序列的字典 208 | """ 209 | seq = defaultdict(list) 210 | for r in self.records: 211 | seq[r.user].append(Item(r.topic, r.score, 212 | r.time, randint(-5000, 5000))) 213 | for user in seq: 214 | if self.random_level == 1: 215 | seq[user].sort(key=lambda x: x.time + x.bias) 216 | elif self.random_level == 2: 217 | shuffle(seq[user]) 218 | return seq 219 | 220 | def get_dict(self): 221 | """ 222 | 返回学生、题目的序号以及反查表 223 | :return: 学生序号、序号反查、题目序号、序号反查 224 | """ 225 | user_dic = {} 226 | topic_dic = {} 227 | user_inv_dic = {} 228 | topic_inv_dic = {} 229 | for i, user in enumerate(sorted(self.users)): 230 | user_dic[user] = i + 1 231 | user_inv_dic[i + 1] = user 232 | for i, topic in enumerate(sorted(self.topics)): 233 | topic_dic[topic] = i + 1 234 | topic_inv_dic[i + 1] = topic 235 | return user_dic, user_inv_dic, topic_dic, topic_inv_dic 236 | 237 | def save(self, filename): 238 | f = open(filename, 'w') 239 | json.dump(self.records, f) 240 | f.close() 241 | 242 | def load(self, filename): 243 | f = open(filename) 244 | records = json.load(f) 245 | for r in records: 246 | self._insert(Record(*r)) 247 | 248 | def _insert(self, r): 249 | self.topics.add(r.topic) 250 | self.exams.add(r.exam) 251 | self.users.add(r.user) 252 | self.schools[r.school].add((r.exam, r.time)) 253 | self.user_school_map[r.user] = r.school 254 | self.topic_exam_map[r.topic] = r.exam 255 | self.records.append(r) 256 | 257 | 258 | def get_dataset(type, random_level=0): 259 | """ 260 | 返回数据集 261 | :param type: {full,some}[_test] 262 | :param random_level: 0为不打乱,1为考试内打乱,2为全部打乱,默认1 263 | :return: 对应数据集 264 | """ 265 | some_schools = ['2300000001000000032', 266 | '2300000001000674122', 267 | '4444000020000000449', 268 | '2300000001000649665', 269 | '2300000001000053674', 270 | '2300000001000649702'] 271 | some_test_schools = ['4444000020000000470'] 272 | 273 | if type.startswith('full'): 274 | if type.endswith('test'): 275 | rv = Dataset.from_records('data/test') 276 | else: 277 | rv = Dataset.from_records('data/full') 278 | elif type.startswith('some'): 279 | d = Dataset.from_matrix('data/02.10.matrix') 280 | if type.endswith('test'): 281 | rv, _ = d.select(lambda r: r.school in some_test_schools) 282 | else: 283 | rv, _ = d.select(lambda r: r.school in some_schools) 284 | else: 285 | rv = Dataset() 286 | rv.load(type) 287 | rv.random_level = random_level 288 | return rv 289 | 290 | 291 | def load_embedding(filename): 292 | f = open(filename, encoding='utf-8') 293 | wcnt, emb_size = next(f).strip().split(' ') 294 | wcnt = int(wcnt) 295 | emb_size = int(emb_size) 296 | 297 | words = [] 298 | embs = [] 299 | for line in f: 300 | fields = line.strip().split(' ') 301 | word = fields[0] 302 | emb = np.array([float(x) for x in fields[1:]]) 303 | words.append(word) 304 | embs.append(emb) 305 | 306 | embs = np.asarray(embs) 307 | return wcnt, emb_size, words, embs 308 | 309 | 310 | def get_topics(type, words): 311 | if type.startswith('some'): 312 | feature_file = 'data/features.dump.some' 313 | else: 314 | feature_file = 'data/features.dump.full' 315 | 316 | to_categorical = Categorical('') 317 | to_categorical.load_dict(words) 318 | topic_fields = { 319 | '2->content': to_categorical(Words(':', null='')), 320 | } 321 | topics = TableLoader(feature_file, with_header=False, 322 | key=0, fields=topic_fields, index=['content']) 323 | return topics 324 | 325 | 326 | if __name__ == '__main__': 327 | # data = Dataset() 328 | # data.load('data/raw50/full_sampled.json') 329 | data = get_dataset('full', random_level=2) 330 | 331 | # print('#topic', len(data.topics)) 332 | # print('#exam', len(data.exams)) 333 | # print('#users', len(data.users)) 334 | # print('#school', len(data.schools)) 335 | # print('#records', len(data.records)) 336 | 337 | data = data.get_seq() 338 | users = list(data) 339 | shuffle(users) 340 | for user in users[:20]: 341 | seq = data[user] 342 | for item in seq: 343 | print(item.topic, item.score, sep=',', end=' ') 344 | print() 345 | 346 | # f = open('data/id_dict', 'w') 347 | # json.dump(data.get_dict(), f) 348 | # f.close() 349 | 350 | ''' 351 | print(data.topics) 352 | 353 | trainset, test_user, test_exam, test_new = data.split() 354 | 355 | print('trainset:') 356 | print('#topic', len(trainset.topics)) 357 | print('#exam', len(trainset.exams)) 358 | print('#users', len(trainset.users)) 359 | print('#school', len(trainset.schools)) 360 | print('#records', len(trainset.records)) 361 | print('user:') 362 | print('#topic', len(test_user.topics)) 363 | print('#exam', len(test_user.exams)) 364 | print('#users', len(test_user.users)) 365 | print('#school', len(test_user.schools)) 366 | print('#records', len(test_user.records)) 367 | print('exam:') 368 | print('#topic', len(test_exam.topics)) 369 | print('#exam', len(test_exam.exams)) 370 | print('#users', len(test_exam.users)) 371 | print('#school', len(test_exam.schools)) 372 | print('#records', len(test_exam.records)) 373 | print('new:') 374 | print('#topic', len(test_new.topics)) 375 | print('#exam', len(test_new.exams)) 376 | print('#users', len(test_new.users)) 377 | print('#school', len(test_new.schools)) 378 | print('#records', len(test_new.records)) 379 | ''' 380 | -------------------------------------------------------------------------------- /datasplit.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division, print_function, unicode_literals 3 | 4 | from dataprep import Dataset, get_dataset 5 | from collections import defaultdict 6 | from random import sample, random 7 | 8 | 9 | if __name__ == '__main__': 10 | data = get_dataset('full') 11 | sampled = Dataset() 12 | user_len = defaultdict(int) 13 | for r in sample(data.records, int(len(data.records) / 1.25)): 14 | user_len[r.user] += 1 15 | ll = user_len[r.user] 16 | sampled._insert(r) 17 | sampled.save('data/raw_long/full_sampled.json') 18 | 19 | data = sampled 20 | data.random_level = 2 21 | # data.load('data/raw_long/full_sampled.json') 22 | 23 | for frac in [0.6, 0.7, 0.8, 0.9]: 24 | f = int(frac * 100) 25 | # print('splitting user at rate %.d%%' % f) 26 | # train, test = data.split_user(frac) 27 | # print(len(train.records)) 28 | # print('saving') 29 | # train.save('data/raw_long/user.train.%.d' % f) 30 | # test.save('data/raw_long/user.test.%.d' % f) 31 | 32 | print('splitting future at rate %.d' % f) 33 | train, test = data.split_future(frac) 34 | print('saving') 35 | train.save('data/raw_long/future.train.%d' % f) 36 | test.save('data/raw_long/future.test.%d' % f) 37 | -------------------------------------------------------------------------------- /gen_test.py: -------------------------------------------------------------------------------- 1 | from dataprep import Dataset, get_dataset, Record 2 | from random import sample 3 | 4 | full = get_dataset('full') 5 | full.random_level = 0 6 | seq = full.get_seq() 7 | 8 | for frac in [60, 70, 80, 90]: 9 | train = get_dataset('data/raw50/future.train.%d' % frac) 10 | train_topics = set(train.topics) 11 | test = Dataset() 12 | cold_test = Dataset() 13 | train_seq = train.get_seq() 14 | for user in train_seq: 15 | L = int(len(train_seq[user]) / frac * (100 - frac)) 16 | topics = set(x.topic for x in seq[user]) 17 | same = topics & train_topics - set(x.topic for x in train_seq[user]) 18 | diff = topics - train_topics 19 | pop = same | diff 20 | print(len(same), len(diff)) 21 | L = max(5, L - len(diff)) 22 | selected = set(sample(list(same), L)) | diff 23 | 24 | for topic, score, time, _ in seq[user]: 25 | r = Record(user, full.user_school_map[user], 26 | topic, full.topic_exam_map[topic], 27 | score, time) 28 | if topic in diff: 29 | cold_test._insert(r) 30 | if topic in selected: 31 | test._insert(r) 32 | 33 | test.save('data/testsets2/future.test.%d' % frac) 34 | cold_test.save('data/testsets2/coldstart.%d' % frac) 35 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import print_function, division 3 | 4 | import torch 5 | import torch.nn as nn 6 | from dataprep import load_embedding 7 | from util import Variable 8 | import json 9 | 10 | 11 | ##### 12 | # 得分预测模型 13 | ### 14 | 15 | class RNN(nn.Module): 16 | """ 17 | 得分预测的RNN模型 18 | """ 19 | @staticmethod 20 | def add_arguments(parser): 21 | parser.add_argument('--emb_file', default='data/emb_50.txt', 22 | help='pretrained word embedding') 23 | parser.add_argument('--topic_size', '-ts', type=int, default=50, 24 | help='topic embedding size') 25 | parser.add_argument('--seq_hidden_size', '-hs', type=int, default=50, 26 | help='sequence embedding size') 27 | parser.add_argument('--score_mode', '-s', 28 | choices=['concat', 'double'], default='double', 29 | help='way to combine topics and scores') 30 | 31 | def __init__(self, args): 32 | super(RNN, self).__init__() 33 | self.args = args 34 | wcnt, emb_size, words, embs = load_embedding(args['emb_file']) 35 | self.words = words 36 | self.topic_model = TopicRNNModel(wcnt, emb_size, args['topic_size']) 37 | self.topic_model.load_emb(embs) 38 | self.seq_model = SeqModel(args['topic_size'], args['seq_hidden_size'], 39 | args['score_mode']) 40 | 41 | def forward(self, topic, score, time, hidden=None): 42 | # topic rnn part size: (seq_len, bz) -> (bz, topic_size) 43 | h = self.topic_model.default_hidden(1) 44 | topic_v, _ = self.topic_model(topic.view(-1, 1), h) 45 | 46 | s, hidden = self.seq_model(topic_v[0], score, hidden) 47 | return s, hidden 48 | 49 | 50 | class Attn(nn.Module): 51 | """ 52 | 得分预测的纯attention模型 53 | """ 54 | @staticmethod 55 | def add_arguments(parser): 56 | parser.add_argument('--emb_file', default='data/emb_50.txt', 57 | help='pretrained word embedding') 58 | parser.add_argument('--topic_size', '-ts', type=int, default=50, 59 | help='topic embedding size') 60 | parser.add_argument('-k', type=int, default=5, 61 | help='use top k similar topics to predict') 62 | 63 | def __init__(self, args): 64 | super(Attn, self).__init__() 65 | self.args = args 66 | wcnt, emb_size, words, embs = load_embedding(args['emb_file']) 67 | self.words = words 68 | self.topic_model = TopicRNNModel(wcnt, emb_size, args['topic_size']) 69 | self.topic_model.load_emb(embs) 70 | self.seq_model = AttnModel(args['topic_size'], args['k']) 71 | 72 | def forward(self, topic, score, time, hidden=None): 73 | # topic rnn part size: (seq_len, bz) -> (bz, topic_size) 74 | h = self.topic_model.default_hidden(1) 75 | topic_v, _ = self.topic_model(topic.view(-1, 1), h) 76 | 77 | s = self.seq_model(topic_v[0], hidden) 78 | 79 | if hidden is None: 80 | hidden = topic_v, score 81 | 82 | else: 83 | vs, scores = hidden 84 | vs = torch.cat([vs, topic_v]) 85 | scores = torch.cat([scores, score]) 86 | hidden = vs, scores 87 | 88 | return s, hidden 89 | 90 | 91 | class RA(nn.Module): 92 | """ 93 | 得分预测的RNN+attention模型 94 | """ 95 | @staticmethod 96 | def add_arguments(parser): 97 | RNN.add_arguments(parser) 98 | parser.add_argument('-k', type=int, default=10, 99 | help='use top k similar topics to predict') 100 | parser.add_argument('-w', '--with_last', 101 | action='store_true', help='with last h') 102 | parser.add_argument('-l', '--num_layers', type=int, default=2, 103 | help='#topic rnn layers') 104 | 105 | def __init__(self, args): 106 | super(RA, self).__init__() 107 | self.args = args 108 | #print(self.args) 109 | wcnt, emb_size, words, embs = load_embedding(args['emb_file']) 110 | self.words = words 111 | self.topic_model = TopicRNNModel(wcnt, emb_size, args['topic_size'], 112 | num_layers=args['num_layers']) 113 | self.topic_model.load_emb(embs) 114 | self.seq_model = AttnSeqModel(args['topic_size'], 115 | args['seq_hidden_size'], args['k'], 116 | args['score_mode'], args['with_last']) 117 | 118 | def forward(self, topic, score, time, hidden=None): 119 | # topic rnn part size: (seq_len, bz) -> (bz, topic_size) 120 | h = self.topic_model.default_hidden(1) 121 | topic_v, _ = self.topic_model(topic.view(-1, 1), h) 122 | 123 | s, h = self.seq_model(topic_v[0], score, hidden) 124 | if hidden is None: 125 | hidden = h, topic_v, h 126 | else: 127 | _, vs, hs = hidden 128 | vs = torch.cat([vs, topic_v]) 129 | hs = torch.cat([hs, h]) 130 | hidden = h, vs, hs 131 | 132 | return s, hidden 133 | 134 | 135 | class RADecay(nn.Module): 136 | """ 137 | 得分预测的RNN+attention模型(增加时间衰减) 138 | """ 139 | @staticmethod 140 | def add_arguments(parser): 141 | RNN.add_arguments(parser) 142 | parser.add_argument('-k', type=int, default=5, 143 | help='use top k similar topics to predict') 144 | 145 | def __init__(self, args): 146 | super(RADecay, self).__init__() 147 | self.args = args 148 | wcnt, emb_size, words, embs = load_embedding(args['emb_file']) 149 | self.words = words 150 | self.topic_model = TopicRNNModel(wcnt, emb_size, args['topic_size']) 151 | self.topic_model.load_emb(embs) 152 | self.seq_model = AttnSeqTimeDecayModel(args['topic_size'], 153 | args['seq_hidden_size'], 154 | args['k'], 155 | args['score_mode']) 156 | self.vs = None 157 | self.hs = None 158 | 159 | def forward(self, topic, score, time, hidden=None): 160 | # topic rnn part size: (seq_len, bz) -> (bz, topic_size) 161 | h = self.topic_model.default_hidden(1) 162 | topic_v, _ = self.topic_model(topic.view(-1, 1), h) 163 | 164 | s, h = self.seq_model(topic_v[0], score, time, hidden) 165 | 166 | if hidden is None: 167 | hidden = topic_v, h, time 168 | else: 169 | vs, hs, ts = hidden 170 | vs = torch.cat([vs, topic_v]) 171 | hs = torch.cat([hs, h]) 172 | ts = torch.cat([ts, time]) 173 | hidden = vs, hs, ts 174 | 175 | return s, hidden 176 | 177 | 178 | class LSTMM(nn.Module): 179 | @staticmethod 180 | def add_arguments(parser): 181 | parser.add_argument('--emb_file', default='data/emb_50.txt', 182 | help='pretrained word embedding') 183 | parser.add_argument('--seq_hidden_size', '-hs', type=int, default=50, 184 | help='sequence embedding size') 185 | parser.add_argument('--topic_size', '-ts', type=int, default=50, 186 | help='topic embedding size') 187 | parser.add_argument('--score_mode', '-s', 188 | choices=['concat', 'double'], default='double', 189 | help='way to combine topics and scores') 190 | 191 | def __init__(self, args): 192 | super(LSTMM, self).__init__() 193 | self.args = args 194 | wcnt, emb_size, words, embs = load_embedding(args['emb_file']) 195 | self.words = words 196 | self.embedding = nn.Embedding(wcnt, emb_size, padding_idx=0) 197 | self.embedding.weight.data.copy_(torch.from_numpy(embs)) 198 | self.seq_model = SeqModel(args['topic_size'], args['seq_hidden_size'], 199 | args['score_mode']) 200 | 201 | def forward(self, topic, score, time, hidden=None): 202 | topic_v = self.embedding(topic).mean(0, keepdim=True) 203 | s, hidden = self.seq_model(topic_v[0], score, hidden) 204 | return s, hidden 205 | 206 | 207 | class LSTMA(nn.Module): 208 | @staticmethod 209 | def add_arguments(parser): 210 | LSTMM.add_arguments(parser) 211 | parser.add_argument('-k', type=int, default=10, 212 | help='use top k similar topics to predict') 213 | parser.add_argument('-w', '--with_last', 214 | action='store_true', help='with last h') 215 | 216 | def __init__(self, args): 217 | super(LSTMA, self).__init__() 218 | self.args = args 219 | wcnt, emb_size, words, embs = load_embedding(args['emb_file']) 220 | self.words = words 221 | self.embedding = nn.Embedding(wcnt, emb_size, padding_idx=0) 222 | self.embedding.weight.data.copy_(torch.from_numpy(embs)) 223 | self.seq_model = AttnSeqModel(args['topic_size'], 224 | args['seq_hidden_size'], 225 | args['k'], 226 | args['score_mode'], 227 | args['with_last']) 228 | self.vs = None 229 | self.hs = None 230 | 231 | def forward(self, topic, score, time, hidden=None): 232 | topic_v = self.embedding(topic).mean(0, keepdim=True) 233 | 234 | s, h = self.seq_model(topic_v[0], score, hidden) 235 | if hidden is None: 236 | hidden = h, topic_v, h 237 | else: 238 | _, vs, hs = hidden 239 | vs = torch.cat([vs, topic_v]) 240 | hs = torch.cat([hs, h]) 241 | hidden = h, vs, hs 242 | 243 | return s, hidden 244 | 245 | 246 | class DKT(nn.Module): 247 | @staticmethod 248 | def add_arguments(parser): 249 | parser.add_argument('--tcnt', '-tc', type=int, default=0, 250 | help='different topic count') 251 | parser.add_argument('--score_mode', '-s', 252 | choices=['concat', 'double'], default='double', 253 | help='way to combine topics and scores') 254 | 255 | def __init__(self, args): 256 | super(DKT, self).__init__() 257 | self.args = args 258 | if args['tcnt'] == 0: 259 | dic = open('data/know_list.txt').read().split('\n') 260 | args['tcnt'] = len(dic) 261 | self.tcnt = args['tcnt'] 262 | self.seq_model = DKTModel(self.tcnt) 263 | 264 | def forward(self, topic, score, time, hidden=None): 265 | s, hidden = self.seq_model(topic, score, hidden) 266 | return s, hidden 267 | 268 | 269 | class DKNM(nn.Module): 270 | @staticmethod 271 | def add_arguments(parser): 272 | parser.add_argument('--emb_file', default='data/emb_50.txt', 273 | help='pretrained word embedding') 274 | parser.add_argument('--tcnt', '-tc', type=int, default=0, 275 | help='different topic count') 276 | parser.add_argument('--seq_hidden_size', '-hs', type=int, default=50, 277 | help='sequence embedding size') 278 | parser.add_argument('--topic_size', '-ts', type=int, default=50, 279 | help='topic embedding size') 280 | parser.add_argument('--score_mode', '-s', 281 | choices=['concat', 'double'], default='double', 282 | help='way to combine topics and scores') 283 | 284 | def __init__(self, args): 285 | super(DKNM, self).__init__() 286 | self.args = args 287 | if args['tcnt'] == 0: 288 | dic = open('data/know_list.txt').read().split('\n') 289 | args['tcnt'] = len(dic) 290 | self.tcnt = args['tcnt'] 291 | self.embedding = nn.Linear(self.tcnt, args['topic_size']) 292 | self.seq_model = SeqModel(args['topic_size'], args['seq_hidden_size'], 293 | args['score_mode']) 294 | 295 | def forward(self, topic, score, time, hidden=None): 296 | topic_v = self.embedding(topic.type_as(score).view(1, -1)) 297 | s, hidden = self.seq_model(topic_v[0], score, hidden) 298 | return s, hidden 299 | 300 | 301 | class DKNA(nn.Module): 302 | @staticmethod 303 | def add_arguments(parser): 304 | DKNM.add_arguments(parser) 305 | parser.add_argument('-k', type=int, default=10, 306 | help='use top k similar topics to predict') 307 | parser.add_argument('-w', '--with_last', 308 | action='store_true', help='with last h') 309 | 310 | def __init__(self, args): 311 | super(DKNA, self).__init__() 312 | self.args = args 313 | if args['tcnt'] == 0: 314 | dic = open('data/know_list.txt').read().split('\n') 315 | args['tcnt'] = len(dic) 316 | self.tcnt = args['tcnt'] 317 | self.embedding = nn.Linear(self.tcnt, args['topic_size']) 318 | self.seq_model = AttnSeqModel(args['topic_size'], 319 | args['seq_hidden_size'], 320 | args['k'], 321 | args['score_mode'], 322 | args['with_last']) 323 | self.vs = None 324 | self.hs = None 325 | 326 | def forward(self, topic, score, time, hidden=None): 327 | topic_v = self.embedding(topic.type_as(score).view(1, -1)) 328 | 329 | s, h = self.seq_model(topic_v[0], score, hidden) 330 | if hidden is None: 331 | hidden = h, topic_v, h 332 | else: 333 | _, vs, hs = hidden 334 | vs = torch.cat([vs, topic_v]) 335 | hs = torch.cat([hs, h]) 336 | hidden = h, vs, hs 337 | 338 | return s, hidden 339 | 340 | 341 | class EKTM(nn.Module): 342 | """ 343 | Knowledge Tracing Model with Markov property combined with exercise texts and knowledge concepts 344 | """ 345 | @staticmethod 346 | def add_arguments(parser): 347 | RNN.add_arguments(parser) 348 | parser.add_argument('-k', type=int, default=10, help='use top k similar topics to predict') 349 | parser.add_argument('-kc', '--kcnt', type=int, default=0, help='numbers of knowledge concepts') 350 | parser.add_argument('-ks', '--knowledge_hidden_size', type=int, default=25, help='knowledge emb size') 351 | parser.add_argument('-l', '--num_layers', type=int, default=1, help='#topic rnn layers') 352 | # parser.add_argument('-hs', '--seq_hidden_size', type=int, default=50, help='student seq emb size') 353 | # parser.add_argument('-ts', '--topic_size', type=int, default=50, help='exercise emb size') 354 | # parser.add_argument('-s', '--score_mode', choices=['concat', 'double'], default='double', 355 | # help='way to combine exercise and score') 356 | 357 | def __init__(self, args): 358 | super(EKTM, self).__init__() 359 | self.args = args 360 | wcnt, emb_size, words, embs = load_embedding(args['emb_file']) 361 | self.words = words 362 | if args['kcnt'] == 0: 363 | know_dic = open('data/firstknow_list.txt').read().split('\n') 364 | args['kcnt'] = len(know_dic) 365 | self.kcnt = args['kcnt'] 366 | 367 | # knowledge embedding module 368 | self.knowledge_model = KnowledgeModel(self.kcnt, args['knowledge_hidden_size']) 369 | 370 | # exercise embedding module 371 | self.topic_model = TopicRNNModel(wcnt, emb_size, args['topic_size'], num_layers=args['num_layers']) 372 | 373 | self.topic_model.load_emb(embs) 374 | 375 | # student seq module 376 | self.seq_model = EKTSeqModel(args['topic_size'], args['knowledge_hidden_size'], args['kcnt'], 377 | args['seq_hidden_size'], args['score_mode']) 378 | 379 | def forward(self, topic, knowledge, score, time, hidden=None): 380 | # print(knowledge.size()) 381 | k = self.knowledge_model(knowledge) 382 | # print(knowledge.size()) 383 | 384 | topic_h = self.topic_model.default_hidden(1) 385 | topic_v, _ = self.topic_model(topic.view(-1, 1), topic_h) 386 | 387 | s, h = self.seq_model(topic_v[0], k, knowledge, score, hidden) 388 | return s, h 389 | 390 | 391 | class EKTA(nn.Module): 392 | """ 393 | Knowledge Tracing Model with Attention mechnaism combined with exercise texts and knowledge concepts 394 | """ 395 | @staticmethod 396 | def add_arguments(parser): 397 | RNN.add_arguments(parser) 398 | parser.add_argument('-k', type=int, default=10, help='use top k similar topics to predict') 399 | parser.add_argument('-kc', '--kcnt', type=int, default=0, help='numbers of knowledge concepts') 400 | parser.add_argument('-ks', '--knowledge_hidden_size', type=int, default=25, help='knowledge emb size') 401 | parser.add_argument('-l', '--num_layers', type=int, default=1, help='#topic rnn layers') 402 | 403 | def __init__(self, args): 404 | super(EKTA, self).__init__() 405 | self.args = args 406 | wcnt, emb_size, words, embs = load_embedding(args['emb_file']) 407 | self.words = words 408 | if args['kcnt'] == 0: 409 | know_dic = open('data/firstknow_list.txt').read().split('\n') 410 | args['kcnt'] = len(know_dic) 411 | self.kcnt = args['kcnt'] 412 | 413 | # knowledge embedding module 414 | self.knowledge_model = KnowledgeModel(self.kcnt, args['knowledge_hidden_size']) 415 | 416 | # exercise embedding module 417 | self.topic_model = TopicRNNModel(wcnt, emb_size, args['topic_size'], num_layers=args['num_layers']) 418 | 419 | self.topic_model.load_emb(embs) 420 | 421 | # student seq module 422 | self.seq_model = EKTAttnSeqModel(args['topic_size'], args['knowledge_hidden_size'], args['kcnt'], 423 | args['seq_hidden_size'], args['k'], args['score_mode']) 424 | 425 | def forward(self, topic, knowledge, score, time, hidden=None, alpha=False): 426 | # print(knowledge.size()) 427 | k = self.knowledge_model(knowledge) 428 | # print(knowledge.size()) 429 | topic_h = self.topic_model.default_hidden(1) 430 | topic_v, _ = self.topic_model(topic.view(-1, 1), topic_h) 431 | 432 | s, h, a = self.seq_model(topic_v[0], k, knowledge, score, hidden) 433 | if hidden is None: 434 | hidden = h, topic_v, h 435 | else: 436 | _, vs, hs = hidden 437 | vs = torch.cat([vs, topic_v]) 438 | hs = torch.cat([hs, h]) 439 | hidden = h, vs, hs 440 | 441 | if alpha: 442 | return s, hidden, a 443 | else: 444 | return s, hidden 445 | 446 | 447 | class DKVMN(nn.Module): 448 | """ 449 | Dynamic Key-Value Memory Networks for Knowledge Tracing at WWW'2017 450 | """ 451 | 452 | @staticmethod 453 | def add_arguments(parser): 454 | RNN.add_arguments(parser) 455 | parser.add_argument('-k', type=int, default=10, help='use top k similar topics to predict') 456 | parser.add_argument('--knows', default='data/know_list.txt', help='numbers of knowledge concepts') 457 | parser.add_argument('-ks', '--knowledge_hidden_size', type=int, default=25, help='knowledge emb size') 458 | parser.add_argument('-l', '--num_layers', type=int, default=2, help='#topic rnn layers') 459 | # parser.add_argument('-es', '--erase_vector_size', type=float, default=25, help='erase vector emb size') 460 | # parser.add_argument('-as', '--add_vector_size', type=float, default=25, help='add vector emb size') 461 | 462 | def __init__(self, args): 463 | super(DKVMN, self).__init__() 464 | self.args = args 465 | know_dic = open(args['knows']).read().split('\n') 466 | args['kcnt'] = len(know_dic) 467 | self.kcnt = args['kcnt'] 468 | self.valve_size = args['knowledge_hidden_size'] * 2 469 | 470 | # knowledge embedding module 471 | self.knowledge_model = KnowledgeModel(self.kcnt, args['knowledge_hidden_size']) 472 | # student seq module 473 | self.seq_model = DKVMNSeqModel(args['knowledge_hidden_size'], 30, args['kcnt'], args['seq_hidden_size'], 474 | self.valve_size) 475 | 476 | def forward(self, knowledge, score, time, hidden=None): 477 | # print(knowledge) 478 | expand_vec = knowledge.float().view(-1) * score 479 | # print(expand_vec) 480 | cks = torch.cat([knowledge.float().view(-1), expand_vec]).view(1, -1) 481 | # print(cks) 482 | 483 | knowledge = self.knowledge_model(knowledge) 484 | 485 | s, h = self.seq_model(cks, knowledge, score, hidden) 486 | return s, h 487 | 488 | 489 | ####### 490 | # knowledge Representation module 491 | ####### 492 | class KnowledgeModel(nn.Module): 493 | """ 494 | Transform Knowledge index to knowledge embedding 495 | """ 496 | 497 | def __init__(self, know_len, know_emb_size): 498 | super(KnowledgeModel, self).__init__() 499 | self.knowledge_embedding = nn.Linear(know_len, know_emb_size) 500 | 501 | def forward(self, knowledge): 502 | return self.knowledge_embedding(knowledge.float().view(1, -1)) 503 | 504 | 505 | class DKVMNSeqModel(nn.Module): 506 | """ 507 | DKVMN seq model 508 | """ 509 | 510 | def __init__(self, know_emb_size, know_length, kcnt, seq_hidden_size, value_size): 511 | super(DKVMNSeqModel, self).__init__() 512 | self.know_emb_size = know_emb_size 513 | self.know_length = know_length 514 | self.seq_hidden_size = seq_hidden_size 515 | # self.erase_size = erase_size 516 | # self.add_size = add_size 517 | self.value_size = value_size 518 | 519 | # knowledge memory matrix 520 | self.knowledge_memory = nn.Parameter(torch.zeros(self.know_length, self.know_emb_size)) 521 | self.knowledge_memory.data.uniform_(-1, 1) 522 | 523 | # read process embedding module 524 | self.ft_embedding = nn.Linear(self.seq_hidden_size + self.know_emb_size, 50) 525 | self.score_layer = nn.Linear(50, 1) 526 | 527 | # write process embedding module 528 | # erase_size = add_size = seq_hidden_size 529 | self.cks_embedding = nn.Linear(kcnt * 2, self.value_size) 530 | self.erase_embedding = nn.Linear(self.value_size, self.seq_hidden_size) 531 | self.add_embedding = nn.Linear(self.value_size, self.seq_hidden_size) 532 | 533 | # the first student state 534 | self.h_initial = nn.Parameter(torch.zeros(know_length, seq_hidden_size)) 535 | self.h_initial.data.uniform_(-1, 1) 536 | 537 | def forward(self, cks, kn, s, h): 538 | if h is None: 539 | h = self.h_initial.view(self.know_length * self.seq_hidden_size) 540 | 541 | # calculate alpha weights of knowledges using dot product 542 | alpha = torch.mm(self.knowledge_memory, kn.view(-1, 1)).view(-1) 543 | alpha = nn.functional.softmax(alpha.view(1, -1), dim=-1) 544 | 545 | # read process 546 | rt = torch.mm(alpha, h.view(self.know_length, self.seq_hidden_size)).view(-1) 547 | com_r_k = torch.cat([rt, kn.view(-1)]).view(1, -1) 548 | # print(com_r_k.size()) 549 | ft = torch.tanh(self.ft_embedding(com_r_k)) 550 | predict_score = torch.sigmoid(self.score_layer(ft)) 551 | 552 | # write process 553 | vt = self.cks_embedding(cks) 554 | et = torch.sigmoid(self.erase_embedding(vt)) 555 | at = torch.tanh(self.add_embedding(vt)) 556 | ht = h * (1 - (alpha.view(-1, 1) * et).view(-1)) 557 | h = ht + (alpha.view(-1, 1) * at).view(-1) 558 | return predict_score.view(1), h 559 | 560 | 561 | class EKTSeqModel(nn.Module): 562 | """ 563 | Student seq modeling combined with exercise texts and knowledge point 564 | """ 565 | 566 | def __init__(self, topic_size, know_emb_size, know_length, seq_hidden_size, score_mode, num_layers=1): 567 | super(EKTSeqModel, self).__init__() 568 | self.topic_size = topic_size 569 | self.know_emb_size = know_emb_size 570 | self.seq_hidden_size = seq_hidden_size 571 | self.know_length = know_length 572 | self.score_mode = score_mode 573 | self.num_layers = num_layers 574 | # self.with_last = with_last 575 | 576 | # Knowledge memory matrix 577 | self.knowledge_memory = nn.Parameter(torch.zeros(self.know_length, self.know_emb_size)) 578 | self.knowledge_memory.data.uniform_(-1, 1) 579 | 580 | # Student seq rnn 581 | if self.score_mode == 'concat': 582 | self.rnn = nn.GRU(self.topic_size + 1, seq_hidden_size, num_layers) 583 | else: 584 | self.rnn = nn.GRU(self.topic_size * 2 + 1, seq_hidden_size, num_layers) 585 | 586 | # the first student state 587 | self.h_initial = nn.Parameter(torch.zeros(know_length, seq_hidden_size)) 588 | self.h_initial.data.uniform_(-1, 1) 589 | 590 | # prediction layer 591 | self.score_layer = nn.Linear(topic_size + seq_hidden_size, 1) 592 | 593 | def forward(self, v, kn, ko, s, h, beta=None): 594 | if h is None: 595 | h = self.h_initial.view(self.num_layers, self.know_length, self.seq_hidden_size) 596 | length = Variable(torch.FloatTensor([0.])) 597 | 598 | # calculate alpha weights of knowledges using dot product 599 | # print(self.knowledge_memory.size()) 600 | # print(kn.view(-1, 1)) 601 | if beta is None: 602 | alpha = torch.mm(self.knowledge_memory, kn.view(-1, 1)).view(-1) 603 | beta = nn.functional.softmax(alpha.view(1, -1), dim=-1) 604 | # print(beta.argmax(1)) 605 | 606 | # print(alpha.size()) 607 | 608 | # print(h.view(self.know_length, self.seq_hidden_size).size()) 609 | # print(h.type()) 610 | # predict score at time t 611 | hkp = torch.mm(beta, h.view(self.know_length, self.seq_hidden_size)).view(-1) 612 | # print(hkp.size()) 613 | pred_v = torch.cat([v, hkp]).view(1, -1) 614 | # print(pred_v.size()) 615 | predict_score = self.score_layer(pred_v) 616 | 617 | # seq states update 618 | if self.score_mode == 'concat': 619 | x = v 620 | else: 621 | x = torch.cat([v * (s >= 0.5).type_as(v).expand_as(v), 622 | v * (s < 0.5).type_as(v).expand_as(v)]) 623 | x = torch.cat([x, s]) 624 | 625 | # print(x.size()) 626 | # print(torch.ones(self.know_length,1).size()) 627 | # print(x.view(1, -1).size()) 628 | # print(x.type()) 629 | # xk = torch.mm(torch.ones(self.know_length, 1), x.view(1, -1)) 630 | xk = x.view(1, -1).expand(self.know_length, -1) 631 | xk = beta.view(-1, 1) * xk 632 | # xk = ko.float().view(-1, 1) * xk 633 | # print(xk.size()) 634 | # print(alpha.size()) 635 | # xk = torch.mm(alpha, xk).view(-1) 636 | # thresh, idx = alpha.topk(5) 637 | # alpha = (alpha >= thresh[0, 4]).float() 638 | # xk = alpha.view(-1, 1) * xk 639 | # xk = Variable(torch.zeros_like(x)).expand(self.know_length, -1) 640 | 641 | _, h = self.rnn(xk.unsqueeze(0), h) 642 | return predict_score.view(1), h 643 | 644 | 645 | class EKTAttnSeqModel(nn.Module): 646 | """ 647 | Student seq modeling combined with exercise texts and knowledge point 648 | """ 649 | 650 | def __init__(self, topic_size, know_emb_size, know_length, seq_hidden_size, k, score_mode, num_layers=1): 651 | super(EKTAttnSeqModel, self).__init__() 652 | self.topic_size = topic_size 653 | self.know_emb_size = know_emb_size 654 | self.seq_hidden_size = seq_hidden_size 655 | self.know_length = know_length 656 | self.score_mode = score_mode 657 | self.num_layers = num_layers 658 | self.k = k 659 | # self.with_last = with_last 660 | 661 | # Knowledge memory matrix 662 | self.knowledge_memory = nn.Parameter(torch.zeros(self.know_length, self.know_emb_size)) 663 | self.knowledge_memory.data.uniform_(-1, 1) 664 | 665 | # Student seq rnn 666 | if self.score_mode == 'concat': 667 | self.rnn = nn.GRU(self.topic_size + 1, seq_hidden_size, num_layers) 668 | else: 669 | self.rnn = nn.GRU(self.topic_size * 2 + 1, seq_hidden_size, num_layers) 670 | 671 | # the first student state 672 | self.h_initial = nn.Parameter(torch.zeros(know_length, seq_hidden_size)) 673 | self.h_initial.data.uniform_(-1, 1) 674 | 675 | # prediction layer 676 | self.score_layer = nn.Linear(topic_size + seq_hidden_size, 1) 677 | self.k = k 678 | 679 | def forward(self, v, kn, ko, s, hidden): 680 | if hidden is None: 681 | h = self.h_initial.view(self.num_layers, self.know_length, self.seq_hidden_size) 682 | attn_h = self.h_initial 683 | length = Variable(torch.FloatTensor([0.])) 684 | beta = None 685 | 686 | else: 687 | 688 | h, vs, hs = hidden 689 | 690 | # calculate beta weights of seqs using dot product 691 | beta = torch.mm(vs, v.view(-1, 1)).view(-1) 692 | beta, idx = beta.topk(min(len(beta), self.k), sorted=False) 693 | beta = nn.functional.softmax(beta.view(1, -1), dim=-1) 694 | length = Variable(torch.FloatTensor([beta.size()[1]])) 695 | 696 | hs = hs.view(-1, self.know_length * self.seq_hidden_size) 697 | attn_h = torch.mm(beta, torch.index_select(hs, 0, idx)).view(-1) 698 | 699 | # calculate alpha weights of knowledges using dot product 700 | alpha = torch.mm(self.knowledge_memory, kn.view(-1, 1)).view(-1) 701 | alpha = nn.functional.softmax(alpha.view(1, -1), dim=-1) 702 | 703 | hkp = torch.mm(alpha, attn_h.view(self.know_length, self.seq_hidden_size)).view(-1) 704 | pred_v = torch.cat([v, hkp]).view(1, -1) 705 | predict_score = self.score_layer(pred_v) 706 | 707 | # seq states update 708 | if self.score_mode == 'concat': 709 | x = v 710 | else: 711 | x = torch.cat([v * (s >= 0.5).type_as(v).expand_as(v), v * (s < 0.5).type_as(v).expand_as(v)]) 712 | x = torch.cat([x, s]) 713 | 714 | # print(x.size()) 715 | # print(torch.ones(self.know_length,1).size()) 716 | # print(x.view(1, -1).size()) 717 | # print(x.type()) 718 | # xk = torch.mm(torch.ones(self.know_length, 1), x.view(1, -1)) 719 | xk = x.view(1, -1).expand(self.know_length, -1) 720 | xk = alpha.view(-1, 1) * xk 721 | # xk = ko.float().view(-1, 1) * xk 722 | # xk = torch.mm(alpha, xk).view(-1) 723 | 724 | _, h = self.rnn(xk.unsqueeze(0), h) 725 | return predict_score.view(1), h, beta 726 | 727 | 728 | ##### 729 | # 题目表示、序列表示等模块 730 | ### 731 | class TopicIdModel(nn.Module): 732 | """ 733 | 对题号embedding 734 | """ 735 | 736 | def __init__(self, wcnt, word_emb_size): 737 | super(TopicIdModel, self).__init__() 738 | self.embedding = nn.Embedding(wcnt, word_emb_size, padding_idx=0) 739 | 740 | def forward(self, x): 741 | return self.embedding(x)[0] 742 | 743 | 744 | class TopicRNNModel(nn.Module): 745 | """ 746 | 双向RNN(GRU)建模题面 747 | """ 748 | 749 | def __init__(self, wcnt, emb_size, topic_size, num_layers=2): 750 | super(TopicRNNModel, self).__init__() 751 | self.num_layers = num_layers 752 | self.embedding = nn.Embedding(wcnt, emb_size, padding_idx=0) 753 | if num_layers > 1: 754 | self.emb_size = topic_size 755 | self.rnn = nn.GRU(emb_size, topic_size, 1, 756 | bidirectional=True, 757 | dropout=0.1) 758 | self.output = nn.GRU(topic_size * 2, 759 | topic_size, num_layers - 1, 760 | dropout=0.1) 761 | else: 762 | self.emb_size = topic_size // 2 763 | self.rnn = nn.GRU(emb_size, topic_size // 2, 1, 764 | bidirectional=True) 765 | 766 | def forward(self, input, hidden): 767 | x = self.embedding(input) 768 | # print(x.size()) 769 | # exit(0) 770 | y, h1 = self.rnn(x, hidden[0]) 771 | if self.num_layers > 1: 772 | y, h2 = self.output(y, hidden[1]) 773 | return y[-1], (h1, h2) 774 | else: 775 | y, _ = torch.max(y, 0) 776 | return y, (h1, None) 777 | 778 | def default_hidden(self, batch_size): 779 | return Variable(torch.zeros(2, batch_size, self.emb_size)), \ 780 | Variable(torch.zeros(self.num_layers - 1, 781 | batch_size, self.emb_size)) \ 782 | if self.num_layers > 1 else None 783 | 784 | def load_emb(self, emb): 785 | self.embedding.weight.data.copy_(torch.from_numpy(emb)) 786 | 787 | 788 | class DKTModel(nn.Module): 789 | """ 790 | 做题记录序列的RNN(GRU)单元 791 | """ 792 | 793 | def __init__(self, topic_size): 794 | super(DKTModel, self).__init__() 795 | self.topic_size = topic_size 796 | self.rnn = nn.GRU(topic_size * 2, topic_size, 1) 797 | self.score = nn.Linear(topic_size * 2, 1) 798 | 799 | def forward(self, v, s, h): 800 | if h is None: 801 | h = self.default_hidden() 802 | 803 | v = v.type_as(h) 804 | score = self.score(torch.cat([h.view(-1), v.view(-1)])) 805 | 806 | x = torch.cat([v.view(-1), 807 | (v * (s > 0.5).type_as(v). 808 | expand_as(v).type_as(v)).view(-1)]) 809 | _, h = self.rnn(x.view(1, 1, -1), h) 810 | return score.view(1), h 811 | 812 | def default_hidden(self): 813 | return Variable(torch.zeros(1, 1, self.topic_size)) 814 | 815 | 816 | class SeqModel(nn.Module): 817 | """ 818 | 做题记录序列的RNN(GRU)单元 819 | """ 820 | 821 | def __init__(self, topic_size, seq_hidden_size, score_mode, num_layers=1): 822 | super(SeqModel, self).__init__() 823 | self.topic_size = seq_hidden_size 824 | self.seq_hidden_size = topic_size 825 | self.num_layers = num_layers 826 | self.score_mode = score_mode 827 | if self.score_mode == 'concat': 828 | self.rnn = nn.GRU(topic_size + 1, seq_hidden_size, num_layers) 829 | else: 830 | self.rnn = nn.GRU(topic_size * 2 + 1, seq_hidden_size, num_layers) 831 | self.score = nn.Linear(seq_hidden_size + topic_size, 1) 832 | 833 | def forward(self, v, s, h): 834 | if h is None: 835 | h = self.default_hidden() 836 | pred_v = torch.cat([v, h.view(-1)]) 837 | score = self.score(pred_v.view(1, -1)) 838 | 839 | if self.score_mode == 'concat': 840 | x = v 841 | else: 842 | x = torch.cat([v * (s >= 0.5).type_as(v).expand_as(v), 843 | v * (s < 0.5).type_as(v).expand_as(v)]) 844 | x = torch.cat([x, s]) 845 | 846 | _, h = self.rnn(x.view(1, 1, -1), h) 847 | return score.view(1), h 848 | 849 | def default_hidden(self): 850 | return Variable(torch.zeros(self.num_layers, 1, self.seq_hidden_size)) 851 | 852 | 853 | class AttnModel(nn.Module): 854 | """ 855 | 做题记录序列的纯attention模型单元(alpha:题面embedding点乘) 856 | """ 857 | 858 | def __init__(self, topic_size, k): 859 | super(AttnModel, self).__init__() 860 | self.user_emb_size = topic_size 861 | self.k = k 862 | self.initial_guess = Variable(torch.zeros(1), requires_grad=True) 863 | 864 | def forward(self, v, h): 865 | if h is None: 866 | return self.initial_guess 867 | else: 868 | vs, scores = h 869 | scores = scores.view(-1, 1) 870 | 871 | # calculate alpha using dot product 872 | alpha = torch.mm(vs, v.view(-1, 1)).view(-1) 873 | alpha, idx = alpha.topk(min(len(alpha), self.k), sorted=False) 874 | alpha = nn.functional.softmax(alpha.view(1, -1), dim=-1) 875 | 876 | score = torch.mm(alpha, torch.index_select(scores, 0, idx)) 877 | return score.view(1, 1) 878 | 879 | 880 | class AttnSeqModel(nn.Module): 881 | """ 882 | 做题记录序列的RNN+attention模型单元(alpha:题面embedding点乘) 883 | """ 884 | 885 | def __init__(self, topic_size, seq_hidden_size, k, 886 | score_mode, with_last, num_layers=1): 887 | super(AttnSeqModel, self).__init__() 888 | self.topic_size = topic_size 889 | self.seq_hidden_size = seq_hidden_size 890 | self.num_layers = num_layers 891 | self.score_mode = score_mode 892 | if self.score_mode == 'concat': 893 | self.rnn = nn.GRU(topic_size + 1, seq_hidden_size, num_layers) 894 | else: 895 | self.rnn = nn.GRU(topic_size * 2 + 1, seq_hidden_size, num_layers) 896 | self.with_last = with_last 897 | h_size = seq_hidden_size * 2 + 1 if with_last else seq_hidden_size 898 | self.score = nn.Linear(topic_size + h_size, 1) 899 | self.initial_h = nn.Parameter(torch.zeros(self.num_layers * 900 | self.seq_hidden_size)) 901 | self.initial_h.data.uniform_(-1., 1.) 902 | self.k = k 903 | 904 | def forward(self, v, s, hidden): 905 | if hidden is None: 906 | h = self.initial_h.view(self.num_layers, 1, self.seq_hidden_size) 907 | attn_h = self.initial_h 908 | length = Variable(torch.FloatTensor([0.])) 909 | else: 910 | h, vs, hs = hidden 911 | # print(h) 912 | # print('start') 913 | # print(vs.size()) 914 | # print(v.size()) 915 | # print(v.view(-1,1).size()) 916 | # print(torch.mm(vs,v.view(-1,1)).size()) 917 | 918 | # print(hs) 919 | 920 | # calculate alpha using dot product 921 | alpha = torch.mm(vs, v.view(-1, 1)).view(-1) 922 | # print(alpha.size()) 923 | # print('end') 924 | # print(alpha.size()) 925 | alpha, idx = alpha.topk(min(len(alpha), self.k), sorted=False) 926 | alpha = nn.functional.softmax(alpha.view(1, -1), dim=-1) 927 | 928 | length = Variable(torch.FloatTensor([alpha.size()[1]])) 929 | 930 | # flatten each h 931 | hs = hs.view(-1, self.num_layers * self.seq_hidden_size) 932 | attn_h = torch.mm(alpha, torch.index_select(hs, 0, idx)).view(-1) 933 | 934 | if self.with_last: 935 | pred_v = torch.cat([v, attn_h, h.view(-1), length]).view(1, -1) 936 | else: 937 | pred_v = torch.cat([v, attn_h]).view(1, -1) 938 | score = self.score(pred_v) 939 | 940 | if self.score_mode == 'concat': 941 | x = v 942 | else: 943 | x = torch.cat([v * (s >= 0.5).type_as(v).expand_as(v), 944 | v * (s < 0.5).type_as(v).expand_as(v)]) 945 | x = torch.cat([x, s]) 946 | 947 | _, h = self.rnn(x.view(1, 1, -1), h) 948 | return score, h 949 | 950 | def default_hidden(self): 951 | return Variable(torch.zeros(self.num_layers, 1, self.seq_hidden_size)) 952 | 953 | 954 | class AttnSeqTimeDecayModel(nn.Module): 955 | """ 956 | 同AttnSeqModel,但增加了依照考试时间远近调整alpha 957 | """ 958 | 959 | def __init__(self, topic_size, seq_hidden_size, k, 960 | score_mode, num_layers=1): 961 | super(AttnSeqTimeDecayModel, self).__init__() 962 | self.topic_size = topic_size 963 | self.seq_hidden_size = seq_hidden_size 964 | self.num_layers = num_layers 965 | self.score_mode = score_mode 966 | if self.score_mode == 'concat': 967 | self.rnn = nn.GRU(topic_size + 1, seq_hidden_size, num_layers) 968 | else: 969 | self.rnn = nn.GRU(topic_size * 2 + 1, seq_hidden_size, num_layers) 970 | self.score = nn.Linear(topic_size + seq_hidden_size, 1) 971 | self.k = k 972 | self.initial_h = Variable(torch.zeros(self.num_layers * 973 | self.seq_hidden_size), 974 | requires_grad=True) 975 | 976 | def forward(self, v, s, t, hidden): 977 | if hidden is None: 978 | h = self.default_hidden() 979 | attn_h = self.initial_h 980 | else: 981 | vs, hs, ts = hidden 982 | h = hs[-1:] 983 | ts = t.expand_as(ts) - ts 984 | # calculate alpha using dot product 985 | alpha = torch.mm(vs, v.view(-1, 1)).view(-1) 986 | alpha, idx = alpha.topk(min(len(alpha), self.k), sorted=False) 987 | alpha = alpha * ((1 - 1e-7) ** torch.index_select(ts, 0, idx)) 988 | alpha = nn.functional.softmax(alpha.view(1, -1), dim=-1) 989 | 990 | # flatten each h 991 | hs = hs.view(-1, self.num_layers * self.seq_hidden_size) 992 | attn_h = torch.mm(alpha, torch.index_select(hs, 0, idx)).view(-1) 993 | 994 | pred_v = torch.cat([v, attn_h]).view(1, -1) 995 | score = self.score(pred_v) 996 | 997 | if self.score_mode == 'concat': 998 | x = v 999 | else: 1000 | x = torch.cat([v * (s >= 0.5).type_as(v).expand_as(v), 1001 | v * (s < 0.5).type_as(v).expand_as(v)]) 1002 | x = torch.cat([x, s]) 1003 | 1004 | _, h = self.rnn(x.view(1, 1, -1), h) 1005 | return score, h 1006 | 1007 | def default_hidden(self): 1008 | return Variable(torch.zeros(self.num_layers, 1, self.seq_hidden_size)) 1009 | -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | 执行模型训练、测试等 5 | """ 6 | 7 | import argparse 8 | import os 9 | import sys 10 | import logging 11 | 12 | from model import * 13 | from command import * 14 | from util import * 15 | 16 | commands = ['config', 'train', 'test', 'testfuture', 'testseq', 'stat', 17 | 'statoverall', 'predict', 'trend', 'kt', 'corr', 'at', 'corr2'] 18 | models = ['RNN', 'Attn', 'RA', 'RADecay', 'LSTMM', 'LSTMA', 'DKT', 19 | 'DKNM', 'DKNA', 'EKTM', 'EKTA', 'DKVMN'] 20 | 21 | 22 | class Config: 23 | def __init__(self, parser): 24 | subs = parser.add_subparsers(title='models available', dest='model') 25 | subs.required = True 26 | group_options = set() 27 | for model in models: 28 | sub = subs.add_parser(model, formatter_class=parser_formatter) 29 | group = sub.add_argument_group('setup') 30 | Model = get_class(model) 31 | Model.add_arguments(group) 32 | for action in group._group_actions: 33 | group_options.add(action.dest) 34 | 35 | def save(args): 36 | for file in os.listdir(args.workspace): 37 | if file.endswith('.json'): 38 | os.remove(os.path.join(args.workspace, file)) 39 | model = args.model 40 | Model = get_class(model) 41 | setup = {name: value for (name, value) in args._get_kwargs() 42 | if name in group_options} 43 | conf = os.path.join(args.workspace, 44 | str(model) + '.json') 45 | m = Model(setup) 46 | print('model: %s, setup: %s' % (model, str(m.args))) 47 | save_config(m, conf) 48 | 49 | sub.set_defaults(func=save) 50 | 51 | def run(self, args): 52 | pass 53 | 54 | 55 | class Train: 56 | def __init__(self, parser): 57 | parser.add_argument('-N', '--epochs', type=int, default=1, 58 | help='number of epochs to train') 59 | parser.add_argument('-d', '--dataset', required=True) 60 | parser.add_argument('-s', '--split_method', 61 | choices=['future', 'user', 'old', 'none']) 62 | parser.add_argument('-f', '--frac', default=0.1, type=float, 63 | help='train data fraction') 64 | parser.add_argument('-rl', '--random_level', default=0, type=int, 65 | help='random level') 66 | parser.add_argument('-l', '--loss', default='mse', 67 | choices=['mse', 'cross_entropy']) 68 | parser.add_argument('--print_every', type=int, default=10, 69 | help='logging interval') 70 | parser.add_argument('--save_every', type=int, default=1000, 71 | help='saving interval') 72 | parser.add_argument('-ik', '--input_knowledge', action='store_true') 73 | parser.add_argument('-it', '--input_text',action='store_true') 74 | 75 | def run(self, args): 76 | for name in os.listdir(args.workspace): 77 | if name.endswith('.json'): 78 | Model = get_class(name.split('.')[0]) 79 | config = os.path.join(args.workspace, name) 80 | break 81 | else: 82 | print('you must run config first!') 83 | sys.exit(1) 84 | 85 | model = load_config(Model, config) 86 | # train(model, args) 87 | trainn(model, args) 88 | 89 | 90 | class Test: 91 | def __init__(self, parser): 92 | parser.add_argument('-e', '--snapshot', 93 | help='model snapshot to test with') 94 | parser.add_argument('-d', '--dataset', 95 | required=True) 96 | parser.add_argument('-t', '--test_as_seq', action='store_true', 97 | help='test sequences using output scores') 98 | parser.add_argument('-o', '--test_on_one', action='store_true', 99 | help='test on next one') 100 | parser.add_argument('-z', '--test_on_last', action='store_true', 101 | help='test last') 102 | parser.add_argument('-r', '--ref_len', type=int, default=0, 103 | help='length of sequence with true scores') 104 | parser.add_argument('-rs', '--ref_set') 105 | parser.add_argument('-s', '--split_method', 106 | choices=['future', 'user', 'old', 'none']) 107 | parser.add_argument('-f', '--frac', default=0.1, type=float, 108 | help='train data fraction') 109 | parser.add_argument('-rl', '--random_level', default=0, type=int, 110 | help='random level') 111 | parser.add_argument('-l', '--loss', default='mse', 112 | choices=['mse', 'cross_entropy']) 113 | parser.add_argument('--print_every', type=int, default=10, 114 | help='logging interval') 115 | parser.add_argument('-ik', '--input_knowledge', action='store_true') 116 | parser.add_argument('-it', '--input_text',action='store_true') 117 | 118 | def run(self, args): 119 | for name in os.listdir(args.workspace): 120 | if name.endswith('.json'): 121 | Model = get_class(name.split('.')[0]) 122 | config = os.path.join(args.workspace, name) 123 | break 124 | else: 125 | print('you must run config first!') 126 | sys.exit(1) 127 | 128 | model = load_config(Model, config) 129 | test(model, args) 130 | 131 | 132 | class Testseq: 133 | def __init__(self, parser): 134 | parser.add_argument('-e', '--snapshot', 135 | help='model snapshot to test with') 136 | parser.add_argument('-d', '--dataset', 137 | required=True) 138 | parser.add_argument('-r', '--ref_len', type=int, default=0, 139 | help='length of sequence with true scores') 140 | parser.add_argument('-rs', '--ref_set') 141 | parser.add_argument('-s', '--split_method', 142 | choices=['future', 'user', 'old', 'none']) 143 | parser.add_argument('-f', '--frac', default=0.1, type=float, 144 | help='train data fraction') 145 | parser.add_argument('-rl', '--random_level', default=0, type=int, 146 | help='random level') 147 | parser.add_argument('-l', '--loss', default='mse', 148 | choices=['mse', 'cross_entropy']) 149 | parser.add_argument('--print_every', type=int, default=10, 150 | help='logging interval') 151 | parser.add_argument('-ik', '--input_knowledge', action='store_true') 152 | parser.add_argument('-it', '--input_text',action='store_true') 153 | 154 | def run(self, args): 155 | for name in os.listdir(args.workspace): 156 | if name.endswith('.json'): 157 | Model = get_class(name.split('.')[0]) 158 | config = os.path.join(args.workspace, name) 159 | break 160 | else: 161 | print('you must run config first!') 162 | sys.exit(1) 163 | 164 | model = load_config(Model, config) 165 | # test(model, args) 166 | testseq(model, args) 167 | 168 | 169 | class Testfuture: 170 | def __init__(self, parser): 171 | parser.add_argument('-e', '--snapshot', 172 | help='model snapshot to test with') 173 | parser.add_argument('-d', '--dataset', 174 | required=True) 175 | parser.add_argument('-r', '--ref_len', type=int, default=0, 176 | help='length of sequence with true scores') 177 | parser.add_argument('-rs', '--ref_set') 178 | parser.add_argument('-s', '--split_method', 179 | choices=['future', 'user', 'old', 'none']) 180 | parser.add_argument('-f', '--frac', default=0.1, type=float, 181 | help='train data fraction') 182 | parser.add_argument('-rl', '--random_level', default=0, type=int, 183 | help='random level') 184 | parser.add_argument('-l', '--loss', default='mse', 185 | choices=['mse', 'cross_entropy']) 186 | parser.add_argument('--print_every', type=int, default=10, 187 | help='logging interval') 188 | parser.add_argument('-ik', '--input_knowledge', action='store_true') 189 | parser.add_argument('-it', '--input_text',action='store_true') 190 | 191 | def run(self, args): 192 | for name in os.listdir(args.workspace): 193 | if name.endswith('.json'): 194 | Model = get_class(name.split('.')[0]) 195 | config = os.path.join(args.workspace, name) 196 | break 197 | else: 198 | print('you must run config first!') 199 | sys.exit(1) 200 | 201 | model = load_config(Model, config) 202 | # test(model, args) 203 | testfuture(model, args) 204 | 205 | 206 | class Stat: 207 | def __init__(self, parser): 208 | parser.add_argument('result_file') 209 | parser.add_argument('-a', '--with_auc', action='store_true') 210 | parser.add_argument('-r', '--round_score', action='store_true') 211 | parser.add_argument('-s', '--short', action='store_true') 212 | 213 | def run(self, args): 214 | stat(open(args.result_file), args.with_auc, args.round_score) 215 | 216 | 217 | class Statoverall: 218 | def __init__(self, parser): 219 | parser.add_argument('result_file') 220 | parser.add_argument('-a', '--with_auc', action='store_true') 221 | parser.add_argument('-r', '--round_score', action='store_true') 222 | parser.add_argument('-s', '--short', action='store_true') 223 | 224 | def run(self, args): 225 | stat_overall(open(args.result_file), args.with_auc, args.round_score) 226 | 227 | 228 | class Trend: 229 | def __init__(self, parser): 230 | parser.add_argument('-e', '--snapshot', 231 | help='model snapshot to test with') 232 | parser.add_argument('-d', '--dataset', required=True) 233 | 234 | def run(self, args): 235 | for name in os.listdir(args.workspace): 236 | if name.endswith('.json'): 237 | Model = get_class(name.split('.')[0]) 238 | config = os.path.join(args.workspace, name) 239 | break 240 | else: 241 | print('you must run config first!') 242 | sys.exit(1) 243 | 244 | model = load_config(Model, config) 245 | if use_cuda: 246 | model.cuda() 247 | trend(model, args.dataset) 248 | 249 | 250 | class Predict: 251 | def __init__(self, parser): 252 | parser.add_argument('-e', '--snapshot', 253 | help='model snapshot to test with') 254 | 255 | def run(self, args): 256 | for name in os.listdir(args.workspace): 257 | if name.endswith('.json'): 258 | Model = get_class(name.split('.')[0]) 259 | config = os.path.join(args.workspace, name) 260 | break 261 | else: 262 | print('you must run config first!') 263 | sys.exit(1) 264 | 265 | model = load_config(Model, config) 266 | predict(model, args) 267 | 268 | 269 | def get_class(name): 270 | return globals()[name[0].upper() + name[1:]] 271 | 272 | 273 | if __name__ == '__main__': 274 | for command in commands: 275 | sub = subparsers.add_parser(command, formatter_class=parser_formatter) 276 | subcommand = get_class(command)(sub) 277 | sub.set_defaults(func=subcommand.run) 278 | 279 | args = parser.parse_args() 280 | workspace = args.workspace 281 | try: 282 | os.makedirs(os.path.join(workspace, 'snapshots')) 283 | os.makedirs(os.path.join(workspace, 'results')) 284 | os.makedirs(os.path.join(workspace, 'logs')) 285 | except OSError: 286 | pass 287 | 288 | logger = logging.getLogger() 289 | logger.setLevel(logging.INFO) 290 | 291 | logFormatter = ColoredFormatter('%(levelname)s %(asctime)s %(message)s', 292 | datefmt='%Y-%m-%d %H:%M:%S') 293 | fileFormatter = logging.Formatter('%(levelname)s %(asctime)s %(message)s', 294 | datefmt='%Y-%m-%d %H:%M:%S') 295 | 296 | if args.command != 'config': 297 | fileHandler = logging.FileHandler(os.path.join(workspace, 'logs', 298 | args.command + '.log')) 299 | fileHandler.setFormatter(fileFormatter) 300 | logger.addHandler(fileHandler) 301 | 302 | consoleHandler = logging.StreamHandler() 303 | consoleHandler.setFormatter(logFormatter) 304 | logger.addHandler(consoleHandler) 305 | 306 | try: 307 | args.func(args) 308 | except KeyboardInterrupt: 309 | logging.warn('cancelled by user') 310 | except Exception as e: 311 | import traceback 312 | sys.stderr.write(traceback.format_exc()) 313 | logging.warn('exception occurred: %s', e) 314 | -------------------------------------------------------------------------------- /util.py: -------------------------------------------------------------------------------- 1 | import json 2 | import torch 3 | import os 4 | import logging 5 | 6 | 7 | def save_config(obj, path): 8 | f = open(path, 'w') 9 | json.dump(obj.args, f, indent=' ') 10 | f.write('\n') 11 | f.close() 12 | 13 | 14 | def load_config(Model, path): 15 | f = open(path, 'r') 16 | return Model(json.load(f)) 17 | 18 | 19 | def save_snapshot(model, ws, id): 20 | filename = os.path.join(ws, 'snapshots', 'model.%s' % str(id)) 21 | f = open(filename, 'wb') 22 | torch.save(model.state_dict(), f) 23 | f.close() 24 | 25 | 26 | def load_snapshot(model, ws, id): 27 | filename = os.path.join(ws, 'snapshots', 'model.%s' % str(id)) 28 | f = open(filename, 'rb') 29 | model.load_state_dict(torch.load(f, map_location=lambda s, loc: s)) 30 | f.close() 31 | 32 | 33 | def load_last_snapshot(model, ws): 34 | last = 0 35 | for file in os.listdir(os.path.join(ws, 'snapshots')): 36 | if 'model.' in file: 37 | epoch = int(file.split('.')[1]) 38 | if epoch > last: 39 | last = epoch 40 | if last > 0: 41 | load_snapshot(model, ws, last) 42 | return last 43 | 44 | 45 | def open_result(ws, name, id): 46 | return open(os.path.join(ws, 'results', '%s.%s' % 47 | (name, str(id))), 'w') 48 | 49 | 50 | use_cuda = torch.cuda.is_available() 51 | 52 | 53 | def Variable(*args, **kwargs): 54 | v = torch.autograd.Variable(*args, **kwargs) 55 | if use_cuda: 56 | v = v.cuda() 57 | return v 58 | 59 | class bcolors: 60 | HEADER = '\033[95m' 61 | OKBLUE = '\033[94m' 62 | OKGREEN = '\033[92m' 63 | WARNING = '\033[93m' 64 | FAIL = '\033[91m' 65 | ENDC = '\033[0m' 66 | BOLD = '\033[1m' 67 | UNDERLINE = '\033[4m' 68 | 69 | 70 | def colored(text, color, bold=False): 71 | if bold: 72 | return bcolors.BOLD + color + text + bcolors.ENDC 73 | else: 74 | return color + text + bcolors.ENDC 75 | 76 | 77 | LOG_COLORS = { 78 | 'WARNING': bcolors.WARNING, 79 | 'INFO': bcolors.OKGREEN, 80 | 'DEBUG': bcolors.OKBLUE, 81 | 'CRITICAL': bcolors.WARNING, 82 | 'ERROR': bcolors.FAIL 83 | } 84 | 85 | 86 | class ColoredFormatter(logging.Formatter): 87 | def __init__(self, msg, datefmt, use_color=True): 88 | logging.Formatter.__init__(self, msg, datefmt=datefmt) 89 | self.use_color = use_color 90 | 91 | def format(self, record): 92 | levelname = record.levelname 93 | if self.use_color and levelname in LOG_COLORS: 94 | record.levelname = colored(record.levelname[0], 95 | LOG_COLORS[record.levelname]) 96 | return logging.Formatter.format(self, record) 97 | --------------------------------------------------------------------------------