├── .vscode ├── .ropeproject │ ├── config.py │ └── objectdb └── settings.json ├── DKT ├── DKT.py ├── README.md ├── prepare_sequences.py ├── prepare_sequences_test.py └── train_DKT.py ├── FeedForwardNetwork ├── FFN.py ├── encode_ffw.py ├── encode_ffw_test.py └── train_ffw.py ├── README.md ├── SAKT ├── model_sakt.py ├── train_sakt.py └── utils_sakt │ ├── logger.py │ ├── metrics.py │ └── misc.py ├── das3h ├── README.md ├── clean.py ├── clean_test.py ├── das3h.py ├── encode.py ├── encode_test.py ├── print_df_infos.py └── results_analysis.py ├── prepare_data.py ├── requirements.txt └── utils ├── logger.py ├── metrics.py └── misc.py /.vscode/.ropeproject/config.py: -------------------------------------------------------------------------------- 1 | # The default ``config.py`` 2 | # flake8: noqa 3 | 4 | 5 | def set_prefs(prefs): 6 | """This function is called before opening the project""" 7 | 8 | # Specify which files and folders to ignore in the project. 9 | # Changes to ignored resources are not added to the history and 10 | # VCSs. Also they are not returned in `Project.get_files()`. 11 | # Note that ``?`` and ``*`` match all characters but slashes. 12 | # '*.pyc': matches 'test.pyc' and 'pkg/test.pyc' 13 | # 'mod*.pyc': matches 'test/mod1.pyc' but not 'mod/1.pyc' 14 | # '.svn': matches 'pkg/.svn' and all of its children 15 | # 'build/*.o': matches 'build/lib.o' but not 'build/sub/lib.o' 16 | # 'build//*.o': matches 'build/lib.o' and 'build/sub/lib.o' 17 | prefs['ignored_resources'] = ['*.pyc', '*~', '.ropeproject', 18 | '.hg', '.svn', '_svn', '.git', '.tox'] 19 | 20 | # Specifies which files should be considered python files. It is 21 | # useful when you have scripts inside your project. Only files 22 | # ending with ``.py`` are considered to be python files by 23 | # default. 24 | # prefs['python_files'] = ['*.py'] 25 | 26 | # Custom source folders: By default rope searches the project 27 | # for finding source folders (folders that should be searched 28 | # for finding modules). You can add paths to that list. Note 29 | # that rope guesses project source folders correctly most of the 30 | # time; use this if you have any problems. 31 | # The folders should be relative to project root and use '/' for 32 | # separating folders regardless of the platform rope is running on. 33 | # 'src/my_source_folder' for instance. 34 | # prefs.add('source_folders', 'src') 35 | 36 | # You can extend python path for looking up modules 37 | # prefs.add('python_path', '~/python/') 38 | 39 | # Should rope save object information or not. 40 | prefs['save_objectdb'] = True 41 | prefs['compress_objectdb'] = False 42 | 43 | # If `True`, rope analyzes each module when it is being saved. 44 | prefs['automatic_soa'] = True 45 | # The depth of calls to follow in static object analysis 46 | prefs['soa_followed_calls'] = 0 47 | 48 | # If `False` when running modules or unit tests "dynamic object 49 | # analysis" is turned off. This makes them much faster. 50 | prefs['perform_doa'] = True 51 | 52 | # Rope can check the validity of its object DB when running. 53 | prefs['validate_objectdb'] = True 54 | 55 | # How many undos to hold? 56 | prefs['max_history_items'] = 32 57 | 58 | # Shows whether to save history across sessions. 59 | prefs['save_history'] = True 60 | prefs['compress_history'] = False 61 | 62 | # Set the number spaces used for indenting. According to 63 | # :PEP:`8`, it is best to use 4 spaces. Since most of rope's 64 | # unit-tests use 4 spaces it is more reliable, too. 65 | prefs['indent_size'] = 4 66 | 67 | # Builtin and c-extension modules that are allowed to be imported 68 | # and inspected by rope. 69 | prefs['extension_modules'] = [] 70 | 71 | # Add all standard c-extensions to extension_modules list. 72 | prefs['import_dynload_stdmods'] = True 73 | 74 | # If `True` modules with syntax errors are considered to be empty. 75 | # The default value is `False`; When `False` syntax errors raise 76 | # `rope.base.exceptions.ModuleSyntaxError` exception. 77 | prefs['ignore_syntax_errors'] = False 78 | 79 | # If `True`, rope ignores unresolvable imports. Otherwise, they 80 | # appear in the importing namespace. 81 | prefs['ignore_bad_imports'] = False 82 | 83 | # If `True`, rope will insert new module imports as 84 | # `from import ` by default. 85 | prefs['prefer_module_from_imports'] = False 86 | 87 | # If `True`, rope will transform a comma list of imports into 88 | # multiple separate import statements when organizing 89 | # imports. 90 | prefs['split_imports'] = False 91 | 92 | # If `True`, rope will remove all top-level import statements and 93 | # reinsert them at the top of the module when making changes. 94 | prefs['pull_imports_to_top'] = True 95 | 96 | # If `True`, rope will sort imports alphabetically by module name instead 97 | # of alphabetically by import statement, with from imports after normal 98 | # imports. 99 | prefs['sort_imports_alphabetically'] = False 100 | 101 | # Location of implementation of 102 | # rope.base.oi.type_hinting.interfaces.ITypeHintingFactory In general 103 | # case, you don't have to change this value, unless you're an rope expert. 104 | # Change this value to inject you own implementations of interfaces 105 | # listed in module rope.base.oi.type_hinting.providers.interfaces 106 | # For example, you can add you own providers for Django Models, or disable 107 | # the search type-hinting in a class hierarchy, etc. 108 | prefs['type_hinting_factory'] = ( 109 | 'rope.base.oi.type_hinting.factory.default_type_hinting_factory') 110 | 111 | 112 | def project_opened(project): 113 | """This function is called after opening the project""" 114 | # Do whatever you like here! 115 | -------------------------------------------------------------------------------- /.vscode/.ropeproject/objectdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thosgt/edm_main_algorithms/e678dcd0342f251d120af3ea5946cc8a1eae4771/.vscode/.ropeproject/objectdb -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.pythonPath": "/Users/tho_sergent/.local/share/virtualenvs/edm_main_algorithms-MrmaA6op/bin/python", 3 | "python.testing.pytestArgs": [ 4 | "src" 5 | ], 6 | "python.testing.unittestEnabled": false, 7 | "python.testing.nosetestsEnabled": false, 8 | "python.testing.pytestEnabled": true 9 | } -------------------------------------------------------------------------------- /DKT/DKT.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class DKT(nn.Module): 7 | def __init__(self, num_items, num_skills, hid_size, num_hid_layers, drop_prob, 8 | item_in, skill_in, item_out, skill_out): 9 | """Deep knowledge tracing. 10 | Arguments: 11 | num_items (int): number of items 12 | num_skills (int): number of skills 13 | hid_size (int): hidden layer dimension 14 | num_hid_layers (int): number of hidden layers 15 | drop_prob (float): dropout probability 16 | item_in (bool): if True, use items as inputs 17 | skill_in (bool): if True, use skills as inputs 18 | item_out (bool): if True, use items as outputs 19 | skill_out (bool): if True, use skills as outputs 20 | """ 21 | super(DKT, self).__init__() 22 | self.num_items = num_items 23 | self.num_skills = num_skills 24 | self.item_in = item_in 25 | self.skill_in = skill_in 26 | self.item_out = item_out 27 | self.skill_out = skill_out 28 | self.input_size = (2 * num_items + 1) * item_in + (2 * num_skills + 1) * skill_in 29 | self.output_size = num_items * item_out + num_skills * skill_out 30 | 31 | self.lstm = nn.LSTM(self.input_size, hid_size, num_hid_layers, batch_first=True) 32 | self.dropout = nn.Dropout(p=drop_prob) 33 | self.out = nn.Linear(hid_size, self.output_size) 34 | 35 | def forward(self, item_inputs, skill_inputs, hidden=None): 36 | # Pad inputs with 0, this explains the +1 37 | if (item_inputs is not None) and (skill_inputs is not None): 38 | items_onehot = F.one_hot(item_inputs, 2 * self.num_items + 1).float() 39 | skills_onehot = F.one_hot(skill_inputs, 2 * self.num_skills + 1).float() 40 | input = torch.cat((items_onehot, skills_onehot), -1) 41 | elif (item_inputs is not None): 42 | input = F.one_hot(item_inputs, 2 * self.num_items + 1).float() 43 | elif (skill_inputs is not None): 44 | input = F.one_hot(skill_inputs, 2 * self.num_skills + 1).float() 45 | else: 46 | raise ValueError("Use at least one of skills or items as input") 47 | 48 | output, hidden = self.lstm(input, hx=hidden) 49 | return self.out(self.dropout(output)), hidden 50 | 51 | def repackage_hidden(self, hidden): 52 | # Return detached hidden for TBPTT 53 | return tuple((v.detach() for v in hidden)) 54 | 55 | -------------------------------------------------------------------------------- /DKT/README.md: -------------------------------------------------------------------------------- 1 | This is an Pytorch implementation of [Deep Knowledge Tracing](https://stanford.edu/~cpiech/bio/papers/deepKnowledgeTracing.pdf) for a dataset like Lalilo's. 2 | 3 | ### Some context 4 | 5 | Knowledge Tracing (KT) is measuring the evolving knowledge of a student over time. Usually this knowledge is captured in a vector of numbers. Several algorithms have been tried over the years like Item Response Theory (IRT), Bayesian Knowledge Tracing (BKT) etc. 6 | 7 | Recently, new KT models based on Neural Networks were tested. Deep Knowledge Tracing (DKT) was the first among them. 8 | 9 | ## How Deep Knowledge Tracing works 10 | 11 | ### High level description 12 | 13 | With any Machine Learning algorithm, the first question is : what do I want to predict ? 14 | 15 | Here I want to predict the probability of success of a student to an exercise. 16 | 17 | Long-Short-Term Memory neural networks (LSTMs) seem a good way to model this : they possess a hidden state supposedly able to capture the evolving student knowledge with each answer. 18 | The hidden state is comprised of ```n_hidden``` dimensions representing the estimated student knowledge state. 19 | 20 | From the student knowledge state we should then be able to predict the probability of success of a student to any exercise after any of their answer. 21 | 22 | #### More advanced paragraph 23 | 24 | A central hypothesis of this algorithm is : the hidden state transitions are the same for each students. Training the model is computing the matrix governing the hidden state transitions. 25 | 26 | ### Lower level description - if you are familiar with Machine Learning in general 27 | 28 | There are two kind of matrices that we need to distinguish here : 29 | - the hidden state that is specific to a given student. There is one hidden state per student. In the beginning of their exercise sequence, the hidden state of a student is a matrix with zeros. It is updated after each exercise they answer 30 | - the weights of the network that govern the transition between hidden states, and the mapping between hidden states and predicted probabilities. These weights are the same for all students. There are updated during training so that they fit student transitions the best way possible : training the model is updating these weights 31 | 32 | #### What are we going to feed our network to train it ? 33 | We are going to feed our network the exercise sequence of each student one after the other (indeed it is a sequence as the exercises are done one after the other and not simultaneously). Therefore, for each student : 34 | - we select the exercises and answers of this student 35 | - the hidden state of the student is set to a zero-like vector 36 | - then for each exercise of their exercise sequence : 37 | - using the hidden state of the student and the mapping between hidden state and expected probabilities, we predict the probability of answering correctly to the exercise they get and compare it to the actual correctness 38 | - we update the network weights so that the predicted probability is closer to the actual correctness 39 | - the hidden state of the student is updated 40 | 41 | This is how we train the model to find its weights. 42 | 43 | #### Lowest level - if you are already familiar with recurring neural networks in particular 44 | 45 | What kind of input my algorithm takes and what is the kind of output it outputs ? 46 | 47 | For each student, the input size will vary as the number of exercises each of them answered vary. We set the number of exercises a student answered as ```sequence_length``` 48 | 49 | One central question here is : how to represent an answer to one of the exercises ? 50 | 51 | The solution choosen in the article is to [one-hot-encode](https://hackernoon.com/what-is-one-hot-encoding-why-and-when-do-you-have-to-use-it-e3c6186d008f) the tuple ```(exercise, correctness)```. 52 | For each answer the student does one out of 2 x ```n_available_exercises``` actions : they get one of ```n_available_exercises``` exercises and can answer correctly or not. Thus, for each answer, the input is a one-hot-encoded vector of size ```2 x n_available_exercises``` 53 | 54 | As output, we would like to have the probabilities of success of all available exercises for the *next* exercise the student does. Therefore, for each answer of the student, the LSTM outputs ```n_available_exercises``` probabilities 55 | 56 | When the student hasn't answered any exercise, we still want to have some probabilities from the LSTM. These probabilities would be the starting probabilities for all students when there is no hidden state yet. However the LSTM doesn't output something when it doesn't have an input. The solution we found is to have as the first input of the sequence a row full of zeros i.e having the answer shifted. 57 | 58 | To sum up, the input of the LSTM for each student is of shape ```(sequence_length, 2 x n_available_exercises)```and the output is ```(sequence_length, n_available_exercises)``` 59 | 60 | Computing the loss is straightforward once we have the predictions for every available exercise and for every actual answer. For each anwer, we need to select the prediction that is relevant (we need only one of the ```n_available_exercises``` predictions, the prediction of the exercise that was actually answered) and compute the ```log_loss``` between it and its actual correctness. 61 | 62 | After propagating the gradient and updating the weights we can go to another student sequence and update the weights once more. 63 | 64 | 65 | ### FAQ 66 | ##### Why not feeding the entire sequences of answers of all students to the neural network ? 67 | The hidden state is specific to each student so it has to be zeroed between students. That is why the network is fed sequence by sequence. -------------------------------------------------------------------------------- /DKT/prepare_sequences.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | from sklearn.preprocessing import LabelEncoder 5 | 6 | 7 | def prepare_df(df: pd.DataFrame)-> pd.DataFrame: 8 | label_encoder = LabelEncoder() 9 | df_copy = df.copy() 10 | df_copy["exercise_code_level_lesson"] = label_encoder.fit_transform( 11 | df_copy["exercise_code_level_lesson"] 12 | ) 13 | df_copy["concat_exercise_correctness"] = ( 14 | df_copy["exercise_code_level_lesson"].map(str) 15 | + "_" 16 | + df_copy["correctness"].map(str) 17 | ) 18 | df_to_feed_network = pd.get_dummies( 19 | df_copy[ 20 | ["correctness", "student_id", "exercise_code_level_lesson", "concat_exercise_correctness"] 21 | ], 22 | columns=["concat_exercise_correctness"], 23 | sparse=True, 24 | ) 25 | n_expected_columns = df_copy.exercise_code_level_lesson.unique().shape[0] 26 | expected_columns = [] 27 | for i in range(n_expected_columns): 28 | for correctness in (0, 1): 29 | expected_columns.append(f'concat_exercise_correctness_{i}_{correctness}') 30 | for column in expected_columns: 31 | if column not in df_to_feed_network.columns: 32 | df_to_feed_network[column] = 0 33 | return df_to_feed_network, label_encoder 34 | 35 | 36 | def prepare_sequences(df: pd.DataFrame): 37 | # idea have a generator to spare memory ? no if several epochs 38 | # idea add shuffling somewhere ? 39 | student_ids = df["student_id"].unique() 40 | exercise_sequences = [] 41 | # le.save somewhere ? 42 | for student_id in student_ids: 43 | df_of_student = df[df["student_id"] == student_id].drop(columns=["student_id"]) 44 | exercise_sequences.append(df_of_student) 45 | return exercise_sequences 46 | 47 | -------------------------------------------------------------------------------- /DKT/prepare_sequences_test.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from prepare_sequences import prepare_df 4 | from pandas.testing import assert_frame_equal 5 | 6 | df = pd.DataFrame( 7 | { 8 | "student_id": [1, 2, 3, 1, 5, 5, 4], 9 | "exercise_code_level_lesson": [ 10 | "phono_3_lesson_102", 11 | "phono_3_lesson_101", 12 | "phono_3_lesson_103", 13 | "phono_3_lesson_101", 14 | "phono_3_lesson_102", 15 | "phono_3_lesson_103", 16 | "phono_3_lesson_103", 17 | ], 18 | "correctness": [0, 1, 1, 0, 1, 0, 1], 19 | } 20 | ) 21 | 22 | 23 | def test_that_cleaning_of_df_works(): 24 | prepared_df, _ = prepare_df(df) 25 | expected_prepared_df = pd.DataFrame( 26 | { 27 | "student_id": {0: 1, 1: 2, 2: 3, 3: 1, 4: 5, 5: 5, 6: 4}, 28 | "correctness": {0: 0, 1: 1, 2: 1, 3: 0, 4: 1, 5: 0, 6: 1}, 29 | "exercise_code_level_lesson": {0: 1, 1: 0, 2: 2, 3: 0, 4: 1, 5: 2, 6: 2}, 30 | "concat_exercise_correctness_0_0": { 31 | 0: 0, 32 | 1: 0, 33 | 2: 0, 34 | 3: 1, 35 | 4: 0, 36 | 5: 0, 37 | 6: 0, 38 | }, 39 | "concat_exercise_correctness_0_1": { 40 | 0: 0, 41 | 1: 1, 42 | 2: 0, 43 | 3: 0, 44 | 4: 0, 45 | 5: 0, 46 | 6: 0, 47 | }, 48 | "concat_exercise_correctness_1_0": { 49 | 0: 1, 50 | 1: 0, 51 | 2: 0, 52 | 3: 0, 53 | 4: 0, 54 | 5: 0, 55 | 6: 0, 56 | }, 57 | "concat_exercise_correctness_1_1": { 58 | 0: 0, 59 | 1: 0, 60 | 2: 0, 61 | 3: 0, 62 | 4: 1, 63 | 5: 0, 64 | 6: 0, 65 | }, 66 | "concat_exercise_correctness_2_0": { 67 | 0: 0, 68 | 1: 0, 69 | 2: 0, 70 | 3: 0, 71 | 4: 0, 72 | 5: 1, 73 | 6: 0, 74 | }, 75 | "concat_exercise_correctness_2_1": { 76 | 0: 0, 77 | 1: 0, 78 | 2: 1, 79 | 3: 0, 80 | 4: 0, 81 | 5: 0, 82 | 6: 1, 83 | }, 84 | } 85 | ) 86 | assert_frame_equal(prepared_df, expected_prepared_df, check_dtype=False, check_like=True) 87 | 88 | -------------------------------------------------------------------------------- /DKT/train_DKT.py: -------------------------------------------------------------------------------- 1 | # mainly from theophilee/kt-algos 2 | import argparse 3 | import pandas as pd 4 | from random import shuffle 5 | from sklearn.metrics import roc_auc_score, accuracy_score 6 | 7 | import torch.nn as nn 8 | from torch.optim import Adam 9 | from torch.nn.utils.rnn import pad_sequence 10 | from tqdm import tqdm 11 | 12 | from model_dkt import DKT 13 | from utils import * 14 | 15 | 16 | def get_data(df, item_in, skill_in, item_out, skill_out, train_split=0.8): 17 | """Extract sequences from dataframe. 18 | Arguments: 19 | df (pandas Dataframe): output by prepare_data.py 20 | item_in (bool): if True, use items as inputs 21 | skill_in (bool): if True, use skills as inputs 22 | item_out (bool): if True, use items as outputs 23 | skill_out (bool): if True, use skills as outputs 24 | train_split (float): proportion of data to use for training 25 | """ 26 | item_ids = [torch.tensor(u_df["item_id"].values, dtype=torch.long) 27 | for _, u_df in df.groupby("user_id")] 28 | skill_ids = [torch.tensor(u_df["skill_id"].values, dtype=torch.long) 29 | for _, u_df in df.groupby("user_id")] 30 | labels = [torch.tensor(u_df["correct"].values, dtype=torch.long) 31 | for _, u_df in df.groupby("user_id")] 32 | 33 | item_inputs = [torch.cat((torch.zeros(1, dtype=torch.long), i * 2 + l + 1))[:-1] 34 | for (i, l) in zip(item_ids, labels)] 35 | skill_inputs = [torch.cat((torch.zeros(1, dtype=torch.long), s * 2 + l + 1))[:-1] 36 | for (s, l) in zip(skill_ids, labels)] 37 | 38 | item_inputs = item_inputs if item_in else [None] * len(item_inputs) 39 | skill_inputs = skill_inputs if skill_in else [None] * len(skill_inputs) 40 | item_ids = item_ids if item_out else [None] * len(item_ids) 41 | skill_ids = skill_ids if skill_out else [None] * len(skill_ids) 42 | 43 | data = list(zip(item_inputs, skill_inputs, item_ids, skill_ids, labels)) 44 | shuffle(data) 45 | 46 | # Train-test split across users 47 | train_size = int(train_split * len(data)) 48 | train_data, val_data = data[:train_size], data[train_size:] 49 | return train_data, val_data 50 | 51 | 52 | def prepare_batches(data, batch_size): 53 | """Prepare batches grouping padded sequences. 54 | Arguments: 55 | data (list of lists of torch Tensor): output by get_data 56 | batch_size (int): number of sequences per batch 57 | Output: 58 | batches (list of lists of torch Tensor) 59 | """ 60 | shuffle(data) 61 | batches = [] 62 | 63 | for k in range(0, len(data), batch_size): 64 | batch = data[k:k + batch_size] 65 | seq_lists = list(zip(*batch)) 66 | inputs_and_ids = [pad_sequence(seqs, batch_first=True, padding_value=0) 67 | if (seqs[0] is not None) else None for seqs in seq_lists[:4]] 68 | labels = pad_sequence(seq_lists[-1], batch_first=True, padding_value=-1) # Pad labels with -1 69 | batches.append([*inputs_and_ids, labels]) 70 | 71 | return batches 72 | 73 | 74 | def get_preds(preds, item_ids, skill_ids, labels): 75 | preds = preds[labels >= 0] 76 | 77 | if (item_ids is not None): 78 | item_ids = item_ids[labels >= 0] 79 | preds = preds[torch.arange(preds.size(0)), item_ids] 80 | elif (skill_ids is not None): 81 | skill_ids = skill_ids[labels >= 0] 82 | preds = preds[torch.arange(preds.size(0)), skill_ids] 83 | else: 84 | raise ValueError("Use exactly one of skills or items as output") 85 | 86 | return preds 87 | 88 | 89 | def compute_auc(preds, item_ids, skill_ids, labels): 90 | preds = get_preds(preds, item_ids, skill_ids, labels) 91 | labels = labels[labels >= 0].float() 92 | 93 | if len(torch.unique(labels)) == 1: # Only one class 94 | auc = accuracy_score(labels, torch.sigmoid(preds).round()) 95 | else: 96 | auc = roc_auc_score(labels, preds) 97 | 98 | return auc 99 | 100 | 101 | def compute_loss(preds, item_ids, skill_ids, labels, criterion): 102 | preds = get_preds(preds, item_ids, skill_ids, labels) 103 | labels = labels[labels >= 0].float() 104 | return criterion(preds, labels) 105 | 106 | 107 | def train(train_data, val_data, model, optimizer, logger, saver, num_epochs, batch_size, bptt=50): 108 | """Train DKT model. 109 | 110 | Arguments: 111 | train_data (list of lists of torch Tensor) 112 | val_data (list of lists of torch Tensor) 113 | model (torch Module) 114 | optimizer (torch optimizer) 115 | logger: wrapper for TensorboardX logger 116 | num_epochs (int): number of epochs to train for 117 | batch_size (int) 118 | bptt (int): length of truncated backprop through time chunks 119 | savepath (str): directory where to save the trained model 120 | """ 121 | criterion = nn.BCEWithLogitsLoss() 122 | metrics = Metrics() 123 | step = 0 124 | 125 | for epoch in tqdm(range(num_epochs)): 126 | train_batches = prepare_batches(train_data, batch_size) 127 | val_batches = prepare_batches(val_data, batch_size) 128 | 129 | # Training 130 | for item_inputs, skill_inputs, item_ids, skill_ids, labels in train_batches: 131 | length = labels.size(1) 132 | preds = torch.empty(labels.size(0), length, model.output_size) 133 | if item_inputs is not None: 134 | item_inputs.to(device=args.device) 135 | if skill_inputs is not None: 136 | skill_inputs.to(device=args.device) 137 | preds.to(device=args.device) 138 | 139 | # Truncated backprop through time 140 | for i in range(0, length, bptt): 141 | item_inp = item_inputs[:, i:i + bptt] if item_inputs is not None else None 142 | skill_inp = skill_inputs[:, i:i + bptt] if skill_inputs is not None else None 143 | if i == 0: 144 | pred, hidden = model(item_inp, skill_inp) 145 | else: 146 | hidden = model.repackage_hidden(hidden) 147 | pred, hidden = model(item_inp, skill_inp, hidden) 148 | preds[:, i:i + bptt] = pred 149 | 150 | loss = compute_loss(preds, item_ids, skill_ids, labels.to(device=args.device), criterion) 151 | train_auc = compute_auc(preds.detach().cpu(), item_ids, skill_ids, labels) 152 | 153 | model.zero_grad() 154 | loss.backward() 155 | optimizer.step() 156 | step += 1 157 | metrics.store({'loss/train': loss.item()}) 158 | metrics.store({'auc/train': train_auc}) 159 | 160 | # Logging 161 | if step % 20 == 0: 162 | logger.log_scalars(metrics.average(), step) 163 | #weights = {"weight/" + name: param for name, param in model.named_parameters()} 164 | #grads = {"grad/" + name: param.grad 165 | # for name, param in model.named_parameters() if param.grad is not None} 166 | #logger.log_histograms(weights, step) 167 | #logger.log_histograms(grads, step) 168 | 169 | # Validation 170 | model.eval() 171 | for item_inputs, skill_inputs, item_ids, skill_ids, labels in val_batches: 172 | with torch.no_grad(): 173 | if item_inputs is not None: 174 | item_inputs.to(device=args.device) 175 | if skill_inputs is not None: 176 | skill_inputs.to(device=args.device) 177 | preds, _ = model(item_inputs, skill_inputs) 178 | val_auc = compute_auc(preds.cpu(), item_ids, skill_ids, labels) 179 | metrics.store({'auc/val': val_auc}) 180 | model.train() 181 | 182 | # Save model 183 | average_metrics = metrics.average() 184 | logger.log_scalars(average_metrics, step) 185 | stop = saver.save(average_metrics['auc/val'], model) 186 | if stop: 187 | break 188 | 189 | 190 | if __name__ == "__main__": 191 | parser = argparse.ArgumentParser(description='Train DKT.') 192 | parser.add_argument('--dataset', type=str) 193 | parser.add_argument('--logdir', type=str, default='runs/dkt') 194 | parser.add_argument('--savedir', type=str, default='save/dkt') 195 | parser.add_argument('--item_in', action='store_true', 196 | help='If True, use items as inputs.') 197 | parser.add_argument('--skill_in', action='store_true', 198 | help='If True, use skills as inputs.') 199 | parser.add_argument('--item_out', action='store_true', 200 | help='If True, use items as outputs.') 201 | parser.add_argument('--skill_out', action='store_true', 202 | help='If True, use skills as outputs.') 203 | parser.add_argument('--hid_size', type=int, default=200) 204 | parser.add_argument('--num_hid_layers', type=int, default=1) 205 | parser.add_argument('--drop_prob', type=float, default=0.5) 206 | parser.add_argument('--batch_size', type=int, default=100) 207 | parser.add_argument('--lr', type=float, default=1e-2) 208 | parser.add_argument('--num_epochs', type=int, default=100) 209 | args = parser.parse_args() 210 | 211 | assert (args.item_in or args.skill_in) # Use at least one of skills or items as input 212 | assert (args.item_out != args.skill_out) # Use exactly one of skills or items as output 213 | 214 | df = pd.read_csv(os.path.join('data', args.dataset, 'preprocessed_data.csv'), sep="\t") 215 | 216 | train_data, val_data = get_data(df, args.item_in, args.skill_in, args.item_out, args.skill_out) 217 | 218 | num_items = int(df["item_id"].max() + 1) + 1 219 | num_skills = int(df["skill_id"].max() + 1) + 1 220 | 221 | model = DKT(num_items, num_skills, args.hid_size, args.num_hid_layers, args.drop_prob, 222 | args.item_in, args.skill_in, args.item_out, args.skill_out) 223 | model = nn.DataParallel(model) 224 | model.to(device=args.device) 225 | optimizer = Adam(model.parameters(), lr=args.lr) 226 | 227 | logger.close() -------------------------------------------------------------------------------- /FeedForwardNetwork/FFN.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | class FeedForwardNetwork(nn.Module): 6 | def __init__(self, n_skills, n_items, n_counters, hidden_dim, drop_prob): 7 | super(FeedForwardNetwork, self).__init__() 8 | self.lin_features_to_hidden = nn.Linear(n_counters * (n_items + n_skills), hidden_dim) 9 | self.lin_hidden_to_output = nn.Linear(hidden_dim, n_items) 10 | self.dropout = nn.Dropout(p=drop_prob) 11 | 12 | def forward(self, input): 13 | hidden_state = F.relu(self.lin_features_to_hidden(input)) 14 | output = self.lin_hidden_to_output(self.dropout(hidden_state)) 15 | return output 16 | -------------------------------------------------------------------------------- /FeedForwardNetwork/encode_ffw.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import pandas as pd 4 | import numpy as np 5 | 6 | from scipy import sparse 7 | from scipy.sparse import csr_matrix, hstack, vstack 8 | from tqdm import tqdm 9 | from sklearn.preprocessing import OneHotEncoder 10 | 11 | 12 | COUNTERS = ("attempts", "wins") 13 | 14 | 15 | def encode_df(df, Q_mat, skill_counters=True): 16 | """Build sparse dataset from dense dataset and q-matrix. 17 | 18 | Arguments: 19 | df (pandas DataFrame): output by prepare_data.py 20 | Q_mat (sparse array): q-matrix, output by prepare_data.py 21 | skill_counters: if we want to include the counters of skill as well 22 | 23 | Output: 24 | sparse_df (sparse array): sparse dataset where first 4 columns are the same as in df 25 | """ 26 | n_items = Q_mat.shape[0] 27 | onehot_items = OneHotEncoder(categories=[range(n_items)]) 28 | onehot_items.fit_transform(df["item_id"].values.reshape(-1, 1)) 29 | 30 | features = [] 31 | for user_id in tqdm(df["user_id"].unique()): 32 | df_user = df[df["user_id"] == user_id] 33 | user_features = encode_user_ffw( 34 | df_user, 35 | onehot_items=onehot_items, 36 | Q_mat=Q_mat, 37 | skill_counters=skill_counters, 38 | ) 39 | user_features = hstack((csr_matrix(df_user.values), user_features)) 40 | features.append(user_features) 41 | return vstack(features) 42 | 43 | 44 | def encode_user_ffw(df_user, Q_mat, onehot_items, skill_counters=True): 45 | labels = csr_matrix(df_user["correct"].values.reshape(-1, 1)) 46 | item_ids = df_user["item_id"].values.reshape(-1, 1) 47 | item_ids_onehot = onehot_items.transform(item_ids) 48 | 49 | skill_ids_onehot = Q_mat[item_ids.flatten()] 50 | 51 | all_counters = [] 52 | for counter in COUNTERS: 53 | user_item_counter = get_user_counter(item_ids_onehot, labels, counter=counter) 54 | all_counters.append(user_item_counter) 55 | if skill_counters: 56 | user_skill_counter = get_user_counter( 57 | skill_ids_onehot, labels, counter=counter 58 | ) 59 | all_counters.append(user_skill_counter) 60 | return hstack(all_counters) 61 | 62 | 63 | def get_user_counter(feature_id_onehot, labels, counter): 64 | array_to_accumulate = feature_id_onehot.toarray() 65 | if counter == "attempts": 66 | pass 67 | elif counter == "wins": 68 | array_to_accumulate *= labels.toarray() 69 | counts = accumulate(array_to_accumulate) 70 | counter = phi(counts) 71 | return counter 72 | 73 | 74 | def accumulate(x): 75 | return vstack((csr_matrix((1, x.shape[1])), csr_matrix(np.cumsum(x, 0))))[:-1] 76 | 77 | 78 | def phi(x): 79 | return x.log1p() 80 | 81 | 82 | if __name__ == "__main__": 83 | parser = argparse.ArgumentParser( 84 | description="Encode feature matrix for feedforward network baseline." 85 | ) 86 | parser.add_argument("--dataset", type=str) 87 | parser.add_argument("--n_traces", type=int, default=20000) 88 | 89 | args = parser.parse_args() 90 | 91 | data_path = os.path.join("data", args.dataset) 92 | 93 | df = pd.read_csv(os.path.join(data_path, "preprocessed_data.csv"), sep="\t") 94 | Q_mat = sparse.load_npz(os.path.join(data_path, "q_mat.npz")) 95 | X = encode_df(df[-args.n_traces :], Q_mat) 96 | sparse.save_npz(os.path.join(data_path, f"X-ffw-{args.n_traces}-traces"), X) 97 | 98 | -------------------------------------------------------------------------------- /FeedForwardNetwork/encode_ffw_test.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from scipy import sparse 4 | 5 | from pandas.testing import assert_frame_equal 6 | from numpy.testing import assert_array_equal 7 | from encode_ffw import encode_user_ffw, get_user_counter, encode_df 8 | from sklearn.preprocessing import OneHotEncoder 9 | 10 | 11 | def test_attempts_counter(): 12 | df_exercise_tuple = pd.DataFrame( 13 | { 14 | "phono_3_lesson_102": [1, 1, 0, 1, 0, 1, 1], 15 | "phono_3_lesson_103": [0, 0, 1, 0, 1, 0, 0], 16 | "correct": [0, 1, 1, 0, 1, 0, 1], 17 | } 18 | ) 19 | feature_id_onehot = sparse.csr_matrix( 20 | df_exercise_tuple[["phono_3_lesson_102", "phono_3_lesson_103"]].values 21 | ) 22 | labels = sparse.csr_matrix(df_exercise_tuple["correct"].values.reshape(-1, 1)) 23 | counter = "attempts" 24 | user_attempts = get_user_counter(feature_id_onehot, labels, counter).toarray() 25 | expected_array = np.log( 26 | 1 + np.array([[0, 0], [1, 0], [2, 0], [2, 1], [3, 1], [3, 2], [4, 2]]) 27 | ) 28 | assert_array_equal(user_attempts, expected_array) 29 | 30 | 31 | def test_wins_counter(): 32 | df_exercise_tuple = pd.DataFrame( 33 | { 34 | "phono_3_lesson_102": [1, 1, 0, 1, 0, 1, 1], 35 | "phono_3_lesson_103": [0, 0, 1, 0, 1, 0, 0], 36 | "correct": [0, 1, 1, 0, 1, 0, 1], 37 | } 38 | ) 39 | feature_id_onehot = sparse.csr_matrix( 40 | df_exercise_tuple[["phono_3_lesson_102", "phono_3_lesson_103"]].values 41 | ) 42 | labels = sparse.csr_matrix(df_exercise_tuple["correct"].values.reshape(-1, 1)) 43 | counter = "wins" 44 | user_attempts = get_user_counter(feature_id_onehot, labels, counter).toarray() 45 | expected_array = np.log( 46 | 1 + np.array([[0, 0], [0, 0], [1, 0], [1, 1], [1, 1], [1, 2], [1, 2]]) 47 | ) 48 | assert_array_equal(user_attempts, expected_array) 49 | 50 | 51 | def test_encoding_counter(): 52 | df_user = pd.DataFrame( 53 | { 54 | "item_id": [0, 0, 1, 0, 1, 0, 0], 55 | "skill_id": [0, 0, 0, 0, 0, 0, 0], 56 | "correct": [0, 1, 1, 0, 1, 0, 1], 57 | } 58 | ) 59 | Q_mat = sparse.csr_matrix([[1], [1]]) 60 | onehot_items = OneHotEncoder() 61 | onehot_items.fit(df_user["item_id"].values.reshape(-1, 1)) 62 | 63 | user_ffw_encoding = encode_user_ffw( 64 | df_user, onehot_items=onehot_items, Q_mat=Q_mat, skill_counters=True 65 | ).toarray() 66 | expected_attempts_array = np.array( 67 | [[0, 0, 0], [1, 0, 1], [2, 0, 2], [2, 1, 3], [3, 1, 4], [3, 2, 5], [4, 2, 6]] 68 | ) 69 | expected_wins_array = np.array( 70 | [[0, 0, 0], [0, 0, 0], [1, 0, 1], [1, 1, 2], [1, 1, 2], [1, 2, 3], [1, 2, 3]] 71 | ) 72 | expected_array = np.concatenate( 73 | (expected_attempts_array, expected_wins_array), axis=1 74 | ) 75 | expected_array = np.log(1 + expected_array) 76 | assert_array_equal(user_ffw_encoding, expected_array) 77 | 78 | 79 | def test_encoding_counter_two_users(): 80 | df_user = pd.DataFrame( 81 | { 82 | "user_id": [0, 0, 1, 0, 1, 0, 0], 83 | "item_id": [0, 0, 1, 0, 1, 0, 0], 84 | "skill_id": [0, 0, 0, 0, 0, 0, 0], 85 | "correct": [0, 1, 1, 0, 1, 0, 1], 86 | } 87 | ) 88 | Q_mat = sparse.csr_matrix([[1], [1]]) # useless here as skill_counters=False 89 | user_ffw_encoding = encode_df(df_user, Q_mat, skill_counters=False).toarray() 90 | expected_attempts_array_0 = np.array([[0, 0], [1, 0], [2, 0], [3, 0], [4, 0]]) 91 | expected_wins_array_0 = np.array([[0, 0], [0, 0], [1, 0], [1, 0], [1, 0]]) 92 | expected_attempts_array_1 = np.array([[0, 0], [0, 1]]) 93 | expected_wins_array_1 = np.array([[0, 0], [0, 1]]) 94 | expected_array_0 = np.concatenate( 95 | (expected_attempts_array_0, expected_wins_array_0), axis=1 96 | ) 97 | expected_array_1 = np.concatenate( 98 | (expected_attempts_array_1, expected_wins_array_1), axis=1 99 | ) 100 | expected_array = np.concatenate((expected_array_0, expected_array_1), axis=0) 101 | expected_array = np.log(1 + expected_array) 102 | 103 | expected_array = np.hstack( 104 | (df_user.sort_values(by="user_id").values, expected_array) 105 | ) 106 | assert_array_equal(user_ffw_encoding, expected_array) 107 | -------------------------------------------------------------------------------- /FeedForwardNetwork/train_ffw.py: -------------------------------------------------------------------------------- 1 | # mainly inspired by Théophile Gervet 2 | # https://github.com/theophilee/kt-algos/blob/master/train_ffw.py 3 | import sys 4 | import os 5 | import argparse 6 | import numpy as np 7 | from scipy.sparse import load_npz, csr_matrix 8 | import torch 9 | import torch.nn as nn 10 | from torch.optim import Adam 11 | 12 | sys.path.append(".") 13 | 14 | from utils.logger import Logger 15 | from utils.metrics import Metrics 16 | from FFN import FeedForwardNetwork 17 | from utils.misc import * 18 | from tqdm import tqdm 19 | 20 | 21 | def get_tensors(sparse): 22 | dense = torch.tensor(sparse.toarray()) 23 | inputs = dense[:, 4:].float() 24 | item_ids = dense[:, 1].long() 25 | labels = dense[:, 3].float() 26 | return inputs, item_ids, labels 27 | 28 | 29 | def train(X_train, X_val, model, optimizer, logger, num_epochs, batch_size): 30 | """Train FFW model. 31 | Arguments: 32 | X (sparse matrix): output by encode_ffw.py 33 | model (torch Module) 34 | optimizer (torch optimizer) 35 | logger: wrapper for TensorboardX logger 36 | num_epochs (int): number of epochs to train for 37 | batch_size (int) 38 | """ 39 | criterion = nn.BCEWithLogitsLoss() 40 | metrics = Metrics() 41 | train_idxs = np.arange(X_train.shape[0]) 42 | val_idxs = np.arange(X_val.shape[0]) 43 | step = 0 44 | 45 | for epoch in tqdm(range(num_epochs)): 46 | shuffle(train_idxs) 47 | shuffle(val_idxs) 48 | 49 | # Training 50 | for k in range(0, len(train_idxs), batch_size): 51 | inputs, item_ids, labels = get_tensors( 52 | X_train[train_idxs[k : k + batch_size]] 53 | ) 54 | inputs = inputs.to(device=args.device) 55 | preds = model(inputs) 56 | relevant_preds = preds[ 57 | torch.arange(preds.shape[0]), item_ids.to(device=args.device) 58 | ] 59 | loss = criterion(relevant_preds, labels.to(device=args.device)) 60 | 61 | train_auc = compute_auc(preds.detach().cpu(), item_ids, labels) 62 | 63 | model.zero_grad() 64 | loss.backward() 65 | optimizer.step() 66 | step += 1 67 | metrics.store({"loss/train": loss.item()}) 68 | metrics.store({"auc/train": train_auc}) 69 | 70 | # Logging 71 | if step % 20 == 0: 72 | logger.log_scalars(metrics.average(), step * batch_size) 73 | 74 | # Validation 75 | model.eval() 76 | for k in range(0, len(val_idxs), batch_size): 77 | inputs, item_ids, labels = get_tensors(X_val[val_idxs[k : k + batch_size]]) 78 | inputs = inputs.to(device=args.device) 79 | with torch.no_grad(): 80 | preds = model(inputs) 81 | val_auc = compute_auc(preds.cpu(), item_ids, labels) 82 | metrics.store({"auc/val": val_auc}) 83 | model.train() 84 | 85 | 86 | def student_level_split(X): 87 | user_ids = X[:, 0].toarray().flatten() 88 | users = np.unique(user_ids) 89 | np.random.shuffle(users) 90 | split = int(0.8 * len(users)) 91 | users_train, users_val = users[:split], users[split:] 92 | return ( 93 | X[np.where(np.isin(user_ids, users_train))], 94 | X[np.where(np.isin(user_ids, users_val))], 95 | ) 96 | 97 | 98 | def get_number_of_items_and_skills(dataset): 99 | data_path = os.path.join("data", dataset) 100 | Q_mat = load_npz(os.path.join(data_path, "q_mat.npz")) 101 | return Q_mat.shape[0], Q_mat.shape[1] 102 | 103 | 104 | if __name__ == "__main__": 105 | parser = argparse.ArgumentParser( 106 | description="Train feedforward neural network on dense feature matrix." 107 | ) 108 | parser.add_argument("X_file", type=str) 109 | parser.add_argument("--dataset", type=str) 110 | parser.add_argument("--logdir", type=str, default="runs/ffw") 111 | parser.add_argument("--hid_size", type=int, default=200) 112 | parser.add_argument("--drop_prob", type=float, default=0.2) 113 | parser.add_argument("--batch_size", type=int, default=500) 114 | parser.add_argument("--lr", type=float, default=1e-3) 115 | parser.add_argument("--num_epochs", type=int, default=25) 116 | parser.add_argument("--disable-cuda", action="store_true", help="Disable CUDA") 117 | args = parser.parse_args() 118 | args.device = None 119 | if not args.disable_cuda and torch.cuda.is_available(): 120 | args.device = torch.device("cuda") 121 | else: 122 | args.device = torch.device("cpu") 123 | 124 | # First four columns are original dataset 125 | # then previous interaction encodings and wins/attempts statistics 126 | X = csr_matrix(load_npz(args.X_file)) 127 | 128 | # Student-level train-val split 129 | X_train, X_val = student_level_split(X) 130 | 131 | n_items, n_skills = get_number_of_items_and_skills(dataset=args.dataset) 132 | n_counters = 2 133 | 134 | model = FeedForwardNetwork( 135 | n_skills=n_skills, 136 | n_items=n_items, 137 | n_counters=n_counters, 138 | hidden_dim=args.hid_size, 139 | drop_prob=args.drop_prob, 140 | ).to(device=args.device) 141 | optimizer = Adam(model.parameters(), lr=args.lr) 142 | 143 | param_str = f"{args.dataset}" 144 | logger = Logger(os.path.join(args.logdir, param_str)) 145 | 146 | train(X_train, X_val, model, optimizer, logger, args.num_epochs, args.batch_size) 147 | 148 | logger.close() 149 | 150 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | This repository contains some of the main EDM models for Knowledge Tracing. 2 | 3 | For now it contains DAS3H model by Choffin et al., a basic FeedForwardModel, DKT and SAKT 4 | 5 | ## Setup 6 | 7 | 8 | 9 | Create a new conda environment with python 3 10 | ``` 11 | conda create --name python3-env python=3.7 12 | ``` 13 | Activate conda env 14 | ``` 15 | conda activate python3-env 16 | ``` 17 | 18 | Install [PyTorch](https://pytorch.org) and the remaining requirements: 19 | 20 | ``` 21 | pip install -r requirements.txt 22 | ``` 23 | 24 | To use a dataset, download the data from one of the links above and: 25 | - place the main file under `data//data.csv` for an ASSISTments dataset 26 | - place the main file under `data//data.txt` for a KDDCup dataset 27 | 28 | ``` 29 | python prepare_data.py --dataset --remove_nan_skills 30 | ``` 31 | 32 | ## Training 33 | 34 | #### Deep Knowledge Tracing 35 | 36 | To train a DKT model: 37 | 38 | ``` 39 | python train_dkt.py --dataset 40 | ``` 41 | 42 | #### Self-Attentive Knowledge Tracing 43 | 44 | To train a SAKT model: 45 | 46 | ``` 47 | python train_sakt.py --dataset 48 | ``` 49 | -------------------------------------------------------------------------------- /SAKT/model_sakt.py: -------------------------------------------------------------------------------- 1 | """ Embeddings module from ONMT""" 2 | import math 3 | import warnings 4 | import numpy as np 5 | 6 | import torch 7 | import torch.nn as nn 8 | 9 | 10 | def future_mask(seq_length): 11 | future_mask = np.triu(np.ones((1, seq_length, seq_length)), k=0).astype("bool") 12 | return torch.from_numpy(future_mask) 13 | 14 | 15 | class PositionalEncoding(nn.Module): 16 | """Sinusoidal positional encoding for non-recurrent neural networks. 17 | Args: 18 | dropout (float): dropout parameter 19 | dim (int): embedding size 20 | """ 21 | 22 | def __init__(self, dropout, dim, max_len=5000): 23 | if dim % 2 != 0: 24 | raise ValueError( 25 | "Cannot use sin/cos positional encoding with " 26 | "odd dim (got dim={:d})".format(dim) 27 | ) 28 | pe = torch.zeros(max_len, dim) 29 | position = torch.arange(0, max_len).unsqueeze(1) 30 | div_term = torch.exp( 31 | (torch.arange(0, dim, 2, dtype=torch.float) * -(math.log(10000.0) / dim)) 32 | ) 33 | pe[:, 0::2] = torch.sin(position.float() * div_term) 34 | pe[:, 1::2] = torch.cos(position.float() * div_term) 35 | pe = pe.unsqueeze(1) 36 | super(PositionalEncoding, self).__init__() 37 | self.register_buffer("pe", pe) 38 | self.dropout = nn.Dropout(p=dropout) 39 | self.dim = dim 40 | 41 | def forward(self, emb): 42 | """Embed interactions. 43 | Args: 44 | emb (FloatTensor): Sequence of word vectors 45 | ``(seq_len, batch_size, self.dim)`` 46 | """ 47 | 48 | # emb = emb * math.sqrt(self.dim) 49 | emb = emb + self.pe[: emb.size(0)] 50 | emb = self.dropout(emb) 51 | return emb 52 | 53 | """ Multi-Head Attention module from ONMT""" 54 | 55 | 56 | class MultiHeadedAttention(nn.Module): 57 | def __init__(self, head_count, model_dim, dropout=0.1): 58 | assert model_dim % head_count == 0 59 | self.dim_per_head = model_dim // head_count 60 | self.model_dim = model_dim 61 | 62 | super(MultiHeadedAttention, self).__init__() 63 | self.head_count = head_count 64 | 65 | self.linear_keys = nn.Linear(model_dim, head_count * self.dim_per_head) 66 | self.linear_values = nn.Linear(model_dim, head_count * self.dim_per_head) 67 | self.linear_query = nn.Linear(model_dim, head_count * self.dim_per_head) 68 | self.softmax = nn.Softmax(dim=-1) 69 | self.dropout = nn.Dropout(dropout) 70 | self.final_linear = nn.Linear(model_dim, model_dim) 71 | 72 | def forward(self, key, value, query, mask=None, layer_cache=None): 73 | """ 74 | Compute the context vector and the attention vectors. 75 | Args: 76 | key (FloatTensor): set of `key_len` 77 | key vectors ``(batch, key_len, dim)`` 78 | value (FloatTensor): set of `key_len` 79 | value vectors ``(batch, key_len, dim)`` 80 | query (FloatTensor): set of `query_len` 81 | query vectors ``(batch, query_len, dim)`` 82 | mask: binary mask 1/0 indicating which keys have 83 | zero / non-zero attention ``(batch, query_len, key_len)`` 84 | Returns: 85 | (FloatTensor, FloatTensor): 86 | * output context vectors ``(batch, query_len, dim)`` 87 | * one of the attention vectors ``(batch, query_len, key_len)`` 88 | """ 89 | # CHECKS 90 | batch, k_len, d = key.size() 91 | batch_, k_len_, d_ = value.size() 92 | assert batch_ == batch 93 | assert k_len == k_len 94 | assert d == d_ 95 | batch_, q_len, d_ = query.size() 96 | assert batch_ == batch 97 | assert d == d_ 98 | 99 | # aeq(self.model_dim % 8, 0) 100 | if mask is not None: 101 | batch_, q_len_, k_len_ = mask.size() 102 | # assert batch_ == batch mask will be broadcasted 103 | assert k_len_ == k_len 104 | assert q_len_ == q_len 105 | # END CHECKS 106 | batch_size = key.size(0) 107 | dim_per_head = self.dim_per_head 108 | head_count = self.head_count 109 | key_len = key.size(1) 110 | query_len = query.size(1) 111 | 112 | def shape(x): 113 | """Projection.""" 114 | return x.view(batch_size, -1, head_count, dim_per_head).transpose(1, 2) 115 | 116 | def unshape(x): 117 | """Compute context.""" 118 | return ( 119 | x.transpose(1, 2) 120 | .contiguous() 121 | .view(batch_size, -1, head_count * dim_per_head) 122 | ) 123 | 124 | # 1) Project key, value, and query. 125 | key = self.linear_keys(key) 126 | value = self.linear_values(value) 127 | query = self.linear_query(query) 128 | key = shape(key) 129 | value = shape(value) 130 | query = shape(query) 131 | 132 | key_len = key.size(2) 133 | query_len = query.size(2) 134 | 135 | # 2) Calculate and scale scores. 136 | query = query / math.sqrt(dim_per_head) 137 | # batch x heads x query_len x key_len 138 | query_key = torch.matmul(query, key.transpose(2, 3)) 139 | 140 | scores = query_key 141 | scores = scores.float() 142 | 143 | if mask is not None: 144 | mask = mask.unsqueeze(1) # [B, 1, 1 (?), T_values] 145 | scores = scores.masked_fill(mask, -1e18) 146 | 147 | # 3) Apply attention dropout and compute context vectors. 148 | attn = self.softmax(scores).to(query.dtype) 149 | drop_attn = self.dropout(attn) 150 | 151 | context_original = torch.matmul(drop_attn, value) 152 | 153 | context = unshape(context_original) 154 | output = self.final_linear(context) 155 | top_attn = attn.view(batch_size, head_count, query_len, key_len)[ 156 | :, 0, :, : 157 | ].contiguous() 158 | 159 | return output, top_attn 160 | 161 | def update_dropout(self, dropout): 162 | self.dropout.p = dropout 163 | 164 | 165 | class PositionwiseFeedForward(nn.Module): 166 | """ A two-layer Feed-Forward-Network with residual layer norm. 167 | Args: 168 | d_model (int): the size of input for the first-layer of the FFN. 169 | d_ff (int): the hidden layer size of the second-layer 170 | of the FNN. 171 | dropout (float): dropout probability in :math:`[0, 1)`. 172 | """ 173 | 174 | def __init__(self, d_model, d_ff, dropout=0.1): 175 | super(PositionwiseFeedForward, self).__init__() 176 | self.w_1 = nn.Linear(d_model, d_ff) 177 | self.w_2 = nn.Linear(d_ff, d_model) 178 | self.layer_norm = nn.LayerNorm(d_model, eps=1e-6) 179 | self.dropout_1 = nn.Dropout(dropout) 180 | self.relu = nn.ReLU() 181 | self.dropout_2 = nn.Dropout(dropout) 182 | 183 | def forward(self, x): 184 | """Layer definition. 185 | Args: 186 | x: ``(batch_size, input_len, model_dim)`` 187 | Returns: 188 | (FloatTensor): Output ``(batch_size, input_len, model_dim)``. 189 | """ 190 | 191 | inter = self.dropout_1(self.relu(self.w_1(self.layer_norm(x)))) 192 | output = self.dropout_2(self.w_2(inter)) 193 | return output + x 194 | 195 | def update_dropout(self, dropout): 196 | self.dropout_1.p = dropout 197 | self.dropout_2.p = dropout 198 | 199 | 200 | class TransformerEncoderLayer(nn.Module): 201 | """ 202 | A single layer of the transformer encoder. 203 | Args: 204 | d_model (int): the dimension of keys/values/queries in 205 | MultiHeadedAttention, also the input size of 206 | the first-layer of the PositionwiseFeedForward. 207 | heads (int): the number of head for MultiHeadedAttention. 208 | d_ff (int): the second-layer of the PositionwiseFeedForward. 209 | dropout (float): dropout probability(0-1.0). 210 | """ 211 | 212 | def __init__(self, d_model, heads, d_ff, dropout, attention_dropout): 213 | super(TransformerEncoderLayer, self).__init__() 214 | 215 | self.self_attn = MultiHeadedAttention(heads, d_model, dropout=attention_dropout) 216 | self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) 217 | self.layer_norm = nn.LayerNorm(d_model, eps=1e-6) 218 | self.dropout = nn.Dropout(dropout) 219 | 220 | def forward(self, interaction_embeds, item_embeds, mask): 221 | """ 222 | Args: 223 | interaction_embeds (FloatTensor): ``(batch_size, src_len, model_dim)`` 224 | mask (LongTensor): ``(batch_size, 1, src_len)`` pourquoi mask est de cette taille ?? 225 | Returns: 226 | (FloatTensor): 227 | * outputs ``(batch_size, src_len, model_dim)`` 228 | """ 229 | context, _ = self.self_attn( 230 | interaction_embeds, interaction_embeds, item_embeds, mask=mask 231 | ) 232 | out = self.dropout(context) + item_embeds 233 | return self.feed_forward(out) 234 | 235 | def update_dropout(self, dropout, attention_dropout): 236 | self.self_attn.update_dropout(attention_dropout) 237 | self.feed_forward.update_dropout(dropout) 238 | self.dropout.p = dropout 239 | 240 | 241 | class SAKT(nn.Module): 242 | """Self-attentive knowledge tracing. 243 | 244 | Arguments: 245 | num_items (int): Number of items 246 | hid_size (int): Attention dot-product dimension 247 | heads (int): Number of parallel attention heads 248 | encode_pos (bool): If True, add positional encoding 249 | dropout (float): Dropout probability 250 | """ 251 | 252 | def __init__( 253 | self, num_items, hid_size=512, heads=8, dropout=0.2, position_encoding=True 254 | ): 255 | super(SAKT, self).__init__() 256 | self.num_items = num_items 257 | self.interaction_embedding = nn.Embedding( 258 | 2 * num_items, hid_size 259 | ) # maybe padding is needed 260 | self.item_embedding = nn.Embedding( 261 | num_items, hid_size 262 | ) # maybe padding is needed 263 | self.position_encoding = position_encoding 264 | if self.position_encoding: 265 | self.pe = PositionalEncoding(dropout, hid_size) 266 | self.encoder_layer = TransformerEncoderLayer( 267 | hid_size, heads, hid_size, dropout, dropout 268 | ) 269 | self.layer_norm = nn.LayerNorm(hid_size, eps=1e-6) 270 | self.out = nn.Linear(hid_size, 1) 271 | 272 | def forward(self, interactions, items): 273 | # intercations and items must be batch first 274 | item_embeds = self.item_embedding(items) 275 | interaction_embeds = self.interaction_embedding(interactions) 276 | mask = future_mask(interactions.size(1)) 277 | if interactions.is_cuda: 278 | mask = mask.cuda() 279 | if self.position_encoding: 280 | interaction_embeds = self.pe(interaction_embeds) # idea do a concatenate instead of just adding the position embeds 281 | 282 | out = self.encoder_layer(interaction_embeds, item_embeds, mask) 283 | return self.out(out).squeeze(2) 284 | -------------------------------------------------------------------------------- /SAKT/train_sakt.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import pandas as pd 4 | import sys 5 | 6 | import torch.nn as nn 7 | from torch.optim import Adam 8 | 9 | 10 | sys.path.append(".") 11 | from model_sakt import SAKT 12 | from utils_sakt.logger import Logger 13 | from utils_sakt.metrics import Metrics 14 | from utils_sakt.misc import * 15 | from tqdm import tqdm 16 | 17 | 18 | def train(df, model, optimizer, logger, num_epochs, batch_size): 19 | """Train SAKT model. 20 | 21 | Arguments: 22 | df (pandas DataFrame): output by prepare_data.py 23 | model (torch Module) 24 | optimizer (torch optimizer) 25 | logger: wrapper for TensorboardX logger 26 | num_epochs (int): number of epochs to train for 27 | batch_size (int) 28 | """ 29 | train_data, val_data = get_data(df) 30 | 31 | criterion = nn.BCEWithLogitsLoss() 32 | brier_score = nn.MSELoss() 33 | m = nn.Sigmoid() 34 | metrics = Metrics() 35 | step = 0 36 | print(args.device) 37 | 38 | for epoch in tqdm(range(num_epochs)): 39 | train_batches = prepare_batches(train_data, batch_size) 40 | val_batches = prepare_batches(val_data, batch_size) 41 | 42 | # Training 43 | for inputs, item_ids, labels in train_batches: 44 | inputs = inputs.to(device=args.device) 45 | item_ids = item_ids.to(device=args.device) 46 | preds = model(inputs, item_ids) 47 | loss = compute_loss( 48 | preds.float(), labels.to(device=args.device).float(), criterion 49 | ) 50 | # loss = compute_loss(preds, item_ids, labels, criterion) 51 | train_auc = compute_auc(preds, labels) 52 | 53 | model.zero_grad() 54 | loss.backward() 55 | optimizer.step() 56 | step += 1 57 | metrics.store({"loss/train": loss.item()}) 58 | metrics.store({"auc/train": train_auc}) 59 | 60 | # Logging 61 | if step % 20 == 0: 62 | logger.log_scalars(metrics.average(), step) 63 | #weights = {"weight/" + name: param for name, param in model.named_parameters()} 64 | #grads = {"grad/" + name: param.grad 65 | # for name, param in model.named_parameters() if param.grad is not None} 66 | #logger.log_histograms(weights, step) 67 | #logger.log_histograms(grads, step) 68 | 69 | # Validation 70 | model.eval() 71 | for inputs, item_ids, labels in val_batches: 72 | inputs = inputs.to(device=args.device) 73 | with torch.no_grad(): 74 | preds = model(inputs, item_ids.to(device=args.device)) 75 | val_loss = compute_loss(preds.float().cpu(), labels.float(), criterion) 76 | val_brier_score = compute_loss( 77 | m(preds).float().cpu(), labels.float(), brier_score 78 | ) 79 | val_auc = compute_auc(preds, labels) 80 | metrics.store({"brier_score/val": val_brier_score.item()}) 81 | metrics.store({"auc/val": val_auc}) 82 | metrics.store({"loss/val": val_loss.item()}) 83 | model.train() 84 | 85 | 86 | 87 | if __name__ == "__main__": 88 | parser = argparse.ArgumentParser(description="Train SAKT.") 89 | parser.add_argument("--dataset", type=str) 90 | parser.add_argument("--logdir", type=str, default="runs/sakt") 91 | parser.add_argument("--embed_inputs", action="store_true") 92 | parser.add_argument("--embed_size", type=int, default=100) 93 | parser.add_argument("--hid_size", type=int, default=100) 94 | parser.add_argument("--num_heads", type=int, default=5) 95 | parser.add_argument("--encode_pos", action="store_true") 96 | parser.add_argument("--drop_prob", type=float, default=0.2) 97 | parser.add_argument("--batch_size", type=int, default=100) 98 | parser.add_argument("--lr", type=float, default=1e-3) 99 | parser.add_argument("--num_epochs", type=int, default=25) 100 | parser.add_argument("--n_traces", type=int, default=20000) 101 | parser.add_argument("--disable-cuda", action="store_true", help="Disable CUDA") 102 | args = parser.parse_args() 103 | if not args.disable_cuda: 104 | args.device = torch.device("cuda") 105 | else: 106 | args.device = torch.device("cpu") 107 | 108 | df = pd.read_csv( 109 | os.path.join("data", args.dataset, "preprocessed_data.csv"), sep="\t" 110 | )[-args.n_traces :] 111 | 112 | num_items = int(df["item_id"].max() + 1) 113 | model = SAKT( 114 | num_items, args.hid_size, args.num_heads, args.encode_pos, args.drop_prob 115 | ) 116 | model = nn.DataParallel(model) 117 | model.to(device=args.device) 118 | print("Let's use", torch.cuda.device_count(), "GPUs!") 119 | 120 | optimizer = Adam(model.parameters(), lr=args.lr) 121 | 122 | param_str = ( 123 | f"{args.dataset}, embed={args.embed_inputs}, dropout={args.drop_prob}, batch_size={args.batch_size} " 124 | f"embed_size={args.embed_size}, hid_size={args.hid_size}, encode_pos={args.encode_pos}, n_traces={args.n_traces}" 125 | ) 126 | logger = Logger(os.path.join(args.logdir, param_str)) 127 | 128 | train(df, model, optimizer, logger, args.num_epochs, args.batch_size) 129 | 130 | logger.close() 131 | -------------------------------------------------------------------------------- /SAKT/utils_sakt/logger.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | from tensorboardX import SummaryWriter 4 | 5 | 6 | class Logger: 7 | """Logging with TensorboardX. 8 | """ 9 | 10 | def __init__(self, logdir, verbose=True): 11 | if not os.path.exists(logdir): 12 | os.makedirs(logdir) 13 | try: 14 | shutil.rmtree(logdir) 15 | except FileNotFoundError: 16 | pass 17 | 18 | self.verbose = verbose 19 | self.writer = SummaryWriter(logdir) 20 | 21 | def log_histograms(self, dic, step): 22 | """Log dictionary of tensors as histograms. 23 | """ 24 | for k, v in dic.items(): 25 | self.writer.add_histogram(k, v, step) 26 | 27 | def log_scalars(self, dic, step): 28 | """Log dictionary of scalar values. 29 | """ 30 | for k, v in dic.items(): 31 | self.writer.add_scalar(k, v, step) 32 | 33 | if self.verbose: 34 | print(f"Step {step}, {dic}") 35 | 36 | def close(self): 37 | self.writer.close() -------------------------------------------------------------------------------- /SAKT/utils_sakt/metrics.py: -------------------------------------------------------------------------------- 1 | class Metrics: 2 | """Keep track of metrics over time in a dictionary. 3 | """ 4 | def __init__(self): 5 | self.metrics = {} 6 | self.counts = {} 7 | 8 | def store(self, new_metrics): 9 | for key in new_metrics: 10 | if key in self.metrics: 11 | self.metrics[key] += new_metrics[key] 12 | self.counts[key] += 1 13 | else: 14 | self.metrics[key] = new_metrics[key] 15 | self.counts[key] = 1 16 | 17 | def average(self): 18 | average = {k: v / self.counts[k] for k, v in self.metrics.items()} 19 | self.metrics, self.counts = {}, {} 20 | return average -------------------------------------------------------------------------------- /SAKT/utils_sakt/misc.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | from random import shuffle 4 | from sklearn.metrics import roc_auc_score, accuracy_score 5 | 6 | import torch 7 | from torch.nn.utils.rnn import pad_sequence 8 | 9 | 10 | def set_random_seeds(seed): 11 | torch.manual_seed(seed) 12 | torch.cuda.manual_seed_all(seed) 13 | random.seed(seed) 14 | 15 | 16 | def get_data(df, train_split=0.8): 17 | num_items = df["item_id"].nunique() 18 | data = [(torch.tensor(u_df["item_id"].values, dtype=torch.long), 19 | torch.tensor(u_df["correct"].values, dtype=torch.long)) 20 | for _, u_df in df.groupby("user_id")] 21 | data = [(item_ids + labels * num_items, item_ids, labels) 22 | for (item_ids, labels) in data] 23 | shuffle(data) 24 | 25 | # Train-test split across users 26 | train_size = int(train_split * len(data)) 27 | train_data, val_data = data[:train_size], data[train_size:] 28 | return train_data, val_data 29 | 30 | 31 | def prepare_batches(data, batch_size): 32 | """Prepare batches grouping padded sequences. 33 | 34 | Arguments: 35 | data (list of tuples of torch Tensor) 36 | batch_size (int): number of sequences per batch 37 | 38 | Output: 39 | batches (list of tuples of torch Tensor) 40 | """ 41 | shuffle(data) 42 | 43 | batches = [] 44 | for k in range(0, len(data), batch_size): 45 | batch = data[k:k + batch_size] 46 | inputs, item_ids, labels = zip(*batch) 47 | 48 | inputs = pad_sequence(inputs, batch_first=True, padding_value=0) # Pad with 0 49 | item_ids = pad_sequence(item_ids, batch_first=True, padding_value=0) # Don't care 50 | labels = pad_sequence(labels, batch_first=True, padding_value=-1) # Pad with -1 51 | 52 | batches.append([inputs, item_ids, labels]) 53 | 54 | return batches 55 | 56 | 57 | def compute_auc(preds, labels): 58 | labels = labels.view(-1) 59 | preds = preds.view(-1)[labels >= 0].detach().cpu().numpy() 60 | labels = labels[labels >= 0].detach().cpu().numpy() 61 | 62 | 63 | if len(np.unique(labels)) == 1: # Only one class 64 | auc = accuracy_score(labels, preds.round()) 65 | else: 66 | auc = roc_auc_score(labels, preds) 67 | return auc 68 | 69 | def compute_loss(preds, labels, criterion): 70 | labels = labels.view(-1) 71 | preds = preds.view(-1)[labels >= 0] 72 | labels = labels[labels >= 0] 73 | return criterion(preds, labels) -------------------------------------------------------------------------------- /das3h/README.md: -------------------------------------------------------------------------------- 1 | ## DAS3H 2 | 3 | This folder contains Python code of [_DAS3H: Modeling Student Learning and Forgetting for 4 | Optimally Scheduling Distributed Practice of Skills_](https://arxiv.org/abs/1905.06873). Authors: [Benoît Choffin](https://github.com/BenoitChoffin), [Fabrice Popineau](https://github.com/fpopineau), Yolaine Bourda, and [Jill-Jênn Vie](https://github.com/jilljenn). 5 | 6 | It is different from the [implementation used in the article](https://github.com/BenoitChoffin/das3h) as it is tailored for a dataset like Lalilo's one. 7 | It also uses the pandas library more (like the rolling function) to create the features. 8 | 9 | ### What is DAS3H ? 10 | It is a model of student learning where there are 5 kinds of parameters to learn (same notations as in the article): 11 | - α : level of a student 12 | - δ : difficulty of an exercise 13 | - β : difficulty of a knowledge component (not used in our implementation as we haven't tagged exercises with KCs yet) 14 | - θwins, exercise, time-window (>0) : speed with which a student learns (?) a given type of exercise in a given time window 15 | - θattempts, exercise, time-window (>0) : speed with which a student forgets (?) a given type of exercise in a given time window 16 | 17 | Let's say we have a dataset looking like this. 18 | #### Original dataset 19 | | trace_id | date | student_id | exercise_id | correctness | 20 | |:-:|:-:|:-:|:-:|:-:| 21 | | 1 | 1 january | 1 | 1 | 1 | 22 | | 2 | 1 january | 1 | 1 | 0 | 23 | | 3 | 1 january | 1 | 1 | 0 | 24 | | 4 | 1 january | 2 | 1 | 0 | 25 | | 5 | 1 january | 2 | 1 | 1 | 26 | | 6 | **3 january** | 2 | 1 | 1 | 27 | 28 | In the simplest version of the model (not using Factorisation Machines), the parameters to compute are those of a LogisticRegression on a dataset looking like this : 29 | 30 | #### Encoded dataset 31 | | trace_id | student_1 (α1)| student (α2)| exercise_1 (δ1) | exercise_2 (δ2)| wins_on_exo_1_in_the_past_day (θwins, exo_1, one-day)| attempts_on_exo_1_in_the_past_day (θattempts, exo_1, one-day) | wins_on_exo_1_in_the_past_week (θwins, exo_1, one-week)| attempts_on_exo_1_in_the_past_week (θattempts, exo_1, one-week)| other columns like θ parameters on ex 2 | 32 | |:-:|:-:|:-----:|:-----:|:------:|:----:|:----:|:-:|:-:|:-:| 33 | | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 34 | | 2 | 0 | 1 | 1 | 0 | 1 | 1 | 1 | 1 | 35 | | 3 | 0 | 1 | 1 | 0 | 1 | 2 | 1 | 2 | 36 | | 4 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 37 | | 5 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 38 | | 6 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 2 | 39 | 40 | To get this encoded dataset, we one-hot-encode on the student_id and exercise_id and add the number of previous attempts and wins that a student had in the given time windows. 41 | 42 | Try to write on a piece of paper what you get doing that and compare to the encoded dataset above. See the *Important* note for further details. 43 | 44 | #### Logistic Regression 45 | 46 | As stated in [sklearn documentation](https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression), Logistic Regression is the optimization of 47 | 48 | Here, the parameters α, δ, β, and θ are concatenated in *w* 49 | 50 | *Important :* 51 | As you may have noticed, the features in the *encoded* dataset seem to "lag" one trace behind the original dataset. Actually this is done to prevent any data leakage and not use the answer at time T to predict itself. If this is not clear, please tell me. 52 | 53 | *Note :* 54 | Actually the number of wins and number of attempts are not fed directly to the model, instead they go through a scaling function : 55 | ``` python 56 | lambda x: log(1 + x) 57 | ``` 58 | in the article -------------------------------------------------------------------------------- /das3h/clean.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | from print_df_infos import print_cleaned_df_with_information 5 | 6 | 7 | def clean_df( 8 | df, 9 | exercise_code_level=False, 10 | drop_level=True, 11 | date_to_timestamp=True, 12 | exercise_code_level_lesson=True, 13 | drop_learning_object=True, 14 | verbose=True 15 | ) -> pd.DataFrame: 16 | df = df.drop(columns=["Unnamed: 0", "id"]) 17 | if exercise_code_level: 18 | df = add_exercise_code_level(df) 19 | if date_to_timestamp: 20 | df = change_date_to_timestamp(df) 21 | df = df.drop(columns=["created_at"]) 22 | if exercise_code_level_lesson: 23 | df = add_exercise_code_level_lesson(df) 24 | df = df.drop(columns=["lesson_id"]) 25 | if drop_level: 26 | df = df.drop(columns=["level"]) 27 | if drop_learning_object: 28 | try: 29 | df = df.drop(columns=["learning_object"]) 30 | except: 31 | pass 32 | df["student_id"] = np.unique(df["student_id"], return_inverse=True)[1] 33 | df = df[df["correctness"].isin((True, False))] 34 | df["correctness"] = df["correctness"].astype(int) 35 | if verbose: 36 | print_cleaned_df_with_information(df) 37 | return df 38 | 39 | 40 | def add_exercise_code_level(df) -> pd.DataFrame: 41 | dataset = df.copy() 42 | dataset["exercise_code_level"] = ( 43 | dataset["exercise_code"].map(str) + "_" + dataset["level"].map(str) 44 | ) 45 | return dataset 46 | 47 | 48 | def add_exercise_code_level_lesson(df) -> pd.DataFrame: 49 | dataset = df.copy() 50 | dataset["exercise_code_level_lesson"] = ( 51 | dataset["exercise_code"].map(str) 52 | + "_" 53 | + dataset["level"].map(str) 54 | + "_lesson_" 55 | + dataset["lesson_id"].map(str) 56 | ) 57 | return dataset 58 | 59 | 60 | def change_date_to_timestamp(df) -> pd.DataFrame: 61 | df["timestamp"] = df["created_at"] 62 | df["timestamp"] = pd.to_datetime(df["timestamp"]) 63 | return df 64 | -------------------------------------------------------------------------------- /das3h/clean_test.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | from pandas.testing import assert_frame_equal 5 | from clean import clean_df 6 | 7 | 8 | def test_that_clean_df_cleans_properly(): 9 | df = pd.DataFrame( 10 | { 11 | "Unnamed: 0": [1, 2, 3], 12 | "id": [1, 2, 3], 13 | "created_at": [ 14 | "2019-03-01 00:00:01", 15 | "2019-03-01 00:00:02", 16 | "2019-03-01 00:00:03", 17 | ], 18 | "student_id": [0, 0, 0], 19 | "exercise_code": ["grapho", "phono", "discovery"], 20 | "level": [3, 4, 1], 21 | "lesson_id": [101, 101, 101], 22 | "correctness": [True, False, None], 23 | } 24 | ) 25 | 26 | cleaned_df = clean_df(df) 27 | expected_df = pd.DataFrame( 28 | { 29 | "timestamp": ["2019-03-01 00:00:01", "2019-03-01 00:00:02"], 30 | "student_id": [0, 0], 31 | "exercise_code": ["grapho", "phono"], 32 | "exercise_code_level_lesson": ["grapho_3_lesson_101", "phono_4_lesson_101"], 33 | "correctness": [1, 0], 34 | } 35 | ) 36 | expected_df["timestamp"] = pd.to_datetime(expected_df["timestamp"]) 37 | assert_frame_equal(cleaned_df, expected_df, check_like=True) # ignore column order 38 | 39 | -------------------------------------------------------------------------------- /das3h/das3h.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from datetime import date 3 | 4 | from sklearn.model_selection import KFold 5 | from sklearn.metrics import roc_auc_score, accuracy_score, log_loss 6 | from sklearn.linear_model import LogisticRegression 7 | from encode import encode_df 8 | from clean import clean_df 9 | from results_analysis import ( 10 | get_coefs_in_dataframe, 11 | get_students_alphas, 12 | get_exercise_code_betas, 13 | get_exercise_gammas_of_one_exercise_code, 14 | get_available_exercise_codes, 15 | ) 16 | 17 | csv_to_use = "shorter_training" 18 | # put the dataset you want in a folder with the right path 19 | dataset = pd.read_csv(f"data/lalilo_datasets/{csv_to_use}.csv") 20 | 21 | # select end of the dataset to test if the model is running properly 22 | last_n_traces = 500 23 | if last_n_traces: 24 | dataset = dataset[-last_n_traces:] 25 | 26 | # clean and encode dataset 27 | cleaned_dataset = clean_df(dataset) 28 | encoded_dataset = encode_df(cleaned_dataset) 29 | 30 | # X and y, X has to be sparse when training a huge dataset 31 | X = encoded_dataset.drop(columns=["correctness", "timestamp"]) 32 | X_sparse_df = X.astype(pd.SparseDtype("float", 0.0)) 33 | X_sparse_array = X_sparse_df.sparse.to_coo() 34 | 35 | y = encoded_dataset["correctness"] 36 | 37 | model = LogisticRegression(solver="lbfgs", max_iter=800) 38 | model.fit(X, y) 39 | 40 | coefficients = get_coefs_in_dataframe(model, X) 41 | 42 | # saving the coefficients, you will have to create a 'results' folder somewhere 43 | save_coeffs = False 44 | if save_coeffs: 45 | today = date.today().strftime("%Y-%m-%d") 46 | coefficients.to_csv(f"das3h/results/coefficients_of_{csv_to_use}_done_{today}.csv") 47 | 48 | print("Printing students alphas") 49 | print(get_students_alphas(coefficients)) 50 | print("") 51 | print("Printing exercise_code betas") 52 | print(get_exercise_code_betas(coefficients)) 53 | 54 | for exercise_code in get_available_exercise_codes(coefficients): 55 | print("") 56 | print(f"Printing coefs of {exercise_code} for all its levels and lessons") 57 | print(get_exercise_gammas_of_one_exercise_code(coefficients, exercise_code)) 58 | 59 | -------------------------------------------------------------------------------- /das3h/encode.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | from typing import Optional 5 | from tqdm import tqdm 6 | 7 | TIME_WINDOWS_DEFAULT = ("60s", "1h", "1d", "5d", "30d", "365d") 8 | COUNTERS_DEFAULT = ("attempts", "wins") 9 | 10 | 11 | def encode_df( 12 | df: pd.DataFrame, 13 | exercises: Optional[tuple] = None, 14 | counters: tuple = COUNTERS_DEFAULT, 15 | time_windows: tuple = TIME_WINDOWS_DEFAULT, 16 | ) -> pd.DataFrame: 17 | """ 18 | Get the wanted counters of each student and 19 | dummifies the categorical variables of the dataset. 20 | """ 21 | if exercises is None: 22 | exercises = set(df["exercise_code"].unique()) 23 | # exercises done by only one student must be removed 24 | exercises_to_keep = remove_exercises_done_by_only_one_student(df, exercises) 25 | df = add_counters_for_all_exercises( 26 | df, exercises_to_keep, counters, time_windows 27 | ).fillna(0) 28 | df = pd.get_dummies( 29 | df, columns=["student_id", "exercise_code_level_lesson", "exercise_code"], sparse=True 30 | ) 31 | return df 32 | 33 | 34 | def remove_exercises_done_by_only_one_student(df, exercises): 35 | """ 36 | Groupby function is not working with them for now 37 | """ 38 | nb_student_by_exo = ( 39 | df[["student_id", "exercise_code"]] 40 | .drop_duplicates() 41 | .groupby("exercise_code", as_index=False) 42 | .count() 43 | ) 44 | exercises_to_keep = exercises 45 | exercises_to_remove = nb_student_by_exo[nb_student_by_exo["student_id"] == 1][ 46 | "exercise_code" 47 | ].values 48 | for exercise in exercises_to_remove: 49 | exercises_to_keep.remove(exercise) 50 | return exercises_to_keep 51 | 52 | 53 | def add_counters_for_all_exercises( 54 | df: pd.DataFrame, exercises: tuple, counters: tuple, time_windows: tuple 55 | ) -> pd.DataFrame: 56 | """ 57 | Adds a column for all given exercises, all given counters 58 | of a student and all given time windows. 59 | """ 60 | for exercise_code in tqdm( 61 | exercises, desc=f"Adding the counters for {len(exercises)} exercises" 62 | ): 63 | df = add_exercise_code_counters_for_each_time_window( 64 | df, exercise_code, counters, time_windows 65 | ) 66 | return df 67 | 68 | 69 | def add_exercise_code_counters_for_each_time_window( 70 | df: pd.DataFrame, exercise_code: str, counters: tuple, time_windows: tuple 71 | ) -> pd.DataFrame: 72 | """ 73 | For a given exercise_code, adds a column for all given counters 74 | of a student and for all given time windows. 75 | """ 76 | for time_window in tqdm( 77 | time_windows, 78 | desc=f"Adding the counters for the time windows of {exercise_code}", 79 | ): 80 | df = add_exercise_code_counters_in_one_time_window( 81 | df, exercise_code, counters, time_window 82 | ) 83 | return df 84 | 85 | 86 | def add_exercise_code_counters_in_one_time_window( 87 | df: pd.DataFrame, exercise_code: str, counters: tuple, time_window: str 88 | ) -> pd.DataFrame: 89 | """ 90 | For a given exercise_code, adds a column for all given counters 91 | of a student and for one given time window. 92 | """ 93 | for counter in counters: 94 | df = add_one_exercise_code_counter_in_one_time_window( 95 | df, exercise_code, counter, time_window 96 | ) 97 | return df 98 | 99 | 100 | def add_one_exercise_code_counter_in_one_time_window( 101 | df: pd.DataFrame, exercise_code: str, counter: str, time_window: str 102 | ) -> pd.DataFrame: 103 | df_copy = df.copy() 104 | filtered_df = df[df["exercise_code"] == exercise_code] 105 | filtered_df_and_timestamp_index = filtered_df.set_index("timestamp") 106 | counter_in_the_time_window = filtered_df_and_timestamp_index.groupby( 107 | by=["student_id"], as_index=False 108 | ).rolling(time_window, closed="left")["correctness"] 109 | assert counter in ("wins", "attempts") 110 | if counter == "attempts": 111 | counter_in_the_time_window = counter_in_the_time_window.count() 112 | elif counter == "wins": 113 | counter_in_the_time_window = counter_in_the_time_window.sum() 114 | exercise_code_counter = ( 115 | counter_in_the_time_window.reset_index() 116 | .fillna(0) 117 | .sort_values(by=["timestamp", "correctness"]) 118 | ) 119 | exercise_code_counter["index"] = filtered_df.index 120 | exercise_code_counter = exercise_code_counter.set_index("index") 121 | df_copy[f"{exercise_code}_{counter}_in_the_past_{time_window}"] = scaling_function( 122 | exercise_code_counter["correctness"] 123 | ) 124 | return df_copy 125 | 126 | 127 | def scaling_function(x, how="log"): 128 | if how == "log": 129 | return np.log(1 + x) 130 | else: 131 | return np.nan 132 | -------------------------------------------------------------------------------- /das3h/encode_test.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from pandas.testing import assert_frame_equal 4 | from encode import ( 5 | scaling_function, 6 | add_one_exercise_code_counter_in_one_time_window, 7 | encode_df, 8 | ) 9 | 10 | # What I noticed : 11 | # - tests won't pass if there is only one student, I don't know if is good. I think it is caused by the groupby function 12 | # - timestamp must be different from student to student or else the sorting by counter value and timestamp yields a false result 13 | 14 | 15 | def test_add_one_exercise_code_wins_in_one_time_window(): 16 | exercise_code = "phono" 17 | time_window = "1d" 18 | counter = "wins" 19 | df = pd.DataFrame( 20 | { 21 | "timestamp": [ 22 | "2019-03-01 00:00:01", 23 | "2019-03-01 00:00:02", 24 | "2019-03-01 00:00:03", 25 | "2019-03-01 00:00:04", 26 | "2019-03-01 00:00:05", 27 | ], 28 | "student_id": [1, 1, 2, 2, 1], 29 | "exercise_code": ["phono", "phono", "phono", "grapho", "phono"], 30 | "correctness": [1, 0, 1, 1, 1], 31 | } 32 | ) 33 | df["timestamp"] = pd.to_datetime(df["timestamp"]) 34 | df_with_one_more_column = add_one_exercise_code_counter_in_one_time_window( 35 | df, exercise_code, counter, time_window 36 | ) 37 | expected_df = pd.DataFrame( 38 | { 39 | "timestamp": [ 40 | "2019-03-01 00:00:01", 41 | "2019-03-01 00:00:02", 42 | "2019-03-01 00:00:03", 43 | "2019-03-01 00:00:04", 44 | "2019-03-01 00:00:05", 45 | ], 46 | "student_id": [1, 1, 2, 2, 1], 47 | "exercise_code": ["phono", "phono", "phono", "grapho", "phono"], 48 | "correctness": [1, 0, 1, 1, 1], 49 | "phono_wins_in_the_past_1d": [0, 1, 0, None, 1], 50 | } 51 | ) 52 | expected_df["phono_wins_in_the_past_1d"] = expected_df[ 53 | "phono_wins_in_the_past_1d" 54 | ].apply(scaling_function) 55 | expected_df["timestamp"] = pd.to_datetime(expected_df["timestamp"]) 56 | assert_frame_equal(expected_df, df_with_one_more_column, check_like=True) 57 | 58 | 59 | def test_add_one_exercise_code_attempts_in_one_time_window(): 60 | exercise_code = "phono" 61 | time_window = "1d" 62 | counter = "attempts" 63 | df = pd.DataFrame( 64 | { 65 | "timestamp": [ 66 | "2019-03-01 00:00:01", 67 | "2019-03-01 00:00:02", 68 | "2019-03-01 00:00:03", 69 | "2019-03-01 00:00:04", 70 | "2019-03-01 00:00:05", 71 | ], 72 | "student_id": [1, 1, 2, 2, 1], 73 | "exercise_code": ["phono", "phono", "phono", "grapho", "phono"], 74 | "correctness": [1, 0, 1, 1, 1], 75 | } 76 | ) 77 | df["timestamp"] = pd.to_datetime(df["timestamp"]) 78 | df_with_one_more_column = add_one_exercise_code_counter_in_one_time_window( 79 | df, exercise_code, counter, time_window 80 | ) 81 | expected_df = pd.DataFrame( 82 | { 83 | "timestamp": [ 84 | "2019-03-01 00:00:01", 85 | "2019-03-01 00:00:02", 86 | "2019-03-01 00:00:03", 87 | "2019-03-01 00:00:04", 88 | "2019-03-01 00:00:05", 89 | ], 90 | "student_id": [1, 1, 2, 2, 1], 91 | "exercise_code": ["phono", "phono", "phono", "grapho", "phono"], 92 | "correctness": [1, 0, 1, 1, 1], 93 | "phono_attempts_in_the_past_1d": [0, 1, 0, None, 2], 94 | } 95 | ) 96 | expected_df["phono_attempts_in_the_past_1d"] = expected_df[ 97 | "phono_attempts_in_the_past_1d" 98 | ].apply(scaling_function) 99 | expected_df["timestamp"] = pd.to_datetime(expected_df["timestamp"]) 100 | assert_frame_equal(expected_df, df_with_one_more_column, check_like=True) 101 | 102 | 103 | def test_add_one_exercise_code_wins_only_use_traces_in_the_given_time_window(): 104 | exercise_code = "phono" 105 | time_window = "1d" 106 | counter = "wins" 107 | df = pd.DataFrame( 108 | { 109 | "timestamp": [ 110 | "2019-03-01 00:00:01", 111 | "2019-03-01 00:00:02", 112 | "2019-03-01 00:00:03", 113 | "2019-03-01 00:00:04", 114 | "2019-03-03 00:00:05", 115 | ], 116 | "student_id": [1, 1, 2, 2, 1], 117 | "exercise_code": ["phono", "phono", "phono", "grapho", "phono"], 118 | "correctness": [1, 0, 1, 1, 1], 119 | } 120 | ) 121 | df["timestamp"] = pd.to_datetime(df["timestamp"]) 122 | df_with_one_more_column = add_one_exercise_code_counter_in_one_time_window( 123 | df, exercise_code, counter, time_window 124 | ) 125 | expected_df = pd.DataFrame( 126 | { 127 | "timestamp": [ 128 | "2019-03-01 00:00:01", 129 | "2019-03-01 00:00:02", 130 | "2019-03-01 00:00:03", 131 | "2019-03-01 00:00:04", 132 | "2019-03-03 00:00:05", 133 | ], 134 | "student_id": [1, 1, 2, 2, 1], 135 | "exercise_code": ["phono", "phono", "phono", "grapho", "phono"], 136 | "correctness": [1, 0, 1, 1, 1], 137 | "phono_wins_in_the_past_1d": [0, 1, 0, None, 0], 138 | } 139 | ) 140 | expected_df["phono_wins_in_the_past_1d"] = expected_df[ 141 | "phono_wins_in_the_past_1d" 142 | ].apply(scaling_function) 143 | expected_df["timestamp"] = pd.to_datetime(expected_df["timestamp"]) 144 | assert_frame_equal(expected_df, df_with_one_more_column, check_like=True) 145 | 146 | 147 | def test_encode_df(): 148 | counters = ("wins", "attempts") 149 | time_windows = ("1h", "7d") 150 | df = pd.DataFrame( 151 | { 152 | "timestamp": [ 153 | "2019-03-01 00:00:01", 154 | "2019-03-01 00:00:02", 155 | "2019-03-01 00:00:03", 156 | "2019-03-01 00:00:04", 157 | "2019-03-03 00:00:05", 158 | ], 159 | "student_id": [1, 1, 2, 2, 1], 160 | "exercise_code": ["phono", "phono", "phono", "grapho", "phono"], 161 | "exercise_code_level_lesson": [ 162 | "phono_3_lesson_1", 163 | "phono_3_lesson_2", 164 | "phono_3_lesson_1", 165 | "grapho_3_lesson_1", 166 | "phono_3_lesson_1", 167 | ], 168 | "correctness": [1, 0, 1, 1, 1], 169 | } 170 | ) 171 | df["timestamp"] = pd.to_datetime(df["timestamp"]) 172 | encoded_df = encode_df(df, counters=counters, time_windows=time_windows) 173 | expected_df = pd.DataFrame( 174 | { 175 | "timestamp": [ 176 | "2019-03-01 00:00:01", 177 | "2019-03-01 00:00:02", 178 | "2019-03-01 00:00:03", 179 | "2019-03-01 00:00:04", 180 | "2019-03-03 00:00:05", 181 | ], 182 | "student_id_1": [1, 1, 0, 0, 1], 183 | "student_id_2": [0, 0, 1, 1, 0], 184 | "exercise_code_phono": [1, 1, 1, 0, 1], 185 | "exercise_code_grapho": [0, 0, 0, 1, 0], 186 | "exercise_code_level_lesson_phono_3_lesson_1": [1, 0, 1, 0, 1], 187 | "exercise_code_level_lesson_phono_3_lesson_2": [0, 1, 0, 0, 0], 188 | "exercise_code_level_lesson_grapho_3_lesson_1": [0, 0, 0, 1, 0], 189 | "correctness": [1, 0, 1, 1, 1], 190 | "phono_wins_in_the_past_1h": [0, 1, 0, 0, 0], 191 | "phono_wins_in_the_past_7d": [0, 1, 0, 0, 1], 192 | "phono_attempts_in_the_past_1h": [0, 1, 0, 0, 0], 193 | "phono_attempts_in_the_past_7d": [0, 1, 0, 0, 2], 194 | } 195 | ) 196 | expected_df["timestamp"] = pd.to_datetime(expected_df["timestamp"]) 197 | expected_df["phono_wins_in_the_past_1h"] = expected_df[ 198 | "phono_wins_in_the_past_1h" 199 | ].apply(scaling_function) 200 | expected_df["phono_wins_in_the_past_7d"] = expected_df[ 201 | "phono_wins_in_the_past_7d" 202 | ].apply(scaling_function) 203 | expected_df["phono_attempts_in_the_past_1h"] = expected_df[ 204 | "phono_attempts_in_the_past_1h" 205 | ].apply(scaling_function) 206 | expected_df["phono_attempts_in_the_past_7d"] = expected_df[ 207 | "phono_attempts_in_the_past_7d" 208 | ].apply(scaling_function) 209 | assert_frame_equal(encoded_df, expected_df, check_like=True, check_dtype=False) 210 | 211 | -------------------------------------------------------------------------------- /das3h/print_df_infos.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | 4 | def print_cleaned_df_with_information(cleaned_df: pd.DataFrame): 5 | nb_students = len(cleaned_df["student_id"].unique()) 6 | print(f"There are {nb_students} students in this dataset.") 7 | print() 8 | trace_repartition = ( 9 | cleaned_df.groupby("exercise_code") 10 | .count() 11 | .rename(columns={"correctness": "traces_count"})["traces_count"] 12 | .sort_values(ascending=False) 13 | ) 14 | print("This is the repartition of traces grouped by exercise_code :") 15 | print(trace_repartition.plot(kind="bar")) 16 | 17 | 18 | -------------------------------------------------------------------------------- /das3h/results_analysis.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | 5 | def get_coefs_in_dataframe(model, X: pd.DataFrame): 6 | return ( 7 | pd.DataFrame(data=model.coef_[0], index=X.columns) 8 | .reset_index() 9 | .rename(columns={"index": "columns", 0: "coefs"}) 10 | ) 11 | 12 | 13 | def get_students_alphas(coefs: pd.DataFrame): 14 | return coefs[coefs["columns"].str.contains("student_id")].sort_values("coefs") 15 | 16 | 17 | def get_exercise_code_betas(coefs: pd.DataFrame): 18 | exercise_codes = [ 19 | f"exercise_code_{exercise_code}" 20 | for exercise_code in get_available_exercise_codes(coefs) 21 | ] 22 | return coefs[coefs["columns"].isin(exercise_codes)].sort_values("coefs") 23 | 24 | 25 | def get_gamma_of_exercise( 26 | coefs: pd.DataFrame, exercise_code: str, level: int, lesson: int 27 | ): 28 | return coefs[ 29 | coefs["columns"].str.contains( 30 | f"{exercise_code}_{str(level)}_lesson_{str(lesson)}" 31 | ) 32 | ].sort_values("coefs") 33 | 34 | 35 | def get_available_exercise_codes(coefs: pd.DataFrame): 36 | return np.array( 37 | list( 38 | map( 39 | lambda x: x[len("exercise_code_") :], 40 | coefs[coefs["columns"].str.startswith("exercise_code")][ 41 | ~coefs["columns"].str.startswith("exercise_code_level_lesson") 42 | ]["columns"].values, 43 | ) 44 | ) 45 | ) 46 | 47 | 48 | def get_available_exercise_code_level_lesson_tuples(coefs: pd.DataFrame): 49 | return np.array( 50 | list( 51 | map( 52 | lambda x: x[len("exercise_code_level_lesson_") :], 53 | coefs[coefs["columns"].str.startswith("exercise_code_level_lesson")][ 54 | "columns" 55 | ].values, 56 | ) 57 | ) 58 | ) 59 | 60 | 61 | def get_exercise_gammas_of_one_exercise_code(coefs: pd.DataFrame, exercise_code: str): 62 | available_exercise_codes = get_available_exercise_codes(coefs) 63 | if exercise_code not in available_exercise_codes: 64 | print('Error : exercise_code not in the dataset') 65 | return 66 | exercise_code_level_lesson_tuples = get_available_exercise_code_level_lesson_tuples( 67 | coefs 68 | ) 69 | filtered_tuples = [ 70 | f"exercise_code_level_lesson_{tuple}" 71 | for tuple in exercise_code_level_lesson_tuples 72 | if exercise_code in tuple 73 | ] 74 | return coefs[coefs["columns"].isin(filtered_tuples)].sort_values( 75 | by="coefs", ascending=False 76 | ) 77 | 78 | 79 | def get_thetas_of_one_exercise_code(coefs: pd.DataFrame, exercise_code: str): 80 | return coefs[coefs["columns"].str.startswith(exercise_code)] 81 | -------------------------------------------------------------------------------- /prepare_data.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from scipy import sparse 4 | import argparse 5 | import os 6 | from time import process_time 7 | 8 | 9 | def prepare_assistments(data_name, min_interactions_per_user, remove_nan_skills): 10 | """Preprocess ASSISTments dataset. 11 | 12 | Arguments: 13 | data_name: "assistments09", "assistments12", "assistments15" or "assistments17" 14 | min_interactions_per_user (int): minimum number of interactions per student 15 | remove_nan_skills (bool): if True, remove interactions with no skill tag 16 | Outputs: 17 | df (pandas DataFrame): preprocessed ASSISTments dataset with user_id, item_id, 18 | timestamp and correct features 19 | Q_mat (item-skill relationships sparse array): corresponding q-matrix 20 | """ 21 | data_path = os.path.join("data", data_name) 22 | df = pd.read_csv(os.path.join(data_path, "data.csv"), encoding="ISO-8859-1") 23 | 24 | # Only 2012 and 2017 versions have timestamps 25 | if data_name == "assistments09": 26 | df = df.rename(columns={"problem_id": "item_id"}) 27 | df["timestamp"] = np.zeros(len(df), dtype=np.int64) 28 | elif data_name == "assistments12": 29 | df = df.rename(columns={"problem_id": "item_id"}) 30 | df = add_timestamp(df, "start_time") 31 | elif data_name == "assistments15": 32 | df = df.rename(columns={"sequence_id": "item_id"}) 33 | df["skill_id"] = df["item_id"] 34 | df["timestamp"] = np.zeros(len(df), dtype=np.int64) 35 | elif data_name == "assistments17": 36 | df = df.rename( 37 | columns={ 38 | "startTime": "timestamp", 39 | "studentId": "user_id", 40 | "problemId": "item_id", 41 | "skill": "skill_id", 42 | } 43 | ) 44 | df = add_timestamp(df, "timestamp") 45 | 46 | # Sort data temporally 47 | if data_name in ["assistments12", "assistments17"]: 48 | df.sort_values(by="timestamp", inplace=True) 49 | elif data_name == "assistments09": 50 | df.sort_values(by="order_id", inplace=True) 51 | elif data_name == "assistments15": 52 | df.sort_values(by="log_id", inplace=True) 53 | 54 | df = general_cleaning(df, min_interactions_per_user) 55 | df = remove_nan_skill(remove_nan_skills, df) 56 | 57 | df["user_id"] = np.unique(df["user_id"], return_inverse=True)[1] 58 | df["item_id"] = np.unique(df["item_id"], return_inverse=True)[1] 59 | df["skill_id"] = np.unique(df["skill_id"], return_inverse=True)[1] 60 | 61 | # Build Q-matrix 62 | Q_mat = np.zeros((len(df["item_id"].unique()), len(df["skill_id"].unique()))) 63 | for item_id, skill_id in df[["item_id", "skill_id"]].values: 64 | Q_mat[item_id, skill_id] = 1 65 | 66 | # Remove row duplicates due to multiple skills for one item 67 | if data_name == "assistments09": 68 | df = df.drop_duplicates("order_id") 69 | elif data_name == "assistments17": 70 | df = df.drop_duplicates(["user_id", "timestamp"]) 71 | 72 | # Get unique skill id from combination of all skill ids 73 | df["skill_id"] = np.unique(Q_mat, axis=0, return_inverse=True)[1][df["item_id"]] 74 | 75 | df = df[["user_id", "item_id", "timestamp", "correct", "skill_id"]] 76 | df.reset_index(inplace=True, drop=True) 77 | 78 | # Save data 79 | save_data(df, data_path, Q_mat) 80 | 81 | 82 | def remove_nan_skill(remove_nan_skills, df): 83 | # Filter nan skills 84 | if remove_nan_skills: 85 | df = df[~df["skill_id"].isnull()] 86 | else: 87 | df.ix[df["skill_id"].isnull(), "skill_id"] = -1 88 | return df 89 | 90 | 91 | def prepare_kddcup10( 92 | data_name, min_interactions_per_user, kc_col_name, remove_nan_skills 93 | ): 94 | """Preprocess KDD Cup 2010 dataset. 95 | Arguments: 96 | data_name (str): "bridge_algebra06" or "algebra05" 97 | min_interactions_per_user (int): minimum number of interactions per student 98 | kc_col_name (str): Skills id column 99 | remove_nan_skills (bool): if True, remove interactions with no skill tag 100 | Outputs: 101 | df (pandas DataFrame): preprocessed KDD Cup 2010 dataset with user_id, item_id, 102 | timestamp and correct features 103 | Q_mat (item-skill relationships sparse array): corresponding q-matrix 104 | """ 105 | data_path = os.path.join("data", data_name) 106 | df = pd.read_csv(os.path.join(data_path, "data.txt"), delimiter="\t") 107 | df = df.rename( 108 | columns={ 109 | "Anon Student Id": "user_id", 110 | "Correct First Attempt": "correct", 111 | kc_col_name: "skill_id", 112 | } 113 | ) 114 | 115 | # Create item from problem and step 116 | df["item_id"] = df["Problem Name"] + ":" + df["Step Name"] 117 | 118 | df = add_timestamp(df, "First Transaction Time") 119 | df = general_cleaning(df, min_interactions_per_user) 120 | df = remove_nan_skill(remove_nan_skills, df) 121 | 122 | # Extract KCs 123 | kc_list = [] 124 | for kc_str in df["skill_id"].unique(): 125 | for kc in kc_str.split("~~"): 126 | kc_list.append(kc) 127 | kc_set = set(kc_list) 128 | kc2idx = {kc: i for i, kc in enumerate(kc_set)} 129 | 130 | df["user_id"] = np.unique(df["user_id"], return_inverse=True)[1] 131 | df["item_id"] = np.unique(df["item_id"], return_inverse=True)[1] 132 | 133 | # Build Q-matrix 134 | Q_mat = np.zeros((len(df["item_id"].unique()), len(kc_set))) 135 | for item_id, kc_str in df[["item_id", "skill_id"]].values: 136 | for kc in kc_str.split("~~"): 137 | Q_mat[item_id, kc2idx[kc]] = 1 138 | 139 | # Get unique skill id from combination of all skill ids 140 | df["skill_id"] = np.unique(Q_mat, axis=0, return_inverse=True)[1][df["item_id"]] 141 | 142 | df = df[["user_id", "item_id", "timestamp", "correct", "skill_id"]] 143 | df.reset_index(inplace=True, drop=True) 144 | 145 | # Save data 146 | save_data(df, data_path, Q_mat) 147 | 148 | 149 | def prepare_lalilo(min_interactions_per_user): 150 | """Preprocess Lalilo dataset. 151 | 152 | Arguments: 153 | min_interactions_per_user (int): minimum number of interactions per student 154 | 155 | Outputs: 156 | df (pandas DataFrame): preprocessed Lalilo dataset with user_id, item_id, 157 | timestamp and correct features 158 | Q_mat (item-skill relationships sparse array): corresponding q-matrix 159 | """ 160 | data_path = os.path.join("data", "lalilo") 161 | df = pd.read_csv( 162 | os.path.join(data_path, "all_traces_from_2018-08-01_to_2019-04-01.csv") 163 | ) 164 | 165 | def add_exercise_code_level_lesson(df): 166 | dataset = df.copy() 167 | dataset["exercise_code_level_lesson"] = ( 168 | dataset["exercise_code"].map(str) 169 | + "_" 170 | + dataset["level"].map(str) 171 | + "_lesson_" 172 | + dataset["lesson_id"].map(str) 173 | ) 174 | return dataset 175 | 176 | df = add_exercise_code_level_lesson(df) 177 | df = df.rename( 178 | columns={ 179 | "student_id": "user_id", 180 | "created_at": "timestamp", 181 | "exercise_code": "skill_id", 182 | "exercise_code_level_lesson": "item_id", 183 | "correctness": "correct", 184 | } 185 | ) 186 | df = add_timestamp(df, "timestamp") 187 | df = general_cleaning(df, min_interactions_per_user) 188 | 189 | # Maybe we want to store the correspondence with the original dataset somewhere 190 | df["user_id"] = np.unique(df["user_id"], return_inverse=True)[1] 191 | df["item_id"] = np.unique(df["item_id"], return_inverse=True)[1] 192 | df["skill_id"] = np.unique(df["skill_id"], return_inverse=True)[1] 193 | 194 | # Build Q-matrix 195 | Q_mat = np.zeros((len(df["item_id"].unique()), len(df["skill_id"].unique()))) 196 | for item_id, skill_id in df[["item_id", "skill_id"]].values: 197 | Q_mat[item_id, skill_id] = 1 198 | 199 | df = df[["user_id", "item_id", "skill_id", "timestamp", "correct"]] 200 | df.reset_index(inplace=True, drop=True) 201 | 202 | # Save data 203 | # save_data(df, data_path, Q_mat) 204 | 205 | 206 | def general_cleaning(df, min_interactions_per_user): 207 | t1_start = process_time() 208 | # Remove continuous outcomes 209 | df = df.copy() 210 | df = df[df["correct"].isin([0, 1])] 211 | df["correct"] = df["correct"].astype(np.int32) 212 | # Drop duplicates 213 | df.drop_duplicates(subset=["user_id", "item_id", "timestamp"], inplace=True) 214 | # Filter too short sequences 215 | df = df.groupby("user_id").filter(lambda x: len(x) >= min_interactions_per_user) 216 | t1_stop = process_time() 217 | print("Elapsed time during general_cleaning in seconds:", t1_stop - t1_start) 218 | return df 219 | 220 | 221 | def add_timestamp(df, column_name): 222 | t1_start = process_time() 223 | df = df.copy() 224 | df["timestamp"] = pd.to_datetime(df[column_name]) 225 | # df.dropna(subset=["timestamp"], inplace=True) 226 | df["timestamp"] = df["timestamp"] - df["timestamp"].min() 227 | df["timestamp"] = ( 228 | df["timestamp"].apply(lambda x: x.total_seconds()).astype(np.int64) 229 | ) 230 | df.sort_values(by="timestamp", inplace=True) 231 | t1_stop = process_time() 232 | print("Elapsed time during add_timestamp in seconds:", t1_stop - t1_start) 233 | return df 234 | 235 | 236 | def save_data(df, data_path, Q_mat): 237 | sparse.save_npz(os.path.join(data_path, "q_mat.npz"), sparse.csr_matrix(Q_mat)) 238 | df.to_csv(os.path.join(data_path, "preprocessed_data.csv"), sep="\t", index=False) 239 | 240 | 241 | if __name__ == "__main__": 242 | parser = argparse.ArgumentParser(description="Prepare datasets.") 243 | parser.add_argument("--dataset", type=str, default="assistments12") 244 | parser.add_argument("--min_interactions", type=int, default=10) 245 | parser.add_argument("--remove_nan_skills", type=bool, default=True) 246 | args = parser.parse_args() 247 | 248 | if args.dataset in [ 249 | "assistments09", 250 | "assistments12", 251 | "assistments15", 252 | "assistments17", 253 | ]: 254 | prepare_assistments( 255 | data_name=args.dataset, 256 | min_interactions_per_user=args.min_interactions, 257 | remove_nan_skills=args.remove_nan_skills, 258 | ) 259 | elif args.dataset == "bridge_algebra06": 260 | prepare_kddcup10( 261 | data_name="bridge_algebra06", 262 | min_interactions_per_user=args.min_interactions, 263 | kc_col_name="KC(SubSkills)", 264 | remove_nan_skills=args.remove_nan_skills, 265 | ) 266 | elif args.dataset == "algebra05": 267 | prepare_kddcup10( 268 | data_name="algebra05", 269 | min_interactions_per_user=args.min_interactions, 270 | kc_col_name="KC(Default)", 271 | remove_nan_skills=args.remove_nan_skills, 272 | ) 273 | elif args.dataset == "lalilo": 274 | prepare_lalilo(min_interactions_per_user=args.min_interactions) 275 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pandas 2 | tqdm 3 | sklearn 4 | numpy 5 | matplotlib 6 | torch 7 | tensorboardX 8 | tensorboard -------------------------------------------------------------------------------- /utils/logger.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | from tensorboardX import SummaryWriter 4 | 5 | 6 | class Logger: 7 | """Logging with TensorboardX. 8 | """ 9 | 10 | def __init__(self, logdir, verbose=True): 11 | if not os.path.exists(logdir): 12 | os.makedirs(logdir) 13 | try: 14 | shutil.rmtree(logdir) 15 | except FileNotFoundError: 16 | pass 17 | 18 | self.verbose = verbose 19 | self.writer = SummaryWriter(logdir) 20 | 21 | def log_histogram(self, tag, array, step): 22 | """Log histogram of numpy array of values. 23 | """ 24 | self.writer.add_histogram(tag, array, step) 25 | 26 | def log_scalars(self, dic, step): 27 | """Log dictionary of scalar values. 28 | """ 29 | for k, v in dic.items(): 30 | self.writer.add_scalar(k, v, step) 31 | 32 | if self.verbose: 33 | print(f"Step {step}, {dic}") 34 | 35 | def close(self): 36 | self.writer.close() -------------------------------------------------------------------------------- /utils/metrics.py: -------------------------------------------------------------------------------- 1 | class Metrics: 2 | """Keep track of metrics over time in a dictionary. 3 | """ 4 | def __init__(self): 5 | self.metrics = {} 6 | self.counts = {} 7 | 8 | def store(self, new_metrics): 9 | for key in new_metrics: 10 | if key in self.metrics: 11 | self.metrics[key] += new_metrics[key] 12 | self.counts[key] += 1 13 | else: 14 | self.metrics[key] = new_metrics[key] 15 | self.counts[key] = 1 16 | 17 | def average(self): 18 | average = {k: v / self.counts[k] for k, v in self.metrics.items()} 19 | self.metrics, self.counts = {}, {} 20 | return average -------------------------------------------------------------------------------- /utils/misc.py: -------------------------------------------------------------------------------- 1 | import random 2 | from random import shuffle 3 | from sklearn.metrics import roc_auc_score, accuracy_score 4 | 5 | import torch 6 | from torch.nn.utils.rnn import pad_sequence 7 | 8 | 9 | def set_random_seeds(seed): 10 | torch.manual_seed(seed) 11 | torch.cuda.manual_seed_all(seed) 12 | random.seed(seed) 13 | 14 | 15 | def get_data(df, train_split=0.8): 16 | num_items = df["item_id"].nunique() 17 | data = [(torch.tensor(u_df["item_id"].values, dtype=torch.long), 18 | torch.tensor(u_df["correct"].values, dtype=torch.long)) 19 | for _, u_df in df.groupby("user_id")] 20 | data = [(torch.cat((torch.zeros(1, dtype=torch.long), item_ids + labels * num_items + 1))[:-1], item_ids, labels) 21 | for (item_ids, labels) in data] 22 | shuffle(data) 23 | 24 | # Train-test split across users 25 | train_size = int(train_split * len(data)) 26 | train_data, val_data = data[:train_size], data[train_size:] 27 | return train_data, val_data 28 | 29 | 30 | def prepare_batches(data, batch_size): 31 | """Prepare batches grouping padded sequences. 32 | 33 | Arguments: 34 | data (list of tuples of torch Tensor) 35 | batch_size (int): number of sequences per batch 36 | 37 | Output: 38 | batches (list of tuples of torch Tensor) 39 | """ 40 | shuffle(data) 41 | batches = [] 42 | 43 | for k in range(0, len(data), batch_size): 44 | batch = data[k:k + batch_size] 45 | inputs, item_ids, labels = zip(*batch) 46 | 47 | inputs = pad_sequence(inputs, batch_first=True, padding_value=0) # Pad with 0 48 | item_ids = pad_sequence(item_ids, batch_first=True, padding_value=0) # Don't care 49 | labels = pad_sequence(labels, batch_first=True, padding_value=-1) # Pad with -1 50 | 51 | batches.append([inputs, item_ids, labels]) 52 | 53 | return batches 54 | 55 | 56 | def compute_auc(preds, item_ids, labels): 57 | labels = labels.view(-1) 58 | item_ids = item_ids.view(-1)[labels >= 0] 59 | preds = preds.view(-1, preds.shape[-1])[labels >= 0] 60 | preds = preds[torch.arange(preds.shape[0]), item_ids] 61 | labels = labels[labels >= 0].float() 62 | 63 | if len(torch.unique(labels)) == 1: # Only one class 64 | auc = accuracy_score(labels, preds.round()) 65 | else: 66 | auc = roc_auc_score(labels, preds) 67 | return auc 68 | 69 | 70 | def compute_loss(preds, item_ids, labels, criterion): 71 | labels = labels.view(-1) 72 | item_ids = item_ids.view(-1)[labels >= 0] 73 | preds = preds.view(-1, preds.shape[-1])[labels >= 0] 74 | preds = preds[torch.arange(preds.shape[0]), item_ids] 75 | labels = labels[labels >= 0].float() 76 | return criterion(preds, labels) --------------------------------------------------------------------------------