├── .vscode
    ├── .ropeproject
    │   ├── config.py
    │   └── objectdb
    └── settings.json
├── DKT
    ├── DKT.py
    ├── README.md
    ├── prepare_sequences.py
    ├── prepare_sequences_test.py
    └── train_DKT.py
├── FeedForwardNetwork
    ├── FFN.py
    ├── encode_ffw.py
    ├── encode_ffw_test.py
    └── train_ffw.py
├── README.md
├── SAKT
    ├── model_sakt.py
    ├── train_sakt.py
    └── utils_sakt
    │   ├── logger.py
    │   ├── metrics.py
    │   └── misc.py
├── das3h
    ├── README.md
    ├── clean.py
    ├── clean_test.py
    ├── das3h.py
    ├── encode.py
    ├── encode_test.py
    ├── print_df_infos.py
    └── results_analysis.py
├── prepare_data.py
├── requirements.txt
└── utils
    ├── logger.py
    ├── metrics.py
    └── misc.py


/.vscode/.ropeproject/config.py:
--------------------------------------------------------------------------------
  1 | # The default ``config.py``
  2 | # flake8: noqa
  3 | 
  4 | 
  5 | def set_prefs(prefs):
  6 |     """This function is called before opening the project"""
  7 | 
  8 |     # Specify which files and folders to ignore in the project.
  9 |     # Changes to ignored resources are not added to the history and
 10 |     # VCSs.  Also they are not returned in `Project.get_files()`.
 11 |     # Note that ``?`` and ``*`` match all characters but slashes.
 12 |     # '*.pyc': matches 'test.pyc' and 'pkg/test.pyc'
 13 |     # 'mod*.pyc': matches 'test/mod1.pyc' but not 'mod/1.pyc'
 14 |     # '.svn': matches 'pkg/.svn' and all of its children
 15 |     # 'build/*.o': matches 'build/lib.o' but not 'build/sub/lib.o'
 16 |     # 'build//*.o': matches 'build/lib.o' and 'build/sub/lib.o'
 17 |     prefs['ignored_resources'] = ['*.pyc', '*~', '.ropeproject',
 18 |                                   '.hg', '.svn', '_svn', '.git', '.tox']
 19 | 
 20 |     # Specifies which files should be considered python files.  It is
 21 |     # useful when you have scripts inside your project.  Only files
 22 |     # ending with ``.py`` are considered to be python files by
 23 |     # default.
 24 |     # prefs['python_files'] = ['*.py']
 25 | 
 26 |     # Custom source folders:  By default rope searches the project
 27 |     # for finding source folders (folders that should be searched
 28 |     # for finding modules).  You can add paths to that list.  Note
 29 |     # that rope guesses project source folders correctly most of the
 30 |     # time; use this if you have any problems.
 31 |     # The folders should be relative to project root and use '/' for
 32 |     # separating folders regardless of the platform rope is running on.
 33 |     # 'src/my_source_folder' for instance.
 34 |     # prefs.add('source_folders', 'src')
 35 | 
 36 |     # You can extend python path for looking up modules
 37 |     # prefs.add('python_path', '~/python/')
 38 | 
 39 |     # Should rope save object information or not.
 40 |     prefs['save_objectdb'] = True
 41 |     prefs['compress_objectdb'] = False
 42 | 
 43 |     # If `True`, rope analyzes each module when it is being saved.
 44 |     prefs['automatic_soa'] = True
 45 |     # The depth of calls to follow in static object analysis
 46 |     prefs['soa_followed_calls'] = 0
 47 | 
 48 |     # If `False` when running modules or unit tests "dynamic object
 49 |     # analysis" is turned off.  This makes them much faster.
 50 |     prefs['perform_doa'] = True
 51 | 
 52 |     # Rope can check the validity of its object DB when running.
 53 |     prefs['validate_objectdb'] = True
 54 | 
 55 |     # How many undos to hold?
 56 |     prefs['max_history_items'] = 32
 57 | 
 58 |     # Shows whether to save history across sessions.
 59 |     prefs['save_history'] = True
 60 |     prefs['compress_history'] = False
 61 | 
 62 |     # Set the number spaces used for indenting.  According to
 63 |     # :PEP:`8`, it is best to use 4 spaces.  Since most of rope's
 64 |     # unit-tests use 4 spaces it is more reliable, too.
 65 |     prefs['indent_size'] = 4
 66 | 
 67 |     # Builtin and c-extension modules that are allowed to be imported
 68 |     # and inspected by rope.
 69 |     prefs['extension_modules'] = []
 70 | 
 71 |     # Add all standard c-extensions to extension_modules list.
 72 |     prefs['import_dynload_stdmods'] = True
 73 | 
 74 |     # If `True` modules with syntax errors are considered to be empty.
 75 |     # The default value is `False`; When `False` syntax errors raise
 76 |     # `rope.base.exceptions.ModuleSyntaxError` exception.
 77 |     prefs['ignore_syntax_errors'] = False
 78 | 
 79 |     # If `True`, rope ignores unresolvable imports.  Otherwise, they
 80 |     # appear in the importing namespace.
 81 |     prefs['ignore_bad_imports'] = False
 82 | 
 83 |     # If `True`, rope will insert new module imports as
 84 |     # `from <package> import <module>` by default.
 85 |     prefs['prefer_module_from_imports'] = False
 86 | 
 87 |     # If `True`, rope will transform a comma list of imports into
 88 |     # multiple separate import statements when organizing
 89 |     # imports.
 90 |     prefs['split_imports'] = False
 91 | 
 92 |     # If `True`, rope will remove all top-level import statements and
 93 |     # reinsert them at the top of the module when making changes.
 94 |     prefs['pull_imports_to_top'] = True
 95 | 
 96 |     # If `True`, rope will sort imports alphabetically by module name instead
 97 |     # of alphabetically by import statement, with from imports after normal
 98 |     # imports.
 99 |     prefs['sort_imports_alphabetically'] = False
100 | 
101 |     # Location of implementation of
102 |     # rope.base.oi.type_hinting.interfaces.ITypeHintingFactory In general
103 |     # case, you don't have to change this value, unless you're an rope expert.
104 |     # Change this value to inject you own implementations of interfaces
105 |     # listed in module rope.base.oi.type_hinting.providers.interfaces
106 |     # For example, you can add you own providers for Django Models, or disable
107 |     # the search type-hinting in a class hierarchy, etc.
108 |     prefs['type_hinting_factory'] = (
109 |         'rope.base.oi.type_hinting.factory.default_type_hinting_factory')
110 | 
111 | 
112 | def project_opened(project):
113 |     """This function is called after opening the project"""
114 |     # Do whatever you like here!
115 | 


--------------------------------------------------------------------------------
/.vscode/.ropeproject/objectdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thosgt/edm_main_algorithms/e678dcd0342f251d120af3ea5946cc8a1eae4771/.vscode/.ropeproject/objectdb


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "python.pythonPath": "/Users/tho_sergent/.local/share/virtualenvs/edm_main_algorithms-MrmaA6op/bin/python",
3 |     "python.testing.pytestArgs": [
4 |         "src"
5 |     ],
6 |     "python.testing.unittestEnabled": false,
7 |     "python.testing.nosetestsEnabled": false,
8 |     "python.testing.pytestEnabled": true
9 | }


--------------------------------------------------------------------------------
/DKT/DKT.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | 
 6 | class DKT(nn.Module):
 7 |     def __init__(self, num_items, num_skills, hid_size, num_hid_layers, drop_prob,
 8 |                  item_in, skill_in, item_out, skill_out):
 9 |         """Deep knowledge tracing.
10 |         Arguments:
11 |             num_items (int): number of items
12 |             num_skills (int): number of skills
13 |             hid_size (int): hidden layer dimension
14 |             num_hid_layers (int): number of hidden layers
15 |             drop_prob (float): dropout probability
16 |             item_in (bool): if True, use items as inputs
17 |             skill_in (bool): if True, use skills as inputs
18 |             item_out (bool): if True, use items as outputs
19 |             skill_out (bool): if True, use skills as outputs
20 |         """
21 |         super(DKT, self).__init__()
22 |         self.num_items = num_items
23 |         self.num_skills = num_skills
24 |         self.item_in = item_in
25 |         self.skill_in = skill_in
26 |         self.item_out = item_out
27 |         self.skill_out = skill_out
28 |         self.input_size = (2 * num_items + 1) * item_in + (2 * num_skills + 1) * skill_in
29 |         self.output_size = num_items * item_out + num_skills * skill_out
30 |         
31 |         self.lstm = nn.LSTM(self.input_size, hid_size, num_hid_layers, batch_first=True)
32 |         self.dropout = nn.Dropout(p=drop_prob)
33 |         self.out = nn.Linear(hid_size, self.output_size)
34 | 
35 |     def forward(self, item_inputs, skill_inputs, hidden=None):
36 |         # Pad inputs with 0, this explains the +1
37 |         if (item_inputs is not None) and (skill_inputs is not None):
38 |             items_onehot = F.one_hot(item_inputs, 2 * self.num_items + 1).float()
39 |             skills_onehot = F.one_hot(skill_inputs, 2 * self.num_skills + 1).float()
40 |             input = torch.cat((items_onehot, skills_onehot), -1)
41 |         elif (item_inputs is not None):
42 |             input = F.one_hot(item_inputs, 2 * self.num_items + 1).float()
43 |         elif (skill_inputs is not None):
44 |             input = F.one_hot(skill_inputs, 2 * self.num_skills + 1).float()
45 |         else:
46 |             raise ValueError("Use at least one of skills or items as input")
47 | 
48 |         output, hidden = self.lstm(input, hx=hidden)
49 |         return self.out(self.dropout(output)), hidden
50 | 
51 |     def repackage_hidden(self, hidden):
52 |         # Return detached hidden for TBPTT
53 |         return tuple((v.detach() for v in hidden))
54 | 
55 | 


--------------------------------------------------------------------------------
/DKT/README.md:
--------------------------------------------------------------------------------
 1 | This is an Pytorch implementation of [Deep Knowledge Tracing](https://stanford.edu/~cpiech/bio/papers/deepKnowledgeTracing.pdf) for a dataset like Lalilo's.
 2 | 
 3 | ### Some context
 4 | 
 5 | Knowledge Tracing (KT) is measuring the evolving knowledge of a student over time. Usually this knowledge is captured in a vector of numbers. Several algorithms have been tried over the years like Item Response Theory (IRT), Bayesian Knowledge Tracing (BKT) etc.
 6 | 
 7 | Recently, new KT models based on Neural Networks were tested. Deep Knowledge Tracing (DKT) was the first among them.
 8 | 
 9 | ## How Deep Knowledge Tracing works
10 | 
11 | ### High level description 
12 | 
13 | With any Machine Learning algorithm, the first question is : what do I want to predict ?
14 | 
15 | Here I want to predict the probability of success of a student to an exercise.
16 | 
17 | Long-Short-Term Memory neural networks (LSTMs) seem a good way to model this : they possess a hidden state supposedly able to capture the evolving student knowledge with each answer.
18 | The hidden state is comprised of ```n_hidden``` dimensions representing the estimated student knowledge state.
19 | 
20 | From the student knowledge state we should then be able to predict the probability of success of a student to any exercise after any of their answer.
21 | 
22 | #### More advanced paragraph
23 | 
24 | A central hypothesis of this algorithm is : the hidden state transitions are the same for each students. Training the model is computing the matrix governing the hidden state transitions.
25 | 
26 | ### Lower level description - if you are familiar with Machine Learning in general
27 | 
28 | There are two kind of matrices that we need to distinguish here :
29 | - the hidden state that is specific to a given student. There is one hidden state per student. In the beginning of their exercise sequence, the hidden state of a student is a matrix with zeros. It is updated after each exercise they answer
30 | - the weights of the network that govern the transition between hidden states, and the mapping between hidden states and predicted probabilities. These weights are the same for all students. There are updated during training so that they fit student transitions the best way possible : training the model is updating these weights
31 | 
32 | #### What are we going to feed our network to train it ?
33 | We are going to feed our network the exercise sequence  of each student one after the other (indeed it is a sequence as the exercises are done one after the other and not simultaneously). Therefore, for each student :
34 | - we select the exercises and answers of this student
35 | - the hidden state of the student is set to a zero-like vector
36 | - then for each exercise of their exercise sequence :
37 |   - using the hidden state of the student and the mapping between hidden state and expected probabilities, we predict the probability of answering correctly to the exercise they get and compare it to the actual correctness
38 |   - we update the network weights so that the predicted probability is closer to the actual correctness
39 |   - the hidden state of the student is updated
40 | 
41 | This is how we train the model to find its weights.
42 | 
43 | #### Lowest level - if you are already familiar with recurring neural networks in particular
44 | 
45 | What kind of input my algorithm takes and what is the kind of output it outputs ?
46 | 
47 | For each student, the input size will vary as the number of exercises each of them answered vary. We set the number of exercises a student answered as ```sequence_length```
48 | 
49 | One central question here is : how to represent an answer to one of the exercises ?
50 | 
51 | The solution choosen in the article is to [one-hot-encode](https://hackernoon.com/what-is-one-hot-encoding-why-and-when-do-you-have-to-use-it-e3c6186d008f) the tuple ```(exercise, correctness)```.
52 | For each answer the student does one out of 2 x ```n_available_exercises``` actions : they get one of ```n_available_exercises``` exercises and can answer correctly or not. Thus, for each answer, the input is a one-hot-encoded vector of size ```2 x n_available_exercises```
53 | 
54 | As output, we would like to have the probabilities of success of all available exercises for the *next* exercise the student does. Therefore, for each answer of the student, the LSTM outputs ```n_available_exercises``` probabilities
55 | 
56 | When the student hasn't answered any exercise, we still want to have some probabilities from the LSTM. These probabilities would be the starting probabilities for all students when there is no hidden state yet. However the LSTM doesn't output something when it doesn't have an input. The solution we found is to have as the first input of the sequence a row full of zeros i.e having the answer shifted.
57 | 
58 | To sum up, the input of the LSTM for each student is of shape ```(sequence_length,  2 x n_available_exercises)```and the output is ```(sequence_length,  n_available_exercises)```
59 | 
60 | Computing the loss is straightforward once we have the predictions for every available exercise and for every actual answer. For each anwer, we need to select the prediction that is relevant (we need only one of the ```n_available_exercises``` predictions, the prediction of the exercise that was actually answered) and compute the ```log_loss``` between it and its actual correctness.
61 | 
62 | After propagating the gradient and updating the weights we can go to another student sequence and update the weights once more.
63 | 
64 | 
65 | ### FAQ
66 | ##### Why not feeding the entire sequences of answers of all students to the neural network ?
67 | The hidden state is specific to each student so it has to be zeroed between students. That is why the network is fed sequence by sequence.


--------------------------------------------------------------------------------
/DKT/prepare_sequences.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | from sklearn.preprocessing import LabelEncoder
 5 | 
 6 | 
 7 | def prepare_df(df: pd.DataFrame)-> pd.DataFrame:
 8 |     label_encoder = LabelEncoder()
 9 |     df_copy = df.copy()
10 |     df_copy["exercise_code_level_lesson"] = label_encoder.fit_transform(
11 |         df_copy["exercise_code_level_lesson"]
12 |     )
13 |     df_copy["concat_exercise_correctness"] = (
14 |         df_copy["exercise_code_level_lesson"].map(str)
15 |         + "_"
16 |         + df_copy["correctness"].map(str)
17 |     )
18 |     df_to_feed_network = pd.get_dummies(
19 |         df_copy[
20 |             ["correctness", "student_id", "exercise_code_level_lesson", "concat_exercise_correctness"]
21 |         ],
22 |         columns=["concat_exercise_correctness"],
23 |         sparse=True,
24 |     )
25 |     n_expected_columns = df_copy.exercise_code_level_lesson.unique().shape[0]
26 |     expected_columns = []
27 |     for i in range(n_expected_columns):
28 |         for correctness in (0, 1):
29 |             expected_columns.append(f'concat_exercise_correctness_{i}_{correctness}')
30 |     for column in expected_columns:
31 |         if column not in df_to_feed_network.columns:
32 |             df_to_feed_network[column] = 0
33 |     return df_to_feed_network, label_encoder
34 | 
35 | 
36 | def prepare_sequences(df: pd.DataFrame):
37 |     # idea have a generator to spare memory ? no if several epochs
38 |     # idea add shuffling somewhere ?
39 |     student_ids = df["student_id"].unique()
40 |     exercise_sequences = []
41 |     # le.save somewhere ?
42 |     for student_id in student_ids:
43 |         df_of_student = df[df["student_id"] == student_id].drop(columns=["student_id"])
44 |         exercise_sequences.append(df_of_student)
45 |     return exercise_sequences
46 | 
47 | 


--------------------------------------------------------------------------------
/DKT/prepare_sequences_test.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | from prepare_sequences import prepare_df
 4 | from pandas.testing import assert_frame_equal
 5 | 
 6 | df = pd.DataFrame(
 7 |     {
 8 |         "student_id": [1, 2, 3, 1, 5, 5, 4],
 9 |         "exercise_code_level_lesson": [
10 |             "phono_3_lesson_102",
11 |             "phono_3_lesson_101",
12 |             "phono_3_lesson_103",
13 |             "phono_3_lesson_101",
14 |             "phono_3_lesson_102",
15 |             "phono_3_lesson_103",
16 |             "phono_3_lesson_103",
17 |         ],
18 |         "correctness": [0, 1, 1, 0, 1, 0, 1],
19 |     }
20 | )
21 | 
22 | 
23 | def test_that_cleaning_of_df_works():
24 |     prepared_df, _ = prepare_df(df)
25 |     expected_prepared_df = pd.DataFrame(
26 |         {
27 |             "student_id": {0: 1, 1: 2, 2: 3, 3: 1, 4: 5, 5: 5, 6: 4},
28 |             "correctness": {0: 0, 1: 1, 2: 1, 3: 0, 4: 1, 5: 0, 6: 1},
29 |             "exercise_code_level_lesson": {0: 1, 1: 0, 2: 2, 3: 0, 4: 1, 5: 2, 6: 2},
30 |             "concat_exercise_correctness_0_0": {
31 |                 0: 0,
32 |                 1: 0,
33 |                 2: 0,
34 |                 3: 1,
35 |                 4: 0,
36 |                 5: 0,
37 |                 6: 0,
38 |             },
39 |             "concat_exercise_correctness_0_1": {
40 |                 0: 0,
41 |                 1: 1,
42 |                 2: 0,
43 |                 3: 0,
44 |                 4: 0,
45 |                 5: 0,
46 |                 6: 0,
47 |             },
48 |             "concat_exercise_correctness_1_0": {
49 |                 0: 1,
50 |                 1: 0,
51 |                 2: 0,
52 |                 3: 0,
53 |                 4: 0,
54 |                 5: 0,
55 |                 6: 0,
56 |             },
57 |             "concat_exercise_correctness_1_1": {
58 |                 0: 0,
59 |                 1: 0,
60 |                 2: 0,
61 |                 3: 0,
62 |                 4: 1,
63 |                 5: 0,
64 |                 6: 0,
65 |             },
66 |             "concat_exercise_correctness_2_0": {
67 |                 0: 0,
68 |                 1: 0,
69 |                 2: 0,
70 |                 3: 0,
71 |                 4: 0,
72 |                 5: 1,
73 |                 6: 0,
74 |             },
75 |             "concat_exercise_correctness_2_1": {
76 |                 0: 0,
77 |                 1: 0,
78 |                 2: 1,
79 |                 3: 0,
80 |                 4: 0,
81 |                 5: 0,
82 |                 6: 1,
83 |             },
84 |         }
85 |     )
86 |     assert_frame_equal(prepared_df, expected_prepared_df, check_dtype=False, check_like=True)
87 | 
88 | 


--------------------------------------------------------------------------------
/DKT/train_DKT.py:
--------------------------------------------------------------------------------
  1 | # mainly from theophilee/kt-algos
  2 | import argparse
  3 | import pandas as pd
  4 | from random import shuffle
  5 | from sklearn.metrics import roc_auc_score, accuracy_score
  6 | 
  7 | import torch.nn as nn
  8 | from torch.optim import Adam
  9 | from torch.nn.utils.rnn import pad_sequence
 10 | from tqdm import tqdm
 11 | 
 12 | from model_dkt import DKT
 13 | from utils import *
 14 | 
 15 | 
 16 | def get_data(df, item_in, skill_in, item_out, skill_out, train_split=0.8):
 17 |     """Extract sequences from dataframe.
 18 |     Arguments:
 19 |         df (pandas Dataframe): output by prepare_data.py
 20 |         item_in (bool): if True, use items as inputs
 21 |         skill_in (bool): if True, use skills as inputs
 22 |         item_out (bool): if True, use items as outputs
 23 |         skill_out (bool): if True, use skills as outputs
 24 |         train_split (float): proportion of data to use for training
 25 |     """
 26 |     item_ids = [torch.tensor(u_df["item_id"].values, dtype=torch.long)
 27 |                 for _, u_df in df.groupby("user_id")]
 28 |     skill_ids = [torch.tensor(u_df["skill_id"].values, dtype=torch.long)
 29 |                  for _, u_df in df.groupby("user_id")]
 30 |     labels = [torch.tensor(u_df["correct"].values, dtype=torch.long)
 31 |               for _, u_df in df.groupby("user_id")]
 32 | 
 33 |     item_inputs = [torch.cat((torch.zeros(1, dtype=torch.long), i * 2 + l + 1))[:-1]
 34 |                    for (i, l) in zip(item_ids, labels)]
 35 |     skill_inputs = [torch.cat((torch.zeros(1, dtype=torch.long), s * 2 + l + 1))[:-1]
 36 |                     for (s, l) in zip(skill_ids, labels)]
 37 | 
 38 |     item_inputs = item_inputs if item_in else [None] * len(item_inputs)
 39 |     skill_inputs = skill_inputs if skill_in else [None] * len(skill_inputs)
 40 |     item_ids = item_ids if item_out else [None] * len(item_ids)
 41 |     skill_ids = skill_ids if skill_out else [None] * len(skill_ids)
 42 | 
 43 |     data = list(zip(item_inputs, skill_inputs, item_ids, skill_ids, labels))
 44 |     shuffle(data)
 45 | 
 46 |     # Train-test split across users
 47 |     train_size = int(train_split * len(data))
 48 |     train_data, val_data = data[:train_size], data[train_size:]
 49 |     return train_data, val_data
 50 | 
 51 | 
 52 | def prepare_batches(data, batch_size):
 53 |     """Prepare batches grouping padded sequences.
 54 |     Arguments:
 55 |         data (list of lists of torch Tensor): output by get_data
 56 |         batch_size (int): number of sequences per batch
 57 |     Output:
 58 |         batches (list of lists of torch Tensor)
 59 |     """
 60 |     shuffle(data)
 61 |     batches = []
 62 | 
 63 |     for k in range(0, len(data), batch_size):
 64 |         batch = data[k:k + batch_size]
 65 |         seq_lists = list(zip(*batch))
 66 |         inputs_and_ids = [pad_sequence(seqs, batch_first=True, padding_value=0)
 67 |                           if (seqs[0] is not None) else None for seqs in seq_lists[:4]]
 68 |         labels = pad_sequence(seq_lists[-1], batch_first=True, padding_value=-1)  # Pad labels with -1
 69 |         batches.append([*inputs_and_ids, labels])
 70 | 
 71 |     return batches
 72 | 
 73 | 
 74 | def get_preds(preds, item_ids, skill_ids, labels):
 75 |     preds = preds[labels >= 0]
 76 | 
 77 |     if (item_ids is not None):
 78 |         item_ids = item_ids[labels >= 0]
 79 |         preds = preds[torch.arange(preds.size(0)), item_ids]
 80 |     elif (skill_ids is not None):
 81 |         skill_ids = skill_ids[labels >= 0]
 82 |         preds = preds[torch.arange(preds.size(0)), skill_ids]
 83 |     else:
 84 |         raise ValueError("Use exactly one of skills or items as output")
 85 | 
 86 |     return preds
 87 | 
 88 | 
 89 | def compute_auc(preds, item_ids, skill_ids, labels):
 90 |     preds = get_preds(preds, item_ids, skill_ids, labels)
 91 |     labels = labels[labels >= 0].float()
 92 | 
 93 |     if len(torch.unique(labels)) == 1:  # Only one class
 94 |         auc = accuracy_score(labels, torch.sigmoid(preds).round())
 95 |     else:
 96 |         auc = roc_auc_score(labels, preds)
 97 | 
 98 |     return auc
 99 | 
100 | 
101 | def compute_loss(preds, item_ids, skill_ids, labels, criterion):
102 |     preds = get_preds(preds, item_ids, skill_ids, labels)
103 |     labels = labels[labels >= 0].float()
104 |     return criterion(preds, labels)
105 | 
106 | 
107 | def train(train_data, val_data, model, optimizer, logger, saver, num_epochs, batch_size, bptt=50):
108 |     """Train DKT model.
109 |     
110 |     Arguments:
111 |         train_data (list of lists of torch Tensor)
112 |         val_data (list of lists of torch Tensor)
113 |         model (torch Module)
114 |         optimizer (torch optimizer)
115 |         logger: wrapper for TensorboardX logger
116 |         num_epochs (int): number of epochs to train for
117 |         batch_size (int)
118 |         bptt (int): length of truncated backprop through time chunks
119 |         savepath (str): directory where to save the trained model
120 |     """
121 |     criterion = nn.BCEWithLogitsLoss()
122 |     metrics = Metrics()
123 |     step = 0
124 |     
125 |     for epoch in tqdm(range(num_epochs)):
126 |         train_batches = prepare_batches(train_data, batch_size)
127 |         val_batches = prepare_batches(val_data, batch_size)
128 | 
129 |         # Training
130 |         for item_inputs, skill_inputs, item_ids, skill_ids, labels in train_batches:
131 |             length = labels.size(1)
132 |             preds = torch.empty(labels.size(0), length, model.output_size)
133 |             if item_inputs is not None:
134 |                 item_inputs.to(device=args.device)
135 |             if skill_inputs is not None:
136 |                 skill_inputs.to(device=args.device)
137 |             preds.to(device=args.device)
138 | 
139 |             # Truncated backprop through time
140 |             for i in range(0, length, bptt):
141 |                 item_inp = item_inputs[:, i:i + bptt] if item_inputs is not None else None
142 |                 skill_inp = skill_inputs[:, i:i + bptt] if skill_inputs is not None else None
143 |                 if i == 0:
144 |                     pred, hidden = model(item_inp, skill_inp)
145 |                 else:
146 |                     hidden = model.repackage_hidden(hidden)
147 |                     pred, hidden = model(item_inp, skill_inp, hidden)
148 |                 preds[:, i:i + bptt] = pred
149 | 
150 |             loss = compute_loss(preds, item_ids, skill_ids, labels.to(device=args.device), criterion)
151 |             train_auc = compute_auc(preds.detach().cpu(), item_ids, skill_ids, labels)
152 | 
153 |             model.zero_grad()
154 |             loss.backward()
155 |             optimizer.step()
156 |             step += 1
157 |             metrics.store({'loss/train': loss.item()})
158 |             metrics.store({'auc/train': train_auc})
159 | 
160 |             # Logging
161 |             if step % 20 == 0:
162 |                 logger.log_scalars(metrics.average(), step)
163 |                 #weights = {"weight/" + name: param for name, param in model.named_parameters()}
164 |                 #grads = {"grad/" + name: param.grad
165 |                 #         for name, param in model.named_parameters() if param.grad is not None}
166 |                 #logger.log_histograms(weights, step)
167 |                 #logger.log_histograms(grads, step)
168 | 
169 |         # Validation
170 |         model.eval()
171 |         for item_inputs, skill_inputs, item_ids, skill_ids, labels in val_batches:
172 |             with torch.no_grad():
173 |                 if item_inputs is not None:
174 |                     item_inputs.to(device=args.device)
175 |                 if skill_inputs is not None:
176 |                     skill_inputs.to(device=args.device)
177 |                 preds, _ = model(item_inputs, skill_inputs)
178 |             val_auc = compute_auc(preds.cpu(), item_ids, skill_ids, labels)
179 |             metrics.store({'auc/val': val_auc})
180 |         model.train()
181 | 
182 |         # Save model
183 |         average_metrics = metrics.average()
184 |         logger.log_scalars(average_metrics, step)
185 |         stop = saver.save(average_metrics['auc/val'], model)
186 |         if stop:
187 |             break
188 | 
189 | 
190 | if __name__ == "__main__":
191 |     parser = argparse.ArgumentParser(description='Train DKT.')
192 |     parser.add_argument('--dataset', type=str)
193 |     parser.add_argument('--logdir', type=str, default='runs/dkt')
194 |     parser.add_argument('--savedir', type=str, default='save/dkt')
195 |     parser.add_argument('--item_in', action='store_true',
196 |                         help='If True, use items as inputs.')
197 |     parser.add_argument('--skill_in', action='store_true',
198 |                         help='If True, use skills as inputs.')
199 |     parser.add_argument('--item_out', action='store_true',
200 |                         help='If True, use items as outputs.')
201 |     parser.add_argument('--skill_out', action='store_true',
202 |                         help='If True, use skills as outputs.')
203 |     parser.add_argument('--hid_size', type=int, default=200)
204 |     parser.add_argument('--num_hid_layers', type=int, default=1)
205 |     parser.add_argument('--drop_prob', type=float, default=0.5)
206 |     parser.add_argument('--batch_size', type=int, default=100)
207 |     parser.add_argument('--lr', type=float, default=1e-2)
208 |     parser.add_argument('--num_epochs', type=int, default=100)
209 |     args = parser.parse_args()
210 | 
211 |     assert (args.item_in or args.skill_in)    # Use at least one of skills or items as input
212 |     assert (args.item_out != args.skill_out)  # Use exactly one of skills or items as output
213 | 
214 |     df = pd.read_csv(os.path.join('data', args.dataset, 'preprocessed_data.csv'), sep="\t")
215 | 
216 |     train_data, val_data = get_data(df, args.item_in, args.skill_in, args.item_out, args.skill_out)
217 | 
218 |     num_items = int(df["item_id"].max() + 1) + 1
219 |     num_skills = int(df["skill_id"].max() + 1) + 1
220 | 
221 |     model = DKT(num_items, num_skills, args.hid_size, args.num_hid_layers, args.drop_prob,
222 |                 args.item_in, args.skill_in, args.item_out, args.skill_out)
223 |     model = nn.DataParallel(model)
224 |     model.to(device=args.device)
225 |     optimizer = Adam(model.parameters(), lr=args.lr)
226 | 
227 |     logger.close()


--------------------------------------------------------------------------------
/FeedForwardNetwork/FFN.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | class FeedForwardNetwork(nn.Module):
 6 |     def __init__(self, n_skills, n_items, n_counters, hidden_dim, drop_prob):
 7 |         super(FeedForwardNetwork, self).__init__()
 8 |         self.lin_features_to_hidden = nn.Linear(n_counters * (n_items + n_skills), hidden_dim)
 9 |         self.lin_hidden_to_output = nn.Linear(hidden_dim, n_items)
10 |         self.dropout = nn.Dropout(p=drop_prob)
11 | 
12 |     def forward(self, input):
13 |         hidden_state = F.relu(self.lin_features_to_hidden(input))
14 |         output = self.lin_hidden_to_output(self.dropout(hidden_state))
15 |         return output
16 | 


--------------------------------------------------------------------------------
/FeedForwardNetwork/encode_ffw.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import pandas as pd
 4 | import numpy as np
 5 | 
 6 | from scipy import sparse
 7 | from scipy.sparse import csr_matrix, hstack, vstack
 8 | from tqdm import tqdm
 9 | from sklearn.preprocessing import OneHotEncoder
10 | 
11 | 
12 | COUNTERS = ("attempts", "wins")
13 | 
14 | 
15 | def encode_df(df, Q_mat, skill_counters=True):
16 |     """Build sparse dataset from dense dataset and q-matrix.
17 | 
18 |     Arguments:
19 |         df (pandas DataFrame): output by prepare_data.py
20 |         Q_mat (sparse array): q-matrix, output by prepare_data.py
21 |         skill_counters: if we want to include the counters of skill as well
22 | 
23 |     Output:
24 |         sparse_df (sparse array): sparse dataset where first 4 columns are the same as in df
25 |     """
26 |     n_items = Q_mat.shape[0]
27 |     onehot_items = OneHotEncoder(categories=[range(n_items)])
28 |     onehot_items.fit_transform(df["item_id"].values.reshape(-1, 1))
29 | 
30 |     features = []
31 |     for user_id in tqdm(df["user_id"].unique()):
32 |         df_user = df[df["user_id"] == user_id]
33 |         user_features = encode_user_ffw(
34 |             df_user,
35 |             onehot_items=onehot_items,
36 |             Q_mat=Q_mat,
37 |             skill_counters=skill_counters,
38 |         )
39 |         user_features = hstack((csr_matrix(df_user.values), user_features))
40 |         features.append(user_features)
41 |     return vstack(features)
42 | 
43 | 
44 | def encode_user_ffw(df_user, Q_mat, onehot_items, skill_counters=True):
45 |     labels = csr_matrix(df_user["correct"].values.reshape(-1, 1))
46 |     item_ids = df_user["item_id"].values.reshape(-1, 1)
47 |     item_ids_onehot = onehot_items.transform(item_ids)
48 | 
49 |     skill_ids_onehot = Q_mat[item_ids.flatten()]
50 | 
51 |     all_counters = []
52 |     for counter in COUNTERS:
53 |         user_item_counter = get_user_counter(item_ids_onehot, labels, counter=counter)
54 |         all_counters.append(user_item_counter)
55 |         if skill_counters:
56 |             user_skill_counter = get_user_counter(
57 |                 skill_ids_onehot, labels, counter=counter
58 |             )
59 |             all_counters.append(user_skill_counter)
60 |     return hstack(all_counters)
61 | 
62 | 
63 | def get_user_counter(feature_id_onehot, labels, counter):
64 |     array_to_accumulate = feature_id_onehot.toarray()
65 |     if counter == "attempts":
66 |         pass
67 |     elif counter == "wins":
68 |         array_to_accumulate *= labels.toarray()
69 |     counts = accumulate(array_to_accumulate)
70 |     counter = phi(counts)
71 |     return counter
72 | 
73 | 
74 | def accumulate(x):
75 |     return vstack((csr_matrix((1, x.shape[1])), csr_matrix(np.cumsum(x, 0))))[:-1]
76 | 
77 | 
78 | def phi(x):
79 |     return x.log1p()
80 | 
81 | 
82 | if __name__ == "__main__":
83 |     parser = argparse.ArgumentParser(
84 |         description="Encode feature matrix for feedforward network baseline."
85 |     )
86 |     parser.add_argument("--dataset", type=str)
87 |     parser.add_argument("--n_traces", type=int, default=20000)
88 | 
89 |     args = parser.parse_args()
90 | 
91 |     data_path = os.path.join("data", args.dataset)
92 | 
93 |     df = pd.read_csv(os.path.join(data_path, "preprocessed_data.csv"), sep="\t")
94 |     Q_mat = sparse.load_npz(os.path.join(data_path, "q_mat.npz"))
95 |     X = encode_df(df[-args.n_traces :], Q_mat)
96 |     sparse.save_npz(os.path.join(data_path, f"X-ffw-{args.n_traces}-traces"), X)
97 | 
98 | 


--------------------------------------------------------------------------------
/FeedForwardNetwork/encode_ffw_test.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | from scipy import sparse
  4 | 
  5 | from pandas.testing import assert_frame_equal
  6 | from numpy.testing import assert_array_equal
  7 | from encode_ffw import encode_user_ffw, get_user_counter, encode_df
  8 | from sklearn.preprocessing import OneHotEncoder
  9 | 
 10 | 
 11 | def test_attempts_counter():
 12 |     df_exercise_tuple = pd.DataFrame(
 13 |         {
 14 |             "phono_3_lesson_102": [1, 1, 0, 1, 0, 1, 1],
 15 |             "phono_3_lesson_103": [0, 0, 1, 0, 1, 0, 0],
 16 |             "correct": [0, 1, 1, 0, 1, 0, 1],
 17 |         }
 18 |     )
 19 |     feature_id_onehot = sparse.csr_matrix(
 20 |         df_exercise_tuple[["phono_3_lesson_102", "phono_3_lesson_103"]].values
 21 |     )
 22 |     labels = sparse.csr_matrix(df_exercise_tuple["correct"].values.reshape(-1, 1))
 23 |     counter = "attempts"
 24 |     user_attempts = get_user_counter(feature_id_onehot, labels, counter).toarray()
 25 |     expected_array = np.log(
 26 |         1 + np.array([[0, 0], [1, 0], [2, 0], [2, 1], [3, 1], [3, 2], [4, 2]])
 27 |     )
 28 |     assert_array_equal(user_attempts, expected_array)
 29 | 
 30 | 
 31 | def test_wins_counter():
 32 |     df_exercise_tuple = pd.DataFrame(
 33 |         {
 34 |             "phono_3_lesson_102": [1, 1, 0, 1, 0, 1, 1],
 35 |             "phono_3_lesson_103": [0, 0, 1, 0, 1, 0, 0],
 36 |             "correct": [0, 1, 1, 0, 1, 0, 1],
 37 |         }
 38 |     )
 39 |     feature_id_onehot = sparse.csr_matrix(
 40 |         df_exercise_tuple[["phono_3_lesson_102", "phono_3_lesson_103"]].values
 41 |     )
 42 |     labels = sparse.csr_matrix(df_exercise_tuple["correct"].values.reshape(-1, 1))
 43 |     counter = "wins"
 44 |     user_attempts = get_user_counter(feature_id_onehot, labels, counter).toarray()
 45 |     expected_array = np.log(
 46 |         1 + np.array([[0, 0], [0, 0], [1, 0], [1, 1], [1, 1], [1, 2], [1, 2]])
 47 |     )
 48 |     assert_array_equal(user_attempts, expected_array)
 49 | 
 50 | 
 51 | def test_encoding_counter():
 52 |     df_user = pd.DataFrame(
 53 |         {
 54 |             "item_id": [0, 0, 1, 0, 1, 0, 0],
 55 |             "skill_id": [0, 0, 0, 0, 0, 0, 0],
 56 |             "correct": [0, 1, 1, 0, 1, 0, 1],
 57 |         }
 58 |     )
 59 |     Q_mat = sparse.csr_matrix([[1], [1]])
 60 |     onehot_items = OneHotEncoder()
 61 |     onehot_items.fit(df_user["item_id"].values.reshape(-1, 1))
 62 | 
 63 |     user_ffw_encoding = encode_user_ffw(
 64 |         df_user, onehot_items=onehot_items, Q_mat=Q_mat, skill_counters=True
 65 |     ).toarray()
 66 |     expected_attempts_array = np.array(
 67 |         [[0, 0, 0], [1, 0, 1], [2, 0, 2], [2, 1, 3], [3, 1, 4], [3, 2, 5], [4, 2, 6]]
 68 |     )
 69 |     expected_wins_array = np.array(
 70 |         [[0, 0, 0], [0, 0, 0], [1, 0, 1], [1, 1, 2], [1, 1, 2], [1, 2, 3], [1, 2, 3]]
 71 |     )
 72 |     expected_array = np.concatenate(
 73 |         (expected_attempts_array, expected_wins_array), axis=1
 74 |     )
 75 |     expected_array = np.log(1 + expected_array)
 76 |     assert_array_equal(user_ffw_encoding, expected_array)
 77 | 
 78 | 
 79 | def test_encoding_counter_two_users():
 80 |     df_user = pd.DataFrame(
 81 |         {
 82 |             "user_id": [0, 0, 1, 0, 1, 0, 0],
 83 |             "item_id": [0, 0, 1, 0, 1, 0, 0],
 84 |             "skill_id": [0, 0, 0, 0, 0, 0, 0],
 85 |             "correct": [0, 1, 1, 0, 1, 0, 1],
 86 |         }
 87 |     )
 88 |     Q_mat = sparse.csr_matrix([[1], [1]])  # useless here as skill_counters=False
 89 |     user_ffw_encoding = encode_df(df_user, Q_mat, skill_counters=False).toarray()
 90 |     expected_attempts_array_0 = np.array([[0, 0], [1, 0], [2, 0], [3, 0], [4, 0]])
 91 |     expected_wins_array_0 = np.array([[0, 0], [0, 0], [1, 0], [1, 0], [1, 0]])
 92 |     expected_attempts_array_1 = np.array([[0, 0], [0, 1]])
 93 |     expected_wins_array_1 = np.array([[0, 0], [0, 1]])
 94 |     expected_array_0 = np.concatenate(
 95 |         (expected_attempts_array_0, expected_wins_array_0), axis=1
 96 |     )
 97 |     expected_array_1 = np.concatenate(
 98 |         (expected_attempts_array_1, expected_wins_array_1), axis=1
 99 |     )
100 |     expected_array = np.concatenate((expected_array_0, expected_array_1), axis=0)
101 |     expected_array = np.log(1 + expected_array)
102 | 
103 |     expected_array = np.hstack(
104 |         (df_user.sort_values(by="user_id").values, expected_array)
105 |     )
106 |     assert_array_equal(user_ffw_encoding, expected_array)
107 | 


--------------------------------------------------------------------------------
/FeedForwardNetwork/train_ffw.py:
--------------------------------------------------------------------------------
  1 | # mainly inspired by Théophile Gervet
  2 | # https://github.com/theophilee/kt-algos/blob/master/train_ffw.py
  3 | import sys
  4 | import os
  5 | import argparse
  6 | import numpy as np
  7 | from scipy.sparse import load_npz, csr_matrix
  8 | import torch
  9 | import torch.nn as nn
 10 | from torch.optim import Adam
 11 | 
 12 | sys.path.append(".")
 13 | 
 14 | from utils.logger import Logger
 15 | from utils.metrics import Metrics
 16 | from FFN import FeedForwardNetwork
 17 | from utils.misc import *
 18 | from tqdm import tqdm
 19 | 
 20 | 
 21 | def get_tensors(sparse):
 22 |     dense = torch.tensor(sparse.toarray())
 23 |     inputs = dense[:, 4:].float()
 24 |     item_ids = dense[:, 1].long()
 25 |     labels = dense[:, 3].float()
 26 |     return inputs, item_ids, labels
 27 | 
 28 | 
 29 | def train(X_train, X_val, model, optimizer, logger, num_epochs, batch_size):
 30 |     """Train FFW model.
 31 |     Arguments:
 32 |         X (sparse matrix): output by encode_ffw.py
 33 |         model (torch Module)
 34 |         optimizer (torch optimizer)
 35 |         logger: wrapper for TensorboardX logger
 36 |         num_epochs (int): number of epochs to train for
 37 |         batch_size (int)
 38 |     """
 39 |     criterion = nn.BCEWithLogitsLoss()
 40 |     metrics = Metrics()
 41 |     train_idxs = np.arange(X_train.shape[0])
 42 |     val_idxs = np.arange(X_val.shape[0])
 43 |     step = 0
 44 | 
 45 |     for epoch in tqdm(range(num_epochs)):
 46 |         shuffle(train_idxs)
 47 |         shuffle(val_idxs)
 48 | 
 49 |         # Training
 50 |         for k in range(0, len(train_idxs), batch_size):
 51 |             inputs, item_ids, labels = get_tensors(
 52 |                 X_train[train_idxs[k : k + batch_size]]
 53 |             )
 54 |             inputs = inputs.to(device=args.device)
 55 |             preds = model(inputs)
 56 |             relevant_preds = preds[
 57 |                 torch.arange(preds.shape[0]), item_ids.to(device=args.device)
 58 |             ]
 59 |             loss = criterion(relevant_preds, labels.to(device=args.device))
 60 | 
 61 |             train_auc = compute_auc(preds.detach().cpu(), item_ids, labels)
 62 | 
 63 |             model.zero_grad()
 64 |             loss.backward()
 65 |             optimizer.step()
 66 |             step += 1
 67 |             metrics.store({"loss/train": loss.item()})
 68 |             metrics.store({"auc/train": train_auc})
 69 | 
 70 |             # Logging
 71 |             if step % 20 == 0:
 72 |                 logger.log_scalars(metrics.average(), step * batch_size)
 73 | 
 74 |         # Validation
 75 |         model.eval()
 76 |         for k in range(0, len(val_idxs), batch_size):
 77 |             inputs, item_ids, labels = get_tensors(X_val[val_idxs[k : k + batch_size]])
 78 |             inputs = inputs.to(device=args.device)
 79 |             with torch.no_grad():
 80 |                 preds = model(inputs)
 81 |             val_auc = compute_auc(preds.cpu(), item_ids, labels)
 82 |             metrics.store({"auc/val": val_auc})
 83 |         model.train()
 84 | 
 85 | 
 86 | def student_level_split(X):
 87 |     user_ids = X[:, 0].toarray().flatten()
 88 |     users = np.unique(user_ids)
 89 |     np.random.shuffle(users)
 90 |     split = int(0.8 * len(users))
 91 |     users_train, users_val = users[:split], users[split:]
 92 |     return (
 93 |         X[np.where(np.isin(user_ids, users_train))],
 94 |         X[np.where(np.isin(user_ids, users_val))],
 95 |     )
 96 | 
 97 | 
 98 | def get_number_of_items_and_skills(dataset):
 99 |     data_path = os.path.join("data", dataset)
100 |     Q_mat = load_npz(os.path.join(data_path, "q_mat.npz"))
101 |     return Q_mat.shape[0], Q_mat.shape[1]
102 | 
103 | 
104 | if __name__ == "__main__":
105 |     parser = argparse.ArgumentParser(
106 |         description="Train feedforward neural network on dense feature matrix."
107 |     )
108 |     parser.add_argument("X_file", type=str)
109 |     parser.add_argument("--dataset", type=str)
110 |     parser.add_argument("--logdir", type=str, default="runs/ffw")
111 |     parser.add_argument("--hid_size", type=int, default=200)
112 |     parser.add_argument("--drop_prob", type=float, default=0.2)
113 |     parser.add_argument("--batch_size", type=int, default=500)
114 |     parser.add_argument("--lr", type=float, default=1e-3)
115 |     parser.add_argument("--num_epochs", type=int, default=25)
116 |     parser.add_argument("--disable-cuda", action="store_true", help="Disable CUDA")
117 |     args = parser.parse_args()
118 |     args.device = None
119 |     if not args.disable_cuda and torch.cuda.is_available():
120 |         args.device = torch.device("cuda")
121 |     else:
122 |         args.device = torch.device("cpu")
123 | 
124 |     # First four columns are original dataset
125 |     # then previous interaction encodings and wins/attempts statistics
126 |     X = csr_matrix(load_npz(args.X_file))
127 | 
128 |     # Student-level train-val split
129 |     X_train, X_val = student_level_split(X)
130 | 
131 |     n_items, n_skills = get_number_of_items_and_skills(dataset=args.dataset)
132 |     n_counters = 2
133 | 
134 |     model = FeedForwardNetwork(
135 |         n_skills=n_skills,
136 |         n_items=n_items,
137 |         n_counters=n_counters,
138 |         hidden_dim=args.hid_size,
139 |         drop_prob=args.drop_prob,
140 |     ).to(device=args.device)
141 |     optimizer = Adam(model.parameters(), lr=args.lr)
142 | 
143 |     param_str = f"{args.dataset}"
144 |     logger = Logger(os.path.join(args.logdir, param_str))
145 | 
146 |     train(X_train, X_val, model, optimizer, logger, args.num_epochs, args.batch_size)
147 | 
148 |     logger.close()
149 | 
150 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | This repository contains some of the main EDM models for Knowledge Tracing.
 2 | 
 3 | For now it contains DAS3H model by Choffin et al., a basic FeedForwardModel, DKT and SAKT
 4 | 
 5 | ## Setup
 6 | 
 7 | 
 8 | 
 9 | Create a new conda environment with python 3
10 | ```
11 | conda create --name python3-env python=3.7
12 | ```
13 | Activate conda env
14 | ```
15 | conda activate python3-env
16 | ```
17 | 
18 | Install [PyTorch](https://pytorch.org) and the remaining requirements:
19 | 
20 | ```
21 | pip install -r requirements.txt
22 | ```
23 | 
24 | To use a dataset, download the data from one of the links above and:
25 | - place the main file under `data/<dataset codename>/data.csv` for an ASSISTments dataset
26 | - place the main file under `data/<dataset codename>/data.txt` for a KDDCup dataset
27 | 
28 | ```
29 | python prepare_data.py --dataset <dataset codename> --remove_nan_skills
30 | ```
31 | 
32 | ## Training
33 | 
34 | #### Deep Knowledge Tracing
35 | 
36 | To train a DKT model:
37 | 
38 | ```
39 | python train_dkt.py --dataset <dataset codename> 
40 | ```
41 | 
42 | #### Self-Attentive Knowledge Tracing
43 | 
44 | To train a SAKT model:
45 | 
46 | ```
47 | python train_sakt.py --dataset <dataset codename>
48 | ```
49 | 


--------------------------------------------------------------------------------
/SAKT/model_sakt.py:
--------------------------------------------------------------------------------
  1 | """ Embeddings module from ONMT"""
  2 | import math
  3 | import warnings
  4 | import numpy as np
  5 | 
  6 | import torch
  7 | import torch.nn as nn
  8 | 
  9 | 
 10 | def future_mask(seq_length):
 11 |     future_mask = np.triu(np.ones((1, seq_length, seq_length)), k=0).astype("bool")
 12 |     return torch.from_numpy(future_mask)
 13 | 
 14 | 
 15 | class PositionalEncoding(nn.Module):
 16 |     """Sinusoidal positional encoding for non-recurrent neural networks.
 17 |     Args:
 18 |        dropout (float): dropout parameter
 19 |        dim (int): embedding size
 20 |     """
 21 | 
 22 |     def __init__(self, dropout, dim, max_len=5000):
 23 |         if dim % 2 != 0:
 24 |             raise ValueError(
 25 |                 "Cannot use sin/cos positional encoding with "
 26 |                 "odd dim (got dim={:d})".format(dim)
 27 |             )
 28 |         pe = torch.zeros(max_len, dim)
 29 |         position = torch.arange(0, max_len).unsqueeze(1)
 30 |         div_term = torch.exp(
 31 |             (torch.arange(0, dim, 2, dtype=torch.float) * -(math.log(10000.0) / dim))
 32 |         )
 33 |         pe[:, 0::2] = torch.sin(position.float() * div_term)
 34 |         pe[:, 1::2] = torch.cos(position.float() * div_term)
 35 |         pe = pe.unsqueeze(1)
 36 |         super(PositionalEncoding, self).__init__()
 37 |         self.register_buffer("pe", pe)
 38 |         self.dropout = nn.Dropout(p=dropout)
 39 |         self.dim = dim
 40 | 
 41 |     def forward(self, emb):
 42 |         """Embed interactions.
 43 |         Args:
 44 |             emb (FloatTensor): Sequence of word vectors
 45 |                 ``(seq_len, batch_size, self.dim)``
 46 |         """
 47 | 
 48 |         # emb = emb * math.sqrt(self.dim)
 49 |         emb = emb + self.pe[: emb.size(0)]
 50 |         emb = self.dropout(emb)
 51 |         return emb
 52 | 
 53 | """ Multi-Head Attention module from ONMT"""
 54 | 
 55 | 
 56 | class MultiHeadedAttention(nn.Module):
 57 |     def __init__(self, head_count, model_dim, dropout=0.1):
 58 |         assert model_dim % head_count == 0
 59 |         self.dim_per_head = model_dim // head_count
 60 |         self.model_dim = model_dim
 61 | 
 62 |         super(MultiHeadedAttention, self).__init__()
 63 |         self.head_count = head_count
 64 | 
 65 |         self.linear_keys = nn.Linear(model_dim, head_count * self.dim_per_head)
 66 |         self.linear_values = nn.Linear(model_dim, head_count * self.dim_per_head)
 67 |         self.linear_query = nn.Linear(model_dim, head_count * self.dim_per_head)
 68 |         self.softmax = nn.Softmax(dim=-1)
 69 |         self.dropout = nn.Dropout(dropout)
 70 |         self.final_linear = nn.Linear(model_dim, model_dim)
 71 | 
 72 |     def forward(self, key, value, query, mask=None, layer_cache=None):
 73 |         """
 74 |         Compute the context vector and the attention vectors.
 75 |         Args:
 76 |            key (FloatTensor): set of `key_len`
 77 |                key vectors ``(batch, key_len, dim)``
 78 |            value (FloatTensor): set of `key_len`
 79 |                value vectors ``(batch, key_len, dim)``
 80 |            query (FloatTensor): set of `query_len`
 81 |                query vectors  ``(batch, query_len, dim)``
 82 |            mask: binary mask 1/0 indicating which keys have
 83 |                zero / non-zero attention ``(batch, query_len, key_len)``
 84 |         Returns:
 85 |            (FloatTensor, FloatTensor):
 86 |            * output context vectors ``(batch, query_len, dim)``
 87 |            * one of the attention vectors ``(batch, query_len, key_len)``
 88 |         """
 89 |         # CHECKS
 90 |         batch, k_len, d = key.size()
 91 |         batch_, k_len_, d_ = value.size()
 92 |         assert batch_ == batch
 93 |         assert k_len == k_len
 94 |         assert d == d_
 95 |         batch_, q_len, d_ = query.size()
 96 |         assert batch_ == batch
 97 |         assert d == d_
 98 | 
 99 |         # aeq(self.model_dim % 8, 0)
100 |         if mask is not None:
101 |             batch_, q_len_, k_len_ = mask.size()
102 |             # assert batch_ == batch mask will be broadcasted
103 |             assert k_len_ == k_len
104 |             assert q_len_ == q_len
105 |         # END CHECKS
106 |         batch_size = key.size(0)
107 |         dim_per_head = self.dim_per_head
108 |         head_count = self.head_count
109 |         key_len = key.size(1)
110 |         query_len = query.size(1)
111 | 
112 |         def shape(x):
113 |             """Projection."""
114 |             return x.view(batch_size, -1, head_count, dim_per_head).transpose(1, 2)
115 | 
116 |         def unshape(x):
117 |             """Compute context."""
118 |             return (
119 |                 x.transpose(1, 2)
120 |                 .contiguous()
121 |                 .view(batch_size, -1, head_count * dim_per_head)
122 |             )
123 | 
124 |         # 1) Project key, value, and query.
125 |         key = self.linear_keys(key)
126 |         value = self.linear_values(value)
127 |         query = self.linear_query(query)
128 |         key = shape(key)
129 |         value = shape(value)
130 |         query = shape(query)
131 | 
132 |         key_len = key.size(2)
133 |         query_len = query.size(2)
134 | 
135 |         # 2) Calculate and scale scores.
136 |         query = query / math.sqrt(dim_per_head)
137 |         # batch x heads x query_len x key_len
138 |         query_key = torch.matmul(query, key.transpose(2, 3))
139 | 
140 |         scores = query_key
141 |         scores = scores.float()
142 | 
143 |         if mask is not None:
144 |             mask = mask.unsqueeze(1)  # [B, 1, 1 (?), T_values]
145 |             scores = scores.masked_fill(mask, -1e18)
146 | 
147 |         # 3) Apply attention dropout and compute context vectors.
148 |         attn = self.softmax(scores).to(query.dtype)
149 |         drop_attn = self.dropout(attn)
150 | 
151 |         context_original = torch.matmul(drop_attn, value)
152 | 
153 |         context = unshape(context_original)
154 |         output = self.final_linear(context)
155 |         top_attn = attn.view(batch_size, head_count, query_len, key_len)[
156 |             :, 0, :, :
157 |         ].contiguous()
158 | 
159 |         return output, top_attn
160 | 
161 |     def update_dropout(self, dropout):
162 |         self.dropout.p = dropout
163 | 
164 | 
165 | class PositionwiseFeedForward(nn.Module):
166 |     """ A two-layer Feed-Forward-Network with residual layer norm.
167 |     Args:
168 |         d_model (int): the size of input for the first-layer of the FFN.
169 |         d_ff (int): the hidden layer size of the second-layer
170 |             of the FNN.
171 |         dropout (float): dropout probability in :math:`[0, 1)`.
172 |     """
173 | 
174 |     def __init__(self, d_model, d_ff, dropout=0.1):
175 |         super(PositionwiseFeedForward, self).__init__()
176 |         self.w_1 = nn.Linear(d_model, d_ff)
177 |         self.w_2 = nn.Linear(d_ff, d_model)
178 |         self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
179 |         self.dropout_1 = nn.Dropout(dropout)
180 |         self.relu = nn.ReLU()
181 |         self.dropout_2 = nn.Dropout(dropout)
182 | 
183 |     def forward(self, x):
184 |         """Layer definition.
185 |         Args:
186 |             x: ``(batch_size, input_len, model_dim)``
187 |         Returns:
188 |             (FloatTensor): Output ``(batch_size, input_len, model_dim)``.
189 |         """
190 | 
191 |         inter = self.dropout_1(self.relu(self.w_1(self.layer_norm(x))))
192 |         output = self.dropout_2(self.w_2(inter))
193 |         return output + x
194 | 
195 |     def update_dropout(self, dropout):
196 |         self.dropout_1.p = dropout
197 |         self.dropout_2.p = dropout
198 | 
199 | 
200 | class TransformerEncoderLayer(nn.Module):
201 |     """
202 |     A single layer of the transformer encoder.
203 |     Args:
204 |         d_model (int): the dimension of keys/values/queries in
205 |                    MultiHeadedAttention, also the input size of
206 |                    the first-layer of the PositionwiseFeedForward.
207 |         heads (int): the number of head for MultiHeadedAttention.
208 |         d_ff (int): the second-layer of the PositionwiseFeedForward.
209 |         dropout (float): dropout probability(0-1.0).
210 |     """
211 | 
212 |     def __init__(self, d_model, heads, d_ff, dropout, attention_dropout):
213 |         super(TransformerEncoderLayer, self).__init__()
214 | 
215 |         self.self_attn = MultiHeadedAttention(heads, d_model, dropout=attention_dropout)
216 |         self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
217 |         self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
218 |         self.dropout = nn.Dropout(dropout)
219 | 
220 |     def forward(self, interaction_embeds, item_embeds, mask):
221 |         """
222 |         Args:
223 |             interaction_embeds (FloatTensor): ``(batch_size, src_len, model_dim)``
224 |             mask (LongTensor): ``(batch_size, 1, src_len)`` pourquoi mask est de cette taille ??
225 |         Returns:
226 |             (FloatTensor):
227 |             * outputs ``(batch_size, src_len, model_dim)``
228 |         """
229 |         context, _ = self.self_attn(
230 |             interaction_embeds, interaction_embeds, item_embeds, mask=mask
231 |         )
232 |         out = self.dropout(context) + item_embeds
233 |         return self.feed_forward(out)
234 | 
235 |     def update_dropout(self, dropout, attention_dropout):
236 |         self.self_attn.update_dropout(attention_dropout)
237 |         self.feed_forward.update_dropout(dropout)
238 |         self.dropout.p = dropout
239 | 
240 | 
241 | class SAKT(nn.Module):
242 |     """Self-attentive knowledge tracing.
243 |     
244 |     Arguments:
245 |             num_items (int): Number of items
246 |             hid_size (int): Attention dot-product dimension
247 |             heads (int): Number of parallel attention heads
248 |             encode_pos (bool): If True, add positional encoding
249 |             dropout (float): Dropout probability
250 |     """
251 | 
252 |     def __init__(
253 |         self, num_items, hid_size=512, heads=8, dropout=0.2, position_encoding=True
254 |     ):
255 |         super(SAKT, self).__init__()
256 |         self.num_items = num_items
257 |         self.interaction_embedding = nn.Embedding(
258 |             2 * num_items, hid_size
259 |         )  # maybe padding is needed
260 |         self.item_embedding = nn.Embedding(
261 |             num_items, hid_size
262 |         )  # maybe padding is needed
263 |         self.position_encoding = position_encoding
264 |         if self.position_encoding:
265 |             self.pe = PositionalEncoding(dropout, hid_size)
266 |         self.encoder_layer = TransformerEncoderLayer(
267 |             hid_size, heads, hid_size, dropout, dropout
268 |         )
269 |         self.layer_norm = nn.LayerNorm(hid_size, eps=1e-6)
270 |         self.out = nn.Linear(hid_size, 1)
271 | 
272 |     def forward(self, interactions, items):
273 |         # intercations and items must be batch first
274 |         item_embeds = self.item_embedding(items)
275 |         interaction_embeds = self.interaction_embedding(interactions)
276 |         mask = future_mask(interactions.size(1))
277 |         if interactions.is_cuda:
278 |             mask = mask.cuda()
279 |         if self.position_encoding:
280 |             interaction_embeds = self.pe(interaction_embeds) # idea do a concatenate instead of just adding the position embeds
281 |         
282 |         out = self.encoder_layer(interaction_embeds, item_embeds, mask)
283 |         return self.out(out).squeeze(2)
284 | 


--------------------------------------------------------------------------------
/SAKT/train_sakt.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | import pandas as pd
  4 | import sys
  5 | 
  6 | import torch.nn as nn
  7 | from torch.optim import Adam
  8 | 
  9 | 
 10 | sys.path.append(".")
 11 | from model_sakt import SAKT
 12 | from utils_sakt.logger import Logger
 13 | from utils_sakt.metrics import Metrics
 14 | from utils_sakt.misc import *
 15 | from tqdm import tqdm
 16 | 
 17 | 
 18 | def train(df, model, optimizer, logger, num_epochs, batch_size):
 19 |     """Train SAKT model.
 20 |     
 21 |     Arguments:
 22 |         df (pandas DataFrame): output by prepare_data.py
 23 |         model (torch Module)
 24 |         optimizer (torch optimizer)
 25 |         logger: wrapper for TensorboardX logger
 26 |         num_epochs (int): number of epochs to train for
 27 |         batch_size (int)
 28 |     """
 29 |     train_data, val_data = get_data(df)
 30 | 
 31 |     criterion = nn.BCEWithLogitsLoss()
 32 |     brier_score = nn.MSELoss()
 33 |     m = nn.Sigmoid()
 34 |     metrics = Metrics()
 35 |     step = 0
 36 |     print(args.device)
 37 | 
 38 |     for epoch in tqdm(range(num_epochs)):
 39 |         train_batches = prepare_batches(train_data, batch_size)
 40 |         val_batches = prepare_batches(val_data, batch_size)
 41 | 
 42 |         # Training
 43 |         for inputs, item_ids, labels in train_batches:
 44 |             inputs = inputs.to(device=args.device)
 45 |             item_ids = item_ids.to(device=args.device)
 46 |             preds = model(inputs, item_ids)
 47 |             loss = compute_loss(
 48 |                 preds.float(), labels.to(device=args.device).float(), criterion
 49 |             )
 50 |             # loss = compute_loss(preds, item_ids, labels, criterion)
 51 |             train_auc = compute_auc(preds, labels)
 52 | 
 53 |             model.zero_grad()
 54 |             loss.backward()
 55 |             optimizer.step()
 56 |             step += 1
 57 |             metrics.store({"loss/train": loss.item()})
 58 |             metrics.store({"auc/train": train_auc})
 59 | 
 60 |             # Logging
 61 |             if step % 20 == 0:
 62 |                 logger.log_scalars(metrics.average(), step)
 63 |                 #weights = {"weight/" + name: param for name, param in model.named_parameters()}
 64 |                 #grads = {"grad/" + name: param.grad
 65 |                 #         for name, param in model.named_parameters() if param.grad is not None}
 66 |                 #logger.log_histograms(weights, step)
 67 |                 #logger.log_histograms(grads, step)
 68 | 
 69 |         # Validation
 70 |         model.eval()
 71 |         for inputs, item_ids, labels in val_batches:
 72 |             inputs = inputs.to(device=args.device)
 73 |             with torch.no_grad():
 74 |                 preds = model(inputs, item_ids.to(device=args.device))
 75 |             val_loss = compute_loss(preds.float().cpu(), labels.float(), criterion)
 76 |             val_brier_score = compute_loss(
 77 |                 m(preds).float().cpu(), labels.float(), brier_score
 78 |             )
 79 |             val_auc = compute_auc(preds, labels)
 80 |             metrics.store({"brier_score/val": val_brier_score.item()})
 81 |             metrics.store({"auc/val": val_auc})
 82 |             metrics.store({"loss/val": val_loss.item()})
 83 |         model.train()
 84 | 
 85 | 
 86 | 
 87 | if __name__ == "__main__":
 88 |     parser = argparse.ArgumentParser(description="Train SAKT.")
 89 |     parser.add_argument("--dataset", type=str)
 90 |     parser.add_argument("--logdir", type=str, default="runs/sakt")
 91 |     parser.add_argument("--embed_inputs", action="store_true")
 92 |     parser.add_argument("--embed_size", type=int, default=100)
 93 |     parser.add_argument("--hid_size", type=int, default=100)
 94 |     parser.add_argument("--num_heads", type=int, default=5)
 95 |     parser.add_argument("--encode_pos", action="store_true")
 96 |     parser.add_argument("--drop_prob", type=float, default=0.2)
 97 |     parser.add_argument("--batch_size", type=int, default=100)
 98 |     parser.add_argument("--lr", type=float, default=1e-3)
 99 |     parser.add_argument("--num_epochs", type=int, default=25)
100 |     parser.add_argument("--n_traces", type=int, default=20000)
101 |     parser.add_argument("--disable-cuda", action="store_true", help="Disable CUDA")
102 |     args = parser.parse_args()
103 |     if not args.disable_cuda:
104 |         args.device = torch.device("cuda")
105 |     else:
106 |         args.device = torch.device("cpu")
107 | 
108 |     df = pd.read_csv(
109 |         os.path.join("data", args.dataset, "preprocessed_data.csv"), sep="\t"
110 |     )[-args.n_traces :]
111 | 
112 |     num_items = int(df["item_id"].max() + 1)
113 |     model = SAKT(
114 |         num_items, args.hid_size, args.num_heads, args.encode_pos, args.drop_prob
115 |     )
116 |     model = nn.DataParallel(model)
117 |     model.to(device=args.device)
118 |     print("Let's use", torch.cuda.device_count(), "GPUs!")
119 | 
120 |     optimizer = Adam(model.parameters(), lr=args.lr)
121 | 
122 |     param_str = (
123 |         f"{args.dataset}, embed={args.embed_inputs}, dropout={args.drop_prob}, batch_size={args.batch_size} "
124 |         f"embed_size={args.embed_size}, hid_size={args.hid_size}, encode_pos={args.encode_pos}, n_traces={args.n_traces}"
125 |     )
126 |     logger = Logger(os.path.join(args.logdir, param_str))
127 | 
128 |     train(df, model, optimizer, logger, args.num_epochs, args.batch_size)
129 | 
130 |     logger.close()
131 | 


--------------------------------------------------------------------------------
/SAKT/utils_sakt/logger.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | from tensorboardX import SummaryWriter
 4 | 
 5 | 
 6 | class Logger:
 7 |     """Logging with TensorboardX.
 8 |     """
 9 | 
10 |     def __init__(self, logdir, verbose=True):
11 |         if not os.path.exists(logdir):
12 |             os.makedirs(logdir)
13 |         try:
14 |             shutil.rmtree(logdir)
15 |         except FileNotFoundError:
16 |             pass
17 | 
18 |         self.verbose = verbose
19 |         self.writer = SummaryWriter(logdir)
20 | 
21 |     def log_histograms(self, dic, step):
22 |         """Log dictionary of tensors as histograms.
23 |         """
24 |         for k, v in dic.items():
25 |             self.writer.add_histogram(k, v, step)
26 | 
27 |     def log_scalars(self, dic, step):
28 |         """Log dictionary of scalar values.
29 |         """
30 |         for k, v in dic.items():
31 |             self.writer.add_scalar(k, v, step)
32 | 
33 |         if self.verbose:
34 |             print(f"Step {step}, {dic}")
35 | 
36 |     def close(self):
37 |         self.writer.close()


--------------------------------------------------------------------------------
/SAKT/utils_sakt/metrics.py:
--------------------------------------------------------------------------------
 1 | class Metrics:
 2 |     """Keep track of metrics over time in a dictionary.
 3 |     """
 4 |     def __init__(self):
 5 |         self.metrics = {}
 6 |         self.counts = {}
 7 | 
 8 |     def store(self, new_metrics):
 9 |         for key in new_metrics:
10 |             if key in self.metrics:
11 |                 self.metrics[key] += new_metrics[key]
12 |                 self.counts[key] += 1
13 |             else:
14 |                 self.metrics[key] = new_metrics[key]
15 |                 self.counts[key] = 1
16 | 
17 |     def average(self):
18 |         average = {k: v / self.counts[k] for k, v in self.metrics.items()}
19 |         self.metrics, self.counts = {}, {}
20 |         return average


--------------------------------------------------------------------------------
/SAKT/utils_sakt/misc.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import numpy as np
 3 | from random import shuffle
 4 | from sklearn.metrics import roc_auc_score, accuracy_score
 5 | 
 6 | import torch
 7 | from torch.nn.utils.rnn import pad_sequence
 8 | 
 9 | 
10 | def set_random_seeds(seed):
11 |     torch.manual_seed(seed)
12 |     torch.cuda.manual_seed_all(seed)
13 |     random.seed(seed)
14 |     
15 |     
16 | def get_data(df, train_split=0.8):
17 |     num_items = df["item_id"].nunique()
18 |     data = [(torch.tensor(u_df["item_id"].values, dtype=torch.long),
19 |              torch.tensor(u_df["correct"].values, dtype=torch.long))
20 |             for _, u_df in df.groupby("user_id")]
21 |     data = [(item_ids + labels * num_items, item_ids, labels)
22 |             for (item_ids, labels) in data]
23 |     shuffle(data)
24 | 
25 |     # Train-test split across users
26 |     train_size = int(train_split * len(data))
27 |     train_data, val_data = data[:train_size], data[train_size:]
28 |     return train_data, val_data
29 | 
30 | 
31 | def prepare_batches(data, batch_size):
32 |     """Prepare batches grouping padded sequences.
33 |     
34 |     Arguments:
35 |         data (list of tuples of torch Tensor)
36 |         batch_size (int): number of sequences per batch
37 |         
38 |     Output:
39 |         batches (list of tuples of torch Tensor)
40 |     """
41 |     shuffle(data)
42 | 
43 |     batches = []
44 |     for k in range(0, len(data), batch_size):
45 |         batch = data[k:k + batch_size]
46 |         inputs, item_ids, labels = zip(*batch)
47 | 
48 |         inputs = pad_sequence(inputs, batch_first=True, padding_value=0)     # Pad with 0
49 |         item_ids = pad_sequence(item_ids, batch_first=True, padding_value=0) # Don't care
50 |         labels = pad_sequence(labels, batch_first=True, padding_value=-1)    # Pad with -1
51 | 
52 |         batches.append([inputs, item_ids, labels])
53 |         
54 |     return batches
55 | 
56 | 
57 | def compute_auc(preds, labels):
58 |     labels = labels.view(-1)
59 |     preds = preds.view(-1)[labels >= 0].detach().cpu().numpy()
60 |     labels = labels[labels >= 0].detach().cpu().numpy()
61 | 
62 | 
63 |     if len(np.unique(labels)) == 1: # Only one class
64 |         auc = accuracy_score(labels, preds.round())
65 |     else:
66 |         auc = roc_auc_score(labels, preds)
67 |     return auc
68 | 
69 | def compute_loss(preds, labels, criterion):
70 |     labels = labels.view(-1)
71 |     preds = preds.view(-1)[labels >= 0]
72 |     labels = labels[labels >= 0]
73 |     return criterion(preds, labels)


--------------------------------------------------------------------------------
/das3h/README.md:
--------------------------------------------------------------------------------
 1 | ## DAS3H
 2 | 
 3 | This folder contains Python code of [_DAS3H: Modeling Student Learning and Forgetting for
 4 | Optimally Scheduling Distributed Practice of Skills_](https://arxiv.org/abs/1905.06873). Authors: [Benoît Choffin](https://github.com/BenoitChoffin), [Fabrice Popineau](https://github.com/fpopineau), Yolaine Bourda, and [Jill-Jênn Vie](https://github.com/jilljenn).
 5 | 
 6 | It is different from the [implementation used in the article](https://github.com/BenoitChoffin/das3h) as it is tailored for a dataset like Lalilo's one. 
 7 | It also uses the pandas library more (like the rolling function) to create the features.
 8 | 
 9 | ### What is DAS3H ?
10 | It is a model of student learning where there are 5 kinds of parameters to learn (same notations as in the article):
11 |   - &alpha; : level of a student
12 |   - &delta; : difficulty of an exercise
13 |   - &beta; : difficulty of a knowledge component (not used in our implementation as we haven't tagged exercises with KCs yet)
14 |   - &theta;<sub>wins, exercise, time-window </sub> (>0) : speed with which a student learns (?) a given type of exercise in a given time window
15 |   - &theta;<sub>attempts, exercise, time-window</sub> (>0) : speed with which a student forgets (?) a given type of exercise in a given time window
16 | 
17 | Let's say we have a dataset looking like this.
18 | #### Original dataset
19 | | trace_id | date | student_id | exercise_id | correctness |
20 | |:-:|:-:|:-:|:-:|:-:|
21 | | 1 | 1 january | 1 | 1 | 1 |
22 | | 2 | 1 january | 1 | 1 | 0 |
23 | | 3 | 1 january | 1 | 1 | 0 |
24 | | 4 | 1 january | 2 | 1 | 0 |
25 | | 5 | 1 january | 2 | 1 | 1 |
26 | | 6 | **3 january** | 2 | 1 | 1 |
27 | 
28 | In the simplest version of the model (not using Factorisation Machines), the parameters to compute are those of a LogisticRegression on a dataset looking like this :
29 | 
30 | #### Encoded dataset
31 | | trace_id | student_1 (&alpha;<sub>1</sub>)| student (&alpha;<sub>2</sub>)| exercise_1 (&delta;<sub>1</sub>) | exercise_2 (&delta;<sub>2</sub>)| wins_on_exo_1_in_the_past_day (&theta;<sub>wins, exo_1, one-day</sub>)| attempts_on_exo_1_in_the_past_day (&theta;<sub>attempts, exo_1, one-day</sub>) | wins_on_exo_1_in_the_past_week (&theta;<sub>wins, exo_1, one-week</sub>)| attempts_on_exo_1_in_the_past_week (&theta;<sub>attempts, exo_1, one-week</sub>)| other columns like &theta; parameters on ex 2 | 
32 | |:-:|:-:|:-----:|:-----:|:------:|:----:|:----:|:-:|:-:|:-:|
33 | | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 |
34 | | 2 | 0 | 1 | 1 | 0 | 1 | 1 | 1 | 1 |
35 | | 3 | 0 | 1 | 1 | 0 | 1 | 2 | 1 | 2 |
36 | | 4 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
37 | | 5 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 |
38 | | 6 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 2 |
39 | 
40 | To get this encoded dataset, we one-hot-encode on the student_id and exercise_id and add the number of previous attempts and wins that a student had in the given time windows. 
41 | 
42 | Try to write on a piece of paper what you get doing that and compare to the encoded dataset above. See the *Important* note for further details.
43 | 
44 | #### Logistic Regression
45 | 
46 | As stated in [sklearn documentation](https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression), Logistic Regression is the optimization of <a href="https://www.codecogs.com/eqnedit.php?latex=\min_{w,&space;c}&space;\frac{1}{2}w^T&space;w&space;&plus;&space;C&space;\sum_{i=1}^n&space;\log(\exp(-&space;y_i&space;(X_i^T&space;w&space;&plus;&space;c))&space;&plus;&space;1)&space;." target="_blank"><img src="https://latex.codecogs.com/gif.latex?\min_{w,&space;c}&space;\frac{1}{2}w^T&space;w&space;&plus;&space;C&space;\sum_{i=1}^n&space;\log(\exp(-&space;y_i&space;(X_i^T&space;w&space;&plus;&space;c))&space;&plus;&space;1)&space;." title="\min_{w, c} \frac{1}{2}w^T w + C \sum_{i=1}^n \log(\exp(- y_i (X_i^T w + c)) + 1) ." /></a>
47 | 
48 | Here, the parameters &alpha;, &delta;, &beta;, and &theta; are concatenated in *w*
49 | 
50 | *Important :*
51 | As you may have noticed, the features in the *encoded* dataset seem to "lag" one trace behind the original dataset. Actually this is done to prevent any data leakage and not use the answer at time T to predict itself. If this is not clear, please tell me.
52 | 
53 | *Note :*
54 | Actually the number of wins and number of attempts are not fed directly to the model, instead they go through a scaling function :
55 | ``` python
56 | lambda x: log(1 + x)
57 | ```
58 | in the article


--------------------------------------------------------------------------------
/das3h/clean.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | from print_df_infos import print_cleaned_df_with_information
 5 | 
 6 | 
 7 | def clean_df(
 8 |     df,
 9 |     exercise_code_level=False,
10 |     drop_level=True,
11 |     date_to_timestamp=True,
12 |     exercise_code_level_lesson=True,
13 |     drop_learning_object=True,
14 |     verbose=True
15 | ) -> pd.DataFrame:
16 |     df = df.drop(columns=["Unnamed: 0", "id"])
17 |     if exercise_code_level:
18 |         df = add_exercise_code_level(df)
19 |     if date_to_timestamp:
20 |         df = change_date_to_timestamp(df)
21 |         df = df.drop(columns=["created_at"])
22 |     if exercise_code_level_lesson:
23 |         df = add_exercise_code_level_lesson(df)
24 |         df = df.drop(columns=["lesson_id"])
25 |     if drop_level:
26 |         df = df.drop(columns=["level"])
27 |     if drop_learning_object:
28 |         try:
29 |             df = df.drop(columns=["learning_object"])
30 |         except:
31 |             pass
32 |     df["student_id"] = np.unique(df["student_id"], return_inverse=True)[1]
33 |     df = df[df["correctness"].isin((True, False))]
34 |     df["correctness"] = df["correctness"].astype(int)
35 |     if verbose:
36 |         print_cleaned_df_with_information(df)
37 |     return df
38 | 
39 | 
40 | def add_exercise_code_level(df) -> pd.DataFrame:
41 |     dataset = df.copy()
42 |     dataset["exercise_code_level"] = (
43 |         dataset["exercise_code"].map(str) + "_" + dataset["level"].map(str)
44 |     )
45 |     return dataset
46 | 
47 | 
48 | def add_exercise_code_level_lesson(df) -> pd.DataFrame:
49 |     dataset = df.copy()
50 |     dataset["exercise_code_level_lesson"] = (
51 |         dataset["exercise_code"].map(str)
52 |         + "_"
53 |         + dataset["level"].map(str)
54 |         + "_lesson_"
55 |         + dataset["lesson_id"].map(str)
56 |     )
57 |     return dataset
58 | 
59 | 
60 | def change_date_to_timestamp(df) -> pd.DataFrame:
61 |     df["timestamp"] = df["created_at"]
62 |     df["timestamp"] = pd.to_datetime(df["timestamp"])
63 |     return df
64 | 


--------------------------------------------------------------------------------
/das3h/clean_test.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | from pandas.testing import assert_frame_equal
 5 | from clean import clean_df
 6 | 
 7 | 
 8 | def test_that_clean_df_cleans_properly():
 9 |     df = pd.DataFrame(
10 |         {
11 |             "Unnamed: 0": [1, 2, 3],
12 |             "id": [1, 2, 3],
13 |             "created_at": [
14 |                 "2019-03-01 00:00:01",
15 |                 "2019-03-01 00:00:02",
16 |                 "2019-03-01 00:00:03",
17 |             ],
18 |             "student_id": [0, 0, 0],
19 |             "exercise_code": ["grapho", "phono", "discovery"],
20 |             "level": [3, 4, 1],
21 |             "lesson_id": [101, 101, 101],
22 |             "correctness": [True, False, None],
23 |         }
24 |     )
25 | 
26 |     cleaned_df = clean_df(df)
27 |     expected_df = pd.DataFrame(
28 |         {
29 |             "timestamp": ["2019-03-01 00:00:01", "2019-03-01 00:00:02"],
30 |             "student_id": [0, 0],
31 |             "exercise_code": ["grapho", "phono"],
32 |             "exercise_code_level_lesson": ["grapho_3_lesson_101", "phono_4_lesson_101"],
33 |             "correctness": [1, 0],
34 |         }
35 |     )
36 |     expected_df["timestamp"] = pd.to_datetime(expected_df["timestamp"])
37 |     assert_frame_equal(cleaned_df, expected_df, check_like=True)  # ignore column order
38 | 
39 | 


--------------------------------------------------------------------------------
/das3h/das3h.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from datetime import date
 3 | 
 4 | from sklearn.model_selection import KFold
 5 | from sklearn.metrics import roc_auc_score, accuracy_score, log_loss
 6 | from sklearn.linear_model import LogisticRegression
 7 | from encode import encode_df
 8 | from clean import clean_df
 9 | from results_analysis import (
10 |     get_coefs_in_dataframe,
11 |     get_students_alphas,
12 |     get_exercise_code_betas,
13 |     get_exercise_gammas_of_one_exercise_code,
14 |     get_available_exercise_codes,
15 | )
16 | 
17 | csv_to_use = "shorter_training"
18 | # put the dataset you want in a folder with the right path
19 | dataset = pd.read_csv(f"data/lalilo_datasets/{csv_to_use}.csv")
20 | 
21 | # select end of the dataset to test if the model is running properly
22 | last_n_traces = 500  
23 | if last_n_traces:
24 |     dataset = dataset[-last_n_traces:]
25 | 
26 | # clean and encode dataset
27 | cleaned_dataset = clean_df(dataset)
28 | encoded_dataset = encode_df(cleaned_dataset)
29 | 
30 | # X and y, X has to be sparse when training a huge dataset
31 | X = encoded_dataset.drop(columns=["correctness", "timestamp"])
32 | X_sparse_df = X.astype(pd.SparseDtype("float", 0.0))
33 | X_sparse_array = X_sparse_df.sparse.to_coo()
34 | 
35 | y = encoded_dataset["correctness"]
36 | 
37 | model = LogisticRegression(solver="lbfgs", max_iter=800)
38 | model.fit(X, y)
39 | 
40 | coefficients = get_coefs_in_dataframe(model, X)
41 | 
42 | # saving the coefficients, you will have to create a 'results' folder somewhere
43 | save_coeffs = False
44 | if save_coeffs:
45 |     today = date.today().strftime("%Y-%m-%d")
46 |     coefficients.to_csv(f"das3h/results/coefficients_of_{csv_to_use}_done_{today}.csv")
47 | 
48 | print("Printing students alphas")
49 | print(get_students_alphas(coefficients))
50 | print("")
51 | print("Printing exercise_code betas")
52 | print(get_exercise_code_betas(coefficients))
53 | 
54 | for exercise_code in get_available_exercise_codes(coefficients):
55 |     print("")
56 |     print(f"Printing coefs of {exercise_code} for all its levels and lessons")
57 |     print(get_exercise_gammas_of_one_exercise_code(coefficients, exercise_code))
58 | 
59 | 


--------------------------------------------------------------------------------
/das3h/encode.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | 
  4 | from typing import Optional
  5 | from tqdm import tqdm
  6 | 
  7 | TIME_WINDOWS_DEFAULT = ("60s", "1h", "1d", "5d", "30d", "365d")
  8 | COUNTERS_DEFAULT = ("attempts", "wins")
  9 | 
 10 | 
 11 | def encode_df(
 12 |     df: pd.DataFrame,
 13 |     exercises: Optional[tuple] = None,
 14 |     counters: tuple = COUNTERS_DEFAULT,
 15 |     time_windows: tuple = TIME_WINDOWS_DEFAULT,
 16 | ) -> pd.DataFrame:
 17 |     """
 18 |     Get the wanted counters of each student and
 19 |     dummifies the categorical variables of the dataset.
 20 |     """
 21 |     if exercises is None:
 22 |         exercises = set(df["exercise_code"].unique())
 23 |     # exercises done by only one student must be removed
 24 |     exercises_to_keep = remove_exercises_done_by_only_one_student(df, exercises)
 25 |     df = add_counters_for_all_exercises(
 26 |         df, exercises_to_keep, counters, time_windows
 27 |     ).fillna(0)
 28 |     df = pd.get_dummies(
 29 |         df, columns=["student_id", "exercise_code_level_lesson", "exercise_code"], sparse=True
 30 |     )
 31 |     return df
 32 | 
 33 | 
 34 | def remove_exercises_done_by_only_one_student(df, exercises):
 35 |     """
 36 |     Groupby function is not working with them for now
 37 |     """
 38 |     nb_student_by_exo = (
 39 |         df[["student_id", "exercise_code"]]
 40 |         .drop_duplicates()
 41 |         .groupby("exercise_code", as_index=False)
 42 |         .count()
 43 |     )
 44 |     exercises_to_keep = exercises
 45 |     exercises_to_remove = nb_student_by_exo[nb_student_by_exo["student_id"] == 1][
 46 |         "exercise_code"
 47 |     ].values
 48 |     for exercise in exercises_to_remove:
 49 |         exercises_to_keep.remove(exercise)
 50 |     return exercises_to_keep
 51 | 
 52 | 
 53 | def add_counters_for_all_exercises(
 54 |     df: pd.DataFrame, exercises: tuple, counters: tuple, time_windows: tuple
 55 | ) -> pd.DataFrame:
 56 |     """
 57 |     Adds a column for all given exercises, all given counters
 58 |     of a student and all given time windows.
 59 |     """
 60 |     for exercise_code in tqdm(
 61 |         exercises, desc=f"Adding the counters for {len(exercises)} exercises"
 62 |     ):
 63 |         df = add_exercise_code_counters_for_each_time_window(
 64 |             df, exercise_code, counters, time_windows
 65 |         )
 66 |     return df
 67 | 
 68 | 
 69 | def add_exercise_code_counters_for_each_time_window(
 70 |     df: pd.DataFrame, exercise_code: str, counters: tuple, time_windows: tuple
 71 | ) -> pd.DataFrame:
 72 |     """
 73 |     For a given exercise_code, adds a column for all given counters
 74 |     of a student and for all given time windows.
 75 |     """
 76 |     for time_window in tqdm(
 77 |         time_windows,
 78 |         desc=f"Adding the counters for the time windows of {exercise_code}",
 79 |     ):
 80 |         df = add_exercise_code_counters_in_one_time_window(
 81 |             df, exercise_code, counters, time_window
 82 |         )
 83 |     return df
 84 | 
 85 | 
 86 | def add_exercise_code_counters_in_one_time_window(
 87 |     df: pd.DataFrame, exercise_code: str, counters: tuple, time_window: str
 88 | ) -> pd.DataFrame:
 89 |     """
 90 |     For a given exercise_code, adds a column for all given counters
 91 |     of a student and for one given time window.
 92 |     """
 93 |     for counter in counters:
 94 |         df = add_one_exercise_code_counter_in_one_time_window(
 95 |             df, exercise_code, counter, time_window
 96 |         )
 97 |     return df
 98 | 
 99 | 
100 | def add_one_exercise_code_counter_in_one_time_window(
101 |     df: pd.DataFrame, exercise_code: str, counter: str, time_window: str
102 | ) -> pd.DataFrame:
103 |     df_copy = df.copy()
104 |     filtered_df = df[df["exercise_code"] == exercise_code]
105 |     filtered_df_and_timestamp_index = filtered_df.set_index("timestamp")
106 |     counter_in_the_time_window = filtered_df_and_timestamp_index.groupby(
107 |         by=["student_id"], as_index=False
108 |     ).rolling(time_window, closed="left")["correctness"]
109 |     assert counter in ("wins", "attempts")
110 |     if counter == "attempts":
111 |         counter_in_the_time_window = counter_in_the_time_window.count()
112 |     elif counter == "wins":
113 |         counter_in_the_time_window = counter_in_the_time_window.sum()
114 |     exercise_code_counter = (
115 |         counter_in_the_time_window.reset_index()
116 |         .fillna(0)
117 |         .sort_values(by=["timestamp", "correctness"])
118 |     )
119 |     exercise_code_counter["index"] = filtered_df.index
120 |     exercise_code_counter = exercise_code_counter.set_index("index")
121 |     df_copy[f"{exercise_code}_{counter}_in_the_past_{time_window}"] = scaling_function(
122 |         exercise_code_counter["correctness"]
123 |     )
124 |     return df_copy
125 | 
126 | 
127 | def scaling_function(x, how="log"):
128 |     if how == "log":
129 |         return np.log(1 + x)
130 |     else:
131 |         return np.nan
132 | 


--------------------------------------------------------------------------------
/das3h/encode_test.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | 
  3 | from pandas.testing import assert_frame_equal
  4 | from encode import (
  5 |     scaling_function,
  6 |     add_one_exercise_code_counter_in_one_time_window,
  7 |     encode_df,
  8 | )
  9 | 
 10 | # What I noticed :
 11 | # - tests won't pass if there is only one student, I don't know if is good. I think it is caused by the groupby function
 12 | # - timestamp must be different from student to student or else the sorting by counter value and timestamp yields a false result
 13 | 
 14 | 
 15 | def test_add_one_exercise_code_wins_in_one_time_window():
 16 |     exercise_code = "phono"
 17 |     time_window = "1d"
 18 |     counter = "wins"
 19 |     df = pd.DataFrame(
 20 |         {
 21 |             "timestamp": [
 22 |                 "2019-03-01 00:00:01",
 23 |                 "2019-03-01 00:00:02",
 24 |                 "2019-03-01 00:00:03",
 25 |                 "2019-03-01 00:00:04",
 26 |                 "2019-03-01 00:00:05",
 27 |             ],
 28 |             "student_id": [1, 1, 2, 2, 1],
 29 |             "exercise_code": ["phono", "phono", "phono", "grapho", "phono"],
 30 |             "correctness": [1, 0, 1, 1, 1],
 31 |         }
 32 |     )
 33 |     df["timestamp"] = pd.to_datetime(df["timestamp"])
 34 |     df_with_one_more_column = add_one_exercise_code_counter_in_one_time_window(
 35 |         df, exercise_code, counter, time_window
 36 |     )
 37 |     expected_df = pd.DataFrame(
 38 |         {
 39 |             "timestamp": [
 40 |                 "2019-03-01 00:00:01",
 41 |                 "2019-03-01 00:00:02",
 42 |                 "2019-03-01 00:00:03",
 43 |                 "2019-03-01 00:00:04",
 44 |                 "2019-03-01 00:00:05",
 45 |             ],
 46 |             "student_id": [1, 1, 2, 2, 1],
 47 |             "exercise_code": ["phono", "phono", "phono", "grapho", "phono"],
 48 |             "correctness": [1, 0, 1, 1, 1],
 49 |             "phono_wins_in_the_past_1d": [0, 1, 0, None, 1],
 50 |         }
 51 |     )
 52 |     expected_df["phono_wins_in_the_past_1d"] = expected_df[
 53 |         "phono_wins_in_the_past_1d"
 54 |     ].apply(scaling_function)
 55 |     expected_df["timestamp"] = pd.to_datetime(expected_df["timestamp"])
 56 |     assert_frame_equal(expected_df, df_with_one_more_column, check_like=True)
 57 | 
 58 | 
 59 | def test_add_one_exercise_code_attempts_in_one_time_window():
 60 |     exercise_code = "phono"
 61 |     time_window = "1d"
 62 |     counter = "attempts"
 63 |     df = pd.DataFrame(
 64 |         {
 65 |             "timestamp": [
 66 |                 "2019-03-01 00:00:01",
 67 |                 "2019-03-01 00:00:02",
 68 |                 "2019-03-01 00:00:03",
 69 |                 "2019-03-01 00:00:04",
 70 |                 "2019-03-01 00:00:05",
 71 |             ],
 72 |             "student_id": [1, 1, 2, 2, 1],
 73 |             "exercise_code": ["phono", "phono", "phono", "grapho", "phono"],
 74 |             "correctness": [1, 0, 1, 1, 1],
 75 |         }
 76 |     )
 77 |     df["timestamp"] = pd.to_datetime(df["timestamp"])
 78 |     df_with_one_more_column = add_one_exercise_code_counter_in_one_time_window(
 79 |         df, exercise_code, counter, time_window
 80 |     )
 81 |     expected_df = pd.DataFrame(
 82 |         {
 83 |             "timestamp": [
 84 |                 "2019-03-01 00:00:01",
 85 |                 "2019-03-01 00:00:02",
 86 |                 "2019-03-01 00:00:03",
 87 |                 "2019-03-01 00:00:04",
 88 |                 "2019-03-01 00:00:05",
 89 |             ],
 90 |             "student_id": [1, 1, 2, 2, 1],
 91 |             "exercise_code": ["phono", "phono", "phono", "grapho", "phono"],
 92 |             "correctness": [1, 0, 1, 1, 1],
 93 |             "phono_attempts_in_the_past_1d": [0, 1, 0, None, 2],
 94 |         }
 95 |     )
 96 |     expected_df["phono_attempts_in_the_past_1d"] = expected_df[
 97 |         "phono_attempts_in_the_past_1d"
 98 |     ].apply(scaling_function)
 99 |     expected_df["timestamp"] = pd.to_datetime(expected_df["timestamp"])
100 |     assert_frame_equal(expected_df, df_with_one_more_column, check_like=True)
101 | 
102 | 
103 | def test_add_one_exercise_code_wins_only_use_traces_in_the_given_time_window():
104 |     exercise_code = "phono"
105 |     time_window = "1d"
106 |     counter = "wins"
107 |     df = pd.DataFrame(
108 |         {
109 |             "timestamp": [
110 |                 "2019-03-01 00:00:01",
111 |                 "2019-03-01 00:00:02",
112 |                 "2019-03-01 00:00:03",
113 |                 "2019-03-01 00:00:04",
114 |                 "2019-03-03 00:00:05",
115 |             ],
116 |             "student_id": [1, 1, 2, 2, 1],
117 |             "exercise_code": ["phono", "phono", "phono", "grapho", "phono"],
118 |             "correctness": [1, 0, 1, 1, 1],
119 |         }
120 |     )
121 |     df["timestamp"] = pd.to_datetime(df["timestamp"])
122 |     df_with_one_more_column = add_one_exercise_code_counter_in_one_time_window(
123 |         df, exercise_code, counter, time_window
124 |     )
125 |     expected_df = pd.DataFrame(
126 |         {
127 |             "timestamp": [
128 |                 "2019-03-01 00:00:01",
129 |                 "2019-03-01 00:00:02",
130 |                 "2019-03-01 00:00:03",
131 |                 "2019-03-01 00:00:04",
132 |                 "2019-03-03 00:00:05",
133 |             ],
134 |             "student_id": [1, 1, 2, 2, 1],
135 |             "exercise_code": ["phono", "phono", "phono", "grapho", "phono"],
136 |             "correctness": [1, 0, 1, 1, 1],
137 |             "phono_wins_in_the_past_1d": [0, 1, 0, None, 0],
138 |         }
139 |     )
140 |     expected_df["phono_wins_in_the_past_1d"] = expected_df[
141 |         "phono_wins_in_the_past_1d"
142 |     ].apply(scaling_function)
143 |     expected_df["timestamp"] = pd.to_datetime(expected_df["timestamp"])
144 |     assert_frame_equal(expected_df, df_with_one_more_column, check_like=True)
145 | 
146 | 
147 | def test_encode_df():
148 |     counters = ("wins", "attempts")
149 |     time_windows = ("1h", "7d")
150 |     df = pd.DataFrame(
151 |         {
152 |             "timestamp": [
153 |                 "2019-03-01 00:00:01",
154 |                 "2019-03-01 00:00:02",
155 |                 "2019-03-01 00:00:03",
156 |                 "2019-03-01 00:00:04",
157 |                 "2019-03-03 00:00:05",
158 |             ],
159 |             "student_id": [1, 1, 2, 2, 1],
160 |             "exercise_code": ["phono", "phono", "phono", "grapho", "phono"],
161 |             "exercise_code_level_lesson": [
162 |                 "phono_3_lesson_1",
163 |                 "phono_3_lesson_2",
164 |                 "phono_3_lesson_1",
165 |                 "grapho_3_lesson_1",
166 |                 "phono_3_lesson_1",
167 |             ],
168 |             "correctness": [1, 0, 1, 1, 1],
169 |         }
170 |     )
171 |     df["timestamp"] = pd.to_datetime(df["timestamp"])
172 |     encoded_df = encode_df(df, counters=counters, time_windows=time_windows)
173 |     expected_df = pd.DataFrame(
174 |         {
175 |             "timestamp": [
176 |                 "2019-03-01 00:00:01",
177 |                 "2019-03-01 00:00:02",
178 |                 "2019-03-01 00:00:03",
179 |                 "2019-03-01 00:00:04",
180 |                 "2019-03-03 00:00:05",
181 |             ],
182 |             "student_id_1": [1, 1, 0, 0, 1],
183 |             "student_id_2": [0, 0, 1, 1, 0],
184 |             "exercise_code_phono": [1, 1, 1, 0, 1],
185 |             "exercise_code_grapho": [0, 0, 0, 1, 0],
186 |             "exercise_code_level_lesson_phono_3_lesson_1": [1, 0, 1, 0, 1],
187 |             "exercise_code_level_lesson_phono_3_lesson_2": [0, 1, 0, 0, 0],
188 |             "exercise_code_level_lesson_grapho_3_lesson_1": [0, 0, 0, 1, 0],
189 |             "correctness": [1, 0, 1, 1, 1],
190 |             "phono_wins_in_the_past_1h": [0, 1, 0, 0, 0],
191 |             "phono_wins_in_the_past_7d": [0, 1, 0, 0, 1],
192 |             "phono_attempts_in_the_past_1h": [0, 1, 0, 0, 0],
193 |             "phono_attempts_in_the_past_7d": [0, 1, 0, 0, 2],
194 |         }
195 |     )
196 |     expected_df["timestamp"] = pd.to_datetime(expected_df["timestamp"])
197 |     expected_df["phono_wins_in_the_past_1h"] = expected_df[
198 |         "phono_wins_in_the_past_1h"
199 |     ].apply(scaling_function)
200 |     expected_df["phono_wins_in_the_past_7d"] = expected_df[
201 |         "phono_wins_in_the_past_7d"
202 |     ].apply(scaling_function)
203 |     expected_df["phono_attempts_in_the_past_1h"] = expected_df[
204 |         "phono_attempts_in_the_past_1h"
205 |     ].apply(scaling_function)
206 |     expected_df["phono_attempts_in_the_past_7d"] = expected_df[
207 |         "phono_attempts_in_the_past_7d"
208 |     ].apply(scaling_function)
209 |     assert_frame_equal(encoded_df, expected_df, check_like=True, check_dtype=False)
210 | 
211 | 


--------------------------------------------------------------------------------
/das3h/print_df_infos.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | 
 4 | def print_cleaned_df_with_information(cleaned_df: pd.DataFrame):
 5 |     nb_students = len(cleaned_df["student_id"].unique())
 6 |     print(f"There are {nb_students} students in this dataset.")
 7 |     print()
 8 |     trace_repartition = (
 9 |         cleaned_df.groupby("exercise_code")
10 |         .count()
11 |         .rename(columns={"correctness": "traces_count"})["traces_count"]
12 |         .sort_values(ascending=False)
13 |     )
14 |     print("This is the repartition of traces grouped by exercise_code :")
15 |     print(trace_repartition.plot(kind="bar"))
16 | 
17 | 
18 | 


--------------------------------------------------------------------------------
/das3h/results_analysis.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | 
 5 | def get_coefs_in_dataframe(model, X: pd.DataFrame):
 6 |     return (
 7 |         pd.DataFrame(data=model.coef_[0], index=X.columns)
 8 |         .reset_index()
 9 |         .rename(columns={"index": "columns", 0: "coefs"})
10 |     )
11 | 
12 | 
13 | def get_students_alphas(coefs: pd.DataFrame):
14 |     return coefs[coefs["columns"].str.contains("student_id")].sort_values("coefs")
15 | 
16 | 
17 | def get_exercise_code_betas(coefs: pd.DataFrame):
18 |     exercise_codes = [
19 |         f"exercise_code_{exercise_code}"
20 |         for exercise_code in get_available_exercise_codes(coefs)
21 |     ]
22 |     return coefs[coefs["columns"].isin(exercise_codes)].sort_values("coefs")
23 | 
24 | 
25 | def get_gamma_of_exercise(
26 |     coefs: pd.DataFrame, exercise_code: str, level: int, lesson: int
27 | ):
28 |     return coefs[
29 |         coefs["columns"].str.contains(
30 |             f"{exercise_code}_{str(level)}_lesson_{str(lesson)}"
31 |         )
32 |     ].sort_values("coefs")
33 | 
34 | 
35 | def get_available_exercise_codes(coefs: pd.DataFrame):
36 |     return np.array(
37 |         list(
38 |             map(
39 |                 lambda x: x[len("exercise_code_") :],
40 |                 coefs[coefs["columns"].str.startswith("exercise_code")][
41 |                     ~coefs["columns"].str.startswith("exercise_code_level_lesson")
42 |                 ]["columns"].values,
43 |             )
44 |         )
45 |     )
46 | 
47 | 
48 | def get_available_exercise_code_level_lesson_tuples(coefs: pd.DataFrame):
49 |     return np.array(
50 |         list(
51 |             map(
52 |                 lambda x: x[len("exercise_code_level_lesson_") :],
53 |                 coefs[coefs["columns"].str.startswith("exercise_code_level_lesson")][
54 |                     "columns"
55 |                 ].values,
56 |             )
57 |         )
58 |     )
59 | 
60 | 
61 | def get_exercise_gammas_of_one_exercise_code(coefs: pd.DataFrame, exercise_code: str):
62 |     available_exercise_codes = get_available_exercise_codes(coefs)
63 |     if exercise_code not in available_exercise_codes:
64 |         print('Error : exercise_code not in the dataset')
65 |         return
66 |     exercise_code_level_lesson_tuples = get_available_exercise_code_level_lesson_tuples(
67 |         coefs
68 |     )
69 |     filtered_tuples = [
70 |         f"exercise_code_level_lesson_{tuple}"
71 |         for tuple in exercise_code_level_lesson_tuples
72 |         if exercise_code in tuple
73 |     ]
74 |     return coefs[coefs["columns"].isin(filtered_tuples)].sort_values(
75 |         by="coefs", ascending=False
76 |     )
77 | 
78 | 
79 | def get_thetas_of_one_exercise_code(coefs: pd.DataFrame, exercise_code: str):
80 |     return coefs[coefs["columns"].str.startswith(exercise_code)]
81 | 


--------------------------------------------------------------------------------
/prepare_data.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from scipy import sparse
  4 | import argparse
  5 | import os
  6 | from time import process_time
  7 | 
  8 | 
  9 | def prepare_assistments(data_name, min_interactions_per_user, remove_nan_skills):
 10 |     """Preprocess ASSISTments dataset.
 11 |     
 12 |     Arguments:
 13 |         data_name: "assistments09", "assistments12", "assistments15" or "assistments17"
 14 |         min_interactions_per_user (int): minimum number of interactions per student
 15 |         remove_nan_skills (bool): if True, remove interactions with no skill tag
 16 |     Outputs:
 17 |         df (pandas DataFrame): preprocessed ASSISTments dataset with user_id, item_id,
 18 |             timestamp and correct features
 19 |         Q_mat (item-skill relationships sparse array): corresponding q-matrix
 20 |     """
 21 |     data_path = os.path.join("data", data_name)
 22 |     df = pd.read_csv(os.path.join(data_path, "data.csv"), encoding="ISO-8859-1")
 23 | 
 24 |     # Only 2012 and 2017 versions have timestamps
 25 |     if data_name == "assistments09":
 26 |         df = df.rename(columns={"problem_id": "item_id"})
 27 |         df["timestamp"] = np.zeros(len(df), dtype=np.int64)
 28 |     elif data_name == "assistments12":
 29 |         df = df.rename(columns={"problem_id": "item_id"})
 30 |         df = add_timestamp(df, "start_time")
 31 |     elif data_name == "assistments15":
 32 |         df = df.rename(columns={"sequence_id": "item_id"})
 33 |         df["skill_id"] = df["item_id"]
 34 |         df["timestamp"] = np.zeros(len(df), dtype=np.int64)
 35 |     elif data_name == "assistments17":
 36 |         df = df.rename(
 37 |             columns={
 38 |                 "startTime": "timestamp",
 39 |                 "studentId": "user_id",
 40 |                 "problemId": "item_id",
 41 |                 "skill": "skill_id",
 42 |             }
 43 |         )
 44 |         df = add_timestamp(df, "timestamp")
 45 | 
 46 |     # Sort data temporally
 47 |     if data_name in ["assistments12", "assistments17"]:
 48 |         df.sort_values(by="timestamp", inplace=True)
 49 |     elif data_name == "assistments09":
 50 |         df.sort_values(by="order_id", inplace=True)
 51 |     elif data_name == "assistments15":
 52 |         df.sort_values(by="log_id", inplace=True)
 53 | 
 54 |     df = general_cleaning(df, min_interactions_per_user)
 55 |     df = remove_nan_skill(remove_nan_skills, df)
 56 | 
 57 |     df["user_id"] = np.unique(df["user_id"], return_inverse=True)[1]
 58 |     df["item_id"] = np.unique(df["item_id"], return_inverse=True)[1]
 59 |     df["skill_id"] = np.unique(df["skill_id"], return_inverse=True)[1]
 60 | 
 61 |     # Build Q-matrix
 62 |     Q_mat = np.zeros((len(df["item_id"].unique()), len(df["skill_id"].unique())))
 63 |     for item_id, skill_id in df[["item_id", "skill_id"]].values:
 64 |         Q_mat[item_id, skill_id] = 1
 65 | 
 66 |     # Remove row duplicates due to multiple skills for one item
 67 |     if data_name == "assistments09":
 68 |         df = df.drop_duplicates("order_id")
 69 |     elif data_name == "assistments17":
 70 |         df = df.drop_duplicates(["user_id", "timestamp"])
 71 | 
 72 |     # Get unique skill id from combination of all skill ids
 73 |     df["skill_id"] = np.unique(Q_mat, axis=0, return_inverse=True)[1][df["item_id"]]
 74 | 
 75 |     df = df[["user_id", "item_id", "timestamp", "correct", "skill_id"]]
 76 |     df.reset_index(inplace=True, drop=True)
 77 | 
 78 |     # Save data
 79 |     save_data(df, data_path, Q_mat)
 80 | 
 81 | 
 82 | def remove_nan_skill(remove_nan_skills, df):
 83 |     # Filter nan skills
 84 |     if remove_nan_skills:
 85 |         df = df[~df["skill_id"].isnull()]
 86 |     else:
 87 |         df.ix[df["skill_id"].isnull(), "skill_id"] = -1
 88 |     return df
 89 | 
 90 | 
 91 | def prepare_kddcup10(
 92 |     data_name, min_interactions_per_user, kc_col_name, remove_nan_skills
 93 | ):
 94 |     """Preprocess KDD Cup 2010 dataset.
 95 |     Arguments:
 96 |         data_name (str): "bridge_algebra06" or "algebra05"
 97 |         min_interactions_per_user (int): minimum number of interactions per student
 98 |         kc_col_name (str): Skills id column
 99 |         remove_nan_skills (bool): if True, remove interactions with no skill tag
100 |     Outputs:
101 |         df (pandas DataFrame): preprocessed KDD Cup 2010 dataset with user_id, item_id,
102 |             timestamp and correct features
103 |         Q_mat (item-skill relationships sparse array): corresponding q-matrix
104 |     """
105 |     data_path = os.path.join("data", data_name)
106 |     df = pd.read_csv(os.path.join(data_path, "data.txt"), delimiter="\t")
107 |     df = df.rename(
108 |         columns={
109 |             "Anon Student Id": "user_id",
110 |             "Correct First Attempt": "correct",
111 |             kc_col_name: "skill_id",
112 |         }
113 |     )
114 | 
115 |     # Create item from problem and step
116 |     df["item_id"] = df["Problem Name"] + ":" + df["Step Name"]
117 | 
118 |     df = add_timestamp(df, "First Transaction Time")
119 |     df = general_cleaning(df, min_interactions_per_user)
120 |     df = remove_nan_skill(remove_nan_skills, df)
121 | 
122 |     # Extract KCs
123 |     kc_list = []
124 |     for kc_str in df["skill_id"].unique():
125 |         for kc in kc_str.split("~~"):
126 |             kc_list.append(kc)
127 |     kc_set = set(kc_list)
128 |     kc2idx = {kc: i for i, kc in enumerate(kc_set)}
129 | 
130 |     df["user_id"] = np.unique(df["user_id"], return_inverse=True)[1]
131 |     df["item_id"] = np.unique(df["item_id"], return_inverse=True)[1]
132 | 
133 |     # Build Q-matrix
134 |     Q_mat = np.zeros((len(df["item_id"].unique()), len(kc_set)))
135 |     for item_id, kc_str in df[["item_id", "skill_id"]].values:
136 |         for kc in kc_str.split("~~"):
137 |             Q_mat[item_id, kc2idx[kc]] = 1
138 | 
139 |     # Get unique skill id from combination of all skill ids
140 |     df["skill_id"] = np.unique(Q_mat, axis=0, return_inverse=True)[1][df["item_id"]]
141 | 
142 |     df = df[["user_id", "item_id", "timestamp", "correct", "skill_id"]]
143 |     df.reset_index(inplace=True, drop=True)
144 | 
145 |     # Save data
146 |     save_data(df, data_path, Q_mat)
147 | 
148 | 
149 | def prepare_lalilo(min_interactions_per_user):
150 |     """Preprocess Lalilo dataset.
151 | 
152 |     Arguments:
153 |         min_interactions_per_user (int): minimum number of interactions per student
154 | 
155 |     Outputs:
156 |         df (pandas DataFrame): preprocessed Lalilo dataset with user_id, item_id,
157 |             timestamp and correct features
158 |         Q_mat (item-skill relationships sparse array): corresponding q-matrix
159 |     """
160 |     data_path = os.path.join("data", "lalilo")
161 |     df = pd.read_csv(
162 |         os.path.join(data_path, "all_traces_from_2018-08-01_to_2019-04-01.csv")
163 |     )
164 | 
165 |     def add_exercise_code_level_lesson(df):
166 |         dataset = df.copy()
167 |         dataset["exercise_code_level_lesson"] = (
168 |             dataset["exercise_code"].map(str)
169 |             + "_"
170 |             + dataset["level"].map(str)
171 |             + "_lesson_"
172 |             + dataset["lesson_id"].map(str)
173 |         )
174 |         return dataset
175 | 
176 |     df = add_exercise_code_level_lesson(df)
177 |     df = df.rename(
178 |         columns={
179 |             "student_id": "user_id",
180 |             "created_at": "timestamp",
181 |             "exercise_code": "skill_id",
182 |             "exercise_code_level_lesson": "item_id",
183 |             "correctness": "correct",
184 |         }
185 |     )
186 |     df = add_timestamp(df, "timestamp")
187 |     df = general_cleaning(df, min_interactions_per_user)
188 | 
189 |     # Maybe we want to store the correspondence with the original dataset somewhere
190 |     df["user_id"] = np.unique(df["user_id"], return_inverse=True)[1]
191 |     df["item_id"] = np.unique(df["item_id"], return_inverse=True)[1]
192 |     df["skill_id"] = np.unique(df["skill_id"], return_inverse=True)[1]
193 | 
194 |     # Build Q-matrix
195 |     Q_mat = np.zeros((len(df["item_id"].unique()), len(df["skill_id"].unique())))
196 |     for item_id, skill_id in df[["item_id", "skill_id"]].values:
197 |         Q_mat[item_id, skill_id] = 1
198 | 
199 |     df = df[["user_id", "item_id", "skill_id", "timestamp", "correct"]]
200 |     df.reset_index(inplace=True, drop=True)
201 | 
202 |     # Save data
203 |     # save_data(df, data_path, Q_mat)
204 | 
205 | 
206 | def general_cleaning(df, min_interactions_per_user):
207 |     t1_start = process_time()
208 |     # Remove continuous outcomes
209 |     df = df.copy()
210 |     df = df[df["correct"].isin([0, 1])]
211 |     df["correct"] = df["correct"].astype(np.int32)
212 |     # Drop duplicates
213 |     df.drop_duplicates(subset=["user_id", "item_id", "timestamp"], inplace=True)
214 |     # Filter too short sequences
215 |     df = df.groupby("user_id").filter(lambda x: len(x) >= min_interactions_per_user)
216 |     t1_stop = process_time()
217 |     print("Elapsed time during general_cleaning in seconds:", t1_stop - t1_start)
218 |     return df
219 | 
220 | 
221 | def add_timestamp(df, column_name):
222 |     t1_start = process_time()
223 |     df = df.copy()
224 |     df["timestamp"] = pd.to_datetime(df[column_name])
225 |     # df.dropna(subset=["timestamp"], inplace=True)
226 |     df["timestamp"] = df["timestamp"] - df["timestamp"].min()
227 |     df["timestamp"] = (
228 |         df["timestamp"].apply(lambda x: x.total_seconds()).astype(np.int64)
229 |     )
230 |     df.sort_values(by="timestamp", inplace=True)
231 |     t1_stop = process_time()
232 |     print("Elapsed time during add_timestamp in seconds:", t1_stop - t1_start)
233 |     return df
234 | 
235 | 
236 | def save_data(df, data_path, Q_mat):
237 |     sparse.save_npz(os.path.join(data_path, "q_mat.npz"), sparse.csr_matrix(Q_mat))
238 |     df.to_csv(os.path.join(data_path, "preprocessed_data.csv"), sep="\t", index=False)
239 | 
240 | 
241 | if __name__ == "__main__":
242 |     parser = argparse.ArgumentParser(description="Prepare datasets.")
243 |     parser.add_argument("--dataset", type=str, default="assistments12")
244 |     parser.add_argument("--min_interactions", type=int, default=10)
245 |     parser.add_argument("--remove_nan_skills", type=bool, default=True)
246 |     args = parser.parse_args()
247 | 
248 |     if args.dataset in [
249 |         "assistments09",
250 |         "assistments12",
251 |         "assistments15",
252 |         "assistments17",
253 |     ]:
254 |         prepare_assistments(
255 |             data_name=args.dataset,
256 |             min_interactions_per_user=args.min_interactions,
257 |             remove_nan_skills=args.remove_nan_skills,
258 |         )
259 |     elif args.dataset == "bridge_algebra06":
260 |         prepare_kddcup10(
261 |             data_name="bridge_algebra06",
262 |             min_interactions_per_user=args.min_interactions,
263 |             kc_col_name="KC(SubSkills)",
264 |             remove_nan_skills=args.remove_nan_skills,
265 |         )
266 |     elif args.dataset == "algebra05":
267 |         prepare_kddcup10(
268 |             data_name="algebra05",
269 |             min_interactions_per_user=args.min_interactions,
270 |             kc_col_name="KC(Default)",
271 |             remove_nan_skills=args.remove_nan_skills,
272 |         )
273 |     elif args.dataset == "lalilo":
274 |         prepare_lalilo(min_interactions_per_user=args.min_interactions)
275 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas
2 | tqdm
3 | sklearn
4 | numpy
5 | matplotlib
6 | torch
7 | tensorboardX
8 | tensorboard


--------------------------------------------------------------------------------
/utils/logger.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | from tensorboardX import SummaryWriter
 4 | 
 5 | 
 6 | class Logger:
 7 |     """Logging with TensorboardX.
 8 |     """
 9 | 
10 |     def __init__(self, logdir, verbose=True):
11 |         if not os.path.exists(logdir):
12 |             os.makedirs(logdir)
13 |         try:
14 |             shutil.rmtree(logdir)
15 |         except FileNotFoundError:
16 |             pass
17 | 
18 |         self.verbose = verbose
19 |         self.writer = SummaryWriter(logdir)
20 | 
21 |     def log_histogram(self, tag, array, step):
22 |         """Log histogram of numpy array of values.
23 |         """
24 |         self.writer.add_histogram(tag, array, step)
25 | 
26 |     def log_scalars(self, dic, step):
27 |         """Log dictionary of scalar values.
28 |         """
29 |         for k, v in dic.items():
30 |             self.writer.add_scalar(k, v, step)
31 | 
32 |         if self.verbose:
33 |             print(f"Step {step}, {dic}")
34 | 
35 |     def close(self):
36 |         self.writer.close()


--------------------------------------------------------------------------------
/utils/metrics.py:
--------------------------------------------------------------------------------
 1 | class Metrics:
 2 |     """Keep track of metrics over time in a dictionary.
 3 |     """
 4 |     def __init__(self):
 5 |         self.metrics = {}
 6 |         self.counts = {}
 7 | 
 8 |     def store(self, new_metrics):
 9 |         for key in new_metrics:
10 |             if key in self.metrics:
11 |                 self.metrics[key] += new_metrics[key]
12 |                 self.counts[key] += 1
13 |             else:
14 |                 self.metrics[key] = new_metrics[key]
15 |                 self.counts[key] = 1
16 | 
17 |     def average(self):
18 |         average = {k: v / self.counts[k] for k, v in self.metrics.items()}
19 |         self.metrics, self.counts = {}, {}
20 |         return average


--------------------------------------------------------------------------------
/utils/misc.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | from random import shuffle
 3 | from sklearn.metrics import roc_auc_score, accuracy_score
 4 | 
 5 | import torch
 6 | from torch.nn.utils.rnn import pad_sequence
 7 | 
 8 | 
 9 | def set_random_seeds(seed):
10 |     torch.manual_seed(seed)
11 |     torch.cuda.manual_seed_all(seed)
12 |     random.seed(seed)
13 |     
14 |     
15 | def get_data(df, train_split=0.8):
16 |     num_items = df["item_id"].nunique()
17 |     data = [(torch.tensor(u_df["item_id"].values, dtype=torch.long),
18 |              torch.tensor(u_df["correct"].values, dtype=torch.long))
19 |             for _, u_df in df.groupby("user_id")]
20 |     data = [(torch.cat((torch.zeros(1, dtype=torch.long), item_ids + labels * num_items + 1))[:-1], item_ids, labels)
21 |             for (item_ids, labels) in data]
22 |     shuffle(data)
23 | 
24 |     # Train-test split across users
25 |     train_size = int(train_split * len(data))
26 |     train_data, val_data = data[:train_size], data[train_size:]
27 |     return train_data, val_data
28 | 
29 | 
30 | def prepare_batches(data, batch_size):
31 |     """Prepare batches grouping padded sequences.
32 |     
33 |     Arguments:
34 |         data (list of tuples of torch Tensor)
35 |         batch_size (int): number of sequences per batch
36 |         
37 |     Output:
38 |         batches (list of tuples of torch Tensor)
39 |     """
40 |     shuffle(data)
41 |     batches = []
42 | 
43 |     for k in range(0, len(data), batch_size):
44 |         batch = data[k:k + batch_size]
45 |         inputs, item_ids, labels = zip(*batch)
46 | 
47 |         inputs = pad_sequence(inputs, batch_first=True, padding_value=0)     # Pad with 0
48 |         item_ids = pad_sequence(item_ids, batch_first=True, padding_value=0) # Don't care
49 |         labels = pad_sequence(labels, batch_first=True, padding_value=-1)    # Pad with -1
50 | 
51 |         batches.append([inputs, item_ids, labels])
52 |         
53 |     return batches
54 | 
55 | 
56 | def compute_auc(preds, item_ids, labels):
57 |     labels = labels.view(-1)
58 |     item_ids = item_ids.view(-1)[labels >= 0]
59 |     preds = preds.view(-1, preds.shape[-1])[labels >= 0]
60 |     preds = preds[torch.arange(preds.shape[0]), item_ids]
61 |     labels = labels[labels >= 0].float()
62 | 
63 |     if len(torch.unique(labels)) == 1: # Only one class
64 |         auc = accuracy_score(labels, preds.round())
65 |     else:
66 |         auc = roc_auc_score(labels, preds)
67 |     return auc
68 | 
69 | 
70 | def compute_loss(preds, item_ids, labels, criterion):
71 |     labels = labels.view(-1)
72 |     item_ids = item_ids.view(-1)[labels >= 0]
73 |     preds = preds.view(-1, preds.shape[-1])[labels >= 0]
74 |     preds = preds[torch.arange(preds.shape[0]), item_ids]
75 |     labels = labels[labels >= 0].float()
76 |     return criterion(preds, labels)


--------------------------------------------------------------------------------