├── rnn_prof
    ├── __init__.py
    ├── data
    │   ├── __init__.py
    │   ├── constants.py
    │   ├── split_data.py
    │   ├── splitting_utils.py
    │   ├── wrapper.py
    │   ├── assistments.py
    │   ├── kddcup.py
    │   └── rnn.py
    ├── tests
    │   ├── data
    │   │   ├── test_assist_data.csv.gz
    │   │   ├── test_kddcup_data.csv.gz
    │   │   ├── test_wrapper.py
    │   │   ├── test_kddcup.py
    │   │   ├── test_splitting_utils.py
    │   │   ├── test_assistments.py
    │   │   └── test_rnn.py
    │   ├── test_simple_rnn.py
    │   ├── irt
    │   │   ├── test_updaters.py
    │   │   ├── test_callbacks.py
    │   │   ├── test_irt.py
    │   │   ├── cpd
    │   │   │   └── test_ogive.py
    │   │   ├── test_linear_operators.py
    │   │   ├── test_metrics.py
    │   │   ├── test_online_cross_validation.py
    │   │   └── test_learners.py
    │   ├── test_run_rnn.py
    │   └── test_run_irt.py
    ├── irt
    │   ├── cpd
    │   │   ├── __init__.py
    │   │   └── cpd.py
    │   ├── constants.py
    │   ├── __init__.py
    │   ├── utils.py
    │   ├── linear_operators.py
    │   ├── updaters.py
    │   ├── testing_utils.py
    │   ├── callbacks.py
    │   ├── irt.py
    │   ├── online_cross_validation.py
    │   └── metrics.py
    ├── common.py
    ├── run_rnn.py
    ├── cliutils.py
    └── run_irt.py
├── MANIFEST.in
├── requirements.testing.in
├── requirements.in
├── .gitignore
├── tox.ini
├── setup.py
├── requirements.txt
├── README.md
└── LICENSE


/rnn_prof/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rnn_prof/data/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements.in
2 | include requirements.testing.in
3 | include README.md
4 | 


--------------------------------------------------------------------------------
/requirements.testing.in:
--------------------------------------------------------------------------------
1 | mock>=1.0,<1.1
2 | pylint==0.25.1
3 | pytest>=2.5,<2.6
4 | pytest-cov>=1.7,<1.8
5 | pytest-xdist>=1.11,<2
6 | 


--------------------------------------------------------------------------------
/rnn_prof/tests/data/test_assist_data.csv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Knewton/edm2016/HEAD/rnn_prof/tests/data/test_assist_data.csv.gz


--------------------------------------------------------------------------------
/rnn_prof/tests/data/test_kddcup_data.csv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Knewton/edm2016/HEAD/rnn_prof/tests/data/test_kddcup_data.csv.gz


--------------------------------------------------------------------------------
/requirements.in:
--------------------------------------------------------------------------------
1 | numpy>=1.8,<1.11
2 | scipy>=0.14,<0.16
3 | pandas>=0.16,<0.17
4 | ipython>=2,<3
5 | click>=4,<7
6 | theano
7 | enum34
8 | python-igraph
9 | 


--------------------------------------------------------------------------------
/rnn_prof/irt/cpd/__init__.py:
--------------------------------------------------------------------------------
1 | from .cpd import CPD
2 | from .gaussian import GaussianCPD
3 | from .ogive import OnePOCPD, TwoPOCPD
4 | 
5 | __all__ = ('CPD', 'OnePOCPD', 'TwoPOCPD', 'GaussianCPD')
6 | 


--------------------------------------------------------------------------------
/rnn_prof/common.py:
--------------------------------------------------------------------------------
1 | """
2 | A place to keep common data structures for various testing schemes.
3 | """
4 | from collections import namedtuple
5 | 
6 | # A structure for keeping track of basic metrics of test performance
7 | Results = namedtuple('Results', ['num_iter', 'accuracy', 'auc'])
8 | 


--------------------------------------------------------------------------------
/rnn_prof/irt/constants.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Miscellaneous constants for IRT
 3 | """
 4 | 
 5 | TRAIN_RESPONSES_KEY = 'train_responses'
 6 | TEST_RESPONSES_KEY = 'test_responses'
 7 | 
 8 | THETAS_KEY = 'thetas'
 9 | OFFSET_COEFFS_KEY = 'offset_coeffs'
10 | NONOFFSET_COEFFS_KEY = 'nonoffset_coeffs'
11 | 
12 | DEFAULT_STEP_SIZE = 5e-1
13 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # compiled code
 2 | *.pyc
 3 | 
 4 | # swap files
 5 | *.swp
 6 | *.swo
 7 | *~
 8 | *#
 9 | 
10 | # apple cruft
11 | .DS_Store
12 | 
13 | # py.test artifacts
14 | __pycache__
15 | 
16 | # log files
17 | *.log
18 | 
19 | # build files
20 | target
21 | dist/
22 | *.egg-info/
23 | build/
24 | MANIFEST
25 | 
26 | .tox/
27 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist=py27
 3 | 
 4 | [testenv]
 5 | deps=
 6 |   -rrequirements.txt
 7 | commands=
 8 |   {envbindir}/py.test -v -n 4 {posargs}
 9 | 
10 | [testenv:pip-compile]
11 | ; this is used to recompile the requirements.txt file
12 | deps=
13 |     pip-tools==1.5.0
14 | commands=
15 |     pip-compile requirements.in requirements.testing.in -v -o requirements.txt
16 | 
17 | [testenv:flake8]
18 | basepython = python2.7
19 | deps = flake8
20 | commands = flake8 rnn_prof --max-line-length=100 --ignore=E731,E241
21 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from setuptools import find_packages, setup
 3 | 
 4 | 
 5 | setup(
 6 |     name="rnn_prof",
 7 |     version='0.1-DEV',
 8 |     url="https://github.com/Knewton/edm2016",
 9 |     author="knewton",
10 |     author_email="help@knewton.com",
11 |     license="Apache License 2.0",
12 |     packages=find_packages(),
13 |     install_requires=open('requirements.in', 'r').readlines(),
14 |     tests_require=open('requirements.testing.in', 'r').readlines(),
15 |     description="Code for our EDM 2016 submission including DKT and IRT variants",
16 |     entry_points="""
17 |         [console_scripts]
18 |         rnn_prof=rnn_prof.cli:main
19 |     """,
20 |     long_description="\n" + open('README.md', 'r').read()
21 | )
22 | 


--------------------------------------------------------------------------------
/rnn_prof/irt/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The BayesNet module contains an implementation of IRT and related models as a Probabilistic
 3 | Graphical Model. Observed and latent variables are represented by nodes in the graph.  Each node
 4 | contains the data (state of the variable), the conditional probability distribution of the data
 5 | given the parameters, links to nodes holding latent variables, and auxiliary parameters related to
 6 | the optimization of its variables.  A learner object provides a thin wrapper for the probabilistic
 7 | graph and learning is performed by coordinate ascent on each node's probability function.
 8 | """
 9 | 
10 | from . import callbacks
11 | from . import cpd
12 | from . import irt
13 | from . import learners
14 | from . import node
15 | from .constants import (TRAIN_RESPONSES_KEY, TEST_RESPONSES_KEY, THETAS_KEY, OFFSET_COEFFS_KEY,
16 |                         NONOFFSET_COEFFS_KEY)
17 | 
18 | __all__ = ('callbacks', 'cpd', 'irt', 'learners', 'node', 'TRAIN_RESPONSES_KEY',
19 |            'TEST_RESPONSES_KEY', 'THETAS_KEY', 'OFFSET_COEFFS_KEY', 'NONOFFSET_COEFFS_KEY')
20 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | #
 2 | # This file is autogenerated by pip-compile
 3 | # Make changes in requirements.testing.in, then run this to update:
 4 | #
 5 | #    pip-compile requirements.in requirements.testing.in -o requirements.txt
 6 | #
 7 | 
 8 | apipkg==1.4               # via execnet
 9 | click==6.2
10 | cov-core==1.15.0          # via pytest-cov
11 | coverage==4.0.3           # via cov-core
12 | enum34==1.1.2
13 | execnet==1.4.1            # via pytest-xdist
14 | gnureadline==6.3.3        # via ipython
15 | ipython==2.3.1
16 | logilab-astng==0.24.3     # via pylint
17 | logilab-common==1.1.0     # via logilab-astng, pylint
18 | mock==1.0.1
19 | numpy==1.10.4
20 | pandas==0.16.2
21 | py==1.4.31                # via pytest, pytest-xdist
22 | pylint==0.25.1
23 | pytest-cov==1.7.0
24 | pytest-xdist==1.14
25 | pytest==2.5.2
26 | python-dateutil==2.4.1    # via pandas
27 | python-igraph==0.7.1.post6
28 | pytz==2014.10             # via pandas
29 | scipy==0.15.1
30 | six==1.10.0               # via logilab-common, python-dateutil, theano
31 | theano==0.8.1
32 | 
33 | # The following packages are commented out because they are
34 | # considered to be unsafe in a requirements file:
35 | # setuptools                # via logilab-common
36 | 


--------------------------------------------------------------------------------
/rnn_prof/data/constants.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Key names used in data frames passed to RNN
 3 | """
 4 | 
 5 | # Represents an "item" for the RNN. Should be a continuous sequence of
 6 | # numbers in range(0, #items)
 7 | ITEM_IDX_KEY = 'item_idx'
 8 | 
 9 | # Represents a "template" for the RNN. Should be a continuous sequence of
10 | # numbers in range(0, #templates)
11 | TEMPLATE_IDX_KEY = 'template_idx'
12 | 
13 | # Represents a "concept" for the RNN. Should be a continuous sequence of
14 | # numbers in range(0, #concepts)
15 | CONCEPT_IDX_KEY = 'concept_idx'
16 | 
17 | # Represents a "user" for the RNN. Should be a continuous sequence of
18 | # numbers in range(0, #users)
19 | USER_IDX_KEY = 'user_idx'
20 | 
21 | # Represents a temporal ordering of items for the RNN. Can be any numeric
22 | # type as it's just used for sorting.
23 | TIME_IDX_KEY = 'time_idx'
24 | 
25 | # Whether the student got the item correct or not. Should be 0/1 or False/True
26 | CORRECT_KEY = 'correct'
27 | 
28 | # If a data set supports hinting, the number of hints requested before answering
29 | HINT_COUNT_KEY = 'hint_count'
30 | 
31 | # used to represent single (constant) value for concepts and templates across datasets
32 | SINGLE = 'single'
33 | 
34 | # datasets
35 | ASSISTMENTS = 'assistments'
36 | KDDCUP = 'kddcup'
37 | 


--------------------------------------------------------------------------------
/rnn_prof/data/split_data.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | 
 5 | import numpy as np
 6 | import pandas as pd
 7 | 
 8 | def main(input_file, student_id, delimiter, seed=293, kept_percentage=0.8):
 9 |     """Split data rows based on student it and write splits to file.
10 | 
11 |     :param str input_file: input file name
12 |     :param str student_id: column identifying students
13 |     :param str delimiter: row entries delimiter
14 |     :param int seed: seed for the random split
15 |     :param float kept_percentage: percent of students to retain in first split
16 |     """
17 | 
18 |     # parse delimiter special characters
19 |     delimiter = delimiter.decode('string_escape')
20 |     df = pd.read_csv(input_file, delimiter=delimiter, index_col=False)
21 |     user_ids = df[student_id].unique()
22 |     np.random.seed(seed)
23 |     np.random.shuffle(user_ids)
24 |     kept_user_ids = user_ids[:int(len(user_ids) * kept_percentage)]
25 |     kept_df = df[df[student_id].isin(kept_user_ids)]
26 |     not_kept_df = df[~df[student_id].isin(kept_user_ids)]
27 | 
28 |     kept_df.to_csv('.'.join(input_file.split('.')[:-1]) + '_big.txt', index=False, sep=delimiter)
29 |     not_kept_df.to_csv('.'.join(input_file.split('.')[:-1]) + '_small.txt', index=False, sep=delimiter)
30 | 
31 | 
32 | if __name__ == '__main__':
33 |     if len(sys.argv) != 4:
34 |         print "split_data.py filename id_column_name delimiter"
35 |         sys.exit(1)
36 |     main(sys.argv[1], sys.argv[2], sys.argv[3])
37 | 


--------------------------------------------------------------------------------
/rnn_prof/data/splitting_utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Utilities for general data pre-processing.
 3 | """
 4 | import logging
 5 | import numpy as np
 6 | 
 7 | from .constants import USER_IDX_KEY
 8 | 
 9 | _logger = logging.getLogger(__name__)
10 | 
11 | 
12 | def _get_fold_student_idx(student_ids, num_folds, seed=0):
13 |     """ Split up unique student IDs into different folds.
14 | 
15 |     :param np.ndarray student_ids: set of unique student ids or indices
16 |     :param int num_folds: number of folds to split that data into
17 |     :param int seed: seed for the splitting
18 |     :return: student ids per fold
19 |     :rtype: list
20 |     """
21 |     num_students = len(student_ids)
22 |     fold_idx = np.arange(num_students)
23 |     # randomize the order of all students to be split across folds
24 |     np.random.seed(seed)
25 |     np.random.shuffle(fold_idx)
26 |     fold_size = num_students // num_folds
27 | 
28 |     return [student_ids[fold_idx[i * fold_size:min(num_students, (i + 1) * fold_size)]]
29 |             for i in range(num_folds)]
30 | 
31 | 
32 | def split_data(data, num_folds, seed=0):
33 |     """ Split all interactions into K-fold sets of training and test dataframes.  Splitting is done
34 |     by assigning student ids to the training or test sets.
35 | 
36 |     :param pd.DataFrame data: all interactions
37 |     :param int num_folds: number of folds
38 |     :param int seed: seed for the splitting
39 |     :return: a generator over (train dataframe, test dataframe) tuples
40 |     :rtype: generator[(pd.DataFrame, pd.DataFrame)]
41 |     """
42 |     # break up students into folds
43 |     fold_student_idx = _get_fold_student_idx(np.unique(data[USER_IDX_KEY]), num_folds=num_folds,
44 |                                              seed=seed)
45 | 
46 |     for fold_test_student_idx in fold_student_idx:
47 |         test_idx = np.in1d(data[USER_IDX_KEY], fold_test_student_idx)
48 |         train_idx = np.logical_not(test_idx)
49 |         yield (data[train_idx].copy(), data[test_idx].copy())
50 | 


--------------------------------------------------------------------------------
/rnn_prof/tests/data/test_wrapper.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | 
 3 | import os
 4 | import unittest
 5 | 
 6 | from rnn_prof.data import assistments
 7 | from rnn_prof.data import wrapper as undertest
 8 | from rnn_prof.data.constants import USER_IDX_KEY
 9 | 
10 | 
11 | TESTDATA_FILENAME = os.path.join(os.path.dirname(__file__), 'test_assist_data.csv.gz')
12 | 
13 | 
14 | class TestWrapper(unittest.TestCase):
15 | 
16 |     def test_proportion_students_retained(self):
17 |         data_opts = undertest.DEFAULT_DATA_OPTS
18 | 
19 |         raw_output = assistments.load_data(
20 |             TESTDATA_FILENAME,
21 |             template_id_col=data_opts.template_id_col,
22 |             concept_id_col=data_opts.concept_id_col,
23 |             remove_nan_skill_ids=data_opts.remove_skill_nans,
24 |             max_interactions_per_user=data_opts.max_interactions_per_user,
25 |             min_interactions_per_user=data_opts.min_interactions_per_user,
26 |             drop_duplicates=data_opts.drop_duplicates)
27 | 
28 |         output = undertest.load_data(TESTDATA_FILENAME,
29 |                                      'assistments',
30 |                                      data_opts=data_opts)
31 | 
32 |         self.assertEqual(len(raw_output[0]), len(output[0]))
33 | 
34 |         test_proportion_students_retained = 2 / 3
35 |         data_opts = undertest.DataOpts(
36 |             num_folds=2, item_id_col=None, template_id_col=None,
37 |             concept_id_col=None,
38 |             remove_skill_nans=False, seed=0, use_correct=True, use_hints=False,
39 |             drop_duplicates=False,
40 |             max_interactions_per_user=None, min_interactions_per_user=2,
41 |             proportion_students_retained=test_proportion_students_retained)
42 | 
43 |         output = undertest.load_data(TESTDATA_FILENAME,
44 |                                      'assistments',
45 |                                      data_opts=data_opts)
46 | 
47 |         total_users = raw_output[0][USER_IDX_KEY].nunique()
48 |         retained_users = output[0][USER_IDX_KEY].nunique()
49 |         self.assertAlmostEquals(retained_users / total_users,
50 |                                 test_proportion_students_retained,
51 |                                 1e-5)
52 | 


--------------------------------------------------------------------------------
/rnn_prof/tests/data/test_kddcup.py:
--------------------------------------------------------------------------------
 1 | import itertools as its
 2 | import os
 3 | import unittest
 4 | 
 5 | from rnn_prof.data import kddcup as undertest
 6 | 
 7 | 
 8 | TESTDATA_FILENAME = os.path.join(os.path.dirname(__file__), 'test_kddcup_data.csv.gz')
 9 | 
10 | 
11 | class TestLoadKddData(unittest.TestCase):
12 | 
13 |     def test_load_data(self):
14 |         max_inter_values = (None, int(1e6), 10, 2)
15 |         drop_duplicates_values = (False, True)
16 |         min_inter_values = (2, 3)
17 | 
18 |         for (max_inter, drop_duplicates, min_inter) in \
19 |                 its.product(max_inter_values, drop_duplicates_values, min_inter_values):
20 | 
21 |             if max_inter is not None and max_inter < min_inter:
22 |                 # The maximum number of interactions must be greater than the minimum
23 |                 continue
24 | 
25 |             output = undertest.load_data(TESTDATA_FILENAME,
26 |                                          concept_id_col=undertest.KC_NAME_STARTS_WITH,
27 |                                          template_id_col=undertest.PROBLEM_NAME,
28 |                                          item_id_col=undertest.STEP_NAME,
29 |                                          max_interactions_per_user=max_inter,
30 |                                          min_interactions_per_user=min_inter,
31 |                                          drop_duplicates=drop_duplicates)
32 |             output_data = output[0]
33 |             self.assertGreater(len(output_data), 0)
34 |             self.assertEqual(set(output_data.columns),
35 |                              {undertest.USER_IDX_KEY, undertest.ITEM_IDX_KEY,
36 |                               undertest.CORRECT_KEY, undertest.TIME_IDX_KEY,
37 |                               undertest.CONCEPT_IDX_KEY, undertest.TEMPLATE_IDX_KEY})
38 | 
39 |             max_interactions = max_inter or int(1e6)
40 |             self.assertLessEqual(output_data.groupby(undertest.USER_IDX_KEY).size().max(),
41 |                                  max_interactions)
42 |             self.assertGreaterEqual(output_data.groupby(undertest.USER_IDX_KEY).size().min(),
43 |                                     min_inter)
44 | 
45 |             if drop_duplicates:
46 |                 num_dupes = output_data.groupby([undertest.USER_IDX_KEY, undertest.TIME_IDX_KEY,
47 |                                                  undertest.ITEM_IDX_KEY]).size().values
48 |                 self.assertEqual(set(num_dupes), {1})
49 | 


--------------------------------------------------------------------------------
/rnn_prof/tests/test_simple_rnn.py:
--------------------------------------------------------------------------------
 1 | import StringIO
 2 | import os
 3 | import unittest
 4 | 
 5 | from rnn_prof import simple_rnn
 6 | from rnn_prof.data.wrapper import load_data
 7 | from rnn_prof.data.rnn import build_nn_data
 8 | 
 9 | TESTDATA_FILENAME = os.path.join(os.path.dirname(__file__), 'data', 'test_assist_data.csv.gz')
10 | 
11 | 
12 | class TestRnn(unittest.TestCase):
13 | 
14 |     def test_initialization(self):
15 |         """ Just make sure initialize doesn't cause the interpreter to crash """
16 |         data, _, item_ids, _, _ = load_data(TESTDATA_FILENAME, 'assistments')
17 |         num_questions = len(item_ids)
18 |         nn_data = build_nn_data(data, num_questions)
19 |         pivot = len(nn_data) // 2
20 |         train_data = nn_data[:pivot]
21 |         test_data = nn_data[pivot:]
22 | 
23 |         opts = simple_rnn.RnnOpts(hidden_dim=20)
24 |         simple_rnn.SimpleRnn(train_data, opts, test_data=test_data)
25 | 
26 |     def test_dump_and_load(self):
27 |         """
28 |         Test dumping and loading the SimpleRnn and make sure that all of its properties remain in
29 |         shape.
30 |         """
31 |         data, _, item_ids, _, _ = load_data(TESTDATA_FILENAME, 'assistments')
32 |         num_questions = len(item_ids)
33 |         nn_data = build_nn_data(data, num_questions)
34 |         pivot = len(nn_data) // 2
35 |         train_data = nn_data[:pivot]
36 | 
37 |         max_compress_dim = 10
38 |         hidden_dim = 20
39 |         recurrent = False
40 |         grad_norm_limit = 1.0
41 |         first_learning_rate = 20.0
42 |         decay_rate = 0.5
43 |         largest_grad = 4.0
44 |         batch_threshold = 0.8
45 |         opts = simple_rnn.RnnOpts(max_compress_dim=max_compress_dim, hidden_dim=hidden_dim,
46 |                                   recurrent=recurrent, grad_norm_limit=grad_norm_limit,
47 |                                   largest_grad=largest_grad, batch_threshold=batch_threshold,
48 |                                   first_learning_rate=first_learning_rate, decay_rate=decay_rate)
49 |         original = simple_rnn.SimpleRnn(train_data, opts)
50 | 
51 |         dumped = StringIO.StringIO()
52 |         original.dump(dumped)
53 |         dumped_str = dumped.getvalue()
54 |         dumped_reader = StringIO.StringIO(dumped_str)
55 |         recalled = simple_rnn.SimpleRnn.load(dumped_reader)
56 | 
57 |         for attr in ('max_compress_dim', 'recurrent', 'grad_norm_limit',
58 |                      'first_learning_rate', 'decay_rate', 'largest_grad', 'batch_threshold'):
59 |             self.assertEqual(getattr(original.opts, attr), getattr(recalled.opts, attr),
60 |                              "%s was changed" % attr)
61 | 


--------------------------------------------------------------------------------
/rnn_prof/tests/data/test_splitting_utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Tests for data splitting utilities
 3 | """
 4 | import unittest
 5 | import uuid
 6 | 
 7 | import numpy as np
 8 | import pandas as pd
 9 | 
10 | from rnn_prof.data import splitting_utils as undertest
11 | 
12 | USER_KEY = 'user'
13 | QUESTION_KEY = 'question'
14 | 
15 | 
16 | class TestSplittingUtils(unittest.TestCase):
17 |     def test_split_data(self):
18 |         num_folds = 5
19 |         question_ids = [str(uuid.uuid4()) for _ in range(100)]
20 |         user_ids = [str(uuid.uuid4()) for _ in range(20)]
21 |         num_responses = 1000
22 |         data = pd.DataFrame(data={undertest.USER_IDX_KEY: np.random.choice(user_ids, num_responses),
23 |                                   USER_KEY: np.random.choice(user_ids, num_responses),
24 |                                   QUESTION_KEY: np.random.choice(question_ids, num_responses)},
25 |                             index=np.arange(num_responses))
26 | 
27 |         for train_data, test_data in undertest.split_data(data, num_folds=num_folds):
28 |             # test that all students and all rows appear in train and test
29 |             self.assertEqual(set.union(set(train_data[undertest.USER_IDX_KEY].values),
30 |                                        set(test_data[undertest.USER_IDX_KEY].values)),
31 |                              set(data[undertest.USER_IDX_KEY].values))
32 |             self.assertEqual(set.union(set(train_data.index), set(test_data.index)),
33 |                              set(data.index))
34 |             # test that no students and no rows appear in both train and test
35 |             self.assertEqual(set.intersection(set(train_data[undertest.USER_IDX_KEY].values),
36 |                                               set(test_data[undertest.USER_IDX_KEY].values)),
37 |                              set([]))
38 |             self.assertEqual(set.intersection(set(train_data.index), set(test_data.index)),
39 |                              set([]))
40 | 
41 |         # test number of folds
42 |         self.assertEqual(len(list(undertest.split_data(data, num_folds=num_folds))), num_folds)
43 | 
44 |         # test that setting seed gives the same student partitions
45 |         train1, test1 = undertest.split_data(data, num_folds=num_folds, seed=0).next()
46 |         train2, test2 = undertest.split_data(data, num_folds=num_folds, seed=0).next()
47 |         np.testing.assert_array_equal(train1[undertest.USER_IDX_KEY].values,
48 |                                       train2[undertest.USER_IDX_KEY].values)
49 |         np.testing.assert_array_equal(test1[undertest.USER_IDX_KEY].values,
50 |                                       test2[undertest.USER_IDX_KEY].values)
51 | 


--------------------------------------------------------------------------------
/rnn_prof/tests/irt/test_updaters.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Unit tests for computing parameter update steps based on gradients and Hessians.
 3 | """
 4 | from __future__ import division
 5 | 
 6 | import logging
 7 | import numpy as np
 8 | import unittest
 9 | from scipy.sparse.construct import csr_matrix
10 | 
11 | from rnn_prof.irt import updaters as undertest
12 | 
13 | LOGGER = logging.getLogger('test_irt')
14 | 
15 | NUM_TRIALS = 10
16 | 
17 | 
18 | class TestUpdaters(unittest.TestCase):
19 |     def setUp(self):
20 |         """
21 |         Make random gradients and Hessians for variables that are 2D numpy arrays.
22 |         """
23 |         def rand_posdef_matrix(dim):
24 |             """random positive definite, well-conditioned matrix"""
25 |             A = np.random.randn(dim, dim)
26 |             u, s, v = np.linalg.svd(A)
27 |             return u.dot(np.diag(np.clip(s, 0.1, np.inf))).dot(u.T)
28 |         shapes = [(np.random.randint(1, 10), np.random.randint(1, 10)) for _ in range(NUM_TRIALS)]
29 |         # make sure at least one trial uses (1,1) matrix
30 |         shapes[0] = (1, 1)
31 |         self.xs = [np.random.randn(*shape) for shape in shapes]
32 |         self.grads = [np.random.randn(*shape) for shape in shapes]
33 |         self.hessians = [rand_posdef_matrix(shape[0]*shape[1]) for shape in shapes]
34 | 
35 |     def test_support(self):
36 |         """
37 |         Test that the updated value is not outside the support.
38 |         """
39 |         lbound = 0.1
40 |         ubound = 0.15
41 |         updater = undertest.NewtonRaphson()
42 |         for x, grad, hessian in zip(self.xs, self.grads, self.hessians):
43 |             new_val = updater(x, grad, hessian, support=(lbound, ubound))
44 |             self.assertTrue(np.all(new_val >= lbound))
45 |             self.assertTrue(np.all(new_val <= ubound))
46 | 
47 |     def test_newton_raphson(self):
48 |         """ Test that Newton-Raphson solves the quadratic problem. """
49 |         updater = undertest.NewtonRaphson()
50 |         for x, grad, hessian in zip(self.xs, self.grads, self.hessians):
51 |             expected_step_vec = -np.linalg.inv(hessian).dot(grad.ravel(order='F'))
52 |             expected_step_vec = expected_step_vec.reshape(grad.shape[1], grad.shape[0]).T
53 |             expected_estimate = x + updater.step_size * expected_step_vec
54 |             # test solution on toy matrix
55 |             nr_estimate = updater(x, grad, hessian)
56 |             np.testing.assert_almost_equal(expected_estimate, nr_estimate, decimal=4)
57 |             # test when toy Hessian is sparse
58 |             nr_sparse_estimate = updater(x, grad, csr_matrix(hessian))
59 |             np.testing.assert_almost_equal(nr_sparse_estimate, expected_estimate, decimal=4)
60 | 


--------------------------------------------------------------------------------
/rnn_prof/tests/test_run_rnn.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pickle
 3 | import shutil
 4 | import tempfile
 5 | import unittest
 6 | 
 7 | from rnn_prof import run_rnn as undertest
 8 | from rnn_prof.data.constants import ASSISTMENTS
 9 | from rnn_prof.data.splitting_utils import split_data
10 | from rnn_prof.data.wrapper import load_data, DEFAULT_DATA_OPTS
11 | 
12 | ASSISTMENTS_TESTDATA_FILENAME = os.path.join(os.path.dirname(__file__), 'data',
13 |                                              'test_assist_data.csv.gz')
14 | TEST_NUM_FOLDS = 2
15 | TEST_NUM_ITERS = 2
16 | OUTPUT_PREFIX = 'output'
17 | 
18 | # Columns expected in the pickled output
19 | EXPECTED_COLS = ['auc', 'fold_num', 'global', 'is_two_po', 'map']
20 | 
21 | 
22 | class TestRunRNN(unittest.TestCase):
23 | 
24 |     @classmethod
25 |     def setUpClass(cls):
26 |         cls.output_dir = tempfile.mkdtemp('output')
27 |         cls.output_prefix = os.path.join(cls.output_dir, OUTPUT_PREFIX)
28 | 
29 |     @classmethod
30 |     def tearDownClass(cls):
31 |         shutil.rmtree(cls.output_dir)
32 | 
33 |     def test_run(self):
34 |         """ Make sure RNN can run on assistments data and outputs results."""
35 |         for data_file, data_source in [(ASSISTMENTS_TESTDATA_FILENAME, ASSISTMENTS)]:
36 |             data, _, item_ids, _, _ = load_data(data_file, data_source)
37 |             data_folds = split_data(data, num_folds=TEST_NUM_FOLDS)
38 |             undertest.run(data_folds, TEST_NUM_FOLDS, len(item_ids), TEST_NUM_ITERS,
39 |                           DEFAULT_DATA_OPTS, output=self.output_prefix)
40 | 
41 |             # Check that output was dumped for each fold
42 |             for i in range(1, TEST_NUM_FOLDS + 1):
43 |                 with open(self.output_prefix + str(i), 'rb') as outfile:
44 |                     output = pickle.load(outfile)
45 |                     self.assertTrue(len(output))
46 | 
47 |     def test_run_with_output_compression(self):
48 |         """ Make sure RNN can run on assistments data and outputs results."""
49 |         for data_file, data_source in [(ASSISTMENTS_TESTDATA_FILENAME, ASSISTMENTS)]:
50 |             data, _, item_ids, _, _ = load_data(data_file, data_source)
51 |             data_folds = split_data(data, num_folds=TEST_NUM_FOLDS)
52 |             undertest.run(data_folds, TEST_NUM_FOLDS, len(item_ids), TEST_NUM_ITERS,
53 |                           DEFAULT_DATA_OPTS, output=self.output_prefix,
54 |                           output_compress_dim=20)
55 | 
56 |             # Check that output was dumped for each fold
57 |             for i in range(1, TEST_NUM_FOLDS + 1):
58 |                 with open(self.output_prefix + str(i), 'rb') as outfile:
59 |                     output = pickle.load(outfile)
60 |                     self.assertTrue(len(output))
61 | 


--------------------------------------------------------------------------------
/rnn_prof/tests/irt/test_callbacks.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Tests for the callback functionality
 3 | """
 4 | 
 5 | import itertools as its
 6 | import numpy as np
 7 | import unittest
 8 | 
 9 | from rnn_prof.irt.callbacks import ConvergenceCallback
10 | from rnn_prof.irt.cpd import GaussianCPD
11 | from rnn_prof.irt.irt import BayesNetLearner
12 | from rnn_prof.irt.node import Node
13 | from rnn_prof.irt.testing_utils import EPSILON
14 | 
15 | NUM_NODES = 4
16 | NUM_HELD_OUT_NODES = 2
17 | 
18 | 
19 | class TestConvergenceCallback(unittest.TestCase):
20 |     def setUp(self):
21 |         nodes = []
22 |         for i in range(NUM_NODES):
23 |             nodes.append(Node(name=str(i), data=np.random.randn(i + 1), cpd=GaussianCPD(dim=i + 1),
24 |                               held_out=i < NUM_HELD_OUT_NODES))
25 |         self.learner = BayesNetLearner(nodes=nodes)
26 | 
27 |     def test_call(self):
28 |         """
29 |         Test callback returns correct should_continue and that error is thrown if early_stopping but
30 |         no held_out nodes.
31 |         """
32 |         # Test that should_continue from callback is correct
33 |         for early_stopping in (False, True):
34 |             callback = ConvergenceCallback(early_stopping=early_stopping)
35 |             for converged_states in its.product(*((True, False),) * NUM_NODES):
36 |                 for held_out_log_prob_deltas in its.product(*((-EPSILON, EPSILON),) *
37 |                                                             NUM_HELD_OUT_NODES):
38 |                     for node, state in zip(self.learner.nodes.values(), converged_states):
39 |                         node.converged = state
40 |                     for (node, held_out_log_prob_delta) in zip(
41 |                             self.learner.nodes.values()[:NUM_HELD_OUT_NODES],
42 |                             held_out_log_prob_deltas):
43 |                         node.log_prob_delta = held_out_log_prob_delta
44 | 
45 |                     expected_should_continue = not all(converged_states[NUM_HELD_OUT_NODES:])
46 |                     if early_stopping and sum(held_out_log_prob_deltas) <= 0:
47 |                         expected_should_continue = False
48 | 
49 |                     actual_should_continue = callback(self.learner)
50 | 
51 |                     self.assertEqual(expected_should_continue, actual_should_continue)
52 | 
53 |         # Test condition check
54 |         for node in self.learner.nodes.values():
55 |             node.held_out = False
56 |         callback = ConvergenceCallback(early_stopping=True)
57 |         with self.assertRaises(ValueError):
58 |             callback(self.learner)
59 | 
60 |     def test_callback_interface(self):
61 |         """ Test that all callback function (interfaces still work). """
62 |         callback = ConvergenceCallback()
63 |         _ = callback(self.learner)
64 | 


--------------------------------------------------------------------------------
/rnn_prof/tests/test_run_irt.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | import numpy as np
 3 | import os
 4 | import pandas as pd
 5 | import pickle
 6 | import tempfile
 7 | import unittest
 8 | 
 9 | from rnn_prof import run_irt as undertest
10 | from rnn_prof.data.constants import CONCEPT_IDX_KEY, USER_IDX_KEY, ASSISTMENTS
11 | from rnn_prof.data.splitting_utils import split_data
12 | from rnn_prof.data.wrapper import load_data
13 | 
14 | ASSISTMENTS_TESTDATA_FILENAME = os.path.join(os.path.dirname(__file__),
15 |                                              'data', 'test_assist_data.csv.gz')
16 | TEST_NUM_FOLDS = 2
17 | # Columns expected in the pickled output
18 | EXPECTED_COLS = ['auc', 'fold_num', 'global', 'is_two_po', 'map']
19 | 
20 | 
21 | class TestRunIrt(unittest.TestCase):
22 | 
23 |     @classmethod
24 |     def setUpClass(cls):
25 |         with tempfile.NamedTemporaryFile(delete=False, suffix='.pckl') as f:
26 |             cls.filename = f.name
27 | 
28 |     @classmethod
29 |     def tearDownClass(cls):
30 |         os.remove(cls.filename)
31 | 
32 |     def test_irt(self):
33 |         """ Make sure IRT can run on Assistments data and outputs results."""
34 |         for data_file, data_source in [(ASSISTMENTS_TESTDATA_FILENAME, ASSISTMENTS)]:
35 |             data, _, _, _, _ = load_data(data_file, data_source)
36 |             for is_two_po in [True, False]:
37 |                 data_folds = split_data(data, num_folds=TEST_NUM_FOLDS)
38 |                 undertest.irt(data_folds, TEST_NUM_FOLDS, output=self.filename, is_two_po=is_two_po)
39 | 
40 |                 with open(self.filename, 'rb') as output:
41 |                     output = pickle.load(output)
42 |                     for col in EXPECTED_COLS:
43 |                         self.assertTrue(col in output)
44 |                     self.assertTrue(np.all(output['is_two_po'].values == is_two_po))
45 | 
46 |     def test_compute_theta_idx(self):
47 |         train_data = pd.DataFrame({USER_IDX_KEY: [0, 0, 0, 1, 1, 2],
48 |                                   CONCEPT_IDX_KEY: [0, 1, 1, 1, 1, 0]})
49 |         test_data = pd.DataFrame({USER_IDX_KEY: [3, 3, 3, 4, 5],
50 |                                   CONCEPT_IDX_KEY: [0, 1, 2, 2, 2]})
51 | 
52 |         # For single concept, there should be a single idx per user
53 |         expected_single_concept = np.array([0, 0, 0, 1, 1, 2, 3, 3, 3, 4, 5])
54 |         actual_single_concept = undertest.compute_theta_idx(train_data, test_df=test_data,
55 |                                                             single_concept=True)
56 |         assert np.all(expected_single_concept == actual_single_concept)
57 | 
58 |         actual_multi_concept = undertest.compute_theta_idx(train_data, test_df=test_data,
59 |                                                            single_concept=False)
60 | 
61 |         # Rearrange the input and output for assertions
62 |         output_dict = defaultdict(list)
63 |         for (_, row), computed in zip(pd.concat([train_data, test_data]).iterrows(),
64 |                                       actual_multi_concept):
65 |             output_dict[(row[USER_IDX_KEY], row[CONCEPT_IDX_KEY])].append(computed)
66 |         values = []
67 |         for value in output_dict.itervalues():
68 |             # There is only one value per student/concept pair
69 |             assert len(set(value)) == 1
70 |             values.append(value)
71 | 
72 |         # Every value is unique per student/concept pair
73 |         assert len(values) == len(set(v for vv in values for v in vv))
74 | 


--------------------------------------------------------------------------------
/rnn_prof/irt/utils.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | import numpy as np
 4 | 
 5 | 
 6 | def set_or_check_min(x, min_x, var_name):
 7 |     """ If x is None, set its value to min_x; also check that it is at least min_x.
 8 |     :param int|float x: the value to check
 9 |     :param int|float min_x: minimum required value for x
10 |     :param str var_name: name of the variable (for error logging)
11 |     :return: set/checked value
12 |     :rtype: int|float
13 |     """
14 |     if x is None:
15 |         x = min_x
16 |     elif x < min_x:
17 |         raise ValueError("{} ({}) must be at least {}".format(var_name, x, min_x))
18 |     return x
19 | 
20 | 
21 | def check_and_set_idx(ids, idx, prefix):
22 |     """ Reconciles passed-in IDs and indices and returns indices, as well as unique IDs
23 |     in the order specified by the indices.  If only IDs supplied, returns the sort-arg
24 |     as the index.  If only indices supplied, returns None for IDs.  If both supplied,
25 |     checks that the correspondence is unique and returns unique IDs in the sort order of
26 |     the associated index.
27 |     :param np.ndarray ids: array of IDs
28 |     :param np.ndarray[int] idx: array of indices
29 |     :param str prefix: variable name (for error logging)
30 |     :return: unique IDs and indices (passed in or derived from the IDs)
31 |     :rtype: np.ndarray, np.ndarray
32 |     """
33 |     if ids is None and idx is None:
34 |         raise ValueError('Both {}_ids and {}_idx cannot be None'.format(prefix, prefix))
35 |     if ids is None:
36 |         return None, np.asarray_chkfinite(idx)
37 |     if idx is None:
38 |         return np.unique(ids, return_inverse=True)
39 |     else:
40 |         ids = np.asarray(ids)
41 |         idx = np.asarray_chkfinite(idx)
42 |         if len(idx) != len(ids):
43 |             raise ValueError('{}_ids ({}) and {}_idx ({}) must have the same length'.format(
44 |                 prefix, len(ids), prefix, len(idx)))
45 |         uniq_idx, idx_sort_index = np.unique(idx, return_index=True)
46 |         # make sure each unique index corresponds to a unique id
47 |         if not all(len(set(ids[idx == i])) == 1 for i in uniq_idx):
48 |             raise ValueError("Each index must correspond to a unique {}_id".format(prefix))
49 |         return ids[idx_sort_index], idx
50 | 
51 | 
52 | def check_positive(value, label):
53 |     if math.isinf(value) or value <= 0:
54 |         raise ValueError('{} ({}) must be a positive finite number'.format(value, label))
55 | 
56 | 
57 | def check_nonnegative(value, label):
58 |     if math.isinf(value) or value < 0:
59 |         raise ValueError('{} ({}) must be a finite nonnegative number'.format(value, label))
60 | 
61 | 
62 | def check_int(value, label):
63 |     if value != int(value):
64 |         raise TypeError('{} ({}) should be an int'.format(value, label))
65 | 
66 | 
67 | def check_float(value, label):
68 |     if value != float(value):
69 |         raise TypeError('{} ({}) should be a float'.format(value, label))
70 | 
71 | 
72 | def check_positive_float(value, label):
73 |     """ Check that a value is a finite positive float. """
74 |     check_positive(value, label)
75 |     check_float(value, label)
76 | 
77 | 
78 | def check_positive_int(value, label):
79 |     """ Check that a value is a positive int. """
80 |     check_positive(value, label)
81 |     check_int(value, label)
82 | 
83 | 
84 | def check_nonnegative_int(value, label):
85 |     """ Check that a value is a nonnegative int. """
86 |     check_nonnegative(value, label)
87 |     check_int(value, label)
88 | 
89 | 
90 | def check_nonnegative_float(value, label):
91 |     """ Check that a value is a nonnegative float. """
92 |     check_nonnegative(value, label)
93 |     check_float(value, label)
94 | 


--------------------------------------------------------------------------------
/rnn_prof/tests/irt/test_irt.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Tests for the BayesNetLearner abstract class and its implementations.
 3 | """
 4 | from __future__ import division
 5 | 
 6 | import logging
 7 | import numpy as np
 8 | from scipy import sparse as sp
 9 | import unittest
10 | 
11 | from rnn_prof.irt import irt as undertest
12 | from rnn_prof.irt.cpd import GaussianCPD
13 | from rnn_prof.irt.node import Node
14 | from rnn_prof.irt.testing_utils import MockLearner
15 | 
16 | LOGGER = logging.getLogger(__name__)
17 | EQUIV_DECIMAL_PLACES = 8
18 | MAX_ITER = 500
19 | NUM_TESTS = 5
20 | 
21 | 
22 | class TestBayesNetLearner(unittest.TestCase):
23 |     def setUp(self):
24 |         # use the test learner
25 |         self.learner = MockLearner()
26 | 
27 |     def test_learn(self):
28 |         """ Test that nodes get evidence from all their children (i.e. nodes are processed in
29 |         topological order) and the summed log-posterior equals to the sum of the contributions from
30 |         all nodes.
31 |         """
32 |         self.learner.learn()
33 | 
34 |         # test that nodes got evidence updates in the correct order
35 |         self.assertEqual(self.learner.nodes['A'].obtained_evidence_terms,
36 |                          {self.learner.nodes['B']: 'A'})
37 |         self.assertEqual(self.learner.nodes['B'].obtained_evidence_terms,
38 |                          {self.learner.nodes['C']: 'B', self.learner.nodes['D']: 'B'})
39 |         self.assertEqual(self.learner.nodes['C'].obtained_evidence_terms,
40 |                          {self.learner.nodes['E']: 'C'})
41 |         self.assertEqual(self.learner.nodes['D'].obtained_evidence_terms,
42 |                          {self.learner.nodes['E']: 'D', self.learner.nodes['F']: 'D'})
43 |         self.assertEqual(self.learner.nodes['E'].obtained_evidence_terms, {})
44 |         self.assertEqual(self.learner.nodes['F'].obtained_evidence_terms, {})
45 | 
46 |         # test that log-probs from all nodes have been added
47 |         self.assertAlmostEqual(self.learner.log_posterior,
48 |                                sum(n.log_prob for n in self.learner.nodes.values()),
49 |                                places=EQUIV_DECIMAL_PLACES)
50 | 
51 |     def test_get_posterior_hessian(self):
52 |         """ Tests the computation of the log-posterior Hessian with a simple graph,
53 |               X
54 |              /|
55 |             / |
56 |            v  v
57 |            Y  Z
58 |         where all nodes contain 2D Gaussians and node X encodes the mean for nodes Y and Z
59 |         """
60 |         for k in range(NUM_TESTS):
61 |             prec_x = np.diag(np.random.rand(2), 0)
62 |             prec_y = sp.diags(np.random.rand(2), 0)  # throw in a sparse precision
63 |             prec_z = np.diag(np.random.rand(2), 0)
64 |             node_x = Node(name='x', data=np.random.randn(2), cpd=GaussianCPD(precision=prec_x))
65 |             node_y = Node(name='y', data=np.random.randn(2), cpd=GaussianCPD(precision=prec_y),
66 |                           param_nodes={GaussianCPD.MEAN_KEY: node_x})
67 |             node_z = Node(name='z', data=np.random.randn(2), cpd=GaussianCPD(precision=prec_z),
68 |                           param_nodes={GaussianCPD.MEAN_KEY: node_x}, held_out=True)
69 |             learner = undertest.BayesNetLearner(nodes=[node_x, node_y, node_z])
70 |             np.testing.assert_almost_equal(learner.get_posterior_hessian('x', use_held_out=True),
71 |                                            -prec_x - prec_y - prec_z)
72 |             np.testing.assert_almost_equal(learner.get_posterior_hessian('x', use_held_out=False),
73 |                                            -prec_x - prec_y)
74 |             np.testing.assert_almost_equal(learner.get_posterior_hessian('x'), -prec_x - prec_y)
75 |             np.testing.assert_almost_equal(learner.get_posterior_hessian('x', np.random.randn(2)),
76 |                                            -prec_x - prec_y)
77 | 


--------------------------------------------------------------------------------
/rnn_prof/tests/data/test_assistments.py:
--------------------------------------------------------------------------------
 1 | import itertools as its
 2 | import os
 3 | import unittest
 4 | 
 5 | import numpy as np
 6 | import pandas as pd
 7 | 
 8 | from rnn_prof.data import assistments as undertest
 9 | 
10 | TESTDATA_FILENAME = os.path.join(os.path.dirname(__file__), 'test_assist_data.csv.gz')
11 | 
12 | 
13 | class TestLoadAssistmentsData(unittest.TestCase):
14 | 
15 |     def test_load_data(self):
16 |         """ Test that data loads without breaking """
17 |         remove_nan_skill_ids_values = (True, False)
18 |         item_id_col_values = (undertest.SKILL_ID_KEY, undertest.PROBLEM_ID_KEY)
19 |         concept_id_col_values = (None, undertest.SKILL_ID_KEY, undertest.PROBLEM_ID_KEY,
20 |                                  undertest.SINGLE)
21 |         template_id_col_values = (None, undertest.TEMPLATE_ID_KEY, undertest.SINGLE)
22 |         max_inter_values = (None, int(1e6), 10, 2)
23 |         drop_duplicates_values = (False, True)
24 |         min_inter_values = (2, 3)
25 |         for (remove_nan_skill_ids, item_id_col, concept_id_col, template_id_col,
26 |              max_inter, drop_duplicates, min_inter) in \
27 |             its.product(remove_nan_skill_ids_values, item_id_col_values, concept_id_col_values,
28 |                         template_id_col_values, max_inter_values, drop_duplicates_values,
29 |                         min_inter_values):
30 | 
31 |             if max_inter is not None and max_inter < min_inter:
32 |                 # The maximum number of interactions must be greater than the minimum
33 |                 continue
34 | 
35 |             output = undertest.load_data(TESTDATA_FILENAME,
36 |                                          item_id_col=item_id_col,
37 |                                          template_id_col=template_id_col,
38 |                                          concept_id_col=concept_id_col,
39 |                                          remove_nan_skill_ids=remove_nan_skill_ids,
40 |                                          max_interactions_per_user=max_inter,
41 |                                          min_interactions_per_user=min_inter,
42 |                                          drop_duplicates=drop_duplicates)
43 |             output_data = output[0]
44 |             expected_columns = {undertest.USER_IDX_KEY, undertest.ITEM_IDX_KEY,
45 |                                 undertest.CORRECT_KEY, undertest.TIME_IDX_KEY}
46 |             if template_id_col is not None:
47 |                 expected_columns.add(undertest.TEMPLATE_IDX_KEY)
48 |             if concept_id_col is not None:
49 |                 expected_columns.add(undertest.CONCEPT_IDX_KEY)
50 |             self.assertEqual(set(output_data.columns), expected_columns)
51 | 
52 |             max_interactions = max_inter or int(1e6)
53 |             self.assertLessEqual(output_data.groupby(undertest.USER_IDX_KEY).size().max(),
54 |                                  max_interactions)
55 |             self.assertGreaterEqual(output_data.groupby(undertest.USER_IDX_KEY).size().min(),
56 |                                     min_inter)
57 | 
58 |             if drop_duplicates:
59 |                 num_dupes = output_data.groupby(
60 |                     [undertest.USER_IDX_KEY, undertest.ITEM_IDX_KEY,
61 |                      undertest.TIME_IDX_KEY]).size().values
62 |                 self.assertEqual(set(num_dupes), {1})
63 | 
64 |             # Test that user_ids, item_ids, concept_ids match up.
65 |             data = pd.DataFrame.from_csv(TESTDATA_FILENAME)
66 |             col_mapping = [(undertest.USER_ID_KEY, undertest.USER_IDX_KEY),
67 |                            (item_id_col, undertest.ITEM_IDX_KEY),
68 |                            (template_id_col, undertest.TEMPLATE_IDX_KEY),
69 |                            (concept_id_col, undertest.CONCEPT_IDX_KEY)]
70 |             for i, (key, val) in enumerate(col_mapping):
71 |                 if key == undertest.SINGLE:
72 |                     self.assertEqual(output_data[val].nunique(), 1)
73 |                 elif key is not None:
74 |                     self.assertGreaterEqual(set(np.unique(data[key])), set(output[i + 1]))
75 |                     self.assertEqual(output_data[val].nunique(), len(set(output[i + 1])))
76 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # IRT and DKT implementation
 2 | 
 3 | This library contains implementations of IRT models and 
 4 | [Deep Knowledge Tracing (DKT)](http://papers.nips.cc/paper/5654-deep-knowledge-tracing.pdf) that reproduces the results reported in "Back to the Basics: Bayesian extensions of IRT outperform neural networks for proficiency estimation" (Wilson, Karklin, Han, Ekanadham EDM2016).
 5 | 
 6 | # Implemented models 
 7 | 
 8 | ## IRT 
 9 | 
10 | 
11 | Bayesian versions of one and two parameter Item Response Theory models.  The likelihood is given by the ogive item response function, and priors on student and item parameters are standard normal distributions.
12 | 
13 | ## Hierarchical IRT
14 | 
15 | Implementation of an IRT model that extends the model above with a Gaussian hyper-prior on item difficulties.
16 | 
17 | ## DKT
18 | 
19 | Recurrent neural network implemented using Theano.
20 | 
21 | # Requirements (see `requirements.in`)
22 | - python
23 | - theano
24 | - numpy
25 | - scipy
26 | - ipython
27 | - pandas
28 | - igraph
29 | 
30 | # Data
31 | 
32 | The ASSISTments data set may be found [here](https://sites.google.com/site/assistmentsdata/home/assistment-2009-2010-data/skill-builder-data-2009-2010). Note that the authors of the data set have since removed several duplicates from the original data set which we used. However, as we explain in the paper, our preprocessing steps involved removing these duplicates as well. Thus, while we used the original data set, both the original and the corrected versions should duplicate our results.
33 | 
34 | The KDD Cup data set may be found [here](https://pslcdatashop.web.cmu.edu/KDDCup/downloads.jsp). We used the Bridge to Algebra 2006-2007 data set, and specifically the training data set.
35 | 
36 | # Usage
37 | ```
38 |     Usage: rnn_prof [OPTIONS] COMMAND [ARGS]...
39 | 
40 |       Collection of scripts for evaluating RNN proficiency models
41 | 
42 |     Options:
43 |       -h, --help  Show this message and exit.
44 | 
45 |     Commands:
46 |       irt    Run IRT to get item parameters and compute...
47 |       naive  Just report the percent correct across all...
48 |       rnn    RNN based proficiency estimation :param str...
49 | ```
50 | 
51 | 
52 | # To reproduce results in the EDM2016 paper:
53 | 
54 | 1. construct the 20/80 split data sets (20% for model parameter selection, e.g.,
55 | prior parameters, RNN layer sizes; 80% for train/test) using `data/split_data.py`, 
56 | `python split_data.py bridge_to_algebra_2006_2007_train.txt "Anon Student Id" "\t"`, 
57 | `python split_data.py skill_builder_data.csv user_id ","`
58 | 
59 | 2. execute the following commands:
60 | 
61 | #### IRT
62 |     rnn_prof irt assistments skill_builder_data_big.txt --onepo \
63 |     --drop-duplicates --no-remove-skill-nans --num-folds 5 \
64 |     --item-id-col problem_id --concept-id-col single 
65 | 
66 |     rnn_prof irt kddcup bridge_to_algebra_2006_2007_train_big.txt \
67 |     --onepo --drop-duplicates --no-remove-skill-nans --num-folds 5 \
68 |     --item-id-col 'Step Name' --concept-id-col single
69 | 
70 | #### HIRT
71 |     rnn_prof irt assistments skill_builder_data_big.txt --onepo \
72 |     --drop-duplicates --no-remove-skill-nans --num-folds 5 \
73 |     --item-precision 4.0 --template-precision 2.0 \
74 |     --template-id-col template_id --item-id-col problem_id \
75 |     --concept-id-col single
76 | 
77 |     rnn_prof irt kddcup bridge_to_algebra_2006_2007_train_big.txt  --onepo \
78 |     --drop-duplicates --no-remove-skill-nans --num-folds 5 \
79 |     --item-precision 2.0 --template-precision 4.0 -m 5000 \
80 |     --template-id-col template_id --item-id-col problem_id \
81 |     --concept-id-col single
82 | 
83 | #### DKT
84 |     rnn_prof rnn assistments skill_builder_data_big.txt  \
85 |     --no-remove-skill-nans --drop-duplicates --num-folds 5 \
86 |     --item-id-col problem_id --num-iters 50 --dropout-prob 0.25 \
87 |     --first-learning-rate 5.0  --compress-dim 50 --hidden-dim 100 
88 | 
89 |     rnn_prof rnn kddcup bridge_to_algebra_2006_2007_train_big.txt  \
90 |     --no-remove-skill-nans --drop-duplicates --num-folds 5 --item-id-col KC \
91 |     --num-iters 50 --dropout-prob 0.25 --first-learning-rate 5.0 \
92 |     --compress-dim 50 --hidden-dim 100 
93 | 
94 | 
95 | 
96 | 


--------------------------------------------------------------------------------
/rnn_prof/irt/linear_operators.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Classes that implement linear projection operators and their transposes.
 3 | """
 4 | import numpy as np
 5 | import scipy.sparse as sp
 6 | from scipy.sparse.linalg import LinearOperator
 7 | from scipy.sparse.linalg.interface import MatrixLinearOperator
 8 | 
 9 | 
10 | def get_subset_lin_op(lin_op, sub_idx):
11 |     """ Subset a linear operator to the indices in `sub_idx`. Equivalent to A' = A[sub_idx, :]
12 |     :param LinearOperator lin_op: input linear operator
13 |     :param np.ndarray[int] sub_idx: subset index
14 |     :return: the subset linear operator
15 |     :rtype: LinearOperator
16 |     """
17 |     if lin_op is None:
18 |         return None
19 |     if type(lin_op) is IndexOperator:
20 |         # subsetting IndexOperator yields a new IndexOperator
21 |         return IndexOperator(lin_op.index_map[sub_idx], dim_x=lin_op.dim_x)
22 |     elif isinstance(lin_op, MatrixLinearOperator):
23 |         # subsetting a matrix multiplication operation yields a new matrix
24 |         return MatrixLinearOperator(lin_op.A[sub_idx, :])
25 |     # in the general case, append a sub-indexing operator
26 |     return IndexOperator(sub_idx, dim_x=lin_op.shape[0]) * lin_op
27 | 
28 | 
29 | def rmatvec_nd(lin_op, x):
30 |     """
31 |     Project a 1D or 2D numpy or sparse array using rmatvec. This is different from rmatvec
32 |     because it applies rmatvec to each row and column. If x is n x n and lin_op is n x k,
33 |     the result will be k x k.
34 | 
35 |     :param LinearOperator lin_op: The linear operator to apply to x
36 |     :param np.ndarray|sp.spmatrix x: array/matrix to be projected
37 |     :return: the projected array
38 |     :rtype: np.ndarray|sp.spmatrix
39 |     """
40 |     if x is None or lin_op is None:
41 |         return x
42 |     if isinstance(x, sp.spmatrix):
43 |         y = x.toarray()
44 |     elif np.isscalar(x):
45 |         y = np.array(x, ndmin=1)
46 |     else:
47 |         y = np.copy(x)
48 |     proj_func = lambda z: lin_op.rmatvec(z)
49 |     for j in range(y.ndim):
50 |         if y.shape[j] == lin_op.shape[0]:
51 |             y = np.apply_along_axis(proj_func, j, y)
52 |     return y
53 | 
54 | 
55 | class IndexOperator(LinearOperator):
56 |     """
57 |     A linear one-to-many operator equivalent to ``y_j = A_jx = x_{index_j}``,
58 |     i.e. ``y = x[index]``.
59 |     The inverse operation is ``x_i = A_i^T y = \sum_j y_j \delta(index_j - i)``.
60 |     When computing the inverse ``x = A^T y``, it is not assumed that all x's were used to generate
61 |     y, so the operator can be initialized with ``max_idx = len(x) - 1``, in which case the inverse
62 |     operator will produce a vector of size len(x).
63 |     """
64 | 
65 |     def _index(self, x):
66 |         return x[self.index_map]
67 | 
68 |     def _reverse_index(self, x):
69 |         """ Helper method for summing over elements with shared indices."""
70 |         count_func = lambda z: np.bincount(self.index_map, weights=z, minlength=self.dim_x)
71 |         return np.apply_along_axis(count_func, 0, x)
72 | 
73 |     def __init__(self, index_map, dim_x=None):
74 |         """ Set up the linear transform and its transpose with the index map
75 |         :param np.ndarray[int] index_map: indicates, for each element of the output, which input
76 |             element is applied
77 |         :param int|None dim_x: dimension of the projected array x
78 |         """
79 |         if dim_x is not None and dim_x < np.max(index_map) - 1:
80 |             raise ValueError("dim_x ({}) must be None or at least max(index_map)+1 ({})".format(
81 |                 dim_x, np.max(index_map) + 1))
82 |         self.dim_x = dim_x or np.max(index_map) + 1
83 |         self.index_map = index_map.astype(int)
84 |         if index_map.dtype != int:
85 |             raise ValueError("Index map must be a numpy array of integers")
86 |         if np.any(index_map < 0):
87 |             raise ValueError("Index map must be positive")
88 |         super(IndexOperator, self).__init__(shape=(len(index_map), self.dim_x),
89 |                                             matvec=self._index,
90 |                                             rmatvec=self._reverse_index,
91 |                                             dtype=bool)
92 | 


--------------------------------------------------------------------------------
/rnn_prof/irt/cpd/cpd.py:
--------------------------------------------------------------------------------
  1 | """
  2 | A module containing the abstract base class for a conditional probability distribution (CPD) in a
  3 | Bayes Net.
  4 | """
  5 | from __future__ import division
  6 | from abc import ABCMeta, abstractmethod
  7 | from collections import namedtuple
  8 | 
  9 | import numpy as np
 10 | 
 11 | 
 12 | FLOAT_DTYPE = np.dtype('float64')
 13 | 
 14 | FunctionInfo = namedtuple('FunctionInfo', ['value', 'gradient', 'hessian'])
 15 | 
 16 | 
 17 | class CPDTerms(object):
 18 |     """A class containing the possible outputs of a conditional probability distribution:
 19 |      the value (log_probability), the gradients and Hessians w.r.t. the data and the parameters."""
 20 |     def __init__(self, log_prob, wrt=None):
 21 |         """
 22 |         :param float log_prob: the value of the CPD at the input data and parameters
 23 |         :param dict[str, FunctionInfo] wrt: gradients and Hessian w.r.t. data/params indicated by
 24 |             the keys
 25 |         """
 26 |         if wrt is not None and not isinstance(wrt, dict):
 27 |             raise TypeError("wrt must be a dict")
 28 |         self.log_prob = log_prob
 29 |         self.wrt = wrt or {}
 30 | 
 31 | 
 32 | class CPD(object):
 33 |     """
 34 |     The abstract base class for a conditional probability distribution Pr(data|{params}).
 35 |     Parameters may be passed in during initialization (if storing or pre-computing intermediate
 36 |     quantities is desirable) or at call-time.
 37 |     """
 38 |     __metaclass__ = ABCMeta
 39 | 
 40 |     # Keys in ``terms_to_compute`` input argument and ``CPDTerms.wrt`` output structure. The data
 41 |     # key is reserved and cannot be one of parameter keys.
 42 |     DATA_KEY = 'data'
 43 |     PARAM_KEYS = ()
 44 |     support = None
 45 | 
 46 |     @abstractmethod
 47 |     def __call__(self, data, params, terms_to_compute=None):
 48 |         """
 49 |         :param data: the data point at which to evaluate the CPD and its gradients
 50 |         :param params: keyword arguments for distribution parameters
 51 |         :param dict[str, UpdateTerms] terms_to_compute: which data/parameter gradients and Hessians
 52 |             to compute
 53 |         :return: the value and gradients of the CPD
 54 |         :rtype: CPDTerms
 55 |         """
 56 |         raise NotImplementedError
 57 | 
 58 |     def _validate_param_keys(self, input_keys, param_term_keys):
 59 |         """
 60 |         Check that all required parameters are available (from init or args) and check that
 61 |         requested gradients are for parameters that exist.
 62 | 
 63 |         :param list|None input_keys: keys of parameters passed into the CPD
 64 |         :param list|None param_term_keys: keys of parameters for which gradients/Hessians are
 65 |             requested
 66 |         """
 67 |         input_keys = input_keys or {}
 68 |         param_term_keys = param_term_keys or {}
 69 | 
 70 |         for par_key in self.PARAM_KEYS:
 71 |             if par_key == self.DATA_KEY:
 72 |                 raise ValueError("{} is a reserved key, cannot be a parameter".format(
 73 |                     self.DATA_KEY))
 74 |             # check that all the parameters have been initialized or passed in
 75 |             if getattr(self, par_key, None) is None and par_key not in input_keys:
 76 |                 raise ValueError("must initialize with %s or pass it in as a parameter" % par_key)
 77 | 
 78 |         # check that only valid param gradients are requested
 79 |         for par_key in param_term_keys:
 80 |             if par_key != self.DATA_KEY and par_key not in self.PARAM_KEYS:
 81 |                 raise ValueError("Terms requested for non-parameter {}".format(par_key))
 82 | 
 83 | 
 84 | class DistributionInfo(object):
 85 |     """
 86 |     Base data structure for distributions. The main usage is a base class for priors.
 87 |     """
 88 |     __metaclass__ = ABCMeta
 89 | 
 90 |     def __init__(self):
 91 |         """ The dimension variable represents the number of variables this distribution is over and
 92 |         must be set in the subclass. It is just initialized here. """
 93 |         self.dim = 0
 94 | 
 95 |     @abstractmethod
 96 |     def log_prob(self, x):
 97 |         """ Return a FunctionInfo object containing the value, gradient,
 98 |         and hessian of the log probability of x according to this distribution.
 99 |         Subclasses should implement this method. The returned hessian can be None,
100 |         in which case only gradient will be used.
101 | 
102 |         :param numpy.ndarray x: A 1D numpy array at which to evaluate the distribution
103 |         :rtype: FunctionInfo
104 |         """
105 |         raise NotImplementedError
106 | 
107 |     @abstractmethod
108 |     def sample(self, num_samples):
109 |         """ Draw samples from the distribution.  Subclasses may implement this method.
110 |         :param int num_samples: number of samples to draw
111 |         :return: samples from the distribution.
112 |         :rtype: np.ndarray
113 |         """
114 |         raise NotImplementedError
115 | 


--------------------------------------------------------------------------------
/rnn_prof/data/wrapper.py:
--------------------------------------------------------------------------------
 1 | from collections import namedtuple
 2 | import logging
 3 | 
 4 | import numpy as np
 5 | 
 6 | from . import assistments, kddcup
 7 | from .constants import USER_IDX_KEY, ASSISTMENTS, KDDCUP
 8 | 
 9 | LOGGER = logging.getLogger(__name__)
10 | 
11 | DataOpts = namedtuple('DataOpts', ['num_folds', 'item_id_col', 'template_id_col', 'concept_id_col',
12 |                                    'remove_skill_nans', 'seed',
13 |                                    'use_correct', 'use_hints', 'drop_duplicates',
14 |                                    'max_interactions_per_user', 'min_interactions_per_user',
15 |                                    'proportion_students_retained'])
16 | DEFAULT_DATA_OPTS = DataOpts(num_folds=2, item_id_col=None, template_id_col=None,
17 |                              concept_id_col=None,
18 |                              remove_skill_nans=False, seed=0, use_correct=True, use_hints=False,
19 |                              drop_duplicates=False,
20 |                              max_interactions_per_user=None, min_interactions_per_user=2,
21 |                              proportion_students_retained=1.0)
22 | 
23 | 
24 | def load_data(interaction_file, data_source, data_opts=DEFAULT_DATA_OPTS):
25 |     """ A wrapper for loading Assistments or KDD Cup data.
26 | 
27 |     :param str interaction_file: The location of the interactions
28 |     :param str data_source: Should be either 'assistments' or 'kddcup'
29 |     :param DataOpts data_opts: options for processing data. Includes fields:
30 |         - `num_folds`: number of folds.  Default is 2.
31 |         - `item_id_col`: Which column should be used for the item id? Should be an element of
32 |         `.data.assistments.SKILL_ID_KEY` or `.data.assistments.PROBLEM_ID_KEY` for Assistments
33 |         - `concept_id_col`: Which column should be used for the concept id? Should be an element of
34 |         `.data.assistments.SKILL_ID_KEY` or `.data.assistments.PROBLEM_ID_KEY` for Assistments
35 |         - `remove_skill_nans`: Remove items which have a NaN skill_id (only relevant for
36 |         Assistments). Default is False.
37 |         - `seed`: seed used for data splitting (in other functions)
38 |         - `use_correct`: whether to use correctness (or just question identity) for training RNN.
39 |         Default is True.
40 |         - `use_hints`: whether to use ternary (hint-informed) data representation.  Used for RNN
41 |         only.  Default is False.
42 |         - `drop_duplicates`: whether to drop duplicate interactions.  Default is False.
43 |         - `max_interactions_per_user`: How many interactions to retain per user
44 |         - `min_interactions_per_user`: Minimum number of interactions required to retain a user
45 |         - `proportion_students_retained`: Proportion of students to retain in the data set
46 |             (for testing sensitivity to number of data points)
47 |     :return: processed data, unique user ids, unique question ids, unique concept ids
48 |     :rtype: (pd.DataFrame, list, list, list, list)
49 |     """
50 |     item_id_col = data_opts.item_id_col
51 |     template_id_col = data_opts.template_id_col
52 |     concept_id_col = data_opts.concept_id_col
53 | 
54 |     # Build initial data
55 |     if data_source.lower() in (ASSISTMENTS, KDDCUP):
56 |         if data_source.lower() == ASSISTMENTS:
57 |             relevant_module = assistments
58 |             default_item_col_id = assistments.SKILL_ID_KEY
59 |         else:
60 |             relevant_module = kddcup
61 |             default_item_col_id = kddcup.PROBLEM_NAME
62 | 
63 |         if data_opts.template_id_col is None:
64 |             item_id_col = item_id_col or default_item_col_id
65 | 
66 |         LOGGER.info("Using %s data with %s for item_id_col, %s for template_id_col, "
67 |                     "and %s for concept_id_col",
68 |                     relevant_module.__name__, item_id_col, template_id_col, concept_id_col)
69 |         data, user_ids, item_ids, template_ids, concept_ids = relevant_module.load_data(
70 |             interaction_file,
71 |             item_id_col=item_id_col,
72 |             template_id_col=template_id_col,
73 |             concept_id_col=concept_id_col,
74 |             remove_nan_skill_ids=data_opts.remove_skill_nans,
75 |             drop_duplicates=data_opts.drop_duplicates,
76 |             max_interactions_per_user=data_opts.max_interactions_per_user,
77 |             min_interactions_per_user=data_opts.min_interactions_per_user)
78 |     else:
79 |         raise ValueError('Unknown data_source %s' % data_source)
80 | 
81 |     num_students = len(user_ids)
82 |     num_rows = len(data)
83 | 
84 |     np.random.seed(data_opts.seed)
85 |     chosen_user_ids = np.random.choice(
86 |         num_students, size=int(data_opts.proportion_students_retained * num_students),
87 |         replace=False)
88 |     data = data[data[USER_IDX_KEY].isin(chosen_user_ids)]
89 | 
90 |     LOGGER.info(("After removing students, {now_rows:3,d}/{orig_rows:3,d} rows and "
91 |                  "{now_students:3,d}/{orig_students:3,d} students remain").format(
92 |                 now_rows=len(data), orig_rows=num_rows,
93 |                 now_students=len(chosen_user_ids), orig_students=num_students))
94 | 
95 |     return data, user_ids, item_ids, template_ids, concept_ids
96 | 


--------------------------------------------------------------------------------
/rnn_prof/run_rnn.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Script that constructs an RNN to predict student performance.
  3 | """
  4 | from __future__ import division
  5 | 
  6 | import logging
  7 | 
  8 | import numpy as np
  9 | 
 10 | from .data.rnn import build_nn_data
 11 | from .simple_rnn import SimpleRnn, RnnOpts
 12 | 
 13 | 
 14 | _logger = logging.getLogger(__name__)
 15 | 
 16 | 
 17 | def run(data_folds, num_folds, num_questions, num_iters, data_opts, output=None, compress_dim=100,
 18 |         hidden_dim=200, test_spacing=10, recurrent=True, dropout_prob=0.0,
 19 |         output_compress_dim=None, first_learning_rate=30.0, decay_rate=0.99,
 20 |         which_fold=None):
 21 |     """ Train and test the neural net
 22 | 
 23 |     :param iterable data_folds: an iterator over tuples of (train, test) datasets
 24 |     :param int num_folds: number of folds total
 25 |     :param int num_questions: Total number of questions in the dataset
 26 |     :param int num_iters: Number of training iterations
 27 |     :param DataOpts data_opts: data pre-processing options. Contains the boolean `use_correct`,
 28 |         necessary for correct NN-data encoding, and all pre-processing parameters (for saving).
 29 |     :param str output: where to dump the current state of the RNN
 30 |     :param int|None compress_dim: The dimension to which to compress the data using the
 31 |         compressed sensing technique. If None, no compression is performed.
 32 |     :param int test_spacing: The number of iterations to run before running the tests
 33 |     :param int hidden_dim: The number of hidden units in the RNN.
 34 |     :param bool recurrent: Whether to use a recurrent architecture
 35 |     :param float dropout_prob: The probability of a node being dropped during training.
 36 |         Default is 0.0 (i.e., no dropout)
 37 |     :param int|None output_compress_dim: The dimension to which the output should be compressed.
 38 |         If None, no compression is performed.
 39 |     :param float first_learning_rate: The initial learning rate. Will be decayed at
 40 |         rate `decay_rate`
 41 |     :param float decay_rate: The rate of decay for the learning rate.
 42 |     :param int | None which_fold: Specify which of the folds you want to actually process. If None,
 43 |         process all folds. Good for naive parallelization.
 44 |     """
 45 |     if which_fold is not None and not (1 <= which_fold <= num_folds):
 46 |         raise ValueError("which_fold ({which_fold}) must be between 1 "
 47 |                          "and num_folds({num_folds})".format(which_fold=which_fold,
 48 |                                                              num_folds=num_folds))
 49 | 
 50 |     compress_dim = None if compress_dim <= 0 else compress_dim
 51 | 
 52 |     rnns = []
 53 |     results = []
 54 |     rnn_opts = RnnOpts(max_compress_dim=compress_dim, hidden_dim=hidden_dim, recurrent=recurrent,
 55 |                        num_iters=num_iters, dropout_prob=dropout_prob,
 56 |                        max_output_compress_dim=output_compress_dim,
 57 |                        first_learning_rate=first_learning_rate, decay_rate=decay_rate)
 58 |     np.random.seed(data_opts.seed)
 59 | 
 60 |     for fold_num, (train_data, test_data) in enumerate(data_folds):
 61 | 
 62 |         fold_num += 1
 63 |         if which_fold and fold_num != which_fold:
 64 |             continue
 65 | 
 66 |         _logger.info("Beginning fold %d", fold_num)
 67 |         _, _, _, _, rnn = eval_rnn(train_data, test_data, num_questions, data_opts,
 68 |                                    rnn_opts, test_spacing, fold_num)
 69 |         rnns.append(rnn)
 70 |         results.append(rnn.results[-1])
 71 |         if output:
 72 |             with open(output + str(fold_num), 'wb') as f:
 73 |                 rnn.dump(f)
 74 | 
 75 |     _logger.info("Completed all %d folds", num_folds)
 76 | 
 77 |     # Print overall results
 78 |     acc_sum = 0
 79 |     auc_sum = 0
 80 |     for i, result in enumerate(results):
 81 |         _logger.info("Fold %d Acc: %.5f AUC: %.5f", i + 1, result.accuracy, result.auc)
 82 |         acc_sum += result.accuracy
 83 |         auc_sum += result.auc
 84 | 
 85 |     _logger.info("Overall %d Acc: %.5f AUC: %.5f", i + 1, acc_sum / num_folds, auc_sum / num_folds)
 86 | 
 87 | 
 88 | def eval_rnn(train_data, test_data, num_questions, data_opts, rnn_opts, test_spacing,
 89 |              fold_num):
 90 |     """ Create, train, and cross-validate an RNN on a train/test split.
 91 | 
 92 |     :param pd.DataFrame train_data: training data
 93 |     :param pd.DataFrame test_data: testing data for cross-validation (required)
 94 |     :param int num_questions: total number of questions in data
 95 |     :param DataOpts data_opts: data options
 96 |     :param RnnOpts rnn_opts: RNN options
 97 |     :param int test_spacing: test the RNN every this many iterations
 98 |     :param int fold_num: fold number (for logging and recording results only)
 99 |     :return: the trained RNN
100 |     :rtype: SimpleRnn
101 |     """
102 |     _logger.info("Training RNN, fold %d", fold_num)
103 |     train_nn_data = build_nn_data(train_data, num_questions,
104 |                                   use_correct=data_opts.use_correct,
105 |                                   use_hints=data_opts.use_hints)
106 |     test_nn_data = build_nn_data(test_data, num_questions,
107 |                                  use_correct=data_opts.use_correct,
108 |                                  use_hints=data_opts.use_hints)
109 |     rnn = SimpleRnn(train_nn_data, rnn_opts, test_data=test_nn_data, data_opts=data_opts)
110 |     test_acc, test_auc, test_prob_correct, test_corrects = rnn.train_and_test(
111 |         rnn_opts.num_iters, test_spacing=test_spacing)
112 |     _logger.info("Fold %d: Num Interactions: %d; Test Accuracy: %.5f; Test AUC: %.5f",
113 |                  fold_num, len(test_data), test_acc, test_auc)
114 | 
115 |     return test_acc, test_auc, test_prob_correct, test_corrects, rnn
116 | 


--------------------------------------------------------------------------------
/rnn_prof/tests/data/test_rnn.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | 
  6 | from rnn_prof.data import rnn
  7 | from rnn_prof.data.constants import ITEM_IDX_KEY, USER_IDX_KEY, TIME_IDX_KEY, CORRECT_KEY
  8 | 
  9 | 
 10 | class TestRnnDataTransforms(unittest.TestCase):
 11 |     common_data = [rnn.UserData(length=3, history=[0, 1, 2], next_answer=[1, 2, 3],
 12 |                                 truth=[1, 1, 0]),
 13 |                    rnn.UserData(length=2, history=[3, 1], next_answer=[1, 2], truth=[0, 0]),
 14 |                    rnn.UserData(length=1, history=[4], next_answer=[5], truth=[1])]
 15 | 
 16 |     def test_build_nn_data(self):
 17 |         # Build fake data
 18 |         data = pd.DataFrame({
 19 |             USER_IDX_KEY: [0, 0, 0, 0, 1, 1, 1, 2, 2],
 20 |             ITEM_IDX_KEY: [0, 1, 2, 3, 2, 1, 3, 4, 5],
 21 |             TIME_IDX_KEY: [1, 2, 3, 4, 6, 5, 4, 4, 5],
 22 |             CORRECT_KEY:  [0, 1, 1, 0, 0, 0, 0, 1, 1]
 23 |         })
 24 |         num_questions = data[ITEM_IDX_KEY].max() + 1
 25 | 
 26 |         # Run it through the builder
 27 |         transformed = rnn.build_nn_data(data, num_questions)
 28 | 
 29 |         # What data we expect
 30 |         expected = [
 31 |             rnn.UserData(
 32 |                 length=3,
 33 |                 history=[0, 1 + num_questions, 2 + num_questions],
 34 |                 next_answer=[1, 2, 3],
 35 |                 truth=[1, 1, 0]
 36 |             ),
 37 |             rnn.UserData(
 38 |                 length=2,
 39 |                 history=[3, 1],
 40 |                 next_answer=[1, 2],
 41 |                 truth=[0, 0],
 42 |             ),
 43 |             rnn.UserData(
 44 |                 length=1,
 45 |                 history=[4 + num_questions],
 46 |                 next_answer=[5],
 47 |                 truth=[1]
 48 |             )
 49 |         ]
 50 | 
 51 |         # Assert!
 52 |         transformed.sort(key=lambda x: -x.length)
 53 |         for expected_entry, actual_entry in zip(expected, transformed):
 54 |             assert expected_entry == actual_entry
 55 | 
 56 |         # What if use_correct is False? Everything should be the same *except* history
 57 |         expected = [rnn.UserData(length=datum.length,
 58 |                                  history=[q % num_questions for q in datum.history],
 59 |                                  next_answer=datum.next_answer,
 60 |                                  truth=datum.truth) for datum in expected]
 61 |         transformed = rnn.build_nn_data(data, num_questions, use_correct=False)
 62 |         transformed.sort(key=lambda x: -x.length)
 63 |         for expected_entry, actual_entry in zip(expected, transformed):
 64 |             assert expected_entry == actual_entry
 65 | 
 66 |     def test__batch_dimension_list(self):
 67 | 
 68 |         # If the threshold is high, creates a different batch per group
 69 |         high_threshold_output = rnn._batch_dimension_list(self.common_data, threshold=1.1)
 70 |         assert high_threshold_output == [(3, 1), (2, 1), (1, 1)]
 71 | 
 72 |         # If the threshold is low, creates one batch
 73 |         low_threshold_output = rnn._batch_dimension_list(self.common_data, threshold=-0.1)
 74 |         assert low_threshold_output == [(3, 3)]
 75 | 
 76 |     def test_build_batches(self):
 77 |         num_questions = 6
 78 | 
 79 |         # If we use the stacked identity basis then we get back the questions
 80 |         basis = np.vstack([np.eye(num_questions), np.eye(num_questions)])
 81 |         output = rnn.build_batches(self.common_data, num_questions, basis, threshold=-0.1)
 82 | 
 83 |         assert len(output) == 1, "You should only have one batch with a negative threshold"
 84 | 
 85 |         output = output[0]
 86 |         # Test that the x's are correct. Note that our choice of basis means that the one-hot
 87 |         # position should be the number of the question
 88 |         for user_idx, datum in enumerate(self.common_data):
 89 |             for history_idx, actual_question_idx in enumerate(datum.history):
 90 |                 for question_idx in range(num_questions):
 91 |                     if question_idx == actual_question_idx:
 92 |                         assert (output.history[history_idx,
 93 |                                                user_idx,
 94 |                                                question_idx % num_questions] == 1)
 95 |                     else:
 96 |                         assert (output.history[history_idx,
 97 |                                                user_idx,
 98 |                                                question_idx % num_questions] == 0)
 99 | 
100 |         # Test that the y's are correct
101 |         for user_idx, datum in enumerate(self.common_data):
102 |             for history_idx, actual_question_idx in enumerate(datum.next_answer):
103 |                 for question_idx in range(num_questions):
104 |                     if question_idx == actual_question_idx:
105 |                         assert output.next_answer[history_idx, user_idx, question_idx] == 1
106 |                     else:
107 |                         assert output.next_answer[history_idx, user_idx, question_idx] == 0
108 | 
109 |         # Now for the truths t
110 |         for user_idx, datum in enumerate(self.common_data):
111 |             for history_idx, correct in enumerate(datum.truth):
112 |                 assert output.truth[history_idx, user_idx] == correct
113 | 
114 |         # Finally, make sure that we are masking the appropriate part of the history
115 |         for user_idx, datum in enumerate(self.common_data):
116 |             data_len = datum.length
117 |             for history_idx in range(output.mask.shape[0]):
118 |                 if history_idx < data_len:
119 |                     assert output.mask[history_idx, user_idx] == 1
120 |                 else:
121 |                     assert output.mask[history_idx, user_idx] == 0
122 | 


--------------------------------------------------------------------------------
/rnn_prof/cliutils.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Utility functions for the cli to unclutter the actual definition.
  3 | """
  4 | import logging
  5 | import os
  6 | 
  7 | import click
  8 | 
  9 | 
 10 | LOGGER_FORMAT_STRING = "%(asctime)s [%(levelname)s] %(name)s: %(message)s"
 11 | 
 12 | 
 13 | class State(object):
 14 |     """ An object that holds the values of common options """
 15 |     pass
 16 | 
 17 | 
 18 | class CommonOptionGroup(object):
 19 |     """
 20 |     Generate common options and decorate the command with the resulting object.
 21 |     Use thus::
 22 | 
 23 |         >>> common_options = CommonOptionGroup()
 24 |         >>> common_options.add('--some-option', '-s', type=int, default=10, nargs=1,
 25 |         ...     help="Here's an interesting option.",
 26 |         ...     extra_callback=lambda ctx, param, value: value + 10)
 27 |         >>>
 28 |         >>> @click.argument('anotherarg')
 29 |         ... @common_options
 30 |         ... def some_command(common_state, anotherarg):
 31 |         ...     print common_state.some_option, anotherarg
 32 |     """
 33 |     def __init__(self):
 34 |         self.options = []
 35 |         self.pass_state = click.make_pass_decorator(State, ensure=True)
 36 | 
 37 |     def add(self, *args, **kwargs):
 38 |         """
 39 |         To a common option group, add a new common option. All args and kwargs are
 40 |         passed to a click.option.
 41 | 
 42 |         WARNING: If you pass a `callback` or `expose_value` option, it will be
 43 |         overwritten. If you want a specific callback, use the option `extra_callback`.
 44 |         The value returned by `extra_callback` will replace the passed value.
 45 |         """
 46 |         def decorator(f):
 47 |             """ The actual decorator that will decorate the command """
 48 |             def callback(ctx, param, value):
 49 |                 """
 50 |                 The click callback that will be executed. Saves the `value` in the `param`
 51 |                 attribute of a common `State` object after executing any `extra_callback`s passed,
 52 |                 which may, in particular, modify the passed value.
 53 |                 """
 54 |                 if 'extra_callback' in kwargs:
 55 |                     # Execute the extra_callback if passed
 56 |                     value = kwargs['extra_callback'](ctx, param, value)
 57 |                 # Get the singleton State object
 58 |                 state = ctx.ensure_object(State)
 59 | 
 60 |                 # Set the param attribute to value
 61 |                 setattr(state, param.name, value)
 62 |                 # Return the value
 63 |                 return value
 64 | 
 65 |             # Setup the callback for the click decorator
 66 |             kwargs['callback'] = callback
 67 | 
 68 |             # Don't expose the value to the command function; it will be stored in `State`
 69 |             kwargs['expose_value'] = False
 70 | 
 71 |             # Decorate the command
 72 |             return click.option(*args,
 73 |                                 **{k: v for k, v in kwargs.iteritems() if k != 'extra_callback'})(f)
 74 |         self.options.append(decorator)
 75 | 
 76 |     def __call__(self, f):
 77 |         """ Decorate the command with all the common options added """
 78 |         for option in self.options:
 79 |             f = option(f)
 80 |         return self.pass_state(f)
 81 | 
 82 | 
 83 | def ensure_directory_callback(ctx, param, value):
 84 |     """
 85 |     This callback ensures that the dirname of the passed value is created. If not,
 86 |     it creates it.
 87 | 
 88 |     :param ctx: The current click context
 89 |     :param param: The parameter name as determined by click
 90 |     :param str value: The directory whose existence we're ensuring
 91 |     :return: value
 92 |     :rtype: str
 93 |     """
 94 |     if not value:
 95 |         return value
 96 | 
 97 |     dirname = os.path.dirname(value)
 98 |     if not dirname:
 99 |         return value
100 | 
101 |     if not os.path.isdir(dirname):
102 |         os.makedirs(dirname)
103 | 
104 |     return value
105 | 
106 | 
107 | def logging_callback(ctx, param, value):
108 |     """
109 |     A callback that sets the level of the root logger to the passed level.
110 | 
111 |     :param ctx: The current click context
112 |     :param param: The parameter name as determined by click
113 |     :param value: The desired logger level. Can be any type that
114 |         logging.getLogger().setLevel(...) accepts
115 |     :return: value
116 |     """
117 |     root_logger = logging.getLogger()
118 |     formatter = logging.Formatter(fmt=LOGGER_FORMAT_STRING)
119 |     handler = logging.StreamHandler()
120 |     handler.formatter = formatter
121 |     root_logger.addHandler(handler)
122 |     root_logger.setLevel(value.upper())
123 |     return value
124 | 
125 | 
126 | def valid_which_fold(ctx, param, value):
127 |     """
128 |     A click callback that checks if the --which-fold argument is between
129 |     1 and --num-folds. If not, raises a click.BadParameter
130 |     """
131 |     if value is not None:
132 |         if not (1 <= value <= ctx.obj.num_folds):
133 |             raise click.BadParameter(("--which-fold ({which_fold}) must be between 1 and "
134 |                                      "--num-folds ({num_folds}) inclusive").format(
135 |                                      which_fold=value, num_folds=ctx.obj.num_folds))
136 |     return value
137 | 
138 | 
139 | def require_value_callback(valid_options=None):
140 |     """ Raise exception if input is not supplied.
141 |     :param tuple|None valid_options: valid options for the parameter. If None, anything goes
142 |     """
143 |     def callback(ctx, param, value):
144 |         if value is None:
145 |             raise click.BadParameter("you must supply " + param.name)
146 |         elif valid_options is not None and value not in valid_options:
147 |             raise click.BadParameter("value must be one of %s" % str(valid_options))
148 |         return value
149 |     return callback
150 | 


--------------------------------------------------------------------------------
/rnn_prof/tests/irt/cpd/test_ogive.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Tests for item ogive response functions
  3 | """
  4 | import unittest
  5 | 
  6 | import numpy as np
  7 | from scipy import stats as st
  8 | 
  9 | from rnn_prof.irt.constants import THETAS_KEY, OFFSET_COEFFS_KEY, NONOFFSET_COEFFS_KEY
 10 | from rnn_prof.irt.cpd import ogive as undertest
 11 | from rnn_prof.irt.updaters import UpdateTerms
 12 | from rnn_prof.irt.testing_utils import finite_diff_grad, finite_diff_hessian_diag, EPSILON
 13 | 
 14 | NUM_TESTS = 10
 15 | NUM_ASSESS_ITEMS = 2
 16 | NUM_STUDENTS = 2
 17 | NUM_RESPONSES = 10
 18 | DECIMALS = 12
 19 | SEED = 0
 20 | 
 21 | 
 22 | class TestOgive(unittest.TestCase):
 23 |     def test_compute_logli(self):
 24 |         """
 25 |         Test the compute_logli function.
 26 |         """
 27 |         for _ in range(NUM_TESTS):
 28 |             for avg in (False, True):
 29 |                 dim = np.random.randint(1, 10)
 30 |                 trues = np.random.rand(dim) > 0.5
 31 |                 probs = np.random.rand(dim)
 32 |                 actual = undertest.OgiveCPD.bernoulli_logli(trues, probs, avg)
 33 |                 expected = np.sum(trues * np.log(probs) + (1 - trues) * np.log(1 - probs))
 34 |                 if avg:
 35 |                     expected /= dim
 36 |                 self.assertAlmostEqual(actual, expected, DECIMALS)
 37 | 
 38 |     def test_ogive_cpd(self):
 39 |         """
 40 |         Test that the ogive CPDs return the correct log-probabilities, the correct gradients w.r.t.
 41 |         all parameters, and the correct set of requested parameter gradients.
 42 |         """
 43 |         np.random.seed(SEED)
 44 |         for _ in range(NUM_TESTS):
 45 |             # set up data
 46 |             num_latent = np.random.random_integers(1, 3)
 47 | 
 48 |             item_idx = np.random.choice(NUM_ASSESS_ITEMS, size=NUM_RESPONSES)
 49 |             student_idx = np.random.choice(NUM_STUDENTS, size=NUM_RESPONSES)
 50 | 
 51 |             for cpd_class in (undertest.OnePOCPD, undertest.TwoPOCPD):
 52 |                 if cpd_class == undertest.OnePOCPD:
 53 |                     correct = np.random.rand(NUM_RESPONSES) > 0.5
 54 |                     params = {THETAS_KEY: np.random.randn(NUM_STUDENTS, 1),
 55 |                               OFFSET_COEFFS_KEY: np.random.randn(NUM_ASSESS_ITEMS, 1)}
 56 |                     irf_arg = (params[THETAS_KEY][student_idx] +
 57 |                                params[OFFSET_COEFFS_KEY][item_idx]).ravel()
 58 |                     expected_prob_correct = st.norm.cdf(irf_arg)
 59 |                     cpd = cpd_class(theta_idx=student_idx, item_idx=item_idx)
 60 |                 else:
 61 |                     correct = np.random.rand(NUM_RESPONSES) > 0.5
 62 |                     params = {THETAS_KEY: np.random.randn(NUM_STUDENTS, num_latent),
 63 |                               OFFSET_COEFFS_KEY: np.random.randn(NUM_ASSESS_ITEMS, 1),
 64 |                               NONOFFSET_COEFFS_KEY: np.random.randn(NUM_ASSESS_ITEMS, num_latent)}
 65 |                     irf_arg = (np.sum(params[NONOFFSET_COEFFS_KEY][item_idx] *
 66 |                                       params[THETAS_KEY][student_idx], axis=1) +
 67 |                                params[OFFSET_COEFFS_KEY][item_idx].ravel())
 68 |                     expected_prob_correct = st.norm.cdf(irf_arg)
 69 |                     cpd = cpd_class(theta_idx=student_idx, item_idx=item_idx)
 70 |                 # test prob correct method
 71 |                 np.testing.assert_array_almost_equal(cpd.compute_prob_correct(**params),
 72 |                                                      expected_prob_correct)
 73 |                 expected_log_prob = cpd_class.bernoulli_logli(correct, expected_prob_correct)
 74 | 
 75 |                 for par_key in cpd_class.PARAM_KEYS:
 76 |                     def grad_helper(key_to_update, new_param):
 77 |                         """ Replace `key_to_update`  with `new_param` and return the log-prob"""
 78 |                         new_params = {k: new_param if k == key_to_update else v
 79 |                                       for k, v in params.iteritems()}
 80 |                         return cpd(correct, **new_params).log_prob
 81 | 
 82 |                     def hess_helper(key_to_update, new_param):
 83 |                         """ Replace `key_to_update`  with `new_param` and return the gradient"""
 84 |                         new_params = {k: new_param if k == key_to_update else v
 85 |                                       for k, v in params.iteritems()}
 86 |                         return cpd(correct,
 87 |                                    terms_to_compute={key_to_update: UpdateTerms.grad_and_hess},
 88 |                                    **new_params).wrt[key_to_update].gradient
 89 |                     cpd_terms = cpd(correct, terms_to_compute={par_key: UpdateTerms.grad_and_hess},
 90 |                                     **params)
 91 |                     # test that log-probability is computed correctly
 92 |                     self.assertAlmostEqual(cpd_terms.log_prob, expected_log_prob, places=6)
 93 | 
 94 |                     # test that only the desired gradients are returned
 95 |                     self.assertEqual(cpd_terms.wrt.keys(), [par_key])
 96 | 
 97 |                     # test that gradient and Hessian w.r.t. the requested parameters are correct
 98 |                     actual_grad = cpd_terms.wrt[par_key].gradient
 99 |                     actual_hess = cpd_terms.wrt[par_key].hessian.ravel()
100 |                     expected_grad = finite_diff_grad(params[par_key],
101 |                                                      lambda x: grad_helper(par_key, x))
102 |                     expected_hess = finite_diff_hessian_diag(params[par_key],
103 |                                                              lambda x: hess_helper(par_key,
104 |                                                                                    x)).ravel()
105 |                     np.testing.assert_allclose(actual_grad, expected_grad, rtol=EPSILON,
106 |                                                atol=EPSILON)
107 |                     np.testing.assert_allclose(actual_hess, expected_hess, rtol=EPSILON,
108 |                                                atol=EPSILON)
109 | 


--------------------------------------------------------------------------------
/rnn_prof/irt/updaters.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Classes for computing an update step direction based on gradients (and optionally the Hessian).
  3 | """
  4 | from enum import IntEnum
  5 | import logging
  6 | 
  7 | import numpy as np
  8 | from scipy.sparse.linalg import spsolve
  9 | 
 10 | import utils
 11 | 
 12 | LOGGER = logging.getLogger(__name__)
 13 | 
 14 | 
 15 | FLOAT_DTYPE = np.dtype('float64')
 16 | THETAS_KEY = 'thetas'
 17 | OFFSET_COEFFS_KEY = 'offset_coeffs'
 18 | NONOFFSET_COEFFS_KEY = 'nonoffset_coeffs'
 19 | INSTR_COEFFS_KEY = 'instructional_coeffs'
 20 | THETA_OFFSETS_KEY = 'theta_offsets'
 21 | OPTIMIZED_VARS = (THETAS_KEY, NONOFFSET_COEFFS_KEY, OFFSET_COEFFS_KEY, INSTR_COEFFS_KEY,
 22 |                   THETA_OFFSETS_KEY)
 23 | SHOULD_TERMINATE_KEY = 'should_terminate'
 24 | 
 25 | _logger = logging.getLogger(__name__)
 26 | 
 27 | 
 28 | class SolverPars(object):
 29 |     """
 30 |     Class holding generic parameters for iterative updates on a parameter.  It attempts to prevent
 31 |     the user from setting attributes that are not ``learn``, ``num_steps``, ``grad_tol``,
 32 |     ``diff_tol``, or ``updater``.
 33 |     """
 34 |     __slots__ = ['learn', 'num_steps', 'grad_tol', 'diff_tol', 'updater']
 35 | 
 36 |     def __init__(self, learn=True, num_steps=None, grad_tol=1e-2, diff_tol=1e-2,
 37 |                  updater=None):
 38 |         """
 39 |         :param bool learn: True if the parameters associated with this instance are to be learned
 40 |                             If True, step_size and num_steps must be greater than zero.
 41 |         :param int num_steps: number of steps to take in this parameter before updating the next.
 42 |         :param float grad_tol: stopping tolerance for gradient
 43 |         :param float diff_tol: stopping tolerance for value change
 44 |         :param ParameterUpdater updater: a function that takes the gradient and the Hessian of coeff
 45 |             log-priors, and returns a step direction
 46 |         """
 47 |         if not learn:
 48 |             if num_steps is not None and num_steps != 0:
 49 |                 _logger.warn("The argument learn=False was set along with non-zero values of "
 50 |                              "num_steps={0}! This may represent a "
 51 |                              "misunderstanding of the user. We will not learn the parameters "
 52 |                              "associated with these SolverPars and are setting num_steps"
 53 |                              "to zero as a precaution.".format(num_steps))
 54 |             # To be safe, we set these to zero, even though they should never be used:
 55 |             num_steps = 0
 56 |         else:
 57 |             if num_steps is None:
 58 |                 num_steps = 1
 59 |         if updater is None:
 60 |             updater = NewtonRaphson()
 61 |         utils.check_positive_float(grad_tol, 'grad_tol')
 62 |         utils.check_positive_float(diff_tol, 'diff_tol')
 63 |         utils.check_nonnegative_int(num_steps, 'num_steps')
 64 |         if not isinstance(updater, NewtonRaphson):
 65 |             raise TypeError('updater must be a NewtonRaphson')
 66 |         if learn and num_steps == 0:
 67 |             raise ValueError("num_steps must be greater than zero if learn=True")
 68 |         self.learn = learn
 69 |         self.num_steps = num_steps
 70 |         self.grad_tol = grad_tol
 71 |         self.diff_tol = diff_tol
 72 |         self.updater = updater
 73 | 
 74 |     def copy(self):
 75 |         """
 76 |         Make a copy of this object. This is trivially a deep copy.
 77 | 
 78 |         :return: A (deep) copy.
 79 |         :rtype: SolverPars
 80 |         """
 81 |         return SolverPars(learn=self.learn, num_steps=self.num_steps,
 82 |                           grad_tol=self.grad_tol, diff_tol=self.diff_tol, updater=self.updater)
 83 | 
 84 | 
 85 | class UpdateTerms(IntEnum):
 86 |     """Indicates which log-probability terms (gradient, Hessian) are required by an updater."""
 87 |     none = 0
 88 |     grad = 1
 89 |     grad_and_hess = 2
 90 | 
 91 | 
 92 | class NewtonRaphson(object):
 93 |     """
 94 |     Newton Raphson update that solves the quadratic problem x = Hessian^-1 grad.
 95 |     """
 96 |     def __init__(self, step_size=1e-1, ravel_order='F'):
 97 |         """
 98 |         :param float step_size: step size
 99 |         :param str ravel_order: the ravel order for gradient and Hessian reshaping. Used for
100 |             backward-compatibility with IRTLearner which uses order='F', whereas BayesNet uses
101 |             order='C'.
102 |         """
103 |         utils.check_nonnegative_float(step_size, 'step_size')
104 |         self.step_size = step_size
105 |         self.ravel_order = ravel_order
106 | 
107 |     def __call__(self, x, gradient, hessian, support=None):
108 |         """
109 |         :param np.ndarray x: current estimate of the parameter
110 |         :param np.ndarray gradient: parameter gradient.
111 |         :param np.ndarray hessian: parameter Hessian.
112 |         :param tuple(float) support: the bounds of the variable being updated, used to truncate
113 |             the updated value
114 |         :return: the new estimate after moving in the direction of the Newton step.
115 |         :rtype: np.ndarray
116 |         """
117 |         if hessian is None:
118 |             raise ValueError('Hessian required for second order methods')
119 |         else:
120 |             if np.isscalar(hessian):
121 |                 step_vec = -gradient / hessian
122 |             elif isinstance(hessian, np.ndarray):
123 |                 # dense matrix
124 |                 if hessian.size == gradient.size:
125 |                     # assume Hessian diagonal is stored
126 |                     step_vec = -gradient / np.asarray(hessian)
127 |                 else:
128 |                     step_vec = -np.linalg.solve(hessian, gradient.ravel(order=self.ravel_order))
129 |             else:
130 |                 # sparse matrix
131 |                 if hessian.shape[0] == 1:
132 |                     # sp.linalg.spsolve cannot handle 1D matrices
133 |                     step_vec = -gradient / hessian.toarray()
134 |                 else:
135 |                     step_vec = -spsolve(hessian, gradient.ravel(order=self.ravel_order))
136 |             self.step = step_vec.reshape(x.shape, order=self.ravel_order)
137 |             value = x + self.step_size * self.step
138 |             if np.any(~np.isfinite(value)):
139 |                 raise RuntimeError("Newly computed values are not all finite!")
140 |             if support is not None:
141 |                 np.clip(value, support[0], support[1], out=value)
142 |             return value
143 | 


--------------------------------------------------------------------------------
/rnn_prof/data/assistments.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Functions for loading the Assistments data.  Originally from
  3 | https://sites.google.com/site/assistmentsdata/home/assistment-2009-2010-data/skill-builder-data-2009-2010
  4 | """
  5 | import logging
  6 | 
  7 | import numpy as np
  8 | import pandas as pd
  9 | 
 10 | from .constants import (ITEM_IDX_KEY, TEMPLATE_IDX_KEY, CONCEPT_IDX_KEY, USER_IDX_KEY,
 11 |                         TIME_IDX_KEY, CORRECT_KEY, SINGLE)
 12 | 
 13 | SKILL_ID_KEY = 'skill_id'
 14 | PROBLEM_ID_KEY = 'problem_id'
 15 | TEMPLATE_ID_KEY = 'template_id'
 16 | USER_ID_KEY = 'user_id'
 17 | 
 18 | LOGGER = logging.getLogger(__name__)
 19 | 
 20 | 
 21 | def load_data(file_path, item_id_col=SKILL_ID_KEY, template_id_col=None, concept_id_col=None,
 22 |               remove_nan_skill_ids=False, max_interactions_per_user=None,
 23 |               drop_duplicates=False, min_interactions_per_user=2):
 24 |     """ Load the Assistments dataset as a pandas dataframe, filter out students with only a single
 25 |     interaction, and optionally truncate student histories.  The columns used for item and concept
 26 |     identifiers can be specified in the input arguments.
 27 | 
 28 |     Note that multiple skill ids associated with an interaction will result in the first skill
 29 |     name lexicographically being retained.
 30 | 
 31 |     :param str file_path: path to the skill builder file
 32 |     :param str item_id_col: indicates column of csv file to use for item ids
 33 |     :param str template_id_col: Set a particular column to represent a template id for hierarchical
 34 |         IRT. If 'single', assumes a dummy single hierarchical level; if None, no column is retained
 35 |         for templates.
 36 |     :param str concept_id_col: indicates column of csv file to use for concept ids. If 'single',
 37 |         assumes a dummy single concept.  If None, concept column is not retained.
 38 |     :param bool remove_nan_skill_ids: whether to filter out interactions with NaN skill ids
 39 |     :param int max_interactions_per_user: number of interactions to keep per user (default is to
 40 |         keep all)
 41 |     :param int min_interactions_per_user: The minimum amount of history that is required to retain a
 42 |         student history.
 43 |     :param bool drop_duplicates: Whether to keep only the first of rows with duplicate order_id
 44 |         fields
 45 |     :return: processed data, student ids corresponding to the student indices, item ids
 46 |         corresponding to the item indices, template ids corresponding to the template indices, and
 47 |         concept ids corresponding to the concept indices
 48 |     :rtype: (pd.DataFrame, np.ndarray[int], np.ndarray[int], np.ndarray[int])
 49 |     """
 50 |     data = pd.DataFrame.from_csv(file_path)
 51 |     LOGGER.info("Read {:3,d} rows from file".format(len(data)))
 52 | 
 53 |     # Get the time index
 54 |     data[TIME_IDX_KEY] = data.index.values
 55 | 
 56 |     # fix up skill ids
 57 |     if data[SKILL_ID_KEY].dtype == 'object':
 58 |         # In this case, we have a string of skill ids like '1,54,3'
 59 |         # Keep only the first skill for now
 60 |         data[SKILL_ID_KEY] = data[SKILL_ID_KEY].apply(
 61 |             lambda x: sorted(map(int, x.split(',')))[0]).values
 62 |     nan_skill = data[SKILL_ID_KEY].apply(np.isnan)
 63 |     if remove_nan_skill_ids:
 64 |         data = data[~nan_skill]
 65 |         LOGGER.info("Removed {:3,d} rows with NaN skill_id".format(np.sum(nan_skill)))
 66 |     else:
 67 |         data.loc[nan_skill, SKILL_ID_KEY] = -1
 68 |         data[SKILL_ID_KEY] = data[SKILL_ID_KEY].astype(int)
 69 | 
 70 |     # sort by user, time, item, and concept id (if available)
 71 |     sort_keys = [USER_ID_KEY, TIME_IDX_KEY, item_id_col, SKILL_ID_KEY]
 72 |     data.sort(columns=sort_keys, inplace=True)
 73 | 
 74 |     if drop_duplicates:
 75 |         old_data_len = len(data)
 76 |         data = data.groupby(data.index).head(1)
 77 |         LOGGER.info("Removed {:3,d} duplicate rows ({:3,d} rows remaining)".format(
 78 |             old_data_len - len(data), len(data)))
 79 | 
 80 |     # filter for students with >= min_history_length interactions;
 81 |     # must be done after removing nan skillz
 82 |     data = data.groupby(USER_ID_KEY).filter(lambda x: len(x) >= min_interactions_per_user)
 83 |     LOGGER.info("Removed students with <{} interactions ({:3,d} rows remaining)".format(
 84 |         min_interactions_per_user, len(data)))
 85 | 
 86 |     # limit to first `max_interactions_per_user`
 87 |     if max_interactions_per_user is not None:
 88 |         old_data_len = len(data)
 89 |         data = data.groupby([USER_ID_KEY]).head(max_interactions_per_user)
 90 |         LOGGER.info("Filtered for {} max interactions per student ({:3,d} rows removed)".format(
 91 |             max_interactions_per_user, old_data_len - len(data)))
 92 | 
 93 |     # to be safe, sort again
 94 |     data.sort(columns=sort_keys, inplace=True)
 95 | 
 96 |     # attach question index
 97 |     item_ids, data[ITEM_IDX_KEY] = np.unique(data[item_id_col], return_inverse=True)
 98 |     user_ids, data[USER_IDX_KEY] = np.unique(data[USER_ID_KEY], return_inverse=True)
 99 | 
100 |     cols_to_keep = [USER_IDX_KEY, ITEM_IDX_KEY, CORRECT_KEY, TIME_IDX_KEY]
101 |     if concept_id_col is None:
102 |         LOGGER.info('concept_id_col not supplied, not using concepts')
103 |         concept_ids = None
104 |     else:
105 |         if concept_id_col == SINGLE:
106 |             LOGGER.info('Using dummy single concept.')
107 |             data[concept_id_col] = '0'
108 |         elif concept_id_col not in data:
109 |             raise ValueError('concept_id_col %s not found in data columns %s' % (concept_id_col,
110 |                                                                                  data.columns))
111 |         concept_ids, data[CONCEPT_IDX_KEY] = np.unique(data[concept_id_col], return_inverse=True)
112 |         cols_to_keep.append(CONCEPT_IDX_KEY)
113 | 
114 |     if template_id_col is None:
115 |         LOGGER.info('template_id_col not supplied, not using templates')
116 |         template_ids = None
117 |     else:
118 |         if template_id_col == SINGLE:
119 |             LOGGER.info('Using dummy single template.')
120 |             data[template_id_col] = '0'
121 |         elif template_id_col not in data:
122 |             raise ValueError('template_id_col %s not found', template_id_col)
123 |         template_ids, data[TEMPLATE_IDX_KEY] = np.unique(data[template_id_col], return_inverse=True)
124 |         cols_to_keep.append(TEMPLATE_IDX_KEY)
125 | 
126 |     LOGGER.info("Processed data: {:3,d} interactions, {:3,d} students; {:3,d} items, "
127 |                 "{:3,d} templates, {:3,d} concepts"
128 |                 .format(len(data), len(user_ids), len(item_ids),
129 |                         len(template_ids) if template_ids is not None else 0,
130 |                         len(concept_ids) if concept_ids is not None else 0))
131 | 
132 |     return data[cols_to_keep], user_ids, item_ids, template_ids, concept_ids
133 | 


--------------------------------------------------------------------------------
/rnn_prof/tests/irt/test_linear_operators.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Unit tests for linear operators
  3 | """
  4 | import numpy as np
  5 | import scipy.sparse as sp
  6 | import unittest
  7 | 
  8 | from rnn_prof.irt import linear_operators as undertest
  9 | from rnn_prof.irt.testing_utils import finite_diff_grad
 10 | 
 11 | NUM_TESTS = 5
 12 | 
 13 | 
 14 | class TestLinearOperators(unittest.TestCase):
 15 |     @staticmethod
 16 |     def subset_test(lin_op):
 17 |         """ Test that subsetting a linear operator produces the correct outputs.
 18 |         :param LinearOperator lin_op: the linear operator
 19 |         """
 20 |         sub_idx = np.random.rand(lin_op.shape[0], 1) > 0.5
 21 |         # make sure at least one element included
 22 |         sub_idx[np.random.randint(0, len(sub_idx))] = True
 23 |         sub_idx = np.flatnonzero(sub_idx)
 24 |         sub_lin_op = undertest.get_subset_lin_op(lin_op, sub_idx)
 25 | 
 26 |         # test projection to subset of indices
 27 |         x = np.random.randn(lin_op.shape[1], np.random.randint(1, 3))
 28 |         np.testing.assert_array_almost_equal(sub_lin_op * x, (lin_op * x)[sub_idx, :])
 29 | 
 30 |         # test back projection from subset of indices
 31 |         y = np.random.randn(len(sub_idx), np.random.randint(1, 3))
 32 |         z = np.zeros((lin_op.shape[0], y.shape[1]))
 33 |         z[sub_idx] = y
 34 |         np.testing.assert_array_almost_equal(sub_lin_op.rmatvec(y), lin_op.rmatvec(z))
 35 | 
 36 |     def test_linearity(self):
 37 |         """Test the linearity and back projection properties of all operators """
 38 |         def finite_diff_matrix_grad(f, x, dim_y):
 39 |             return np.array([finite_diff_grad(x, lambda z: f(z)[i]) for i in np.arange(dim_y)])
 40 | 
 41 |         for _ in range(NUM_TESTS):
 42 |             dim_x = np.random.randint(2, 20)
 43 |             index_map = np.random.randint(dim_x, size=np.random.randint(1, 20))
 44 |             group_idx = np.random.randint(np.random.randint(1, 20), size=dim_x)
 45 |             split_idx = np.unique(np.random.randint(1, dim_x, 5))
 46 |             mask_idx = np.random.rand(len(index_map)) > 0.5
 47 |             while not np.sum(mask_idx):
 48 |                 mask_idx = np.random.rand(len(index_map)) > 0.5
 49 |             skip_index_map = index_map[:int(np.sum(mask_idx))]
 50 |             # for masked index operator, make the output array larger than last skip index
 51 |             si_dim_y = np.random.randint(1, 5)
 52 |             if np.sum(mask_idx):
 53 |                 si_dim_y += np.flatnonzero(mask_idx)[-1]
 54 |             lin_op = undertest.IndexOperator(index_map, dim_x)
 55 |             dim_y = lin_op.shape[0]
 56 | 
 57 |             # test the gradients at multiple points are the same
 58 |             grad0 = finite_diff_matrix_grad(lambda z: lin_op * z, np.random.randn(dim_x), dim_y)
 59 |             grad1 = finite_diff_matrix_grad(lambda z: lin_op * z, np.random.randn(dim_x), dim_y)
 60 |             np.testing.assert_array_almost_equal(grad0, grad1)
 61 |             lin_op.rmatvec(np.random.randn(dim_y))
 62 |             # test the gradient of the back projection is the transpose of the forward one
 63 |             back_grad = finite_diff_matrix_grad(lambda z: lin_op.rmatvec(z),
 64 |                                                 np.random.randn(dim_y), dim_x)
 65 |             np.testing.assert_array_almost_equal(grad0, back_grad.T)
 66 | 
 67 |     def test_index_operator(self):
 68 |         """ Test the indexing operator. """
 69 |         def rev_index(x, idx, n):
 70 |             y = np.empty(shape=(n, x.shape[1]), dtype=x.dtype)
 71 |             for k in range(y.shape[1]):
 72 |                 y[:, k] = np.bincount(idx, weights=x[:, k], minlength=n)
 73 |             return y
 74 | 
 75 |         for _ in range(NUM_TESTS):
 76 |             shape_x = (np.random.randint(1, 20), np.random.randint(1, 20))
 77 |             dim_y = np.random.randint(1, 20)
 78 |             x = np.random.randn(*shape_x)
 79 |             index_map = np.random.randint(shape_x[0], size=dim_y)
 80 |             lin_op = undertest.IndexOperator(index_map=index_map, dim_x=shape_x[0])
 81 |             self.assertEqual(lin_op.dim_x, shape_x[0])
 82 |             proj = lin_op * x
 83 |             backproj = lin_op.rmatvec(proj)
 84 |             expected_proj = x[index_map, :]
 85 |             expected_backproj = rev_index(proj, index_map, shape_x[0])
 86 |             np.testing.assert_array_equal(proj, expected_proj)
 87 |             np.testing.assert_array_equal(backproj, expected_backproj)
 88 | 
 89 |             self.subset_test(lin_op)
 90 | 
 91 |     def test_rmatvec_nd(self):
 92 |         """ Test that given an n x k linear operator and n x n matrix rmatvec_nd yields a k x k
 93 |         matrix"""
 94 | 
 95 |         def rev_index(index_map, x, output_dim):
 96 |             intermediate = np.empty((output_dim, x.shape[1]))
 97 |             final = np.empty((output_dim, output_dim))
 98 |             for i in range(x.shape[1]):
 99 |                 intermediate[:, i] = np.bincount(index_map, weights=x[:, i], minlength=output_dim)
100 |             for i in range(output_dim):
101 |                 final[i, :] = np.bincount(index_map, weights=intermediate[i, :],
102 |                                           minlength=output_dim)
103 |             return final
104 | 
105 |         n = 10
106 |         x = np.random.randn(n, n)
107 |         k = np.random.randint(1, 5)
108 |         index_map = np.random.randint(k, size=n)
109 |         lin_op = undertest.IndexOperator(index_map=index_map, dim_x=k)
110 |         actual = undertest.rmatvec_nd(lin_op, x)
111 |         expected_backproj = rev_index(index_map, x, k)
112 |         np.testing.assert_array_equal(actual, expected_backproj)
113 | 
114 |         # Sparse, non-diagonal
115 |         x_sp = sp.csr_matrix(x)
116 |         actual = undertest.rmatvec_nd(lin_op, x_sp)
117 |         np.testing.assert_array_equal(actual, expected_backproj)
118 | 
119 |         # Sparse diagonal
120 |         x_sp_diag = sp.diags(np.diag(x), 0)
121 |         actual = undertest.rmatvec_nd(lin_op, x_sp_diag)
122 |         self.assertEqual(actual.shape, (k, k))
123 |         expected_backproj = np.diag(np.bincount(index_map, weights=np.diag(x), minlength=k))
124 |         np.testing.assert_array_equal(actual, expected_backproj)
125 | 
126 |         # Non-sparse diagonal
127 |         x_diag = np.diag(np.random.randn(n))
128 |         actual = undertest.rmatvec_nd(lin_op, x_diag)
129 |         self.assertEqual(actual.shape, (k, k))
130 |         # The result should also be sparse and diagonal
131 |         expected_backproj = np.diag(np.bincount(index_map, weights=np.diag(x_diag), minlength=k))
132 |         np.testing.assert_array_equal(actual, expected_backproj)
133 | 
134 |         # scalar
135 |         x = 1.3
136 |         k = 5
137 |         index_map = np.random.randint(k, size=1)
138 |         lin_op = undertest.IndexOperator(index_map=index_map, dim_x=k)
139 |         actual = undertest.rmatvec_nd(lin_op, x)
140 |         expected_backproj = np.zeros(k)
141 |         expected_backproj[index_map] = x
142 |         np.testing.assert_array_equal(actual, expected_backproj)
143 | 


--------------------------------------------------------------------------------
/rnn_prof/irt/testing_utils.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Utilities for testing
  3 | """
  4 | from collections import namedtuple
  5 | 
  6 | import numpy as np
  7 | 
  8 | from .callbacks import ConvergenceCallback
  9 | from .cpd import GaussianCPD
 10 | from .irt import BayesNetLearner
 11 | from .node import Node
 12 | 
 13 | EPSILON = 1e-3
 14 | NUM_TRIALS = 3
 15 | 
 16 | NUM_ITEMS = 50
 17 | NUM_INSTR_ITEMS = 0
 18 | NUM_LATENT = 1
 19 | NUM_CHOICES = 2
 20 | NUM_RESPONSES = 500
 21 | NUM_STUDENTS = 20
 22 | PROB_CORRECT = 0.5
 23 | 
 24 | THETA_MU = 0.0
 25 | THETA_SIGMA = 1.0
 26 | 
 27 | NONOFFSET_COEFF_SHAPE_GEN = 100.0
 28 | NONOFFSET_COEFF_SCALE_GEN = 0.01
 29 | 
 30 | NONOFFSET_COEFF_SHAPE = 2.0
 31 | NONOFFSET_COEFF_SCALE = 0.5
 32 | 
 33 | OFFSET_COEFF_MU = 0.0
 34 | OFFSET_COEFF_SIGMA = 1.0
 35 | 
 36 | INSTR_COEFF_MU = 0.05
 37 | INSTR_COEFF_SIGMA = 0.025
 38 | 
 39 | THETA_OFFSETS_SIGMA = 0.05
 40 | 
 41 | ResponseData = namedtuple('ResponseData', ['correct', 'student_idx', 'item_idx'])
 42 | 
 43 | 
 44 | def log_norm_pdf(x, mu=0.0, var=1.0):
 45 |     """ Evaluate the log normal pdf.
 46 |     :param np.ndarray|float x: point at which to evaluate the log norm pdf
 47 |     :param np.ndarray|float mu: mean of the normal distribution
 48 |     :param np.ndarray|float var: variance of the normal distribution.
 49 |     """
 50 |     return -0.5 * np.log(2. * np.pi * var) - 0.5 / var * (x - mu) ** 2
 51 | 
 52 | 
 53 | FINITE_DIFF_EPSILON = 1e-6
 54 | ALMOST_EQUAL_EPSILON = 1e-4
 55 | 
 56 | 
 57 | def finite_diff_grad(x, func, epsilon=FINITE_DIFF_EPSILON):
 58 |     """ Approximate the derivative of a function using finite difference.
 59 |      :param np.ndarray x: point at which to evaluate derivative
 60 |      :param function func: function with which to take finite differences.
 61 |      """
 62 |     fwd_x = np.copy(x)
 63 |     bwd_x = np.copy(x)
 64 |     fwd_xx = fwd_x.ravel()
 65 |     bwd_xx = bwd_x.ravel()
 66 |     y = np.zeros(x.shape)
 67 |     yy = y.ravel()
 68 |     for i in xrange(x.size):
 69 |         fwd_xx[i] += epsilon
 70 |         bwd_xx[i] -= epsilon
 71 |         yy[i] = (func(fwd_x) - func(bwd_x)) / 2.0 / epsilon
 72 |         fwd_xx[i] -= epsilon
 73 |         bwd_xx[i] += epsilon
 74 |     return y
 75 | 
 76 | 
 77 | def finite_diff_hessian(x, grad, epsilon=FINITE_DIFF_EPSILON):
 78 |     """ Approximate the Hessian of a function using finite difference in the partial gradient.
 79 |     :param np.ndarray x: point at which to evaluate derivative
 80 |     :param function grad: function that returns the gradient
 81 |     """
 82 |     fwd_x = np.copy(x)
 83 |     bwd_x = np.copy(x)
 84 |     fwd_xx = fwd_x.ravel()
 85 |     bwd_xx = bwd_x.ravel()
 86 |     y = np.zeros((x.size, x.size))
 87 |     for i in xrange(x.size):
 88 |         for j in xrange(x.size):
 89 |             fwd_xx[i] += epsilon
 90 |             bwd_xx[i] -= epsilon
 91 |             y[i, j] = (grad(fwd_x).ravel()[j] - grad(bwd_x).ravel()[j]) / 2.0 / epsilon
 92 |             fwd_xx[i] -= epsilon
 93 |             bwd_xx[i] += epsilon
 94 |     return y
 95 | 
 96 | 
 97 | def finite_diff_hessian_diag(x, grad, epsilon=FINITE_DIFF_EPSILON):
 98 |     """ Approximate the diagonal of the Hessian of a function using finite difference in the
 99 |     partial gradient.
100 |     :param np.ndarray x: point at which to evaluate derivative
101 |     :param function grad: function that returns the gradient
102 |     """
103 |     fwd_x = np.copy(x)
104 |     bwd_x = np.copy(x)
105 |     fwd_xx = fwd_x.ravel()
106 |     bwd_xx = bwd_x.ravel()
107 |     y = np.zeros(x.shape)
108 |     yy = y.ravel()
109 |     for i in xrange(x.size):
110 |         fwd_xx[i] += epsilon
111 |         bwd_xx[i] -= epsilon
112 |         yy[i] = (grad(fwd_x).ravel()[i] - grad(bwd_x).ravel()[i]) / 2.0 / epsilon
113 |         fwd_xx[i] -= epsilon
114 |         bwd_xx[i] += epsilon
115 |     return y
116 | 
117 | 
118 | def generate_data(num_students=NUM_STUDENTS,
119 |                   num_items=NUM_ITEMS,
120 |                   num_responses=NUM_RESPONSES,
121 |                   prob_correct=PROB_CORRECT):
122 |     """ Simulate student response data (independently of any parameters).
123 | 
124 |     :param int num_students: Number of unique student ids.
125 |     :param int num_items: number of assessment items
126 |     :param int num_responses: number of responses to generate
127 |     :param float prob_correct: probability of correct (probability of choosing first choice when
128 |                                num_choices > 1; probability of other choices are all equal)
129 |     :return: the response data
130 |     :rtype: ResponseData
131 |     """
132 |     correct = np.random.rand(num_responses) < prob_correct
133 |     num_responses_per_student, remainder = divmod(num_responses, num_students)
134 |     unique_student_ids = range(num_students)
135 |     student_idx = [reg_id for reg_id in unique_student_ids for _ in
136 |                    range(num_responses_per_student)]
137 |     # If num_responses can't be perfectly divided into students, add the remaining responses
138 |     # to the last student id:
139 |     student_idx.extend([unique_student_ids[-1]] * remainder)
140 |     student_idx = np.array(student_idx)
141 | 
142 |     item_idx = np.random.random_integers(low=0, high=num_items-1, size=num_responses)
143 |     np.random.shuffle(student_idx)
144 | 
145 |     return ResponseData(correct, student_idx, item_idx)
146 | 
147 | 
148 | class MockNode(Node):
149 |     """
150 |     A test node class that stores the evidence terms passed into it and does nothing with them,
151 |     and whose update method returns a dictionary with param node names
152 |     """
153 | 
154 |     def __init__(self, *args, **kwargs):
155 |         super(MockNode, self).__init__(*args, **kwargs)
156 |         self.obtained_evidence_terms = {}
157 | 
158 |     def update(self, evidence_terms=None):
159 |         """ An update function that stores all the evidence infos passed to it, and sets its
160 |         log_prob to a random Gaussian value
161 | 
162 |         :param list evidence_terms: evidence information passed into the node
163 |         :return: the names of all param nodes
164 |         :rtype: dict[Node, str]
165 |         """
166 |         if evidence_terms is not None:
167 |             self.obtained_evidence_terms.update(evidence_terms)
168 |         self.log_prob = np.random.randn()
169 |         return {v: v.name for k, v in self.param_nodes.iteritems()}
170 | 
171 | 
172 | class MockLearner(BayesNetLearner):
173 |     """
174 |     A learner with the following graph of TestNodes (directed edges pointing down):
175 | 
176 |        A
177 |        |
178 |        B
179 |       / \
180 |      C  D
181 |      \ / \
182 |      E   F
183 |     """
184 | 
185 |     def __init__(self):
186 |         cpd = GaussianCPD(dim=1)
187 |         node_a = MockNode(name='A', data=None, cpd=cpd)
188 |         node_b = MockNode(name='B', data=None, cpd=cpd, param_nodes={'mean': node_a})
189 |         node_c = MockNode(name='C', data=None, cpd=cpd, param_nodes={'mean': node_b})
190 |         node_d = MockNode(name='D', data=None, cpd=cpd, param_nodes={'mean': node_b})
191 |         node_e = MockNode(name='E', data=None, cpd=cpd, param_nodes={'mean': node_c,
192 |                                                                      'precision': node_d})
193 |         node_f = MockNode(name='F', data=None, cpd=cpd, param_nodes={'mean': node_d})
194 |         super(MockLearner, self).__init__(nodes=[node_a, node_b, node_c, node_d, node_e, node_f],
195 |                                           max_iterations=1, callback=ConvergenceCallback())
196 | 


--------------------------------------------------------------------------------
/rnn_prof/tests/irt/test_metrics.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Unit tests for BayesNet prediction accuracy metrics
  3 | """
  4 | from collections import defaultdict
  5 | import itertools as its
  6 | import logging
  7 | import numpy as np
  8 | import unittest
  9 | import uuid
 10 | 
 11 | from rnn_prof.irt import metrics as undertest
 12 | from rnn_prof.irt.constants import TRAIN_RESPONSES_KEY, THETAS_KEY, OFFSET_COEFFS_KEY
 13 | from rnn_prof.irt.learners import OnePOLearner
 14 | 
 15 | LOGGER = logging.getLogger(__name__)
 16 | EPS_DECIMAL = 6
 17 | 
 18 | 
 19 | class TestMetrics(unittest.TestCase):
 20 |     def setUp(self):
 21 |         num_responses = 100
 22 |         num_students = 10
 23 |         num_items = 10
 24 |         self.correct = np.random.rand(num_responses) > 0.5
 25 |         item_idx = np.random.choice(num_items, size=num_responses)
 26 |         theta_idx = np.random.choice(num_students, size=num_responses)
 27 |         self.learner = OnePOLearner(self.correct, theta_idx, item_idx)
 28 | 
 29 |         # set non-trivial thetas and offsets
 30 |         self.learner.nodes[THETAS_KEY].data = np.random.randn(num_students, 1)
 31 |         self.learner.nodes[OFFSET_COEFFS_KEY].data = np.random.randn(num_items, 1)
 32 | 
 33 |         params = self.learner.nodes[TRAIN_RESPONSES_KEY].param_data
 34 |         self.prob_correct = self.learner.nodes[TRAIN_RESPONSES_KEY].cpd.compute_prob_true(**params)
 35 | 
 36 |     def test_compute_logli(self):
 37 |         """ Test that the log-likelihood metric normalizes by the size of the node's data. """
 38 |         self.assertAlmostEqual(self.learner.nodes[TRAIN_RESPONSES_KEY].metrics.compute_logli(),
 39 |                                self.learner.nodes[TRAIN_RESPONSES_KEY].compute_log_prob(),
 40 |                                places=EPS_DECIMAL)
 41 |         self.assertAlmostEqual(self.learner.nodes[TRAIN_RESPONSES_KEY].metrics.compute_logli(False),
 42 |                                self.learner.nodes[TRAIN_RESPONSES_KEY].compute_log_prob(),
 43 |                                places=EPS_DECIMAL)
 44 |         self.assertAlmostEqual(self.learner.nodes[TRAIN_RESPONSES_KEY].metrics.compute_logli(True),
 45 |                                (self.learner.nodes[TRAIN_RESPONSES_KEY].compute_log_prob() /
 46 |                                 len(self.correct)),
 47 |                                places=EPS_DECIMAL)
 48 | 
 49 |     def test_compute_naive(self):
 50 |         """ Test the Naive (predict most frequent response values) metric."""
 51 |         fraction_correct = np.mean(self.correct)
 52 |         expected = max(fraction_correct, 1 - fraction_correct)
 53 |         actual = self.learner.nodes[TRAIN_RESPONSES_KEY].metrics.compute_naive()
 54 |         self.assertAlmostEqual(actual, expected, places=EPS_DECIMAL)
 55 | 
 56 |     def test_map_accuracy(self):
 57 |         """ Test the MAP accuracy metric."""
 58 |         expected = np.mean((self.prob_correct > 0.5) ==
 59 |                            self.learner.nodes[TRAIN_RESPONSES_KEY].data)
 60 |         actual = self.learner.nodes[TRAIN_RESPONSES_KEY].metrics.compute_map_accuracy()
 61 |         self.assertAlmostEqual(actual, expected, places=EPS_DECIMAL)
 62 | 
 63 |     def test_d_prime(self):
 64 |         """ Test the d-prime statistic"""
 65 |         pc_correct = self.prob_correct[self.correct]
 66 |         pc_incorrect = self.prob_correct[np.logical_not(self.correct)]
 67 |         expected = (np.mean(pc_correct) - np.mean(pc_incorrect)) / \
 68 |             np.sqrt(0.5 * np.var(pc_correct) + 0.5 * np.var(pc_incorrect))
 69 |         actual = self.learner.nodes[TRAIN_RESPONSES_KEY].metrics.compute_d_prime()
 70 |         self.assertAlmostEqual(actual, expected, places=EPS_DECIMAL)
 71 | 
 72 |     def test_auc_helper(self):
 73 |         """ Test the math meat of the AUC metric computation. """
 74 |         num_responses = 50
 75 |         num_trials = 100
 76 |         for trial in range(num_trials):
 77 |             # Create random response correctnesses
 78 |             correct_prob = 0.1 + 0.8 * np.random.rand()
 79 |             corrects = np.zeros(num_responses, dtype=bool)
 80 |             # Make sure there's at least 1 correct and 1 incorrect
 81 |             while np.sum(corrects) in (0, num_responses):
 82 |                 corrects = np.random.rand(num_responses) < correct_prob
 83 |                 incorrects = np.logical_not(corrects)
 84 | 
 85 |             # Create some random response probabilities
 86 |             rps = np.random.rand(num_responses)
 87 |             num_correct = float(np.sum(corrects))
 88 |             num_incorrect = float(np.sum(incorrects))
 89 | 
 90 |             # Compute AUC the slow way by iterating through thresholds
 91 |             tprs = np.zeros(len(rps) + 2)
 92 |             fprs = np.zeros(len(tprs))
 93 |             for i, threshold in enumerate(np.r_[-1., np.sort(rps), 2.]):
 94 |                 tprs[i] = np.sum(np.logical_and(corrects, rps > threshold)) / num_correct
 95 |                 fprs[i] = np.sum(np.logical_and(incorrects, rps > threshold)) / num_incorrect
 96 |             expected_auc = np.trapz(tprs[::-1], fprs[::-1])
 97 |             actual_auc = undertest.Metrics.auc_helper(corrects, rps)
 98 |             self.assertAlmostEqual(expected_auc, actual_auc)
 99 | 
100 |             # Now compute some edge cases (all rps are 0 or all rps are 1)
101 |             self.assertAlmostEqual(0.5, undertest.Metrics.auc_helper(corrects,
102 |                                                                      np.zeros(num_responses)))
103 |             self.assertAlmostEqual(0.5, undertest.Metrics.auc_helper(corrects,
104 |                                                                      np.ones(num_responses)))
105 | 
106 |             # Now construct a case where a perfect threshold is possible
107 |             sorted_rps = np.sort(rps)
108 |             corrects = rps > sorted_rps[np.random.randint(1, num_responses-1)]
109 |             self.assertAlmostEqual(1.0, undertest.Metrics.auc_helper(corrects, rps))
110 | 
111 |     def test_compute_per_student_naive(self):
112 |         num_students = 10
113 |         num_responses = 1000
114 |         unique_reg_ids = sorted([uuid.uuid4() for _ in range(num_students)])
115 |         reg_ids = np.random.choice(unique_reg_ids, size=num_responses)
116 |         corrects = np.random.rand(num_responses) > 0.5
117 |         is_held_out = np.random.rand(num_responses) < 0.2
118 |         # make sure first reg_id appears only in train set
119 |         is_held_out[reg_ids == unique_reg_ids[0]] = False
120 |         # make sure last reg_id appears only in test set
121 |         is_held_out[reg_ids == unique_reg_ids[-1]] = True
122 |         # test per-student naive using a naive implementation
123 |         per_student_num_correct = defaultdict(float)
124 |         per_student_num_resp = defaultdict(int)
125 |         for (reg_id, correct, held_out) in its.izip(reg_ids, corrects, is_held_out):
126 |             if held_out:
127 |                 continue
128 |             per_student_num_correct[reg_id] += float(correct)
129 |             per_student_num_resp[reg_id] += 1
130 | 
131 |         preds = {reg_id: (num_correct / per_student_num_resp[reg_id]) >= 0.5
132 |                  for reg_id, num_correct in per_student_num_correct.iteritems()}
133 |         train_psn = np.mean([c == preds[r]
134 |                              for r, c in its.izip(reg_ids[~is_held_out], corrects[~is_held_out])])
135 |         test_psn = np.mean([c == preds[r] if r in preds else c
136 |                             for r, c in its.izip(reg_ids[is_held_out], corrects[is_held_out])])
137 |         actual_psn = undertest.Metrics.compute_per_student_naive(reg_ids, corrects, is_held_out)
138 |         self.assertEqual((train_psn, test_psn), actual_psn)
139 | 


--------------------------------------------------------------------------------
/rnn_prof/data/kddcup.py:
--------------------------------------------------------------------------------
  1 | import itertools as its
  2 | import logging
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | 
  7 | from .constants import (ITEM_IDX_KEY, TEMPLATE_IDX_KEY, CONCEPT_IDX_KEY, USER_IDX_KEY,
  8 |                         TIME_IDX_KEY, CORRECT_KEY, SINGLE)
  9 | 
 10 | 
 11 | LOGGER = logging.getLogger(__name__)
 12 | 
 13 | TIME_ID_KEY = 'First Transaction Time'
 14 | USER_ID_KEY = 'Anon Student Id'
 15 | ORIG_CORRECT_KEY = 'Correct First Attempt'
 16 | PROBLEM_NAME = 'Problem Name'
 17 | STEP_NAME = 'Step Name'
 18 | KC_NAME_STARTS_WITH = 'KC'
 19 | IS_TEST = 'is_test'
 20 | 
 21 | 
 22 | def load_data(file_path, item_id_col=PROBLEM_NAME, template_id_col=None, concept_id_col=None,
 23 |               remove_nan_skill_ids=False, max_interactions_per_user=None,
 24 |               drop_duplicates=False, min_interactions_per_user=2, test_file_path=None):
 25 |     """ Load data from KDD Cup data sets.
 26 | 
 27 |     :param str file_path: The location of the data
 28 |     :param str item_id_col: The column to be used for item_ids in interactions. Likely one of
 29 |         PROBLEM_NAME, STEP_NAME, or KC_NAME_STARTS_WITH
 30 |     :param str template_id_col: Set a particular column to represent a template id for hierarchical
 31 |         IRT. If 'single', assumes a dummy single hierarchical level; if None, no column is retained
 32 |         for templates.
 33 |     :param str|None concept_id_col: The column to be used for concept_ids in interactions.
 34 |         Likely KC_NAME_STARTS_WITH or 'single', in the latter case, all problems are given the same
 35 |         concept_id.  If None, no concept column is retained.
 36 |     :param bool remove_nan_skill_ids: Whether to remove interactions where the KC column is NaN
 37 |     :param int|None max_interactions_per_user: Retain only the first (in time order)
 38 |         `max_interactions_per_user` per user. If None, then there is no limit.
 39 |     :param bool drop_duplicates: Drop (seemingly) duplicate interactions
 40 |     :param int min_interactions_per_user: The minimum number of interactions required to retain
 41 |         a user
 42 |     :param str|None test_file_path: The KDD Cup data sets break themselves up into a (very large)
 43 |         training set and a (very small) test set. This allows you to combine the two files if
 44 |         specified. Will be specified in output with an IS_TEST column, which can be used if
 45 |         desired by downstream actors.
 46 |     :return: processed data, student ids corresponding to the student indices, item ids
 47 |         corresponding to the item indices, template ids corresponding to the template indices, and
 48 |         concept ids corresponding to the concept indices
 49 |     :rtype: (pd.DataFrame, np.ndarray[str], np.ndarray[str], np.ndarray[str])
 50 |     """
 51 | 
 52 |     data = pd.read_csv(file_path, delimiter='\t')
 53 | 
 54 |     LOGGER.info("Read {:3,d} rows from file".format(len(data)))
 55 | 
 56 |     if test_file_path:
 57 |         test_data = pd.read_csv(test_file_path, delimiter='\t')
 58 |         test_data[IS_TEST] = True
 59 |         data[IS_TEST] = False
 60 |         data = pd.concat([data, test_data])
 61 | 
 62 |     LOGGER.info("After test inclusion have {:3,d} rows".format(len(data)))
 63 | 
 64 |     data[TIME_IDX_KEY] = np.unique(data[TIME_ID_KEY], return_inverse=True)[1]
 65 |     data[CORRECT_KEY] = data[ORIG_CORRECT_KEY] == 1
 66 | 
 67 |     # Step names aren't universally unique. Prepend with the problem name to fix this problem.
 68 |     data[STEP_NAME] = [':'.join(x) for x in its.izip(data[PROBLEM_NAME], data[STEP_NAME])]
 69 | 
 70 |     kc_name = [column for column in data.columns if column.startswith(KC_NAME_STARTS_WITH)][0]
 71 |     if item_id_col and item_id_col.startswith(KC_NAME_STARTS_WITH):
 72 |         item_id_col = kc_name
 73 |     if template_id_col and template_id_col.startswith(KC_NAME_STARTS_WITH):
 74 |         template_id_col = kc_name
 75 |     if concept_id_col and concept_id_col.startswith(KC_NAME_STARTS_WITH):
 76 |         concept_id_col = kc_name
 77 |     if remove_nan_skill_ids:
 78 |         data = data[~data[kc_name].isnull()]
 79 |     else:
 80 |         data.ix[data[kc_name].isnull(), kc_name] = 'NaN'
 81 | 
 82 |     # Turn skills into single names. Take the first lexicographically if there's more than
 83 |     # one, though this can be modified. Only do for non nan skills.
 84 |     data[kc_name] = data[kc_name].apply(lambda x: sorted(x.split('~~'))[0])
 85 | 
 86 |     LOGGER.info("Total of {:3,d} rows remain after removing NaN skills".format(len(data)))
 87 | 
 88 |     # sort by user, time, item, and concept id (if available)
 89 |     sort_keys = [USER_ID_KEY, TIME_IDX_KEY, item_id_col]
 90 |     if concept_id_col:
 91 |         if concept_id_col == SINGLE:
 92 |             LOGGER.info('Using dummy single concept.')
 93 |             data[concept_id_col] = '0'
 94 |         elif concept_id_col not in data:
 95 |             raise ValueError('concept_id_col %s not found in data columns %s' % (concept_id_col,
 96 |                                                                                  data.columns))
 97 |         sort_keys.append(concept_id_col)
 98 | 
 99 |     data = data.sort(sort_keys)
100 |     if drop_duplicates:
101 |         data = data.drop_duplicates(sort_keys)
102 | 
103 |     # filter for students with >= min_history_length interactions;
104 |     # must be done after removing nan skillz
105 |     data = data.groupby(USER_ID_KEY).filter(lambda x: len(x) >= min_interactions_per_user)
106 |     LOGGER.info("Removed students with <{} interactions ({:3,d} rows remaining)".format(
107 |         min_interactions_per_user, len(data)))
108 | 
109 |     # limit to first `max_interactions_per_user`
110 |     if max_interactions_per_user is not None:
111 |         old_data_len = len(data)
112 |         data = data.groupby([USER_ID_KEY]).head(max_interactions_per_user)
113 |         LOGGER.info("Filtered for {} max interactions per student ({:3,d} rows removed)".format(
114 |             max_interactions_per_user, old_data_len - len(data)))
115 | 
116 |     user_ids, data[USER_IDX_KEY] = np.unique(data[USER_ID_KEY], return_inverse=True)
117 |     item_ids, data[ITEM_IDX_KEY] = np.unique(data[item_id_col], return_inverse=True)
118 |     user_ids = user_ids.astype(str)
119 |     item_ids = item_ids.astype(str)
120 | 
121 |     # TODO (yan): refactor the below to avoid code duplication across data sets
122 |     cols_to_keep = [USER_IDX_KEY, ITEM_IDX_KEY, CORRECT_KEY, TIME_IDX_KEY]
123 |     if template_id_col is None:
124 |         LOGGER.info('template_id_col not supplied, not using templates')
125 |         template_ids = None
126 |     else:
127 |         if template_id_col == SINGLE:
128 |             LOGGER.info('Using dummy single template.')
129 |             data[template_id_col] = '0'
130 |         elif template_id_col not in data:
131 |             raise ValueError('template_id_col %s not found', template_id_col)
132 |         template_ids, data[TEMPLATE_IDX_KEY] = np.unique(data[template_id_col], return_inverse=True)
133 |         cols_to_keep.append(TEMPLATE_IDX_KEY)
134 | 
135 |     if concept_id_col is None:
136 |         LOGGER.info('concept_id_col not supplied, not using concepts')
137 |         concept_ids = None
138 |     else:
139 |         concept_ids, data[CONCEPT_IDX_KEY] = np.unique(data[concept_id_col], return_inverse=True)
140 |         cols_to_keep.append(CONCEPT_IDX_KEY)
141 | 
142 |     if test_file_path:
143 |         cols_to_keep.append(IS_TEST)
144 | 
145 |     LOGGER.info("Processed data: {:3,d} interactions, {:3,d} students; {:3,d} items, "
146 |                 "{:3,d} templates, {:3,d} concepts"
147 |                 .format(len(data), len(user_ids), len(item_ids),
148 |                         len(template_ids) if template_ids is not None else 0,
149 |                         len(concept_ids) if concept_ids is not None else 0))
150 | 
151 |     return data[cols_to_keep], user_ids, item_ids, template_ids, concept_ids
152 | 


--------------------------------------------------------------------------------
/rnn_prof/irt/callbacks.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Basic callbacks for the Bayes Net IRT learners.
  3 | """
  4 | import logging
  5 | import numpy as np
  6 | 
  7 | from .cpd.ogive import OgiveCPD
  8 | from .metrics import LOGLI_KEY, MAP_ACCURACY_KEY, AUC_KEY, METRICS_KEYS
  9 | 
 10 | LOGGER = logging.getLogger(__name__)
 11 | TRAIN_LOG_POST_KEY = 'train log posterior'
 12 | ITER_KEY = 'iteration'
 13 | TEST_SUFFIX = '_TEST'
 14 | 
 15 | MAX_HIST_LEN = 400
 16 | MAX_HIST_SAMPLES = 50
 17 | 
 18 | HIST_COLOR = np.asarray([.7, .7, .7])
 19 | 
 20 | DEFAULT_METRICS = (ITER_KEY, TRAIN_LOG_POST_KEY, LOGLI_KEY, MAP_ACCURACY_KEY, AUC_KEY)
 21 | 
 22 | 
 23 | class ConvergenceCallback(object):
 24 |     """
 25 |     Basic callback that checks if convergence conditions on all the learner's node have been met.
 26 |     Optionally, print or log info statements related to convergence.
 27 |     """
 28 |     def __init__(self, early_stopping=False, log_freq=0, print_freq=100, logger=None):
 29 |         """
 30 |         :param int print_freq: print frequency (if 0, do not print)
 31 |         :param int log_freq: log frequency (if 0, do not log)
 32 |         :param bool early_stopping: Whether to stop inference if the sum of held_out nodes'
 33 |                                     log-prob_delta's is not positive
 34 |         :param Logger|None logger:  optional logger to use; if not specified, use this module's
 35 |         """
 36 |         self.early_stopping = early_stopping
 37 |         self.print_freq = print_freq
 38 |         self.log_freq = log_freq
 39 |         self.logger = logger or LOGGER
 40 | 
 41 |     def __call__(self, learner, metrics=None):
 42 |         """
 43 |         :param BayesNetLearner learner: the learner
 44 |         :param dict|None metrics: Metrics dictionary of depth 1 or 2
 45 |             (generally structured as: {metric name: array of values}) to log/print. Logs/prints
 46 |             the last element in the array of values.
 47 |         :return: whether to continue learning
 48 |         :rtype: bool
 49 |         """
 50 |         def get_msg_vals():
 51 |             msg_string = 'Iter %d: Log-Posterior: %.04f, Log10Grad: %0.4f, Log10Diff: %0.4f'
 52 |             msg_vars = [learner.iter, learner.log_posterior, max_grad, max_diff]
 53 |             if metrics is not None:
 54 |                 for mkey, mval in metrics.iteritems():
 55 |                     if isinstance(mval, dict):
 56 |                         for node_name, node_metric_val in mval.iteritems():
 57 |                             msg_string += ', %s %s: %%0.4f' % (mkey, node_name)
 58 |                             msg_vars.append(node_metric_val[-1])
 59 |                     else:
 60 |                         msg_string += ', %s: %%0.4f' % mkey
 61 |                         msg_vars.append(mval[-1])
 62 |             return msg_string, tuple(msg_vars)
 63 |         max_grad, max_diff = None, None
 64 |         if self.print_freq > 0 and not learner.iter % self.print_freq:
 65 |             max_grad, max_diff = self.compute_stats(learner)
 66 |             print_msg, print_vars = get_msg_vals()
 67 |             print_msg = '\r' + print_msg
 68 |             print print_msg % print_vars,
 69 |         if self.log_freq > 0 and not learner.iter % self.log_freq:
 70 |             if max_grad is None:
 71 |                 # compute stats if it hasn't been done yet
 72 |                 max_grad, max_diff = self.compute_stats(learner)
 73 |             log_string, log_vars = get_msg_vals()
 74 |             self.logger.info(log_string, *log_vars)
 75 |         return self.is_converged(learner)
 76 | 
 77 |     def is_converged(self, learner):
 78 |         """
 79 |         :param BayesNetLearner learner: the learner
 80 |         :return: whether to continue learning
 81 |         :rtype: bool
 82 |         """
 83 |         should_continue = not all([n.converged for n in learner.nodes.values() if not n.held_out])
 84 |         if should_continue and self.early_stopping:
 85 |             held_out_nodes = [n for n in learner.nodes.values() if n.held_out]
 86 |             if len(held_out_nodes) == 0:
 87 |                 raise ValueError('There are no held out nodes so early stopping cannot work.')
 88 |             log_prob_deltas = [n.log_prob_delta for n in held_out_nodes
 89 |                                if n.log_prob_delta is not None]
 90 |             if len(log_prob_deltas) > 0:
 91 |                 should_continue = sum(log_prob_deltas) > 0
 92 |         return should_continue
 93 | 
 94 |     @staticmethod
 95 |     def compute_stats(learner):
 96 |         """ Compute the gradient and difference changes across a learner's nodes
 97 | 
 98 |         :param BayesNetLearner learner: the IRT learner
 99 |         :return: the maximum of the gradients and the maximum of the iteration-to-iteration diffs
100 |         :rtype: float, float
101 |         """
102 |         grad_diffs = [np.abs(n.max_grad) for n in learner.nodes.values() if n.max_grad is not None]
103 |         diff_diffs = [np.abs(n.max_diff) for n in learner.nodes.values() if n.max_diff is not None]
104 |         max_grad = np.log10(np.max(grad_diffs)) if len(grad_diffs) else np.nan
105 |         max_diff = np.log10(np.max(diff_diffs)) if len(diff_diffs) else np.nan
106 |         return max_grad, max_diff
107 | 
108 | 
109 | class RecordingCallback(ConvergenceCallback):
110 |     """ Callback function that records basic learning metrics. """
111 |     def __init__(self, metrics_to_record=DEFAULT_METRICS, **kwargs):
112 |         super(RecordingCallback, self).__init__(**kwargs)
113 |         self.metrics = {m: None for m in metrics_to_record}
114 | 
115 |     def __call__(self, learner):
116 |         self.record_metrics(learner)
117 |         return super(RecordingCallback, self).__call__(learner, metrics=self.metrics)
118 | 
119 |     def record_metrics(self, learner):
120 |         """ Record the performance metrics: iteration count, global learner log-posterior, and
121 |         the metrics specified at initialization (e.g., log-likelihood, test MAP accuracy) for
122 |         all OgiveCPD nodes.
123 |         NOTE: The latter performance metrics are dictionaries two levels deep, and should be
124 |         accessed as `callback.metrics[AUC_KEY][test_response_node.name]`.
125 |         """
126 |         def append_metric(new_value, metric_key, node_key=None, dtype=None):
127 |             """ Helper function for appending to (possibly uninitialized) dictionary of metrics,
128 |             one (iteration count, log-posterior) or two (e.g., AUC for particular node) levels
129 |             deep."""
130 |             # initialize dicts/arrays if necessary
131 |             dtype = dtype or np.float64
132 |             if self.metrics[metric_key] is None:
133 |                 init_vals = np.nan * np.empty(MAX_HIST_LEN, dtype=dtype)
134 |                 self.metrics[metric_key] = init_vals if node_key is None else {node_key: init_vals}
135 |             elif node_key is not None and node_key not in self.metrics[metric_key]:
136 |                 init_vals = np.nan * np.empty(MAX_HIST_LEN, dtype=dtype)
137 |                 self.metrics[metric_key][node_key] = init_vals
138 |             # get dictionary element and append
139 |             if node_key is None:
140 |                 metric = self.metrics[metric_key]
141 |             else:
142 |                 metric = self.metrics[metric_key][node_key]
143 |             return np.append(metric[1:], new_value)
144 | 
145 |         for mkey in self.metrics:
146 |             if mkey == ITER_KEY:
147 |                 # write iteration count
148 |                 self.metrics[mkey] = append_metric(learner.iter, mkey, dtype=int)
149 |             elif mkey == TRAIN_LOG_POST_KEY:
150 |                 # write global learner log-posterior
151 |                 self.metrics[mkey] = append_metric(learner.log_posterior, mkey)
152 |             elif mkey in METRICS_KEYS:
153 |                 # for all other metrics, record values for each node with an OgiveCPD
154 |                 for node in learner.nodes.itervalues():
155 |                     if isinstance(node.cpd, OgiveCPD):
156 |                         metric = node.metrics.compute_metric(mkey)
157 |                         self.metrics[mkey][node.name] = append_metric(metric, mkey, node.name)
158 | 


--------------------------------------------------------------------------------
/rnn_prof/irt/irt.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Module containing the base class IRT learner that uses a Bayes net to specify model structure
  3 | """
  4 | from collections import defaultdict, OrderedDict
  5 | import logging
  6 | 
  7 | from igraph import Graph, Vertex
  8 | 
  9 | from .callbacks import ConvergenceCallback
 10 | from .node import Node
 11 | from .updaters import UpdateTerms
 12 | 
 13 | LOGGER = logging.getLogger(__name__)
 14 | 
 15 | 
 16 | class BayesNetGraph(Graph):
 17 |     """ A wrapper of igraph's Graph that contains nodes in a BayesNet and implements some utility
 18 |     methods. """
 19 |     def __init__(self, nodes):
 20 |         """ Build a graph from Node objects.  Edges are parent node (param) to child node (data).
 21 |         :param dict[str, Node] nodes: nodes in the BayesNet
 22 |         """
 23 |         super(BayesNetGraph, self).__init__(directed=True)
 24 |         # add vertices
 25 |         for name, node in nodes.iteritems():
 26 |             self.add_vertex(name=name, node=node)
 27 |         # add edges
 28 |         for node_name, node in nodes.iteritems():
 29 |             for par_key, param_node in node.param_nodes.iteritems():
 30 |                 if param_node not in nodes.values():
 31 |                     raise ValueError("{}'s {} not in nodes".format(node_name, par_key))
 32 |                 self.add_edge(nodes.values().index(param_node), node_name)
 33 |         if not self.is_dag():
 34 |             raise ValueError("Bayes Net is not a DAG")
 35 |         if len(self.components(mode='weak')) > 1:
 36 |             LOGGER.warn("Bayes Net is not fully connected")
 37 | 
 38 |     def topological_sorting(self, mode='IN'):
 39 |         """ Return a topological sorting of nodes in the graph.
 40 | 
 41 |         :param mode: whether to sort
 42 |         :return: nodes in topologically sorted order
 43 |         :rtype: list[Node]
 44 |         """
 45 |         sorted_node_idx = super(BayesNetGraph, self).topological_sorting(mode=mode)
 46 |         sorted_nodes = [self.vs[idx]['node'] for idx in sorted_node_idx]
 47 |         LOGGER.debug("Model topological sort: %s", [n.name for n in sorted_nodes])
 48 |         return sorted_nodes
 49 | 
 50 |     @property
 51 |     def nodes(self):
 52 |         return [v['node'] for v in self.vs]
 53 | 
 54 |     @property
 55 |     def training_nodes(self):
 56 |         """ Get the nodes in the graph used for training (not held out for testing).
 57 | 
 58 |         :return: training nodes
 59 |         :rtype: list[Node]
 60 |         """
 61 |         return [v['node'] for v in self.vs if not v['node'].held_out]
 62 | 
 63 |     @property
 64 |     def held_out_nodes(self):
 65 |         """ Get the held-out nodes in the graph used for testing.
 66 | 
 67 |         :return: held out nodes
 68 |         :rtype: list[Node]
 69 |         """
 70 |         return [v['node'] for v in self.vs if v['node'].held_out]
 71 | 
 72 |     @property
 73 |     def training_subgraph(self):
 74 |         """ Get the subgraph that includes only the training (non held-out) nodes and their
 75 |         predecessors.
 76 |         :return: training subgraph
 77 |         :rtype: BayesNetGraph
 78 |         """
 79 |         training_nodes = [v for v in self.vs if not v['node'].held_out]
 80 |         return self._predecessor_subgraph(training_nodes)
 81 | 
 82 |     @property
 83 |     def held_out_subgraph(self):
 84 |         """ Get the subgraph that includes only the held-out nodes and their predecessors.
 85 |         :return: held-out subgraph
 86 |         :rtype: BayesNetGraph
 87 |         """
 88 |         held_out_nodes = [v for v in self.vs if v['node'].held_out]
 89 |         return self._predecessor_subgraph(held_out_nodes)
 90 | 
 91 |     def _predecessor_subgraph(self, nodes):
 92 |         """ Get the subgraph containing only the passed-in nodes and their predecessors
 93 |         :param Vertex|list[Vertex] leaf_nodes:
 94 |         :return: the predecessor subgraph
 95 |         :rtype: BayesNetGraph
 96 |         """
 97 |         if isinstance(nodes, Vertex):
 98 |             nodes = [Vertex]
 99 |         preds = self.neighborhood(nodes, order=len(self.vs), mode='IN')
100 |         nodes = list(set(self.vs[idx]['node'] for ns in preds for idx in ns))
101 |         nodes = {node.name: node for node in nodes}
102 |         return BayesNetGraph(nodes)
103 | 
104 | 
105 | class BayesNetLearner(object):
106 |     """
107 |     Base class for fitting an IRT model specified by a Bayes Net graphical model.
108 |     """
109 |     def __init__(self, nodes, callback=None, max_iterations=1000):
110 |         """ Initialize a learner with a list of nodes
111 | 
112 |         :param list[Node] nodes: Bayes Net nodes
113 |         :param callback: a callback function, executed at the end of each iteration. Should return
114 |             a boolean flag indicating whether learning should continue.
115 |         :param int max_iterations: maximum number of iterations for learning
116 |         """
117 |         if callback is None:
118 |             callback = ConvergenceCallback()
119 |         for node in nodes:
120 |             if not isinstance(node, Node):
121 |                 raise TypeError("Node {} is not of type Node".format(node.name))
122 |         self.callback = callback
123 |         self.max_iterations = max_iterations
124 |         self._check_and_add_nodes(nodes)
125 |         self.log_posterior = 0.
126 |         self.iter = 0
127 | 
128 |     def _check_and_add_nodes(self, nodes):
129 |         """
130 |         Check that nodes have unique names, then add them as self.nodes OrderedDict, {name: node}
131 | 
132 |         :param list[Node] nodes: Bayes Net nodes
133 |         """
134 |         if len(set([node.name for node in nodes])) != len(nodes):
135 |             raise ValueError('nodes have non-unique names')
136 | 
137 |         self.nodes = OrderedDict()
138 |         for node in nodes:
139 |             self.nodes[node.name] = node
140 | 
141 |     def learn(self):
142 |         """
143 |         Iterate over all nodes in topological order starting with the leaves and perform updates
144 |         on each node.  Collects and stores the sum of the log-probabilities.
145 |         """
146 |         graph = BayesNetGraph(self.nodes)
147 |         sorted_nodes = graph.topological_sorting()
148 | 
149 |         # initialize a dictionary for passing evidence, keyed on Node objects
150 |         # (first key: node that will consume the evidence, second key: source node of the evidence)
151 |         LOGGER.info("Beginning learning")
152 |         while self.iter < self.max_iterations:
153 |             self.iter += 1
154 |             # clear terms
155 |             update_terms = defaultdict(dict)
156 |             self.log_posterior = 0.
157 |             for node in sorted_nodes:
158 |                 LOGGER.debug("Updating %s", node.name)
159 |                 evidence_terms = node.update(evidence_terms=update_terms[node])
160 | 
161 |                 if not node.held_out:
162 |                     for target_node, evidence_term in evidence_terms.iteritems():
163 |                         update_terms[target_node][node] = evidence_term
164 | 
165 |                     self.log_posterior += node.log_prob
166 | 
167 |             LOGGER.debug("Finished iteration %4.d, log-posterior = %f", self.iter,
168 |                          self.log_posterior)
169 |             if not self.callback(self):
170 |                 LOGGER.debug("Stopping updates because of callback termination condition")
171 |                 break
172 |         LOGGER.info("Learning finished at iteration %d, log-posterior = %f", self.iter,
173 |                     self.log_posterior)
174 | 
175 |     def get_posterior_hessian(self, node_name, x=None, use_held_out=False):
176 |         """ Get the Hessian of the log-posterior of a node at x.  If x is not passed in, use
177 |         the stored data.
178 | 
179 |         :param str node_name: the node of interest
180 |         :param np.ndarray x: the point at which to evaluate the Hessian (same dimensionality as
181 |             the node's data)
182 |         :param bool use_held_out: Whether to use held-out nodes for the computing likelihood terms
183 |         :return: the Hessian
184 |         :rtype: np.ndarray|sp.spmatrix
185 |         """
186 |         # get node of interest (and fail fast if it's not there)
187 |         post_node = self.nodes[node_name]
188 | 
189 |         # get log prior Hessian
190 |         data_terms = {post_node.cpd.DATA_KEY: UpdateTerms.grad_and_hess}
191 |         hessian = post_node.cpd(x if x is not None else post_node.data,
192 |                                 terms_to_compute=data_terms,
193 |                                 **post_node.param_data).wrt[post_node.cpd.DATA_KEY].hessian
194 | 
195 |         # collect log-likelihood Hessians from all children
196 |         for node in self.nodes.itervalues():
197 |             if node.held_out and not use_held_out:
198 |                 # skip log-likelihood terms from held-out nodes
199 |                 continue
200 |             # look through this node's parent param nodes for our target node
201 |             for par_key, par_node in node.param_nodes.iteritems():
202 |                 if par_node is post_node:
203 |                     terms_to_compute = {par_key: UpdateTerms.grad_and_hess}
204 |                     param_data = node.param_data
205 |                     if x is not None:  # if evaluating at x, replace stored data with x
206 |                         param_data[par_key] = x
207 |                     hessian = hessian + node.cpd(node.data, terms_to_compute=terms_to_compute,
208 |                                                  **param_data).wrt[par_key].hessian
209 |         return hessian
210 | 


--------------------------------------------------------------------------------
/rnn_prof/data/rnn.py:
--------------------------------------------------------------------------------
  1 | from collections import namedtuple
  2 | import itertools as its
  3 | 
  4 | import numpy as np
  5 | from scipy import sparse as sp
  6 | 
  7 | from .constants import ITEM_IDX_KEY, USER_IDX_KEY, TIME_IDX_KEY, CORRECT_KEY, HINT_COUNT_KEY
  8 | 
  9 | 
 10 | # A namedtuple representing the data for a single user
 11 | UserData = namedtuple('UserData', ['length', 'history', 'next_answer', 'truth'])
 12 | 
 13 | # A namedtuple representing a batch of users' data
 14 | Batch = namedtuple('Batch', ['length', 'history', 'next_answer', 'truth', 'mask',
 15 |                              'num_interactions'])
 16 | 
 17 | 
 18 | def build_nn_data(data, num_questions, use_correct=True, use_hints=False):
 19 |     """
 20 |     Build data ready for RNN input.
 21 | 
 22 |     :param DataFrame data: User interactions for all users in DataFrame format as
 23 |         returned by loading functions in this package.
 24 |     :param int num_questions: number of questions in the full dataset
 25 |     :param bool use_correct: If True, records responses (before compression) as a
 26 |         2 * num_questions one-hot vector where one dimension corresponds to correct
 27 |         and one dimension corresponds to incorrect. If False, records responses
 28 |         (before compression) as a num_questions one-hot vector where each dimension
 29 |         corresponds to having *answered* a question, whether correctly or incorrectly.
 30 |     :param bool use_hints: If True, records responses ternarily: Correct, Wrong with
 31 |         No Hints, and Used a Hint.
 32 |     :return: list of all users data ready for RNN input.
 33 |     :rtype: list[UserData]
 34 |     """
 35 |     all_users_data = []
 36 |     data.sort([USER_IDX_KEY, TIME_IDX_KEY], inplace=True)
 37 | 
 38 |     # use_hints => use_correct
 39 |     use_correct = use_correct or use_hints
 40 | 
 41 |     for user_id, user in data.groupby(USER_IDX_KEY):
 42 | 
 43 |         x = []  # Input X denoting position for one hot
 44 |         y = []  # Mask Y to mask the probabilities all questions except the next one
 45 |         t = []  # The truth about the correctness of the next question
 46 | 
 47 |         xiter, yiter = its.tee(user[ITEM_IDX_KEY].values)
 48 |         next(yiter, None)
 49 |         this_correct_iter, next_correct_iter = its.tee(user[CORRECT_KEY].values)
 50 |         next(next_correct_iter, None)
 51 |         if use_hints:
 52 |             hints_iter = user[HINT_COUNT_KEY].values
 53 |         else:
 54 |             hints_iter = its.cycle([0])
 55 |         for this_skill, next_skill, this_correct, next_correct, hint in its.izip(
 56 |                 xiter, yiter, this_correct_iter, next_correct_iter, hints_iter):
 57 |             # The first num_questions dimensions refer to incorrect responses, the
 58 |             # second num_questions dimensions to correct responses. *Unless*
 59 |             # use_correct is False, in which case, only num_questions dimensions
 60 |             # are used, one for answering (correctly or incorrectly) each question
 61 |             x.append(this_skill + num_questions * this_correct * (hint == 0) * use_correct +
 62 |                      2 * num_questions * (hint > 0) * use_hints)
 63 |             y.append(next_skill)
 64 |             t.append(next_correct)
 65 | 
 66 |         # Append it to a list
 67 |         all_users_data.append(UserData(length=len(x), history=x, next_answer=y, truth=t))
 68 | 
 69 |     return all_users_data
 70 | 
 71 | 
 72 | def _batch_dimension_list(user_data, threshold=0.9):
 73 |     """
 74 |     A helper function for ..py:function`build_batches` which returns a list of areas of
 75 |     the rectangles which will represent student history.
 76 | 
 77 |     :param list[UserData] user_data: The output of ..py:function`build_nn_data`. Must be
 78 |         sorted from largest to shortest history length *before* being passed.
 79 |     :return: list[(int, int)] batch_list: A list of rectangle dimensions for each batch
 80 |     """
 81 |     if len(user_data) <= 0:
 82 |         return []
 83 | 
 84 |     width = user_data[0].length  # Width of rectangle (user with max interactions within a batch)
 85 |     area_actual = 0              # Actual area within rectangle
 86 |     area_rect = 0                # Area bounded by rectangle
 87 |     height = 0                   # Height of rectangle (num users within a batch)
 88 |     dimension_list = []          # List of rectangle dimensions
 89 | 
 90 |     for i, user in enumerate(user_data):
 91 |         num_interactions = user.length
 92 | 
 93 |         # Calculate size of new area
 94 |         area_actual += num_interactions
 95 |         area_rect += width
 96 | 
 97 |         # Package the previous batch (not including the current one)
 98 |         # Note that we say height > 0 on the off chance that double rounding messes up
 99 |         #   when area_actual "==" area_rect
100 |         if area_actual / area_rect < threshold and height > 0:
101 |             dimension_list.append((width, height))
102 |             width = num_interactions
103 |             height = 0
104 |             area_actual = width
105 |             area_rect = width
106 |         height += 1
107 | 
108 |     # Append the final batch
109 |     dimension_list.append((width, height))
110 |     return dimension_list
111 | 
112 | 
113 | def build_batches(user_data, num_questions, basis, threshold=0.9, mask_type=np.int8,
114 |                   num_type=np.float32, output_basis=None, compress_dim=None):
115 |     """
116 |     Each student response history has a different length, so we will have to pad
117 |     to train in the neural network. However, if we pad too much, then our padding
118 |     will engulf the training. So we will break the history into many batches, each
119 |     of which have approximately the same length. That is, if the response history
120 |     looks like::
121 | 
122 |               interactions -->
123 |          u    >>>>>>>>>>>>>>>>>>>>>>>>
124 |          s    >>>>>>>>>>>>>>>>>>>>
125 |          e    >>>>>>>>>>>>>>>>
126 |          r    >>>>>>>>>>
127 |          |    >>>>>>>
128 |          |    >>>
129 |          v    >>>
130 | 
131 |     then we will break it up along the lines::
132 | 
133 |               interactions -->
134 |          u    >>>>>>>>>>>>>>>>>>>>>>>>
135 |          s    >>>>>>>>>>>>>>>>>>>>XXXX
136 |         --------------------------------
137 |          e    >>>>>>>>>>>>>>>>
138 |         --------------------------------
139 |          r    >>>>>>>>>>
140 |          |    >>>>>>>XXX
141 |         --------------------------------
142 |          |    >>>
143 |          v    >>>
144 | 
145 |     where the X's represent padding.
146 | 
147 |     :param list[UserData] user_data: User data with positions of one hot encoding. The
148 |         output of ..py:function`build_nn_data`.
149 |     :param int num_questions: The number of questions in the data set. Cannot be
150 |         inferred because the actual input size of the basis may differ based on
151 |         what data you are recording. See basis for more information.
152 |     :param np.ndarray basis: Compressed sensing matrix, which should be of size
153 |         (dimension of actual input, lower dimension)
154 |         Note that the size of the actual input may vary, e.g., if you are recording
155 |         whether or not a student answered a question correctly versus whether a student
156 |         simply saw the question
157 |     :param dtype mask_type: The type to be used for boolean values when running in Theano.
158 |     :param dtype num_type: The type to be used for float values when running in Theano.
159 |     :param np.ndarray|None output_basis: The compression basis for the output of the
160 |         RNN. If None, no compression is assumed, i.e., implicitly it is the
161 |         num_questions x num_questions identity matrix.
162 |     :param int|None compress_dim: If basis is None, that is we are using the identity
163 |         matrix, then you will need to specify this to indicate the dimension of
164 |         the input.
165 |     :return: Batches of user data with actual one hot encoding in numpy matrix
166 |     :rtype: list[Batch]
167 |     """
168 | 
169 |     # Sort user data by interaction in reverse order
170 |     user_data.sort(key=lambda u: u.length, reverse=True)
171 |     dimension_list = _batch_dimension_list(user_data, threshold=threshold)
172 | 
173 |     all_batches = []
174 |     user_index = 0
175 |     if (compress_dim is None) == (basis is None):
176 |         if basis is not None:
177 |             if basis.shape[1] != compress_dim:
178 |                 raise ValueError("If both basis and compress_dim are specified, then "
179 |                                  "basis.shape[1] ({}) must match compress_dim ({})"
180 |                                  .format(basis.shape[1], compress_dim))
181 |     compress_dim = compress_dim or basis.shape[1]
182 | 
183 |     if basis is None:
184 |         eye_basis = sp.eye(compress_dim).tocsc()
185 |     if output_basis is None:
186 |         eye_output_basis = sp.eye(num_questions).tocsc()
187 | 
188 |     for (width, height) in dimension_list:
189 |         # Input X
190 |         batch_x = np.zeros((width, height, compress_dim), dtype=num_type)
191 | 
192 |         # Mask Y on probabilities for next question if uncompressed, or
193 |         # a vector to dot with if compressed.
194 |         batch_y = np.zeros((width, height, num_questions), dtype=num_type)
195 |         # Truth on correctness of next question
196 |         batch_t = np.zeros((width, height), dtype=mask_type)
197 |         # Mask on actual user interaction within a rectangle batch of users
198 |         batch_m = np.zeros((width, height), dtype=mask_type)
199 | 
200 |         for j in xrange(height):
201 |             user = user_data[user_index]
202 | 
203 |             # Compressed sensing on input X
204 |             if basis is not None:
205 |                 x = basis[user.history, :]
206 |             else:
207 |                 x = eye_basis[user.history, :].toarray()
208 | 
209 |             if output_basis is not None:
210 |                 y = output_basis[user.next_answer, :]
211 |             else:
212 |                 y = eye_output_basis[user.next_answer, :].toarray()
213 | 
214 |             # Construct the numpy matrix for user
215 |             for i in xrange(user.length):
216 |                 np.copyto(batch_x[i][j], x[i])
217 |                 np.copyto(batch_y[i][j], y[i])
218 |                 batch_t[i][j] = user.truth[i]
219 |                 batch_m[i][j] = 1
220 | 
221 |             # Next user
222 |             user_index += 1
223 | 
224 |         all_batches.append(
225 |             Batch(history=batch_x, next_answer=batch_y, truth=batch_t, mask=batch_m,
226 |                   length=height, num_interactions=batch_m.sum()))
227 |     return all_batches
228 | 


--------------------------------------------------------------------------------
/rnn_prof/tests/irt/test_online_cross_validation.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Tests for online cross-validation.
  3 | """
  4 | import unittest
  5 | 
  6 | import numpy as np
  7 | 
  8 | from rnn_prof.irt.online_cross_validation import get_online_rps, _idx_to_occurrence_ordinal
  9 | from rnn_prof.irt.constants import THETAS_KEY, TEST_RESPONSES_KEY, OFFSET_COEFFS_KEY
 10 | from rnn_prof.irt.cpd import OnePOCPD
 11 | from rnn_prof.irt.learners import OnePOLearner
 12 | from rnn_prof.irt.metrics import Metrics
 13 | 
 14 | 
 15 | class TestOnlinePrediction(unittest.TestCase):
 16 |     @staticmethod
 17 |     def set_up_learner(zero_difficulties, learn_diffs, theta_variance):
 18 |         """ Make student interactions and train a 1PO learner. Interactions are not sorted by
 19 |             student.
 20 |         :param bool zero_difficulties: whether item difficulties should be 0 (otherwise, drawn
 21 |             from standard Normal.
 22 |         :param bool learn_diffs: whether to learn item difficulties
 23 |         :param float theta_variance: theta prior variance
 24 |         :return: the trained learner and the student indices for the test interactions
 25 |         :rtype: BayesNetLearner, np.ndarray
 26 |         """
 27 |         held_out_frac = 0.2
 28 |         num_students = 50
 29 |         num_items = 10
 30 |         num_interactions = 100
 31 | 
 32 |         # make all interactions
 33 |         student_idx = np.random.randint(0, num_students, num_interactions)
 34 |         item_idx = np.random.randint(0, num_items, num_interactions)
 35 |         difficulties = np.zeros(num_items) if zero_difficulties else np.random.randn(num_items)
 36 |         thetas = np.linspace(-2, 2, num_students)
 37 |         prob_correct = OnePOCPD._prob_correct_from_irf_arg(thetas[student_idx] -
 38 |                                                            difficulties[item_idx])
 39 |         correct = np.random.rand(num_interactions) < prob_correct
 40 |         held_out_students = np.arange(num_students)[np.random.rand(num_students) < held_out_frac]
 41 |         held_out_idx = np.in1d(student_idx, held_out_students)
 42 |         held_out_student_idx = student_idx[held_out_idx]
 43 | 
 44 |         learner = OnePOLearner(correct, student_idx, item_idx, is_held_out=held_out_idx,
 45 |                                max_iterations=10)
 46 |         # make weak theta prior and no item difficulty learning
 47 |         learner.nodes[THETAS_KEY].cpd.precision /= theta_variance
 48 |         learner.nodes[OFFSET_COEFFS_KEY].solver_pars.learn = learn_diffs
 49 |         learner.nodes[THETAS_KEY].solver_pars.grad_tol = 1e-12
 50 |         # train learner on training data
 51 |         learner.learn()
 52 |         return learner, held_out_student_idx
 53 | 
 54 |     def test_prediction(self):
 55 |         """ Test that online prediction yields the same probabilities as running percent correct
 56 |         when item parameters are default and priors are weak. """
 57 |         learner, held_out_student_idx = self.set_up_learner(True, False, 1e9)
 58 | 
 59 |         # store all node's data and reference IDs for CPDs and param_node dicts
 60 |         orig_datas = {key: node.data for key, node in learner.nodes.iteritems()}
 61 |         orig_cpd_ids = {key: id(node.cpd) for key, node in learner.nodes.iteritems()}
 62 |         orig_param_node_ids = {key: id(node.param_nodes)
 63 |                                for key, node in learner.nodes.iteritems()}
 64 |         orig_fields = {}
 65 |         for field in ('callback', 'max_iterations', 'log_posterior', 'iter'):
 66 |             orig_fields[field] = getattr(learner, field)
 67 | 
 68 |         prob_correct = get_online_rps(learner, held_out_student_idx, max_iterations=1000)
 69 | 
 70 |         # get the test node with all the appended test responses
 71 |         test_correct = learner.nodes[TEST_RESPONSES_KEY].data
 72 | 
 73 |         valid_idx = np.isfinite(prob_correct)
 74 |         num_nan_rp = len(prob_correct) - np.sum(valid_idx)
 75 |         # check that number of NaN RPs equals total number of students
 76 |         self.assertEqual(num_nan_rp, len(np.unique(held_out_student_idx)))
 77 |         online_pc = Metrics.online_perc_correct(test_correct, held_out_student_idx)
 78 |         np.testing.assert_array_almost_equal(prob_correct[valid_idx], online_pc[valid_idx],
 79 |                                              decimal=6)
 80 | 
 81 |         # test that the original quantities are not modified
 82 |         for key in orig_datas:
 83 |             self.assertTrue(learner.nodes[key].data is orig_datas[key])
 84 |             np.testing.assert_array_equal(learner.nodes[key].data, orig_datas[key])
 85 |             self.assertTrue(id(learner.nodes[key].cpd) == orig_cpd_ids[key])
 86 |             self.assertTrue(id(learner.nodes[key].param_nodes) == orig_param_node_ids[key])
 87 |         for field, value in orig_fields.iteritems():
 88 |             self.assertTrue(getattr(learner, field) is value)
 89 | 
 90 |         # test that running online prediction again yields the same result; this time modify learner
 91 |         prob_correct_mod = get_online_rps(learner, held_out_student_idx, max_iterations=1000,
 92 |                                           copy_learner=False)
 93 |         np.testing.assert_equal(prob_correct_mod, prob_correct)
 94 |         # original responses should not have been modified, but thetas should have been
 95 |         for key in orig_datas:
 96 |             if key == THETAS_KEY:
 97 |                 self.assertFalse(learner.nodes[key].data is orig_datas[key])
 98 |             else:
 99 |                 self.assertTrue(learner.nodes[key].data is orig_datas[key])
100 |                 self.assertTrue(id(learner.nodes[key].cpd) == orig_cpd_ids[key])
101 |                 self.assertTrue(id(learner.nodes[key].param_nodes) == orig_param_node_ids[key])
102 | 
103 |     def test_first_interaction_rps(self):
104 |         """ Test that the predicted RP for students' first interactions is equal to the item
105 |         offset parameter passed through the IRF. """
106 |         learner, held_out_student_idx = self.set_up_learner(False, True, 1.0)
107 |         prob_correct = get_online_rps(learner, held_out_student_idx, max_iterations=100,
108 |                                       compute_first_interaction_rps=True)
109 |         # check that there are no NaNs in RPs
110 |         self.assertTrue(np.all(np.isfinite(prob_correct)))
111 | 
112 |         # figure out which interactions are a student's first
113 |         seen_student_idx = set()
114 |         first_int_idx = np.zeros(len(held_out_student_idx), dtype=bool)
115 |         for k, idx in enumerate(held_out_student_idx):
116 |             if idx not in seen_student_idx:
117 |                 first_int_idx[k] = True
118 |                 seen_student_idx.add(idx)
119 | 
120 |         # test that RPs are offset coeffs passed through the IRF
121 |         test_offsets = learner.params_per_response()[TEST_RESPONSES_KEY][OFFSET_COEFFS_KEY]
122 |         np.testing.assert_array_almost_equal(prob_correct[first_int_idx],
123 |             OnePOCPD._prob_correct_from_irf_arg(test_offsets[first_int_idx]).ravel())
124 | 
125 |     def test_online_pred_contemporaneous_events(self):
126 |         """ Test that online prediction for events with the same item parameters occurring at
127 |         the same labeled "time" yield the same probability of correct (i.e. thetas do not change
128 |         from one event to another).
129 |         """
130 |         learner, held_out_student_idx = self.set_up_learner(False, False, 1.0)
131 |         num_test_interactions = len(held_out_student_idx)
132 |         unique_student_idx = set(held_out_student_idx)
133 | 
134 |         # test without providing times
135 |         prob_correct = get_online_rps(learner, held_out_student_idx, max_iterations=1000,
136 |                                       compute_first_interaction_rps=True)
137 |         for student_idx in unique_student_idx:
138 |             # expect all predicted RPs to be different
139 |             student_prob_correct = prob_correct[held_out_student_idx == student_idx]
140 |             self.assertTrue(np.all(np.diff(student_prob_correct)))
141 | 
142 |         # test with all unique times
143 |         unique_times = np.arange(num_test_interactions)
144 |         prob_correct = get_online_rps(learner, held_out_student_idx, max_iterations=1000,
145 |                                       compute_first_interaction_rps=True,
146 |                                       test_student_time_idx=unique_times)
147 |         for student_idx in unique_student_idx:
148 |             # expect all predicted RPs to be different
149 |             student_prob_correct = prob_correct[held_out_student_idx == student_idx]
150 |             self.assertTrue(np.all(np.diff(student_prob_correct)))
151 | 
152 |         # test with contemporaneous interactions. last two of each student's interactions share time
153 |         contemp_times = np.arange(num_test_interactions)
154 |         for student_idx in unique_student_idx:
155 |             contemp_times[np.flatnonzero(held_out_student_idx == student_idx)[-2:]] = -1
156 |         prob_correct = get_online_rps(learner, held_out_student_idx, max_iterations=1000,
157 |                                       compute_first_interaction_rps=True,
158 |                                       test_student_time_idx=contemp_times)
159 |         for student_idx in unique_student_idx:
160 |             # expect last two predicted RPs to be same
161 |             student_prob_correct = prob_correct[held_out_student_idx == student_idx]
162 |             if len(student_prob_correct) > 1:
163 |                 self.assertFalse(np.any(np.diff(student_prob_correct[-2:])))
164 | 
165 |     def test_idx_to_occurrence_ordinal(self):
166 |         """
167 |         Test helper method for setting the ordinal based on student and time ids.
168 |         """
169 |         students = np.array([1, 0, 1, 2, 3, 3, 0, 1, 1])
170 |         expected_ordinals = [0, 0, 1, 0, 0, 1, 1, 2, 3]
171 |         np.testing.assert_array_equal(expected_ordinals, _idx_to_occurrence_ordinal(students))
172 | 
173 |         # same with string identifiers
174 |         students = np.array(['B', 'A', 'B', 'C', 'D', 'D', 'A', 'B', 'B'])
175 |         np.testing.assert_array_equal(expected_ordinals, _idx_to_occurrence_ordinal(students))
176 | 
177 |         # all unique times
178 |         times = np.array([4, 1, 3, 2, 6, 5, 0, 8, 7])
179 |         np.testing.assert_array_equal(expected_ordinals,
180 |                                       _idx_to_occurrence_ordinal(students, times))
181 |         # non-unique times, but unique for each student
182 |         times = np.array([4, 1, 1, 1, 1, 5, 0, 6, 3])
183 |         np.testing.assert_array_equal(expected_ordinals,
184 |                                       _idx_to_occurrence_ordinal(students, times))
185 |         # bundle the middle 2 interactions for student 'B'
186 |         times = np.array([4, 1, 1, 1, 1, 5, 0, 1, 3])
187 |         expected_ordinals = [0, 0, 1, 0, 0, 1, 1, 1, 2]
188 |         np.testing.assert_array_equal(expected_ordinals,
189 |                                       _idx_to_occurrence_ordinal(students, times))
190 | 


--------------------------------------------------------------------------------
/rnn_prof/irt/online_cross_validation.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Helper utilities for taking a trained BayesNet model with a test responses node, and computing
  3 | predicted probability of correct in an online setting by training new learners on subsequences
  4 | of interactions.
  5 | """
  6 | import logging
  7 | 
  8 | import numpy as np
  9 | 
 10 | from .callbacks import ConvergenceCallback
 11 | from .constants import TRAIN_RESPONSES_KEY, TEST_RESPONSES_KEY, THETAS_KEY, OFFSET_COEFFS_KEY
 12 | from .irt import BayesNetLearner, BayesNetGraph
 13 | from .node import Node
 14 | from .updaters import SolverPars
 15 | 
 16 | LOGGER = logging.getLogger(__name__)
 17 | LOGGER.setLevel(logging.INFO)
 18 | PRINT_FREQ = 100
 19 | 
 20 | 
 21 | def get_online_rps(learner, test_student_idx, test_student_time_idx=None,
 22 |                    learn_node_keys=(THETAS_KEY, ), compute_first_interaction_rps=False,
 23 |                    copy_learner=True, **test_learner_kwargs):
 24 |     """Compute the probability of correct of the `test response` node's interactions with an online
 25 |     training scheme (train learners on interactions up to but not including interaction i, and make
 26 |     a prediction of probability of correct for i.
 27 |     NOTE: We assume that for each student, interactions are sorted by time.
 28 | 
 29 |     :param BayesNetLearner learner: The learner with the original train and test response nodes.
 30 |     :param np.ndarray test_student_idx: Index indicating the student associated with each
 31 |         interaction in learner.nodes['test responses'].
 32 |     :param np.ndarray|None test_student_time_idx: unique event identifiers for each interaction
 33 |         in the test set. If None, it is assumed that each interaction occurs at a new time, and
 34 |         can be used to predict all following interactions.  If supplied, adjacent interactions
 35 |         with identical identifiers will both be in the test or the train set, and never split
 36 |         during online validation.
 37 |     :param tuple(str) learn_node_keys: The keys of learner nodes whose variables should be adapted
 38 |         at each iteration.
 39 |     :param bool compute_first_interaction_rps: Whether to compute the RP of a student's first
 40 |         interaction (thetas optimized under the prior only).  Default is no, in which case NaNs
 41 |         are returned for first interaction RPs.
 42 |     :param bool copy_learner: whether to operate on a copy of the learner, which avoids mutating
 43 |         the learner but incurs a memory cost of copying all the nodes' data.  Set to False if the
 44 |         learner is disposable, in which case the data of all nodes in ``learn_node_keys`` will be
 45 |         modified, and the theta node will be pruned in-place for efficient optimization.
 46 |     :param test_learner_kwargs: Optional keyword arguments that will be passed to the constructor
 47 |         of BayesNetLearner for the test learner.
 48 |     :return: Probabilities of correct (RPs) for each interaction (not including the first, which is
 49 |         set to np.nan) derived from a learner trained on the previous interactions.
 50 |     :rtype: np.ndarray
 51 |     """
 52 |     if not (test_learner_kwargs and 'callback' in test_learner_kwargs):
 53 |         test_learner_kwargs['callback'] = ConvergenceCallback()
 54 | 
 55 |     # get learner logger to set desired levels
 56 |     learner_logger = logging.getLogger('rnn_prof.irt')
 57 | 
 58 |     # get iteration count, an array indicating the online validation iteration associated with each
 59 |     # interaction in the set of test responses
 60 |     iteration_count = _idx_to_occurrence_ordinal(test_student_idx, test_student_time_idx)
 61 |     max_interactions = np.max(iteration_count) + 1
 62 | 
 63 |     # get corrects and parameter indices that will be sub-indexed during online validation
 64 |     correct = learner.nodes[TEST_RESPONSES_KEY].data
 65 |     theta_idx = learner.nodes[TEST_RESPONSES_KEY].cpd.index_map(THETAS_KEY)
 66 |     item_idx = learner.nodes[TEST_RESPONSES_KEY].cpd.index_map(OFFSET_COEFFS_KEY)
 67 |     cpd_class = learner.nodes[TEST_RESPONSES_KEY].cpd.__class__
 68 | 
 69 |     num_items = learner.nodes[OFFSET_COEFFS_KEY].cpd.dim
 70 |     # initialize arrays for storing all online validation prob corrects
 71 |     prob_correct = np.nan * np.empty_like(iteration_count, dtype=float)
 72 | 
 73 |     if copy_learner:
 74 |         # make copies of the nodes
 75 |         new_nodes = []
 76 |         for node in BayesNetGraph(learner.nodes).topological_sorting():
 77 |             # insert a copy
 78 |             LOGGER.debug("adding {}".format(node.name))
 79 |             new_nodes.append(node.copy())
 80 | 
 81 |             # replace child-parent references in the previously inserted nodes to point to this one
 82 |             for prev_node in new_nodes:
 83 |                 for par_key, par_node in prev_node.param_nodes.iteritems():
 84 |                     if par_node is node:
 85 |                         prev_node.param_nodes[par_key] = new_nodes[-1]
 86 |                         LOGGER.debug("relinking %s's node's %s param",
 87 |                                      prev_node.name, par_key)
 88 | 
 89 |         test_learner = BayesNetLearner(new_nodes, **test_learner_kwargs)
 90 |     else:
 91 |         test_learner = learner
 92 | 
 93 |     # turn off learning for nodes that should be constant
 94 |     for node in test_learner.nodes.itervalues():
 95 |         if node.name not in learn_node_keys:
 96 |             LOGGER.debug("node {} parameters will not be learned".format(node.name))
 97 |             node.solver_pars.learn = False
 98 |             node.converged = True
 99 | 
100 |     theta_node = test_learner.nodes[THETAS_KEY]
101 |     # get the thetas that depend (directly or through the prior precision) on the interactions
102 |     # in orig_test_node
103 |     thetas_to_keep = theta_node.cpd.get_dependent_vars(np.unique(theta_idx))
104 |     # trim theta node in place and remap theta_idx to the newly trimmed cpd
105 |     theta_idx = theta_node.subset(thetas_to_keep, inplace=True)[theta_idx]
106 |     num_thetas = theta_node.cpd.dim
107 | 
108 |     # quiet the online learner logger from INFO to WARNING (leave DEBUG alone)
109 |     orig_log_level = learner_logger.getEffectiveLevel()
110 |     if orig_log_level == logging.INFO:
111 |         learner_logger.setLevel(logging.WARNING)
112 | 
113 |     for k in np.arange(0 if compute_first_interaction_rps else 1, max_interactions):
114 |         test_idx = (iteration_count == k)
115 |         train_idx = (iteration_count < k)
116 |         # remove from train index students not in test_idx (whose interactions are all processed)
117 |         train_idx &= np.in1d(test_student_idx, test_student_idx[test_idx])
118 | 
119 |         test_learner = BayesNetLearner(test_learner.nodes.values(), **test_learner_kwargs)
120 |         # make new train/test nodes by splitting the original test node's correct into train/test
121 |         for node_name, idx in ((TRAIN_RESPONSES_KEY, train_idx), (TEST_RESPONSES_KEY, test_idx)):
122 |             if k == 0 and node_name == TRAIN_RESPONSES_KEY:
123 |                 # when on first interaction, make training node not empty (to avoid errors); it
124 |                 # will be labeled held-out below
125 |                 idx = test_idx
126 |             param_nodes = test_learner.nodes[node_name].param_nodes
127 |             test_learner.nodes[node_name] = Node(name=node_name,
128 |                                                  data=correct[idx],
129 |                                                  solver_pars=SolverPars(learn=False),
130 |                                                  cpd=cpd_class(item_idx=item_idx[idx],
131 |                                                                theta_idx=theta_idx[idx],
132 |                                                                num_thetas=num_thetas,
133 |                                                                num_items=num_items),
134 |                                                  param_nodes=param_nodes,
135 |                                                  held_out=((node_name == TEST_RESPONSES_KEY) or
136 |                                                            k == 0))
137 | 
138 |         # run this iteration's learner and save the probability of correct for test responses
139 |         test_learner.learn()
140 |         iter_test_node = test_learner.nodes[TEST_RESPONSES_KEY]
141 |         prob_correct[test_idx] = iter_test_node.cpd.compute_prob_true(**iter_test_node.param_data)
142 | 
143 |         if np.any(np.isnan(prob_correct[test_idx])):
144 |             LOGGER.warn("NaN value in prob correct; iteration=%d" % k)
145 |         if not k % PRINT_FREQ:
146 |             num_train_interactions = np.sum(train_idx)
147 |             num_test_interactions = np.sum(test_idx)
148 |             msg = "Processed histories up to length %d (max=%d): %d train and %d test interactions."
149 |             LOGGER.info(msg % (k, max_interactions, num_train_interactions, num_test_interactions))
150 | 
151 |     # reset the learner logger
152 |     learner_logger.setLevel(orig_log_level)
153 | 
154 |     return prob_correct
155 | 
156 | 
157 | def _idx_to_occurrence_ordinal(student_ids, time_ids=None):
158 |     """
159 |     Convert student index and unique time identifiers into the ordinal of the student's
160 |     interaction.  The values of the time index are not important, only whether the value is
161 |     equal to the previous value for the student. For example, the result of:
162 |     idx_to_occurrence_ordinal(['s1', 's1', 's1', 's2', 's3', 's3', 's1'],
163 |                               ['t2', 't3', 't3', 't1', 't1', 't1', 't2'])
164 |     is                        [0,    1,    1,    0,    0,    1,    2]
165 |     Note that the last interaction (s1, t2) has a repeat time identifier, but
166 |     because it is different than the student's previous time identifier at interaction 3 (s1, t3),
167 |     this event is considered non-repeat.
168 | 
169 |     :param np.ndarray student_ids: unique identifiers for each student
170 |     :param None|np.ndarray time_ids: identifiers for the time of each interaction (NOTE:
171 |         this method only tests equality between adjacent events for each student; i.e.,
172 |         these identifiers are unique if sorted, but not necessarily unique if unsorted.)
173 |     :return: interaction ordinal for each student
174 |     :rtype: np.ndarray[int]
175 |     """
176 |     check_times = time_ids is not None
177 |     student_idx = np.unique(student_ids, return_inverse=True)[1]
178 |     counter = -np.ones(np.max(student_idx) + 1, dtype=int)
179 |     idx_ordinal = np.zeros_like(student_idx)
180 |     if check_times:
181 |         prev_times = np.zeros_like(counter)
182 |     for i, k in enumerate(student_idx):
183 |         if check_times:
184 |             this_time = time_ids[i]
185 |             if counter[k] == -1:
186 |                 new_time = 1
187 |             else:
188 |                 new_time = int(prev_times[k] != this_time)
189 |             prev_times[k] = this_time
190 |         else:
191 |             new_time = 1
192 |         counter[k] += new_time
193 |         idx_ordinal[i] = counter[k]
194 |     return idx_ordinal
195 | 


--------------------------------------------------------------------------------
/rnn_prof/run_irt.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Script for running basic online IRT
  3 | """
  4 | import logging
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | from scipy import sparse as sp
  9 | 
 10 | from .data.constants import (ITEM_IDX_KEY, TEMPLATE_IDX_KEY, USER_IDX_KEY, CORRECT_KEY,
 11 |                              CONCEPT_IDX_KEY)
 12 | from .data.wrapper import DEFAULT_DATA_OPTS
 13 | from .irt import TEST_RESPONSES_KEY, OFFSET_COEFFS_KEY
 14 | from .irt.callbacks import ConvergenceCallback
 15 | from .irt.learners import OnePOLearner, TwoPOLearner, OnePOHighRT, HIGHER_OFFSET_KEY
 16 | from .irt.metrics import Metrics
 17 | from .irt.online_cross_validation import get_online_rps
 18 | 
 19 | LOGGER = logging.getLogger(__name__)
 20 | 
 21 | 
 22 | def get_metrics(correct, rps):
 23 |     """ Compute global PC, MAP Accuracy, AUC validation metrics.
 24 |     :param np.ndarray[bool] correct: correctnesses
 25 |     :param np.ndarray[float] rps: probability of correct
 26 |     :return: global percent correct, MAP accuracy, AUC
 27 |     :rtype: dict
 28 |     """
 29 |     correct_hats = rps >= 0.5
 30 |     global_acc = np.mean(np.array(correct, dtype=float))
 31 |     map_acc = np.mean(np.array(correct_hats == correct, dtype=float))
 32 |     auc = Metrics.auc_helper(correct, rps)
 33 |     return {'global': global_acc, 'map': map_acc, 'auc': auc}
 34 | 
 35 | 
 36 | def compute_theta_idx(train_df, test_df=None, single_concept=True):
 37 |     """
 38 |     Compute theta indices. If single_concept is True, then there is one theta
 39 |     per user, and if it is false, there is one theta per user/concept pair.
 40 | 
 41 |     Training and testing users are assumed disjoint and consecutive.
 42 | 
 43 |     :param pd.DataFrame train_df: The DataFrame of training data. Should have
 44 |         columns labeled `USER_IDX_KEY` and `CONCEPT_IDX_KEY`.
 45 |     :param pd.DataFrame|None test_df: The DataFrame of testing data. Should have
 46 |         columns labeled `USER_IDX_KEY` and `CONCEPT_IDX_KEY`. Can be None and if
 47 |         so is simply ignored.
 48 |     :param bool single_concept: Should there be one theta per user (True) or
 49 |         one theta per user/concept pair (False)
 50 |     :return: Theta indices whose order corresponds to the order of the passed
 51 |         data. Training comes before testing.
 52 |     :rtype: np.ndarray
 53 |     """
 54 |     if single_concept:
 55 |         if test_df is None:
 56 |             return train_df[USER_IDX_KEY].values
 57 |         else:
 58 |             return np.concatenate([train_df[USER_IDX_KEY].values, test_df[USER_IDX_KEY].values])
 59 |     else:
 60 |         num_users = train_df[USER_IDX_KEY].max() + 1
 61 |         if test_df is None:
 62 |             train_idx = train_df[USER_IDX_KEY].values + train_df[CONCEPT_IDX_KEY].values * num_users
 63 |             return train_idx
 64 | 
 65 |         num_users = max(num_users, test_df[USER_IDX_KEY].max() + 1)
 66 |         train_idx = train_df[USER_IDX_KEY].values + train_df[CONCEPT_IDX_KEY].values * num_users
 67 |         test_idx = test_df[USER_IDX_KEY].values + test_df[CONCEPT_IDX_KEY].values * num_users
 68 |         return np.concatenate([train_idx, test_idx])
 69 | 
 70 | 
 71 | def get_irt_learner(train_df, test_df=None, is_two_po=True,
 72 |                     single_concept=True, template_precision=None, item_precision=None):
 73 |     """ Make a 1PO or 2PO learner.
 74 | 
 75 |     :param pd.DataFrame train_df: Train data
 76 |     :param pd.DataFrame test_df: Optional test data
 77 |     :param bool is_two_po: Whether to make a 2PO learner
 78 |     :param bool single_concept: Should we train with a single theta per user (True)
 79 |         or a single theta per user per concept (False)
 80 |     :param float template_precision: The hierarchical IRT model has a model
 81 |         item_difficulty ~ N(template_difficulty, 1.0/item_precision) and
 82 |         template_difficulty ~ N(0, 1.0/template_precision). None just ignores
 83 |         templates.
 84 |     :param float|None item_precision: The precision of the Gaussian prior around items in a
 85 |         non-templated model. Or see `template_precision` for the templated case. If None, uses 1.0.
 86 |     :return: The learner
 87 |     :rtype: BayesNetLearner
 88 |     """
 89 |     correct = train_df[CORRECT_KEY].values.astype(bool)
 90 |     item_idx = train_df[ITEM_IDX_KEY].values
 91 |     is_held_out = np.zeros(len(train_df), dtype=bool)
 92 |     if test_df is not None:
 93 |         correct = np.concatenate((correct, test_df[CORRECT_KEY].values.astype(bool)))
 94 |         item_idx = np.concatenate((item_idx, test_df[ITEM_IDX_KEY].values))
 95 |         is_held_out = np.concatenate((is_held_out, np.ones(len(test_df), dtype=bool)))
 96 | 
 97 |     student_idx = compute_theta_idx(train_df, test_df=test_df, single_concept=single_concept)
 98 |     if not template_precision:
 99 |         learner_class = TwoPOLearner if is_two_po else OnePOLearner
100 |         learner = learner_class(correct, student_idx=student_idx, item_idx=item_idx,
101 |                                 is_held_out=is_held_out, max_iterations=1000,
102 |                                 callback=ConvergenceCallback())
103 |         for node in learner.nodes.itervalues():
104 |             node.solver_pars.updater.step_size = 0.5
105 |         if item_precision is not None:
106 |             learner.nodes[OFFSET_COEFFS_KEY].cpd.precision = \
107 |                 item_precision * sp.eye(learner.nodes[OFFSET_COEFFS_KEY].data.size)
108 |             LOGGER.info("Made a 1PO IRT learner with item precision %f", item_precision)
109 |         else:
110 |             LOGGER.info("Made a 1PO IRT learner with default item precision")
111 |     else:
112 |         template_idx = train_df[TEMPLATE_IDX_KEY]
113 |         if test_df is not None:
114 |             template_idx = np.concatenate((template_idx, test_df[TEMPLATE_IDX_KEY].values))
115 |         problem_to_template = {item: template for item, template in zip(item_idx, template_idx)}
116 |         problem_to_template = sorted(problem_to_template.items())
117 |         template_idx = np.array([x for _, x in problem_to_template])
118 |         learner = OnePOHighRT(correct, student_idx, item_idx, template_idx,
119 |                               is_held_out=is_held_out, max_iterations=1000,
120 |                               higher_precision=item_precision,
121 |                               callback=ConvergenceCallback())
122 |         if item_precision is not None:
123 |             learner.nodes[HIGHER_OFFSET_KEY].cpd.precision = \
124 |                 template_precision * sp.eye(learner.nodes[HIGHER_OFFSET_KEY].data.size)
125 |         for node in learner.nodes.itervalues():
126 |             node.solver_pars.updater.step_size = 0.5
127 |         LOGGER.info("Made a hierarchical IRT learner with item precision %f and template "
128 |                     "precision %f", item_precision, template_precision)
129 |     return learner
130 | 
131 | 
132 | def irt(data_folds, num_folds, output=None, data_opts=DEFAULT_DATA_OPTS, is_two_po=True,
133 |         single_concept=True, template_precision=None, which_fold=None,
134 |         item_precision=None):
135 |     """ Run 1PO/2PO IRT and print test-set metrics.
136 | 
137 |     :param iterable data_folds: An iterator over (train, test) data tuples
138 |     :param int num_folds: number of folds
139 |     :param str output: where to store the pickled output of the results
140 |     :param DataOpts data_opts: data pre-processing parameters, to be saved (in the future) with IRT
141 |         outputs. See `data.wrapper` for details and default values.
142 |     :param bool is_two_po: Whether to use the 2PO IRT model
143 |     :param bool single_concept: Should we train with a single concept per user (True)
144 |         or a single concept per user per concept (False)
145 |     :param float template_precision: the precision of the higher-order template variable
146 |         specifying the mean of the item difficulties
147 |     :param int | None which_fold: Specify which of the folds you want to actually process. If None,
148 |         process all folds. Good for naive parallelization.
149 |     :param float|None item_precision: The precision of the Gaussian prior around items in a
150 |         non-templated model. If None, uses 1.0.
151 |     """
152 |     if which_fold is not None and not (1 <= which_fold <= num_folds):
153 |         raise ValueError("which_fold ({which_fold}) must be between 1 "
154 |                          "and num_folds({num_folds})".format(which_fold=which_fold,
155 |                                                              num_folds=num_folds))
156 | 
157 |     np.random.seed(data_opts.seed)
158 |     metrics = pd.DataFrame()
159 |     for fold_num, (train_data, test_data) in enumerate(data_folds):
160 |         fold_num += 1
161 |         if which_fold and fold_num != which_fold:
162 |             continue
163 |         fold_metrics, _, _ = eval_learner(train_data, test_data, is_two_po, fold_num,
164 |                                           single_concept=single_concept,
165 |                                           template_precision=template_precision,
166 |                                           item_precision=item_precision)
167 |         metrics = metrics.append(pd.DataFrame(index=[len(metrics)], data=fold_metrics))
168 | 
169 |     if output:
170 |         metrics.to_pickle(output)
171 | 
172 |     # Print overall results
173 |     LOGGER.info("Overall Acc: %.5f AUC: %.5f", metrics['map'].mean(), metrics['auc'].mean())
174 | 
175 | 
176 | def eval_learner(train_data, test_data, is_two_po, fold_num,
177 |                  single_concept=True, template_precision=None, item_precision=None):
178 |     """ Create, train, and cross-validate an IRT learner on a train/test split.
179 | 
180 |     :param pd.DataFrame train_data: training data
181 |     :param pd.DataFrame test_data: testing data for cross-validation (required)
182 |     :param bool is_two_po: Whether to use the 2PO IRT model
183 |     :param int fold_num: fold number (for logging and recording results only)
184 |     :param float template_precision: The hierarchical IRT model has a model
185 |         item_difficulty ~ N(template_difficulty, 1.0/template_precision). None just ignores
186 |         templates.
187 |     :param bool single_concept: Should we train with a single concept per user (True)
188 |         or a single concept per user per concept (False)
189 |     :param float|None item_precision: The precision of the Gaussian prior around items in a
190 |         non-templated model. If None, uses 1.0.
191 |     :return: the validation metrics, predicted RP's, and boolean corrects on the test set
192 |     :rtype: dict, np.ndarray[float], np.ndarray[bool]
193 |     """
194 |     LOGGER.info("Training %s model, fold %d, (single concept = %s)",
195 |                 '2PO' if is_two_po else '1PO', fold_num, single_concept)
196 |     learner = get_irt_learner(train_data, test_data, is_two_po=is_two_po,
197 |                               single_concept=single_concept,
198 |                               template_precision=template_precision,
199 |                               item_precision=item_precision)
200 |     learner.learn()
201 |     LOGGER.info("Performing online cross-validation")
202 |     prob_correct = get_online_rps(learner, test_data[USER_IDX_KEY].values,
203 |                                   compute_first_interaction_rps=True)
204 | 
205 |     test_correct = learner.nodes[TEST_RESPONSES_KEY].data
206 |     metrics = get_metrics(test_correct, prob_correct)
207 |     metrics['is_two_po'] = is_two_po
208 |     metrics['fold_num'] = fold_num
209 |     metrics['num_test_interactions'] = len(test_correct)
210 |     LOGGER.info("Fold %d: Num Interactions: %d; Test Accuracy: %.5f; Test AUC: %.5f",
211 |                 fold_num, metrics['num_test_interactions'], metrics['map'], metrics['auc'])
212 |     return metrics, prob_correct, test_correct
213 | 


--------------------------------------------------------------------------------
/rnn_prof/tests/irt/test_learners.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Tests for simple ogive and logistic learners.
  3 | 
  4 | @author Chaitu Ekanadham - chaitu@knewton.com
  5 | 
  6 | 06/03/2015
  7 | """
  8 | from __future__ import division
  9 | import itertools as its
 10 | import uuid
 11 | 
 12 | import numpy as np
 13 | import unittest
 14 | 
 15 | from rnn_prof.irt import learners as undertest
 16 | from rnn_prof.irt.callbacks import RecordingCallback, ITER_KEY, TRAIN_LOG_POST_KEY
 17 | from rnn_prof.irt.constants import (TRAIN_RESPONSES_KEY, TEST_RESPONSES_KEY, THETAS_KEY,
 18 |                                     OFFSET_COEFFS_KEY, NONOFFSET_COEFFS_KEY)
 19 | from rnn_prof.irt.cpd import OnePOCPD, TwoPOCPD
 20 | from rnn_prof.irt.testing_utils import EPSILON, generate_data
 21 | 
 22 | NUM_TRIALS = 3
 23 | # try combinations of a few (possibly) problematic scenarios
 24 | NUM_THETAS, NUM_ITEMS, NUM_RESPONSES, PROB_CORRECT = zip(*its.product((1, 10),
 25 |                                                                       (1, 50),
 26 |                                                                       (100,),
 27 |                                                                       (0.5,)))
 28 | 
 29 | 
 30 | class TestOgiveLearners(unittest.TestCase):
 31 |     @staticmethod
 32 |     def gen_data(trial):
 33 |         return generate_data(num_students=NUM_THETAS[trial],
 34 |                              num_items=NUM_ITEMS[trial],
 35 |                              num_responses=NUM_RESPONSES[trial],
 36 |                              prob_correct=PROB_CORRECT[trial])
 37 | 
 38 |     def learn_and_assert(self, learner):
 39 |         """ Run the learner and make sure log posteriors are finite and increasing.
 40 | 
 41 |         :param BayesNetLearner learner: learner to optimize parameters
 42 |         """
 43 |         learner.learn()
 44 |         iter_indices = np.isfinite(learner.callback.metrics[ITER_KEY])
 45 |         log_posteriors = learner.callback.metrics[TRAIN_LOG_POST_KEY][iter_indices]
 46 |         self.assertTrue(np.all(np.isfinite(log_posteriors)))
 47 |         np.testing.assert_array_less(-EPSILON, np.diff(log_posteriors))
 48 | 
 49 |     def test_irf(self):
 50 |         """
 51 |         Test that the correct IRF function is used in the learners.
 52 |         """
 53 |         data = self.gen_data(0)
 54 |         for (learner_class, cpd_class) in ((undertest.OnePOLearner, OnePOCPD),
 55 |                                            (undertest.TwoPOLearner, TwoPOCPD)):
 56 |             learner = learner_class(data.correct, student_idx=data.student_idx,
 57 |                                     item_idx=data.item_idx, num_students=NUM_THETAS[0],
 58 |                                     num_items=NUM_ITEMS[0], callback=RecordingCallback(),
 59 |                                     max_iterations=10)
 60 |             self.assertEqual(learner.nodes[TRAIN_RESPONSES_KEY].cpd.__class__, cpd_class)
 61 | 
 62 |     def test_irt_learning(self):
 63 |         """
 64 |         Test that log posteriors are finite and increasing for various trials for the 1PO/2PO model.
 65 |         """
 66 |         for trial in range(len(NUM_THETAS)) * NUM_TRIALS:
 67 |             data = self.gen_data(trial)
 68 |             for learner_class in (undertest.OnePOLearner, undertest.TwoPOLearner):
 69 |                 learner = learner_class(data.correct, student_idx=data.student_idx,
 70 |                                         item_idx=data.item_idx, num_students=NUM_THETAS[trial],
 71 |                                         num_items=NUM_ITEMS[trial], callback=RecordingCallback(),
 72 |                                         max_iterations=10)
 73 |                 self.learn_and_assert(learner)
 74 | 
 75 |                 # test that params_per_response() returns the correct values
 76 |                 pars = learner.params_per_response()
 77 |                 resp_node = learner.nodes[TRAIN_RESPONSES_KEY]
 78 |                 for key in (OFFSET_COEFFS_KEY, THETAS_KEY, NONOFFSET_COEFFS_KEY):
 79 |                     if key in learner.nodes:
 80 |                         expected_par = resp_node.cpd.lin_operators[key] * learner.nodes[key].data
 81 |                         np.testing.assert_array_equal(pars[TRAIN_RESPONSES_KEY][key],
 82 |                                                       expected_par)
 83 |                 # test that posterior hessian computation does not break, Hessians are of right size
 84 |                 self.assertEqual(learner.get_posterior_hessian(OFFSET_COEFFS_KEY).shape,
 85 |                                  (NUM_ITEMS[trial], 1))
 86 |                 self.assertEqual(learner.get_posterior_hessian(THETAS_KEY).shape,
 87 |                                  (NUM_THETAS[trial], 1))
 88 | 
 89 |     def test_train_test_split(self):
 90 |         """ Test that the classes split train and test data properly.  """
 91 |         for trial in range(len(NUM_THETAS)) * NUM_TRIALS:
 92 |             if NUM_RESPONSES[trial] < 100:
 93 |                 continue
 94 |             data = self.gen_data(trial)
 95 | 
 96 |             # test OnePO and TwoPO
 97 |             for learner_class in (undertest.OnePOLearner, undertest.TwoPOLearner):
 98 |                 is_held_out = np.random.rand(NUM_RESPONSES[trial]) > 0.5
 99 |                 train_idx = np.logical_not(is_held_out)
100 |                 learner = learner_class(data.correct, student_idx=data.student_idx,
101 |                                         item_idx=data.item_idx, is_held_out=is_held_out,
102 |                                         num_students=NUM_THETAS[trial], num_items=NUM_ITEMS[trial])
103 |                 # test that correctnesses are split up correctly
104 |                 np.testing.assert_array_equal(learner.nodes[TRAIN_RESPONSES_KEY].data,
105 |                                               data.correct[train_idx])
106 |                 np.testing.assert_array_equal(learner.nodes[TEST_RESPONSES_KEY].data,
107 |                                               data.correct[is_held_out])
108 |                 # test that each response node's cpd references the right thetas and items
109 |                 for (cpd, idx) in ((learner.nodes[TRAIN_RESPONSES_KEY].cpd, train_idx),
110 |                                    (learner.nodes[TEST_RESPONSES_KEY].cpd, is_held_out)):
111 |                     np.testing.assert_array_equal(cpd.index_map(THETAS_KEY),
112 |                                                   data.student_idx[idx])
113 |                     np.testing.assert_array_equal(cpd.index_map(OFFSET_COEFFS_KEY),
114 |                                                   data.item_idx[idx])
115 |                     if learner_class is undertest.TwoPOLearner:
116 |                         np.testing.assert_array_equal(cpd.index_map(NONOFFSET_COEFFS_KEY),
117 |                                                       data.item_idx[idx])
118 |                 # test that train and test node's held_out flag is set correctly
119 |                 self.assertFalse(learner.nodes[TRAIN_RESPONSES_KEY].held_out)
120 |                 self.assertTrue(learner.nodes[TEST_RESPONSES_KEY].held_out)
121 | 
122 |     def test_get_ids(self):
123 |         for learner_class in (undertest.OnePOLearner, undertest.TwoPOLearner):
124 |             # test the case with no IDs
125 |             num_items = 50
126 |             num_students = 20
127 |             data = generate_data(num_items=num_items, num_students=num_students)
128 |             learner = learner_class(data.correct, student_idx=data.student_idx,
129 |                                     item_idx=data.item_idx)
130 |             for bogus_id in ('bogus_id', data.item_idx[0], 0):
131 |                 with self.assertRaises(ValueError):
132 |                     _ = learner.get_difficulty(bogus_id)
133 | 
134 |             uniq_item_ids = np.array([str(uuid.uuid4()) for _ in range(num_items)])
135 |             item_ids = uniq_item_ids[data.item_idx]
136 |             uniq_student_ids = np.array([str(uuid.uuid4()) for _ in range(num_students)])
137 |             student_ids = uniq_student_ids[data.student_idx]
138 | 
139 |             # test that non-unique IDs for a single item raises an error
140 |             bad_item_ids = [x for x in item_ids]
141 |             # find an item that occurs more than once
142 |             recurrent_item_idx, item_count = np.unique(data.item_idx, return_counts=True)
143 |             recurrent_item_idx = recurrent_item_idx[(item_count > 1)][0]
144 |             # set the first two of its occurrences to have different IDs
145 |             occ_idx = np.flatnonzero(np.array(data.item_idx) == recurrent_item_idx)
146 |             bad_item_ids[occ_idx[0]] = uniq_item_ids[0]
147 |             bad_item_ids[occ_idx[1]] = uniq_item_ids[1]
148 |             with self.assertRaises(ValueError):
149 |                 _ = learner_class(data.correct, student_idx=data.student_idx,
150 |                                   item_idx=data.item_idx, item_ids=bad_item_ids)
151 | 
152 |             # test that mismatched ID/index lengths raise an error
153 |             bad_item_ids = item_ids[:-1]
154 |             with self.assertRaises(ValueError):
155 |                 _ = learner_class(data.correct, student_idx=data.student_idx,
156 |                                   item_idx=data.item_idx, item_ids=bad_item_ids)
157 | 
158 |             learner = learner_class(data.correct, student_idx=data.student_idx,
159 |                                     student_ids=student_ids, item_idx=data.item_idx,
160 |                                     item_ids=item_ids,  max_iterations=10)
161 |             learner.learn()
162 |             for _ in range(NUM_TRIALS):
163 |                 query_len = np.random.randint(1, 3)
164 | 
165 |                 # test item parameters
166 |                 query_item_id = np.random.choice(uniq_item_ids, size=query_len, replace=False)
167 |                 # unique item IDs in sort order of item_idx
168 |                 sorted_ids = item_ids[np.unique(data.item_idx, return_index=True)[1]]
169 |                 # find interactions that match query_item_ids
170 |                 idx = [k for x in query_item_id for k, sid in enumerate(sorted_ids) if x == sid]
171 | 
172 |                 actual_diffs = learner.get_difficulty(query_item_id)
173 |                 if learner_class is undertest.OnePOLearner:
174 |                     expected_diffs = -learner.nodes[OFFSET_COEFFS_KEY].data[idx]
175 |                 else:
176 |                     expected_diffs = -(learner.nodes[OFFSET_COEFFS_KEY].data[idx] /
177 |                                        learner.nodes[NONOFFSET_COEFFS_KEY].data[idx])
178 |                 np.testing.assert_array_equal(actual_diffs, expected_diffs)
179 | 
180 |                 actual_offsets = learner.get_offset_coeff(query_item_id)
181 |                 expected_offsets = learner.nodes[OFFSET_COEFFS_KEY].data[idx]
182 |                 np.testing.assert_array_equal(actual_offsets, expected_offsets)
183 | 
184 |                 if learner_class is undertest.TwoPOLearner:
185 |                     actual_nonoffsets = learner.get_nonoffset_coeff(query_item_id)
186 |                     expected_nonoffsets = learner.nodes[NONOFFSET_COEFFS_KEY].data[idx]
187 |                     np.testing.assert_array_equal(actual_nonoffsets, expected_nonoffsets)
188 | 
189 |                 # test student parameters
190 |                 query_student_id = np.random.choice(uniq_student_ids, size=query_len, replace=False)
191 |                 # unique student IDs in sort order of student_idx
192 |                 sorted_ids = student_ids[np.unique(data.student_idx, return_index=True)[1]]
193 |                 # find interactions that match query_student_ids
194 |                 idx = [k for x in query_student_id for k, sid in enumerate(sorted_ids) if x == sid]
195 | 
196 |                 actual_thetas = learner.get_theta(query_student_id)
197 |                 expected_thetas = learner.nodes[THETAS_KEY].data[idx]
198 |                 np.testing.assert_array_equal(actual_thetas, expected_thetas)
199 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/rnn_prof/irt/metrics.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Class for computing various metrics on a data set with a BayesNet Node object.
  3 | """
  4 | import logging
  5 | 
  6 | import numpy as np
  7 | 
  8 | from .cpd.ogive import OgiveCPD
  9 | 
 10 | EPSILON = 1e-16
 11 | 
 12 | MAP_ACCURACY_KEY = 'map_accuracy'
 13 | AUC_KEY = 'auc'
 14 | LOGLI_KEY = 'logli'
 15 | D_PRIME_KEY = 'd_prime'
 16 | NAIVE_KEY = 'naive'
 17 | METRICS_KEYS = {NAIVE_KEY, LOGLI_KEY, MAP_ACCURACY_KEY, AUC_KEY, D_PRIME_KEY}
 18 | 
 19 | LOGGER = logging.getLogger(__name__)
 20 | 
 21 | 
 22 | class Metrics(object):
 23 |     """ Class for computing various performance metrics based on the data and the parameters in a
 24 |      Node object. """
 25 | 
 26 |     def __init__(self, node):
 27 |         """ Initialize this object with a reference to a BayesNet Node object that holds the
 28 |         CPD, the data, and the parameters."""
 29 |         self.node = node
 30 | 
 31 |     def _check_binary_cpd(self, func_name):
 32 |         if not isinstance(self.node.cpd, OgiveCPD):
 33 |             raise TypeError("{} only defined for OgiveCPDs not %s".format(
 34 |                 func_name, self.node.cpd.__class__))
 35 | 
 36 |     @classmethod
 37 |     def _check_finite(cls, prob_true, *args):
 38 |         """ Check that all probabilities are finite; if not, remove those elements and corresponding
 39 |             elements from other positional args.
 40 |         :param np.ndarray prob_true: array to check for finiteness
 41 |         :param args: optional arguments to subselect based on isfinite(prob_true)
 42 |         :return: np.ndarray|tuple[np.ndarray]
 43 |         """
 44 |         if not np.all(np.isfinite(prob_true)):
 45 |             valid_idx = np.isfinite(prob_true)
 46 |             LOGGER.warn("%d non-finite prob corrects found; ignoring these interactions",
 47 |                         np.sum(~valid_idx))
 48 |             prob_true = prob_true[valid_idx]
 49 |             args = tuple([arg[valid_idx] for arg in args])
 50 |         if not len(args):
 51 |             return prob_true
 52 |         else:
 53 |             return (prob_true,) + args
 54 | 
 55 |     @staticmethod
 56 |     def compute_per_student_naive(reg_ids, corrects, is_held_out):
 57 |         """ Compute the per-student naive metrics on the training and test sets, based on predicting
 58 |         correct if the student had more corrects in the training set.  If no data in the training
 59 |         set exist for the student, predict correct.
 60 | 
 61 |         :param np.ndarray reg_ids: unique student identifier for each interaction
 62 |         :param np.ndarray[bool] corrects: correctness values for each interaction
 63 |         :param np.ndarray[bool] is_held_out: indicator whether an interaction is in the test set
 64 |         :return: per student naive on the training and test sets
 65 |         :rtype: float, float
 66 |         """
 67 |         if len(corrects) != len(reg_ids) or len(is_held_out) != len(reg_ids):
 68 |             raise ValueError("reg_ids (%d), corrects (%d), is_held_out (%d) must have same length",
 69 |                              len(reg_ids), len(corrects), len(is_held_out))
 70 |         uniq_regids, reg_idxs = np.unique(reg_ids, return_inverse=True)
 71 |         num_reg_ids = len(uniq_regids)
 72 |         train_reg_idxs = reg_idxs[~is_held_out]
 73 |         test_reg_idxs = reg_idxs[is_held_out]
 74 |         train_corrects = corrects[~is_held_out]
 75 |         test_corrects = corrects[is_held_out]
 76 |         per_student_num_correct = np.bincount(train_reg_idxs, weights=train_corrects,
 77 |                                               minlength=num_reg_ids)
 78 |         per_student_num_responses = np.bincount(train_reg_idxs, minlength=num_reg_ids)
 79 |         pred_correct = (2 * per_student_num_correct >= per_student_num_responses)
 80 |         train_per_student_naive = np.mean(pred_correct[train_reg_idxs] == train_corrects)
 81 |         test_per_student_naive = np.mean(pred_correct[test_reg_idxs] == test_corrects)
 82 |         return train_per_student_naive, test_per_student_naive
 83 | 
 84 |     def compute_metric(self, metric_key, *args, **kwargs):
 85 |         """ Compute metric specified by the supplied key.
 86 |         :param str metric_key: key specifying the metric
 87 |         :return: the value of the metric
 88 |         :rtype: float
 89 |         """
 90 |         return getattr(self, 'compute_' + metric_key)(*args, **kwargs)
 91 | 
 92 |     def compute_naive(self):
 93 |         """ Compute the accuracy of predicting always correct or always incorrect,
 94 |         whichever is higher. Defined for binary CPDs only.
 95 | 
 96 |         :return: a number between 0 and 1 specifying prediction accuracy
 97 |         :rtype: float
 98 |         """
 99 |         self._check_binary_cpd("Naive metric")
100 |         fraction_correct = np.mean(np.array(self.node.data, dtype=float))
101 |         return max(fraction_correct, 1. - fraction_correct)
102 | 
103 |     def compute_logli(self, avg=False):
104 |         """ Compute the response log-likelihood (the value of the node's CPD given the stored data
105 |         and parameters.
106 | 
107 |         :param bool avg: whether to normalize the log-likelihood by the size of the node's data
108 |         :return: the sum of the log-likelihoods over the data points.
109 |         :rtype: float
110 |         """
111 |         log_li = self.node.compute_log_prob()
112 |         if avg:
113 |             log_li /= self.node.data.size
114 |         return log_li
115 | 
116 |     def compute_map_accuracy(self):
117 |         """ Compute the MAP accuracy (fraction of data points predicted correctly at the maximum
118 |         of the binary probability distribution).  Defined for binary CPDs only.
119 | 
120 |         :return: MAP accuracy
121 |         :rtype: float
122 |         """
123 |         self._check_binary_cpd("MAP accuracy")
124 |         prob_true = self.node.cpd.compute_prob_true(**self.node.param_data)
125 |         prob_true, data = self._check_finite(prob_true, self.node.data)
126 |         return np.mean((prob_true > 0.5) == data)
127 | 
128 |     def compute_d_prime(self):
129 |         """ Compute the d-prime statistic measuring separation between response probabilities
130 |         conditioned on a true (positive) and false (negative) data points.
131 |         Defined for binary CPDs only.
132 | 
133 |         :return: the d-prime statistic of distribution separation
134 |         :rtype: float
135 |         """
136 |         self._check_binary_cpd("D prime")
137 |         prob_true = self.node.cpd.compute_prob_true(**self.node.param_data)
138 |         return self.d_prime_helper(self.node.data, prob_true)
139 | 
140 |     def compute_auc(self):
141 |         """ Compute the area under curve (AUC) for the task of predicting binary labels
142 |         based on the probabilities computed by some model.  The curve is the Receiver Operator
143 |         Characteristic (ROC) curve, which plots the true positive rate vs. the false positive rate
144 |         as one varies the threshold on the probabilities given by the model. AUC is also equal to
145 |         the probability that the model will yield a higher probability for a randomly chosen
146 |         positive data point than for a randomly chosen negative data point.  Defined for binary
147 |         CPDs only.
148 | 
149 |         NOTE: this assumes at least one positive and one negative data point (otherwise
150 |         the notions of true positive rate and false positive rate do not make
151 |         sense).
152 | 
153 |         :return: a number between 0 and 1 specifying area under the ROC curve
154 |         :rtype: float
155 |         """
156 |         self._check_binary_cpd("AUC")
157 |         prob_true = self.node.cpd.compute_prob_true(**self.node.param_data)
158 |         return self.auc_helper(self.node.data, prob_true)
159 | 
160 |     @staticmethod
161 |     def d_prime_helper(data, prob_true):
162 |         """ Compute the d-prime metric (of the separation of probabilities associated with positive
163 |         data labels and negative data labels).
164 | 
165 |         :param np.ndarray[bool] data: binary data values (positive/negative class labels).
166 |         :param np.ndarray[float] prob_true: probability of positive label
167 |         :return: d-prime metric
168 |         :rtype: float
169 |         """
170 |         if len(prob_true) != len(data):
171 |             raise ValueError('prob_true and data must have the same length')
172 |         prob_true, data = Metrics._check_finite(prob_true, data)
173 |         pc_correct = prob_true[data]
174 |         pc_incorrect = prob_true[np.logical_not(data)]
175 |         mean_sep = np.mean(pc_correct) - np.mean(pc_incorrect)
176 |         norm_const = np.sqrt(0.5 * (np.var(pc_correct) + np.var(pc_incorrect)))
177 |         return mean_sep / norm_const
178 | 
179 |     @staticmethod
180 |     def auc_helper(data, prob_true):
181 |         """ Compute AUC (area under ROC curve) as a function of binary data values and predicted
182 |         probabilities.  If data includes only positive or only negative labels, returns np.nan.
183 | 
184 |         :param np.ndarray[bool] data: binary data values (positive/negative class labels).
185 |         :param np.ndarray[float] prob_true: probability of positive label
186 |         :return: area under ROC curve
187 |         :rtype: float
188 |         """
189 |         if len(prob_true) != len(data):
190 |             raise ValueError('prob_true and data must have the same length')
191 | 
192 |         prob_true, data = Metrics._check_finite(prob_true, data)
193 |         sorted_idx = np.argsort(prob_true)[::-1]
194 |         sorted_prob_true = prob_true[sorted_idx]
195 |         unique_prob_true_idx = np.append(np.flatnonzero(np.diff(sorted_prob_true)),
196 |                                          len(sorted_prob_true) - 1)
197 |         x = data[sorted_idx]
198 |         not_x = np.logical_not(x)
199 | 
200 |         # Compute cumulative sums of true positives and false positives.
201 |         tp = np.cumsum(x)[unique_prob_true_idx].astype(float)
202 |         fp = np.cumsum(not_x)[unique_prob_true_idx].astype(float)
203 | 
204 |         # The i'th element of tp (fp) is the number of true (false) positives
205 |         # resulting from using the i'th largest rp as a threshold. That is,
206 |         # we predict correct if a response's rp is >= sorted_prob_true[i].
207 |         # We want the first element to correspond to a threshold sufficiently
208 |         # high to yield no predictions of correct. The highest rp qualifies
209 |         # as this highest threshold if its corresponding response is incorrect.
210 |         # Otherwise, we need to add an artificial "highest threshold" at the
211 |         # beginning that yields 0 true positives and 0 false positives.
212 |         if tp[0] != 0.0:
213 |             tp = np.append(0.0, tp)
214 |             fp = np.append(0.0, fp)
215 | 
216 |         # Calculate true positive rate and false positive rate.
217 |         # This requires at least 1 correct and 1 incorrect response.
218 |         if not tp[-1]:
219 |             return np.nan
220 |         tpr = tp / tp[-1]
221 | 
222 |         if not fp[-1]:
223 |             return np.nan
224 |         fpr = fp / fp[-1]
225 | 
226 |         return np.trapz(tpr, fpr)
227 | 
228 |     @staticmethod
229 |     def online_perc_correct(correct, student_idx):
230 |         """ For each interaction, compute the percent correct for the student's previous
231 |         interactions.  The returned array will contain NaNs for each student's first interaction.
232 |         :param np.ndarray[bool] correct:
233 |         :param np.ndarray[int] student_idx:
234 |         :return: percent correct on previous interactions for this student
235 |         :rtype: np.ndarray[float]
236 |         """
237 |         student_num_correct = np.zeros(np.max(student_idx) + 1)
238 |         student_num_answered = np.zeros(np.max(student_idx) + 1)
239 |         online_pc = np.nan * np.empty_like(correct, dtype=float)
240 |         for i, c in enumerate(correct):
241 |             j = student_idx[i]
242 |             if student_num_answered[j]:
243 |                 online_pc[i] = student_num_correct[j] / float(student_num_answered[j])
244 |             student_num_answered[j] += 1
245 |             student_num_correct[j] += int(c)
246 |         return online_pc
247 | 


--------------------------------------------------------------------------------