├── .github
├── ISSUE_TEMPLATE
│ ├── bug_report.md
│ ├── documentation.md
│ └── feature_request.md
├── PULL_REQUEST_TEMPLATE.md
└── workflows
│ ├── python-publish.yml
│ └── python-test.yml
├── .gitignore
├── .travis.yml
├── AUTHORS.md
├── CHANGE.txt
├── CONTRIBUTE.md
├── CONTRIBUTE_CH.md
├── EduCDM
├── DINA
│ ├── EM
│ │ ├── DINA.py
│ │ └── __init__.py
│ ├── GD
│ │ ├── DINA.py
│ │ └── __init__.py
│ └── __init__.py
├── FuzzyCDF
│ ├── FuzzyCDF.py
│ ├── __init__.py
│ └── modules.py
├── ICD
│ ├── ICD.py
│ ├── __init__.py
│ ├── etl
│ │ ├── __init__.py
│ │ ├── etl.py
│ │ └── utils.py
│ ├── metrics
│ │ ├── __init__.py
│ │ └── metrics.py
│ ├── sym
│ │ ├── __init__.py
│ │ ├── fit_eval.py
│ │ ├── net
│ │ │ ├── __init__.py
│ │ │ ├── dtn.py
│ │ │ ├── mirt.py
│ │ │ ├── ncd.py
│ │ │ └── net.py
│ │ └── pos_linear.py
│ └── utils.py
├── IRR
│ ├── DINA.py
│ ├── IRT.py
│ ├── MIRT.py
│ ├── NCDM.py
│ ├── __init__.py
│ ├── etl
│ │ ├── __init__.py
│ │ ├── pair_etl.py
│ │ ├── point_etl.py
│ │ └── utils.py
│ └── loss.py
├── IRT
│ ├── EM
│ │ ├── IRT.py
│ │ └── __init__.py
│ ├── GD
│ │ ├── IRT.py
│ │ └── __init__.py
│ ├── __init__.py
│ └── irt.py
├── KaNCD
│ ├── KaNCD.py
│ └── __init__.py
├── MCD
│ ├── MCD.py
│ └── __init__.py
├── MIRT
│ ├── MIRT.py
│ └── __init__.py
├── NCDM
│ ├── NCDM.py
│ └── __init__.py
├── __init__.py
└── meta.py
├── LICENSE
├── Makefile
├── README.md
├── docs
├── DINA.md
├── FuzzyCDF.md
├── ICD.md
├── IRR.md
├── IRT.md
├── KaNCD.md
├── MCD.md
├── MIRT.md
├── NCDM.md
└── _static
│ ├── DINA.png
│ ├── EduCDM.png
│ ├── FuzzyCDF.png
│ ├── IRR.png
│ ├── KDM_MF.png
│ ├── KPM_MF.png
│ ├── MCD.png
│ └── NeuralCDM.JPG
├── examples
├── DINA
│ ├── EM
│ │ ├── DINA.ipynb
│ │ ├── DINA.py
│ │ └── prepare_dataset.ipynb
│ └── GD
│ │ ├── DINA.ipynb
│ │ ├── DINA.py
│ │ └── prepare_dataset.ipynb
├── FuzzyCDF
│ ├── FuzzyCDF.ipynb
│ ├── FuzzyCDF.py
│ └── prepare_dataset.ipynb
├── ICD
│ ├── ICD.py
│ └── prepare_dataset.ipynb
├── IRR
│ ├── DINA.ipynb
│ ├── DINA.py
│ ├── IRT.ipynb
│ ├── IRT.py
│ ├── MIRT.ipynb
│ ├── MIRT.py
│ ├── NCDM.ipynb
│ ├── NCDM.py
│ ├── README.md
│ └── prepare_dataset.ipynb
├── IRT
│ ├── EM
│ │ ├── IRT.ipynb
│ │ ├── IRT.py
│ │ └── prepare_dataset.ipynb
│ └── GD
│ │ ├── IRT.ipynb
│ │ ├── IRT.py
│ │ └── prepare_dataset.ipynb
├── KaNCD
│ ├── KaNCD.ipynb
│ ├── KaNCD.py
│ └── prepare_dataset.ipynb
├── MCD
│ ├── MCD.ipynb
│ ├── MCD.py
│ └── prepare_dataset.ipynb
├── MIRT
│ ├── MIRT.ipynb
│ ├── MIRT.py
│ └── prepare_dataset.ipynb
└── NCDM
│ ├── NCDM.ipynb
│ ├── NCDM.py
│ └── prepare_dataset.ipynb
├── pytest.ini
├── setup.cfg
├── setup.py
└── tests
├── __init__.py
├── dina
├── __init__.py
├── em
│ ├── __init__.py
│ ├── conftest.py
│ └── test_dina.py
└── gd
│ ├── __init__.py
│ ├── conftest.py
│ └── test_gddina.py
├── fuzzycdf
├── __init__.py
├── conftest.py
└── test_fuzzycdf.py
├── icd
├── __init__.py
├── conftest.py
├── test_mirt.py
└── test_ncd.py
├── irr
├── __init__.py
├── conftest.py
├── test_dina.py
├── test_irt.py
├── test_mirt.py
└── test_ncdm.py
├── irt
├── __init__.py
├── em
│ ├── __init__.py
│ ├── conftest.py
│ └── test_emirt.py
└── gd
│ ├── __init__.py
│ ├── conftest.py
│ └── test_gdirt.py
├── kancd
├── __init__.py
├── conftest.py
└── test_kancd.py
├── mcd
├── __init__.py
├── conftest.py
└── test_mcd.py
├── mirt
├── __init__.py
├── conftest.py
└── test_mirt.py
└── ncdm
├── __init__.py
├── conftest.py
└── test_ncdm.py
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Create a report to help us improve
4 | title: ''
5 | labels: 'Bug, needs triage'
6 |
7 | ---
8 | ## 🐛 Description
9 | (A clear and concise description of what the bug is.)
10 |
11 | ### Error Message
12 | (Paste the complete error message. Please also include stack trace by setting environment variable `DMLC_LOG_STACK_TRACE_DEPTH=100` before running your script.)
13 |
14 | ## To Reproduce
15 | (If you developed your own code, please provide a short script that reproduces the error. For existing examples, please provide link.)
16 |
17 | ### Steps to reproduce
18 | (Paste the commands you ran that produced the error.)
19 |
20 | 1.
21 | 2.
22 |
23 | ## What have you tried to solve it?
24 |
25 | 1.
26 | 2.
27 |
28 | ## Environment
29 |
30 |
31 | Environment Information
32 |
33 | **Operating System:** ...
34 |
35 | **Python Version:** (e.g., python3.6, anaconda/python3.7, venv/python3.8)
36 |
37 |
38 |
39 | ## Additional context
40 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/documentation.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: 📚 Documentation
3 | about: Update api documentation or add the data analysis
4 | ---
5 |
6 | ## 📚 Documentation
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature request
3 | about: Suggest an idea for this project
4 | title: ''
5 | labels: 'Feature request'
6 |
7 | ---
8 |
9 | ## Description
10 | (A clear and concise description of what the feature is.)
11 | - If the proposal is about an algorithm or a model, provide mock examples if possible. In addition, you may need to carefully follow the [guidance](https://github.com/bigdata-ustc/EduCDM/blob/main/CONTRIBUTE.md)
12 |
13 | ## References
14 | - list reference and related literature
15 | - list known implementations
16 |
--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | Thanks for sending a pull request!
2 | Please make sure you click the link above to view the [contribution guidelines](../blob/master/CONTRIBUTE.md),
3 | then fill out the blanks below.
4 |
5 | ## Description ##
6 | (Brief description on what this PR is about)
7 |
8 | ### What does this implement/fix? Explain your changes.
9 | ...
10 |
11 | #### Pull request type
12 | - [ ] [DATASET] Add a new dataset
13 | - [ ] [BUGFIX] Bugfix
14 | - [ ] [FEATURE] New feature (non-breaking change which adds functionality)
15 | - [ ] [BREAKING] Breaking change (fix or feature that would cause existing functionality to not work as expected)
16 | - [ ] [STYLE] Code style update (formatting, renaming)
17 | - [ ] [REFACTOR] Refactoring (no functional changes, no api changes)
18 | - [ ] [BUILD] Build related changes
19 | - [ ] [DOC] Documentation content changes
20 | - [ ] [Sync] Synchronization with a repository
21 | - [ ] [OTHER] Other (please describe):
22 |
23 |
24 | #### Changes
25 | - Feature1, tests, (and when applicable, API doc)
26 | - Feature2, tests, (and when applicable, API doc)
27 |
28 | or
29 |
30 | - Fix1, tests
31 | - Fix2, tests
32 |
33 | ### Does this close any currently open issues?
34 | ...
35 |
36 | ### Any relevant logs, error output, etc?
37 | ...
38 |
39 | ## Checklist ##
40 | Before you submit a pull request, please make sure you have to following:
41 |
42 | ### Essentials ###
43 | - [ ] PR's title starts with a category (e.g. [BUGFIX], [FEATURE], [BREAKING], [DOC], etc)
44 | - [ ] Changes are complete (i.e. I finished coding on this PR)
45 | - [ ] All changes have test coverage and al tests passing
46 | - [ ] Code is well-documented (extended the README / documentation, if necessary)
47 | - [ ] If this PR is your first one, add your name and github account to [AUTHORS.md](../blob/master/AUTHORS.md)
48 |
49 | ## Comments ##
50 | - If this change is a backward incompatible change, why must this change be made.
51 | - Interesting edge cases to note here
52 |
--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
1 | # This workflows will upload a Python Package using Twine when a release is created
2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
3 |
4 | name: Upload Python Package
5 |
6 | on:
7 | release:
8 | types: [created]
9 |
10 | jobs:
11 | deploy:
12 |
13 | runs-on: ubuntu-latest
14 |
15 | steps:
16 | - uses: actions/checkout@v2
17 | - name: Set up Python
18 | uses: actions/setup-python@v2
19 | with:
20 | python-version: '3.x'
21 | - name: Install dependencies
22 | run: |
23 | python -m pip install --upgrade pip
24 | pip install setuptools wheel twine
25 | - name: Build and publish
26 | env:
27 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
28 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
29 | run: |
30 | python setup.py sdist bdist_wheel
31 | twine upload dist/*
32 |
--------------------------------------------------------------------------------
/.github/workflows/python-test.yml:
--------------------------------------------------------------------------------
1 |
2 | name: test
3 |
4 | on: [push, pull_request]
5 |
6 | jobs:
7 | build:
8 |
9 | runs-on: ubuntu-latest
10 | strategy:
11 | matrix:
12 | python-version: [3.7, 3.8, 3.9, '3.10']
13 |
14 | steps:
15 | - uses: actions/checkout@v2
16 | - name: Set up Python ${{ matrix.python-version }}
17 | uses: actions/setup-python@v2
18 | with:
19 | python-version: ${{ matrix.python-version }}
20 | - name: Install dependencies
21 | run: |
22 | pip install -e .[test]
23 | pip install codecov
24 | - name: Test with pytest
25 | run: |
26 | pytest
27 | codecov
28 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib64/
18 | parts/
19 | sdist/
20 | var/
21 | wheels/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 | MANIFEST
26 |
27 | # PyInstaller
28 | # Usually these files are written by a python script from a template
29 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 |
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 |
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *.cover
46 | .hypothesis/
47 | .pytest_cache/
48 |
49 | # Translations
50 | *.mo
51 | *.pot
52 |
53 | # Django stuff:
54 | *.log
55 | local_settings.py
56 | db.sqlite3
57 |
58 | # Flask stuff:
59 | instance/
60 | .webassets-cache
61 |
62 | # Scrapy stuff:
63 | .scrapy
64 |
65 | # Sphinx documentation
66 | docs/_build/
67 | **/_build/
68 | **/_build/*
69 |
70 | # PyBuilder
71 | target/
72 |
73 | # Jupyter Notebook
74 | .ipynb_checkpoints
75 |
76 | # pyenv
77 | .python-version
78 |
79 | # celery beat schedule file
80 | celerybeat-schedule
81 |
82 | # SageMath parsed files
83 | *.sage.py
84 |
85 | # Environments
86 | .env
87 | .venv
88 | env/
89 | venv/
90 | ENV/
91 | env.bak/
92 | venv.bak/
93 |
94 | # Spyder project settings
95 | .spyderproject
96 | .spyproject
97 |
98 | # Rope project settings
99 | .ropeproject
100 |
101 | # mkdocs documentation
102 | /site
103 |
104 | # IDE
105 | .idea/
106 | .vscode/
107 | .DS_Store
108 |
109 | # Pyre type checker
110 | .pyre/
111 |
112 | # User Definition
113 | data/
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 |
3 | matrix:
4 | include:
5 | - python: 3.6
6 | - python: 3.7
7 | - python: 3.8
8 | - python: 3.9
9 | dist: xenial
10 | sudo: true
11 |
12 | install:
13 | - pip install -e .[test]
14 | - pip install codecov
15 |
16 | script:
17 | - pytest
18 | - codecov
19 |
--------------------------------------------------------------------------------
/AUTHORS.md:
--------------------------------------------------------------------------------
1 | # AUTHORS
2 |
3 | [Shiwei Tong](https://github.com/tswsxk)
4 |
5 | [Wei Huang](https://github.com/RandolphVI)
6 |
7 | [Jiayu Liu](https://github.com/Ljyustc)
8 |
9 | [Fei Wang](https://github.com/LegionKing)
10 |
11 | [Fangzhou Yao](https://github.com/fannazya)
12 |
13 | [Yuting Hong](https://github.com/ViviHong200709)
14 |
15 |
--------------------------------------------------------------------------------
/CHANGE.txt:
--------------------------------------------------------------------------------
1 | v1.0.1:
2 | * update version requirements of longling and pytest-flake8
3 |
4 | v1.0.0:
5 | * add KaNCD and ICD
6 |
7 | v0.0.13:
8 | * Bugfix: update dependency version of `longling`
9 | * use PosLinear to replace clipper operation
10 |
11 | v0.0.12:
12 | * limit the range of parameters in IRT and MIRT
13 |
14 | v0.0.11:
15 | * fix error in `irf`
16 |
17 | v0.0.10:
18 | * add STE operator in DINA
19 | * add Multidimensional Item Response Theory (MIRT)
20 | * add IRR-DINA, IRR-MIRT, IRR-NCDM
21 |
22 | v0.0.9:
23 | * add Item Response Ranking for Cognitive Diagnosis (IRR)
24 | * IRT
25 |
26 | v0.0.8:
27 | * add DINA model with Gradient Descent Optimization (GDDINA) and rename the previous DINA to EMDINA
28 |
29 | v0.0.7:
30 | * fix potential ModuleNotFoundError
31 |
32 | v0.0.6:
33 | * add Item Response Theory with Expectation Maximization Optimization (EMIRT)
34 |
35 | v0.0.5:
36 | * add Item Response Theory with Gradient Descent Optimization (GDIRT)
37 |
38 | v0.0.4:
39 | * add NeuralCDM (NCDM)
40 |
41 | v0.0.3:
42 | * add DINA
43 | * add FuzzyCDF
44 |
45 | v0.0.2:
46 | * add MCD
47 |
48 | v0.0.1:
49 | * Add meta class
50 |
--------------------------------------------------------------------------------
/CONTRIBUTE.md:
--------------------------------------------------------------------------------
1 | # CONTRIBUTE
2 |
3 | [中文版本](CONTRIBUTE_CH.md)
4 |
5 | ## Guidance
6 | Thank you for your interest in contributing to EduCDM!
7 | Before you begin writing code, it is important that you share your intention to contribute with the team,
8 | based on the type of contribution:
9 |
10 | 1. You want to propose a new feature and implement it.
11 | * Post about your intended feature in an issue,
12 | and we shall discuss the design and implementation.
13 | Once we agree that the plan looks good, go ahead and implement it.
14 | 2. You want to implement a feature or bug-fix for an outstanding issue.
15 | * Search for your issue in the [EduCDM issue list](https://github.com/bigdata-ustc/EduCDM/issues).
16 | * Pick an issue and comment that you'd like to work on the feature or bug-fix.
17 | * If you need more context on a particular issue, please ask and we shall provide.
18 |
19 | Once you implement and test your feature or bug-fix,
20 | please submit a Pull Request to [EduCDM](https://github.com/bigdata-ustc/EduCDM):
21 |
22 | 1. Fork this repository to your branch.
23 | 2. Modify the code. Note that we strongly recommend that you comply with our [commit format specifications](CONTRIBUTE.md#About-Commit).
24 | 3. Pass code tests and make the test coverage reach 100%. [An example](tests/mcd).
25 | 4. Submit a Pull Request to [EduCDM](https://github.com/bigdata-ustc/EduCDM). Note that we provide a standard template of Pull Request [here](https://github.com/bigdata-ustc/EduCDM/pull/7). Please fill in the information carefully.
26 |
27 | The followings are some helpful guidelines for different types contribution:
28 |
29 | ### Add new dataset
30 |
31 | If you want to add the data analysis or a new dataset, please submit a Pull Request to [EduData](https://github.com/bigdata-ustc/EduData).
32 |
33 | ### Add new CDM model
34 |
35 | The newly implemented CDM model requires:
36 |
37 | 1. Dataset processing.
38 | 2. Inherit the `class CDM` in `EduCDM/meta.py` and implement four methods in it.
39 | 3. Write the corresponding example code for the model (This refers to a demo that can be tested by others). It should include at least [notebook](examples/MCD/MCD.ipynb) and [script](examples/MCD/MCD.py). [An example](examples/MCD).
40 | 4. Write the corresponding test code for the model and make sure that the test coverage is 100%. [An example](tests/mcd).
41 |
42 | #### Dataset Processing
43 |
44 | As for the dataset preprocessing, we suggest:
45 |
46 | 1. Write a script, and make sure that:
47 | - Processing and converting of the raw dataset.
48 | - Partitioning Training/validation/test dataset.
49 | 2. Provide or use [CDBD](https://github.com/bigdata-ustc/EduData) dataset (which is already divided into training/validation/test datasets).
50 |
51 |
52 | #### Module
53 |
54 | All modules are inherited from `Class CDM`, it will raise `NotImplementedError` if the functions are not implemented.
55 |
56 | Note that we do not constrain your neural network or algorithms (for example, the network construction, optimizer, loss function definitions, etc.).
57 |
58 | - **Train** module
59 |
60 | This module is a training module, which is used to train model.
61 |
62 | ```python3
63 | def train(self, *args, **kwargs) -> ...:
64 | raise NotImplementedError
65 | ```
66 |
67 | - **Eval** module
68 |
69 | This module is an evaluation module, which is used to verify and test the model.
70 |
71 | ```python3
72 | def eval(self, *args, **kwargs) -> ...:
73 | raise NotImplementedError
74 | ```
75 |
76 | - **Save** module
77 |
78 | This module is a model saving module, which is used to save the trained model.
79 |
80 | ```python3
81 | def save(self, *args, **kwargs) -> ...:
82 | raise NotImplementedError
83 | ```
84 |
85 | - **Load** module
86 |
87 | This module is a model loading module, which is used to load the saved model.
88 |
89 | ```python3
90 | def load(self, *args, **kwargs) -> ...:
91 | raise NotImplementedError
92 | ```
93 |
94 | #### Demo
95 |
96 | Make sure you make a demo for your model. [An example](examples/MCD).
97 |
98 | #### Docs Format
99 |
100 | Numpy docs format is used:
101 |
102 | ```
103 | function
104 |
105 | Parameters
106 | ----------
107 | Variable 1: type , optional or not
108 | description
109 | Variable 2: type , optional or not
110 | description
111 | ...
112 |
113 | Returns
114 | -------
115 | Variable: type
116 | description
117 |
118 | See Also (Optional)
119 | --------
120 | Similar to function():
121 |
122 | Examples (Optional)
123 | --------
124 | >>> For example:
125 | ...
126 | ```
127 |
128 | ### About Commit
129 |
130 | #### commit format
131 |
132 | ```
133 | []()
134 | ```
135 |
136 | #### type
137 | - `feat`:New feature。
138 | - `fix/to`:Fix bugs, either found in Q&A or found in your own use.
139 | - `fix`:Generating diff and fixes the problem automatically. **Suitable for one submit to fix the problem directly**.
140 | - `to`:Generating only **diff** but does not automatically fix the problem. **Suitable for multiple submissions**. Use `fix` when the final fix problem is committed.
141 | - `docs`:Documentation.
142 | - `style`:Format (do not affect code execution).
143 | - `refactor`:Refactoring (not new features or bug fix).
144 | - `perf`:Optimize related issues, such as code performance, user experience.
145 | - `test`:Add test unit.
146 | - `chore`:Build process or auxiliary tools change.
147 | - `revert`:Roll back to the previous version.
148 | - `merge`:Code merge.
149 | - `sync`:Synchronizing the bug of main or branch。
150 | - `arch`: Engineering documents or tools change.
151 |
152 | ##### scope (optional)
153 |
154 | Scope is used to describe the impact of the commit, such as **the data layer**, **the control layer**, **the view layer**, and so on, depending on the project.
155 |
156 | For example, in Angular, it can be location, browser, compile, compile, rootScope, ngHref, ngClick, ngView, and so on. If your changes affect more than one scope, you can use `*` instead.
157 |
158 | ##### subject (mandatory)
159 |
160 | A subject is a short description of the purpose of the commit, not more than 50 characters.
161 |
162 | There is no period or other punctuation at the end.
163 |
164 | #### Example
165 |
166 | - **[docs] update the README.md**
167 |
168 | ```sh
169 | git commit -m "[docs] update the README.md"
170 | ```
171 |
172 | ## FAQ
173 |
174 | Q: I have carefully tested the code in my local system (all testing passed) but still failed in online CI?
175 |
176 | A: There are two possible reasons:
177 | 1. the online CI system is different from your local system;
178 | 2. there are some network error causing the downloading test failed, which you can find in the CI log.
179 |
180 | For the second reason, all you need to do is to retry the test.
181 |
182 |
--------------------------------------------------------------------------------
/CONTRIBUTE_CH.md:
--------------------------------------------------------------------------------
1 | # 贡献规范
2 |
3 | [English version](CONTRIBUTE.md)
4 |
5 | ## 导引
6 |
7 | 首先感谢您关注 EduCDM 并致力于让其变得更好!
8 | 在您开始贡献自己的一份力之前,需要注意以下几点:
9 | 1. 如果您希望我们实现新的功能。
10 | - 可以在通过 issue 来告诉我们您想要的功能,我们将及时展开讨论设计和实现。
11 | - 一旦我们一致地认为这个计划不错,那么您可以期待新的功能很快就可以与您见面。
12 | 2. 如果您想要对于某个未解决问题的 issue 提供解决性意见或 bug 修复。
13 | - 可以先在 [EduCDM issue list](https://github.com/bigdata-ustc/CDM/issues) 中搜索您的问题。
14 | - 之后,选择一个具体问题和评论,来提供您的解决性意见或者 bug 修复。
15 | - 如果对于具体的 issue,您需要更多的细节,请向我们咨询。
16 |
17 | 一旦您实现并已经测试过了你的想法或者是对于 bug 的修复,请通过 Pull Request 提及到到 [EduCDM](https://github.com/bigdata-ustc/CDM) :
18 | 1. 首先fork此仓库到你的分支下
19 | 2. 对代码进行修改。注意:我们强烈建议你遵守我们的 [commit格式规范](CONTRIBUTE_CH.md#关于Commit的格式)
20 | 3. 通过代码测试,测试覆盖度达到100%,例子可见[此处](tests/mcd)
21 | 4. 通过Pull Request 提及到到 [EduCDM](https://github.com/bigdata-ustc/CDM) 。注意:我们提供了一个标准的PR请求模板,你需要认真完成其中的信息,一个标准且规范的PR可参考[此处](https://github.com/bigdata-ustc/EduCDM/pull/7)
22 |
23 | 以下是对于不同贡献内容的有用建议:
24 |
25 | ### 添加新的数据集或者数据分析
26 |
27 | 有关新数据集或数据分析,请移步至 [EduData](https://github.com/bigdata-ustc/EduData) 。
28 |
29 | ### 添加新的 CDM 模型
30 |
31 | 新实现的 CDM 模型需要:
32 | 1. 数据集的预处理。
33 | 2. 继承 `EduCDM/meta.py` 中的的 `class CDM`,并实现中间的四个方法。
34 | 3. 编写模型对应的 example 代码(这里指的是可供其他人运行测试使用的 demo),例子可见[此处](examples/MCD):至少应当包括:[notebook](examples/MCD/MCD.ipynb) 和 [script](examples/MCD/MCD.py)
35 | 4. 编写模型对应的测试代码,保证测试覆盖度为100%,例子可见[此处](tests/mcd)
36 |
37 | #### 数据预处理
38 |
39 | 关于数据集的预处理,我们提供如下两种建议:
40 |
41 | 1. 编写一个 script,完成:
42 | - 对原始数据集中进行处理,转换。
43 | - 训练/验证/测试集划分。
44 | 2. 提交或使用 [CDBD](https://github.com/bigdata-ustc/EduData) 数据集(已划分好训练/验证/测试集)。
45 |
46 | #### 模块编写
47 |
48 | 编写的新 CDM 模型,其中几个重要模块需要继承 `EduCDM/meta.py` 中的 `class CDM`。
49 | 需要注意的是,我们并不对您的神经网络、算法(例如,网络构造、优化器、损失函数定义等)进行约束。
50 |
51 | - 训练模块
52 |
53 | 该模块为训练模块,用于对模型、算法进行训练。
54 |
55 | ```python3
56 | def train(self, *args, **kwargs) -> ...:
57 | raise NotImplementedError
58 | ```
59 |
60 | - 测试模块
61 |
62 | 该模块为测试模块,用于对模型、算法进行验证、测试。
63 |
64 | ```python3
65 | def eval(self, *args, **kwargs) -> ...:
66 | raise NotImplementedError
67 | ```
68 |
69 | - 模型存储模块
70 |
71 | 该模块为存储模块,用于保存训练好了的模型、算法。
72 |
73 | ```python3
74 | def save(self, *args, **kwargs) -> ...:
75 | raise NotImplementedError
76 | ```
77 |
78 | - 模型读取模块
79 |
80 | 该模块为模型读取模块,用于读取保存好了的模型、算法。
81 |
82 | ```python3
83 | def load(self, *args, **kwargs) -> ...:
84 | raise NotImplementedError
85 | ```
86 |
87 | #### 编写 Demo
88 |
89 | 编写模型对应的 Example 代码,例子可见[]() :
90 |
91 | #### 代码注释风格
92 |
93 | 请使用 Numpy 代码注释风格:
94 |
95 | ```
96 | function 的功能
97 |
98 | Parameters
99 | ----------
100 | 变量名 1: 类型, 是否 optional
101 | 描述
102 | 变量名 2: 类型, 是否 optional
103 | 描述
104 | ...
105 |
106 | Returns
107 | -------
108 | 变量名: 类型
109 | 描述
110 |
111 | See Also (可选)
112 | --------
113 | 类似 function: 类似 function 的功能
114 |
115 | Examples (可选)
116 | --------
117 | >>> 举例怎么用
118 | ```
119 |
120 | ### 关于Commit的格式
121 |
122 | #### commit format
123 |
124 | ```
125 | []()
126 | ```
127 |
128 | #### type
129 | - `feat`:新功能(feature)。
130 | - `fix/to`:修复 bug,可以是 Q&A 发现的 bug,也可以是自己在使用时发现的 bug。
131 | - `fix`:产生 diff 并自动修复此问题。**适合于一次提交直接修复问题**。
132 | - `to`:只产生 diff 不自动修复此问题。**适合于多次提交**。最终修复问题提交时使用 `fix`。
133 | - `docs`:文档(documentation)。
134 | - `style`:格式(不影响代码运行的变动)。
135 | - `refactor`:重构(即非新增功能,也不是修改 bug 的代码变动)。
136 | - `perf`:优化相关,比如提升性能、体验。
137 | - `test`:增加测试。
138 | - `chore`:构建过程或辅助工具的变动。
139 | - `revert`:回滚到上一个版本。
140 | - `merge`:代码合并。
141 | - `sync`:同步主线或分支的 bug。
142 | - `arch`: 工程文件或工具的改动。
143 |
144 | #### scope (可选)
145 |
146 | scope 是用于说明 commit 影响的范围,比如数据层、控制层、视图层等等,视项目不同而不同。
147 |
148 | 例如在 Angular,可以是 location,browser,compile,compile,rootScope, ngHref,ngClick,ngView等。如果你的修改影响了不止一个scope,你可以使用`*`代替。
149 |
150 | #### subject (必须)
151 |
152 | subject 是 commit 目的的简短描述,不超过50个字符。
153 |
154 | 结尾不加句号或其他标点符号。
155 |
156 | #### Example
157 |
158 | - **[docs] update the README.md**
159 |
160 | ```sh
161 | git commit -m "[docs] update the README.md"
162 | ```
163 |
164 | ## FAQ
165 |
166 | 问题: 我已经在本地仔细地测试了代码,并通过了代码检查,但是在 CI 步骤时却报错?
167 | 回答: 这个问题可能是两个原因造成:
168 | 1. 在线的 CI 系统与您自己本地系统有差别;
169 | 2. 可能是网络原因造成的,如果是可以通过 CI 的日志文件查看。
170 |
--------------------------------------------------------------------------------
/EduCDM/DINA/EM/DINA.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/3/28 @ liujiayu
3 |
4 | import logging
5 | import numpy as np
6 | from tqdm import tqdm
7 | import pickle
8 | from EduCDM import CDM
9 |
10 |
11 | def initial_all_knowledge_state(know_num):
12 | state_num = 2 ** know_num
13 | all_states = np.zeros((state_num, know_num))
14 | for i in range(state_num):
15 | k, quotient, residue = 1, i // 2, i % 2
16 | while True:
17 | all_states[i, know_num - k] = residue
18 | if quotient <= 0:
19 | break
20 | quotient, residue = quotient // 2, quotient % 2
21 | k += 1
22 | return all_states
23 |
24 |
25 | def init_parameters(stu_num, prob_num):
26 | slip = np.zeros(shape=prob_num) + 0.2
27 | guess = np.zeros(shape=prob_num) + 0.2
28 | theta = np.zeros(shape=stu_num) # index of state
29 | return theta, slip, guess
30 |
31 |
32 | class DINA(CDM):
33 | """
34 | DINA model, training (EM) and testing methods
35 | :param R (array): response matrix, shape = (stu_num, prob_num)
36 | :param q_m (array): Q matrix, shape = (prob_num, know_num)
37 | :param stu_num (int): number of students
38 | :param prob_num (int): number of problems
39 | :param know_num (int): number of knowledge
40 | :param skip_value (int): skip value in response matrix
41 | """
42 |
43 | def __init__(self, R, q_m, stu_num, prob_num, know_num, skip_value=-1):
44 | self.R, self.q_m, self.state_num, self.skip_value = R, q_m, 2 ** know_num, skip_value
45 | self.stu_num, self.prob_num, self.know_num = stu_num, prob_num, know_num
46 | self.theta, self.slip, self.guess = init_parameters(stu_num, prob_num)
47 | self.all_states = initial_all_knowledge_state(know_num) # shape = (state_num, know_num)
48 | state_prob = np.transpose(np.sum(q_m, axis=1, keepdims=True) - np.dot(q_m, np.transpose(self.all_states)))
49 | self.eta = 1 - (state_prob > 0) # state covers knowledge of problem (1: yes), shape = (state_num, prob_num)
50 |
51 | def train(self, epoch, epsilon) -> ...:
52 | like = np.zeros(shape=(self.stu_num, self.state_num)) # likelihood
53 | post = np.zeros(shape=(self.stu_num, self.state_num)) # posterior
54 | theta, slip, guess, tmp_R = np.copy(self.theta), np.copy(self.slip), np.copy(self.guess), np.copy(self.R)
55 | tmp_R[np.where(self.R == self.skip_value)[0], np.where(self.R == self.skip_value)[1]] = 0
56 | for iteration in range(epoch):
57 | post_tmp, slip_tmp, guess_tmp = np.copy(post), np.copy(slip), np.copy(guess)
58 | answer_right = (1 - slip) * self.eta + guess * (1 - self.eta)
59 | for s in range(self.state_num):
60 | log_like = np.log(answer_right[s, :] + 1e-9) * self.R + np.log(1 - answer_right[s, :] + 1e-9) * (
61 | 1 - self.R)
62 | log_like[np.where(self.R == self.skip_value)[0], np.where(self.R == self.skip_value)[1]] = 0
63 | like[:, s] = np.exp(np.sum(log_like, axis=1))
64 | post = like / np.sum(like, axis=1, keepdims=True)
65 | i_l = np.expand_dims(np.sum(post, axis=0), axis=1) # shape = (state_num, 1)
66 | r_jl = np.dot(np.transpose(post), tmp_R) # shape = (state_num, prob_num)
67 | r_jl_0, r_jl_1 = np.sum(r_jl * (1 - self.eta), axis=0), np.sum(r_jl * self.eta, axis=0)
68 | i_jl_0, i_jl_1 = np.sum(i_l * (1 - self.eta), axis=0), np.sum(i_l * self.eta, axis=0)
69 | guess, slip = r_jl_0 / i_jl_0, (i_jl_1 - r_jl_1) / i_jl_1
70 |
71 | change = max(np.max(np.abs(post - post_tmp)), np.max(np.abs(slip - slip_tmp)),
72 | np.max(np.abs(guess - guess_tmp)))
73 | theta = np.argmax(post, axis=1)
74 | if iteration > 20 and change < epsilon:
75 | break
76 | self.theta, self.slip, self.guess = theta, slip, guess
77 |
78 | def eval(self, test_data) -> tuple:
79 | pred_score = (1 - self.slip) * self.eta + self.guess * (1 - self.eta)
80 | test_rmse, test_mae = [], []
81 | for i in tqdm(test_data, "evaluating"):
82 | stu, test_id, true_score = i['user_id'], i['item_id'], i['score']
83 | test_rmse.append((pred_score[self.theta[stu], test_id] - true_score) ** 2)
84 | test_mae.append(abs(pred_score[self.theta[stu], test_id] - true_score))
85 | return np.sqrt(np.average(test_rmse)), np.average(test_mae)
86 |
87 | def save(self, filepath):
88 | with open(filepath, 'wb') as file:
89 | pickle.dump({"theta": self.theta, "slip": self.slip, "guess": self.guess}, file)
90 | logging.info("save parameters to %s" % filepath)
91 |
92 | def load(self, filepath):
93 | with open(filepath, 'rb') as file:
94 | self.theta, self.slip, self.guess = pickle.load(file).values()
95 | logging.info("load parameters from %s" % filepath)
96 |
97 | def inc_train(self, inc_train_data, epoch, epsilon): # incremental training
98 | for i in inc_train_data:
99 | stu, test_id, true_score = i['user_id'], i['item_id'], i['score']
100 | self.R[stu, test_id] = true_score
101 | self.train(epoch, epsilon)
102 |
103 | def transform(self, records): # MLE for evaluating student's state
104 | # max_like_id: diagnose which state among all_states the student belongs to
105 | # dia_state: binaray vector of length know_num, 0/1 indicates whether masters the knowledge
106 | answer_right = (1 - self.slip) * self.eta + self.guess * (1 - self.eta)
107 | log_like = records * np.log(answer_right + 1e-9) + (1 - records) * np.log(1 - answer_right + 1e-9)
108 | log_like[:, np.where(records == self.skip_value)[0]] = 0
109 | max_like_id = np.argmax(np.exp(np.sum(log_like, axis=1)))
110 | dia_state = self.all_states[max_like_id]
111 | return max_like_id, dia_state
112 |
--------------------------------------------------------------------------------
/EduCDM/DINA/EM/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/6/21 @ tongshiwei
3 |
4 | from .DINA import DINA
5 |
--------------------------------------------------------------------------------
/EduCDM/DINA/GD/DINA.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/6/21 @ tongshiwei
3 |
4 | import logging
5 | import numpy as np
6 | import torch
7 | from EduCDM import CDM
8 | from torch import nn
9 | from tqdm import tqdm
10 | from sklearn.metrics import roc_auc_score, accuracy_score
11 | import torch.autograd as autograd
12 | import torch.nn.functional as F
13 |
14 |
15 | class DINANet(nn.Module):
16 | def __init__(self, user_num, item_num, hidden_dim, max_slip=0.4, max_guess=0.4, *args, **kwargs):
17 | super(DINANet, self).__init__()
18 | self._user_num = user_num
19 | self._item_num = item_num
20 | self.step = 0
21 | self.max_step = 1000
22 | self.max_slip = max_slip
23 | self.max_guess = max_guess
24 |
25 | self.guess = nn.Embedding(self._item_num, 1)
26 | self.slip = nn.Embedding(self._item_num, 1)
27 | self.theta = nn.Embedding(self._user_num, hidden_dim)
28 |
29 | def forward(self, user, item, knowledge, *args):
30 | theta = self.theta(user)
31 | slip = torch.squeeze(torch.sigmoid(self.slip(item)) * self.max_slip)
32 | guess = torch.squeeze(torch.sigmoid(self.guess(item)) * self.max_guess)
33 | if self.training:
34 | n = torch.sum(knowledge * (torch.sigmoid(theta) - 0.5), dim=1)
35 | t, self.step = max((np.sin(2 * np.pi * self.step / self.max_step) + 1) / 2 * 100,
36 | 1e-6), self.step + 1 if self.step < self.max_step else 0
37 | return torch.sum(
38 | torch.stack([1 - slip, guess]).T * torch.softmax(torch.stack([n, torch.zeros_like(n)]).T / t, dim=-1),
39 | dim=1
40 | )
41 | else:
42 | n = torch.prod(knowledge * (theta >= 0) + (1 - knowledge), dim=1)
43 | return (1 - slip) ** n * guess ** (1 - n)
44 |
45 |
46 | class STEFunction(autograd.Function):
47 | @staticmethod
48 | def forward(ctx, input):
49 | return (input > 0).float()
50 |
51 | @staticmethod
52 | def backward(ctx, grad_output):
53 | return F.hardtanh(grad_output)
54 |
55 |
56 | class StraightThroughEstimator(nn.Module):
57 | def __init__(self):
58 | super(StraightThroughEstimator, self).__init__()
59 |
60 | def forward(self, x):
61 | x = STEFunction.apply(x)
62 | return x
63 |
64 |
65 | class STEDINANet(DINANet):
66 | def __init__(self, user_num, item_num, hidden_dim, max_slip=0.4, max_guess=0.4, *args, **kwargs):
67 | super(STEDINANet, self).__init__(user_num, item_num, hidden_dim, max_slip, max_guess, *args, **kwargs)
68 | self.sign = StraightThroughEstimator()
69 |
70 | def forward(self, user, item, knowledge, *args):
71 | theta = self.sign(self.theta(user))
72 | slip = torch.squeeze(torch.sigmoid(self.slip(item)) * self.max_slip)
73 | guess = torch.squeeze(torch.sigmoid(self.guess(item)) * self.max_guess)
74 | mask_theta = (knowledge == 0) + (knowledge == 1) * theta
75 | n = torch.prod((mask_theta + 1) / 2, dim=-1)
76 | return torch.pow(1 - slip, n) * torch.pow(guess, 1 - n)
77 |
78 |
79 | class DINA(CDM):
80 | def __init__(self, user_num, item_num, hidden_dim, ste=False):
81 | super(DINA, self).__init__()
82 | if ste:
83 | self.dina_net = STEDINANet(user_num, item_num, hidden_dim)
84 | else:
85 | self.dina_net = DINANet(user_num, item_num, hidden_dim)
86 |
87 | def train(self, train_data, test_data=None, *, epoch: int, device="cpu", lr=0.001) -> ...:
88 | self.dina_net = self.dina_net.to(device)
89 | loss_function = nn.BCELoss()
90 |
91 | trainer = torch.optim.Adam(self.dina_net.parameters(), lr)
92 |
93 | for e in range(epoch):
94 | losses = []
95 | for batch_data in tqdm(train_data, "Epoch %s" % e):
96 | user_id, item_id, knowledge, response = batch_data
97 | user_id: torch.Tensor = user_id.to(device)
98 | item_id: torch.Tensor = item_id.to(device)
99 | knowledge: torch.Tensor = knowledge.to(device)
100 | predicted_response: torch.Tensor = self.dina_net(user_id, item_id, knowledge)
101 | response: torch.Tensor = response.to(device)
102 | loss = loss_function(predicted_response, response)
103 |
104 | # back propagation
105 | trainer.zero_grad()
106 | loss.backward()
107 | trainer.step()
108 |
109 | losses.append(loss.mean().item())
110 | print("[Epoch %d] LogisticLoss: %.6f" % (e, float(np.mean(losses))))
111 |
112 | if test_data is not None:
113 | auc, accuracy = self.eval(test_data, device=device)
114 | print("[Epoch %d] auc: %.6f, accuracy: %.6f" % (e, auc, accuracy))
115 |
116 | def eval(self, test_data, device="cpu") -> tuple:
117 | self.dina_net = self.dina_net.to(device)
118 | self.dina_net.eval()
119 | y_pred = []
120 | y_true = []
121 | for batch_data in tqdm(test_data, "evaluating"):
122 | user_id, item_id, knowledge, response = batch_data
123 | user_id: torch.Tensor = user_id.to(device)
124 | item_id: torch.Tensor = item_id.to(device)
125 | knowledge: torch.Tensor = knowledge.to(device)
126 | pred: torch.Tensor = self.dina_net(user_id, item_id, knowledge)
127 | y_pred.extend(pred.tolist())
128 | y_true.extend(response.tolist())
129 |
130 | self.dina_net.train()
131 | return roc_auc_score(y_true, y_pred), accuracy_score(y_true, np.array(y_pred) >= 0.5)
132 |
133 | def save(self, filepath):
134 | torch.save(self.dina_net.state_dict(), filepath)
135 | logging.info("save parameters to %s" % filepath)
136 |
137 | def load(self, filepath):
138 | self.dina_net.load_state_dict(torch.load(filepath))
139 | logging.info("load parameters from %s" % filepath)
140 |
--------------------------------------------------------------------------------
/EduCDM/DINA/GD/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/6/21 @ tongshiwei
3 |
4 | from .DINA import DINA
5 |
--------------------------------------------------------------------------------
/EduCDM/DINA/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/3/28 @ liujiayu
3 |
4 |
5 | from .GD import DINA as GDDINA
6 | from .EM import DINA as EMDINA
7 |
--------------------------------------------------------------------------------
/EduCDM/FuzzyCDF/FuzzyCDF.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/3/28 @ liujiayu
3 |
4 | import logging
5 | import numpy as np
6 | import pickle
7 | from scipy import stats
8 | from tqdm import tqdm
9 | from collections import namedtuple
10 | from EduCDM import CDM
11 | from .modules import get_LogLikelihood, cal_alpha_mastery, update_A_B, update_theta, update_slip_guess, update_variance
12 |
13 | hyper_para = namedtuple("hyperparameters",
14 | ["sig_a", "mu_a", "sig_b", "mu_b", "max_s", "min_s", "max_g", "min_g", "mu_theta", "sig_theta"])
15 | default_hyper = hyper_para(1, 0, 1, 0, 0.6, 0, 0.6, 0, 0, 1)
16 |
17 |
18 | def init_parameters(stu_num, prob_num, know_num, args): # initialize FuzzyCDF parameters
19 | a = stats.lognorm.rvs(s=args.sig_a, loc=0, scale=np.exp(args.mu_a), size=(stu_num, know_num))
20 | b = stats.norm.rvs(loc=args.mu_b, scale=args.sig_b, size=(stu_num, know_num))
21 | slip = stats.beta.rvs(a=1, b=2, size=prob_num) * (args.max_s - args.min_s) + args.min_s
22 | guess = stats.beta.rvs(a=1, b=2, size=prob_num) * (args.max_g - args.min_g) + args.min_g
23 | theta = stats.norm.rvs(loc=args.mu_theta, scale=args.sig_theta, size=stu_num)
24 | variance = 1 / stats.gamma.rvs(a=4, scale=1 / 6, size=1)
25 | return a, b, slip, guess, theta, variance
26 |
27 |
28 | class FuzzyCDF(CDM):
29 | """
30 | FuzzyCDF model, training (MCMC) and testing methods
31 | :param R (array): response matrix, shape = (stu_num, prob_num)
32 | :param q_m (array): Q matrix, shape = (prob_num, know_num)
33 | :param stu_num (int): number of students
34 | :param prob_num (int): number of problems
35 | :param know_num (int): number of knowledge
36 | :param obj_prob_index (array): index of all objective problems, shape = (number, )
37 | :param sub_prob_index (array): index of all subjective problems, shape = (number, )
38 | :param skip_value (int): skip value in response matrix
39 | :param args: all hyper-parameters
40 | """
41 |
42 | def __init__(self, R, q_m, stu_num, prob_num, know_num, obj_prob_index, sub_prob_index, skip_value=-1,
43 | args=default_hyper):
44 | self.args = args
45 | self.R, self.q_m, self.stu_num, self.prob_num, self.know_num = R, q_m, stu_num, prob_num, know_num
46 | self.a, self.b, self.slip, self.guess, self.theta, self.variance = init_parameters(stu_num, prob_num, know_num,
47 | self.args)
48 | self.obj_prob_index, self.sub_prob_index, self.skip_value = obj_prob_index, sub_prob_index, skip_value
49 |
50 | def train(self, epoch, burnin) -> ...:
51 | A, B, slip, guess = np.copy(self.a), np.copy(self.b), np.copy(self.slip), np.copy(self.guess)
52 | theta, variance = np.copy(self.theta), np.copy(self.variance)
53 | estimate_A, estimate_B, estimate_slip, estimate_guess, estimate_theta, estimate_variance = 0, 0, 0, 0, 0, 0
54 | for iteration in range(epoch):
55 | update_A_B(A, B, theta, slip, guess, variance, self.R, self.q_m, self.obj_prob_index, self.sub_prob_index,
56 | self.skip_value, self.args)
57 | update_theta(A, B, theta, slip, guess, variance, self.R, self.q_m, self.obj_prob_index, self.sub_prob_index,
58 | self.skip_value, self.args)
59 | update_slip_guess(A, B, theta, slip, guess, variance, self.R, self.q_m, self.obj_prob_index,
60 | self.sub_prob_index,
61 | self.skip_value, self.args)
62 | variance = update_variance(A, B, theta, slip, guess, variance, self.R, self.q_m, self.obj_prob_index,
63 | self.sub_prob_index,
64 | self.skip_value)
65 | if iteration >= burnin:
66 | estimate_A += A
67 | estimate_B += B
68 | estimate_slip += slip
69 | estimate_guess += guess
70 | estimate_theta += theta
71 | estimate_variance += variance
72 | self.a, self.b, self.slip, self.guess, self.theta, self.variance = estimate_A / (epoch - burnin), estimate_B / (
73 | epoch - burnin), estimate_slip / (epoch - burnin), estimate_guess / (epoch - burnin), estimate_theta \
74 | / (epoch - burnin), estimate_variance / (epoch - burnin)
75 |
76 | def eval(self, test_data) -> tuple:
77 | _, pred_mastery = cal_alpha_mastery(self.a, self.b, self.theta, self.q_m, self.obj_prob_index,
78 | self.sub_prob_index)
79 | pred_score = (1 - self.slip) * pred_mastery + self.guess * (1 - pred_mastery)
80 | test_rmse, test_mae = [], []
81 | for i in tqdm(test_data, "evaluating"):
82 | stu, test_id, true_score = i['user_id'], i['item_id'], i['score']
83 | test_rmse.append((pred_score[stu, test_id] - true_score) ** 2)
84 | test_mae.append(abs(pred_score[stu, test_id] - true_score))
85 | return np.sqrt(np.average(test_rmse)), np.average(test_mae)
86 |
87 | def save(self, filepath):
88 | with open(filepath, 'wb') as file:
89 | pickle.dump({"a": self.a, "b": self.b, "theta": self.theta, "slip": self.slip, "guess": self.guess}, file)
90 | logging.info("save parameters to %s" % filepath)
91 |
92 | def load(self, filepath):
93 | with open(filepath, 'rb') as file:
94 | self.a, self.b, self.theta, self.slip, self.guess = pickle.load(file).values()
95 | logging.info("load parameters from %s" % filepath)
96 |
97 | def inc_train(self, inc_train_data, epoch, burnin): # incremental training
98 | for i in inc_train_data:
99 | stu, test_id, true_score = i['user_id'], i['item_id'], i['score']
100 | self.R[stu, test_id] = true_score
101 | self.train(epoch, burnin)
102 |
--------------------------------------------------------------------------------
/EduCDM/FuzzyCDF/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/3/28 @ liujiayu
3 |
4 |
5 | from .FuzzyCDF import FuzzyCDF
6 |
--------------------------------------------------------------------------------
/EduCDM/FuzzyCDF/modules.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/3/28 @ liujiayu
3 | # Modules in FuzzyCDF
4 |
5 | import numpy as np
6 | from scipy import stats
7 |
8 |
9 | def cal_alpha_mastery(A, B, theta, q_m, obj_prob_index, sub_prob_index): # calculate proficiency on knows and probs
10 | stu_num, prob_num = len(theta), q_m.shape[0]
11 | alpha = 1 / (1 + np.exp(-1.7 * A * (theta.reshape([-1, 1]) - B)))
12 | mastery = np.zeros((stu_num, prob_num))
13 | for i in range(stu_num):
14 | stu_i = alpha[i] * q_m # shape = (prob_num, know_num)
15 | if len(obj_prob_index) > 0:
16 | mastery[i][obj_prob_index] = np.min((stu_i + 2 * (1 - q_m))[obj_prob_index], axis=1)
17 | if len(sub_prob_index) > 0:
18 | mastery[i][sub_prob_index] = np.max(stu_i[sub_prob_index], axis=1)
19 | return alpha, mastery
20 |
21 |
22 | def get_LogLikelihood(A, B, theta, R, q_m, slip, guess, variance, obj_prob_index, sub_prob_index, skip_value=-1):
23 | # calculate log-likelihood for each response log
24 | _, mastery = cal_alpha_mastery(A, B, theta, q_m, obj_prob_index, sub_prob_index)
25 | stu_num, prob_num = R.shape[0], R.shape[1]
26 | x = (1 - slip) * mastery + guess * (1 - mastery)
27 | result = np.zeros((stu_num, prob_num))
28 | if len(obj_prob_index) > 0:
29 | result[:, obj_prob_index] = (np.log(x + 1e-9) * R + np.log(1 - x + 1e-9) * (1 - R))[:, obj_prob_index]
30 | if len(sub_prob_index) > 0:
31 | result[:, sub_prob_index] = np.log(stats.norm.pdf(R, loc=x, scale=variance))[:, sub_prob_index]
32 |
33 | result[np.where(R == skip_value)[0], np.where(R == skip_value)[1]] = 0 # skip logs
34 | return result # shape = (stu_num, prob_num)
35 |
36 |
37 | # ---below are updating processes in MCMC for FuzzyCDF---
38 | def update_A_B(A, B, theta, slip, guess, variance, R, q_m, obj_prob_index, sub_prob_index, skip_value, args):
39 | know_num = A.shape[1]
40 | new_A = A + 0.3 * stats.norm.rvs(size=A.shape)
41 | new_B = B + 0.3 * stats.norm.rvs(size=B.shape)
42 | for know in range(know_num):
43 | tempA = np.copy(A)
44 | tempB = np.copy(B)
45 | tempA[:, know] = np.copy(new_A[:, know])
46 | tempB[:, know] = np.copy(new_B[:, know])
47 |
48 | l_0 = get_LogLikelihood(A, B, theta, R, q_m, slip, guess, variance, obj_prob_index, sub_prob_index, skip_value)
49 | l_1 = get_LogLikelihood(tempA, tempB, theta, R, q_m, slip, guess, variance, obj_prob_index, sub_prob_index,
50 | skip_value)
51 |
52 | log_p0 = np.sum(l_0, axis=1) + np.log(stats.norm.pdf(x=B[:, know], loc=args.mu_b, scale=args.sig_b) + 1e-9) + \
53 | np.log(stats.lognorm.pdf(x=A[:, know], loc=0, scale=np.exp(args.mu_a), s=args.sig_a) + 1e-9)
54 | log_p1 = np.sum(l_1, axis=1) + np.log(stats.norm.pdf(x=tempB[:, know], loc=args.mu_b, scale=args.sig_b) + 1e-9)\
55 | + np.log(stats.lognorm.pdf(x=tempA[:, know], loc=0, scale=np.exp(args.mu_a), s=args.sig_a) + 1e-9)
56 | accept_prob = np.exp(np.minimum(log_p1 - log_p0, 0)) # avoid overflow in exp
57 | mask = accept_prob >= np.random.random(1)
58 | A[mask, know] = new_A[mask, know]
59 | B[mask, know] = new_B[mask, know]
60 |
61 |
62 | def update_theta(A, B, theta, slip, guess, variance, R, q_m, obj_prob_index, sub_prob_index, skip_value, args):
63 | new_theta = theta + 0.1 * stats.norm.rvs(size=theta.shape)
64 |
65 | l_0 = get_LogLikelihood(A, B, theta, R, q_m, slip, guess, variance, obj_prob_index, sub_prob_index, skip_value)
66 | l_1 = get_LogLikelihood(A, B, new_theta, R, q_m, slip, guess, variance, obj_prob_index, sub_prob_index, skip_value)
67 |
68 | log_p0 = np.sum(l_0, axis=1) + np.log(stats.norm.pdf(x=theta, loc=args.mu_theta, scale=args.sig_theta) + 1e-9)
69 | log_p1 = np.sum(l_1, axis=1) + np.log(stats.norm.pdf(x=new_theta, loc=args.mu_theta, scale=args.sig_theta) + 1e-9)
70 | accept_prob = np.exp(np.minimum(log_p1 - log_p0, 0)) # avoid overflow in exp
71 | mask = accept_prob >= np.random.random(1)
72 | theta[mask] = new_theta[mask]
73 |
74 |
75 | def update_slip_guess(A, B, theta, slip, guess, variance, R, q_m, obj_prob_index, sub_prob_index, skip_value, args):
76 | new_slip = np.abs(slip + 0.2 * stats.norm.rvs(size=slip.shape) - 0.1)
77 | new_guess = np.abs(guess + 0.2 * stats.norm.rvs(size=guess.shape) - 0.1)
78 |
79 | l_0 = get_LogLikelihood(A, B, theta, R, q_m, slip, guess, variance, obj_prob_index, sub_prob_index, skip_value)
80 | l_1 = get_LogLikelihood(A, B, theta, R, q_m, new_slip, new_guess, variance, obj_prob_index, sub_prob_index,
81 | skip_value)
82 |
83 | log_p0 = np.sum(l_0, axis=0) + np.log(stats.beta.pdf(x=slip / (args.max_s - args.min_s), a=1, b=2) + 1e-9) + np.log(
84 | stats.beta.pdf(x=guess / (args.max_g - args.min_g), a=1, b=2) + 1e-9)
85 | log_p1 = np.sum(l_1, axis=0) + np.log(stats.beta.pdf(x=new_slip / (args.max_s - args.min_s), a=1, b=2) + 1e-9) + \
86 | np.log(stats.beta.pdf(x=new_guess / (args.max_g - args.min_g), a=1, b=2) + 1e-9)
87 | accept_prob = np.exp(np.minimum(log_p1 - log_p0, 0)) # avoid overflow in exp
88 | mask = accept_prob >= np.random.random(1)
89 | slip[mask] = new_slip[mask]
90 | guess[mask] = new_guess[mask]
91 |
92 |
93 | def update_variance(A, B, theta, slip, guess, variance, R, q_m, obj_prob_index, sub_prob_index, skip_value):
94 | new_var = np.maximum(variance - 0.01 + 0.02 * stats.norm.rvs(size=variance.shape), 0)
95 |
96 | l_0 = get_LogLikelihood(A, B, theta, R, q_m, slip, guess, variance, obj_prob_index, sub_prob_index, skip_value)
97 | l_1 = get_LogLikelihood(A, B, theta, R, q_m, slip, guess, new_var, obj_prob_index, sub_prob_index, skip_value)
98 |
99 | l_0[:, obj_prob_index] = 0
100 | l_1[:, obj_prob_index] = 0
101 |
102 | log_p0 = np.sum(l_0) + np.log(stats.gamma.pdf(x=1 / (variance + 1e-9), a=4, scale=1 / 6) + 1e-9)
103 | log_p1 = np.sum(l_1) + np.log(stats.gamma.pdf(x=1 / (new_var + 1e-9), a=4, scale=1 / 6) + 1e-9)
104 | accept_prob = np.exp(np.minimum(log_p1 - log_p0, 0)) # avoid overflow in exp
105 | if accept_prob >= np.random.random(1):
106 | variance = new_var
107 | return variance
108 |
--------------------------------------------------------------------------------
/EduCDM/ICD/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
--------------------------------------------------------------------------------
/EduCDM/ICD/etl/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from .etl import *
4 |
--------------------------------------------------------------------------------
/EduCDM/ICD/etl/utils.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | import torch
4 | from baize.utils import pad_sequence
5 | from torch import Tensor, LongTensor
6 |
7 |
8 | def multi_hot(ks, kn):
9 | array = [0] * kn
10 | for k in ks:
11 | array[k] = 1
12 | return array
13 |
14 |
15 | def pack_batch(batch):
16 | user_id, user_items, item_id, item_users, item_knows, response = zip(*batch)
17 | user_items_length = [len(d) for d in user_items]
18 | padded_user_items = pad_sequence(user_items)
19 | item_users_length = [len(d) for d in item_users]
20 | padded_item_users = pad_sequence(item_users)
21 | return (
22 | LongTensor(user_id), LongTensor(padded_user_items), LongTensor(user_items_length),
23 | LongTensor(item_id), LongTensor(padded_item_users), LongTensor(item_users_length), Tensor(item_knows),
24 | Tensor(response)
25 | )
26 |
--------------------------------------------------------------------------------
/EduCDM/ICD/metrics/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from .metrics import doa_report, stableness_report
4 |
--------------------------------------------------------------------------------
/EduCDM/ICD/metrics/metrics.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2022/2/1 @ tongshiwei
3 | import pandas as pd
4 | from longling.ML.metrics import POrderedDict
5 | import numpy as np
6 | from tqdm import tqdm
7 |
8 |
9 | def doa_report(user, item, know, score, theta):
10 | df = pd.DataFrame({
11 | "user_id": user,
12 | "item_id": item,
13 | "score": score,
14 | "theta": theta,
15 | "knowledge": know
16 | })
17 | ground_truth = []
18 |
19 | for _, group_df in tqdm(df.groupby("item_id"), "formatting item df"):
20 | ground_truth.append(group_df["score"].values)
21 | ground_truth.append(1 - group_df["score"].values)
22 |
23 | knowledges = []
24 | knowledge_item = []
25 | knowledge_user = []
26 | knowledge_truth = []
27 | knowledge_theta = []
28 | for user, item, score, theta, knowledge in tqdm(
29 | df[["user_id", "item_id", "score", "theta", "knowledge"]].values,
30 | "formatting knowledge df"):
31 | if isinstance(theta, list):
32 | for i, (theta_i, knowledge_i) in enumerate(zip(theta, knowledge)):
33 | if knowledge_i == 1:
34 | knowledges.append(i)
35 | knowledge_item.append(item)
36 | knowledge_user.append(user)
37 | knowledge_truth.append(score)
38 | knowledge_theta.append(theta_i)
39 | else: # pragma: no cover
40 | for i, knowledge_i in enumerate(knowledge):
41 | if knowledge_i == 1:
42 | knowledges.append(i)
43 | knowledge_item.append(item)
44 | knowledge_user.append(user)
45 | knowledge_truth.append(score)
46 | knowledge_theta.append(theta)
47 |
48 | knowledge_df = pd.DataFrame({
49 | "knowledge": knowledges,
50 | "user_id": knowledge_user,
51 | "item_id": knowledge_item,
52 | "score": knowledge_truth,
53 | "theta": knowledge_theta
54 | })
55 | knowledge_ground_truth = []
56 | knowledge_prediction = []
57 | for _, group_df in knowledge_df.groupby("knowledge"):
58 | _knowledge_ground_truth = []
59 | _knowledge_prediction = []
60 | for _, item_group_df in group_df.groupby("item_id"):
61 | _knowledge_ground_truth.append(item_group_df["score"].values)
62 | _knowledge_prediction.append(item_group_df["theta"].values)
63 | knowledge_ground_truth.append(_knowledge_ground_truth)
64 | knowledge_prediction.append(_knowledge_prediction)
65 |
66 | return POrderedDict(doa_eval(knowledge_ground_truth, knowledge_prediction))
67 |
68 |
69 | def doa_eval(y_true, y_pred):
70 | """
71 | >>> import numpy as np
72 | >>> y_true = [
73 | ... [np.array([1, 0, 1])],
74 | ... [np.array([0, 1, 1])]
75 | ... ]
76 | >>> y_pred = [
77 | ... [np.array([.5, .4, .6])],
78 | ... [np.array([.2, .3, .5])]
79 | ... ]
80 | >>> float(doa_eval(y_true, y_pred)['doa'])
81 | 1.0
82 | >>> y_pred = [
83 | ... [np.array([.4, .5, .6])],
84 | ... [np.array([.3, .2, .5])]
85 | ... ]
86 | >>> float(doa_eval(y_true, y_pred)['doa'])
87 | 0.5
88 | """
89 | doa = []
90 | doa_support = 0
91 | z_support = 0
92 | for knowledge_label, knowledge_pred in tqdm(zip(y_true, y_pred),
93 | "doa metrics"):
94 | _doa = 0
95 | _z = 0
96 | for label, pred in zip(knowledge_label, knowledge_pred):
97 | if sum(label) == len(label) or sum(label) == 0:
98 | continue
99 | pos_idx = []
100 | neg_idx = []
101 | for i, _label in enumerate(label):
102 | if _label == 1:
103 | pos_idx.append(i)
104 | else:
105 | neg_idx.append(i)
106 | pos_pred = pred[pos_idx]
107 | neg_pred = pred[neg_idx]
108 | invalid = 0
109 | for _pos_pred in pos_pred:
110 | _doa += len(neg_pred[neg_pred < _pos_pred])
111 | invalid += len(neg_pred[neg_pred == _pos_pred])
112 | _z += (len(pos_pred) * len(neg_pred)) - invalid
113 | if _z > 0:
114 | doa.append(_doa / _z)
115 | z_support += _z
116 | doa_support += 1
117 | return {
118 | "doa": np.mean(doa),
119 | "doa_know_support": doa_support,
120 | "doa_z_support": z_support,
121 | }
122 |
123 |
124 | def stableness_report(traits: list, new_traits: list, keys: list):
125 | ret = {}
126 | a_dim = None
127 | b_dim = None
128 | for trait, new_trait, key in zip(traits, new_traits, keys):
129 | if key == "b" and b_dim is None:
130 | b_dim = trait.size()[-1] if len(trait.size()) > 1 else 1
131 | if key == "a" and a_dim is None:
132 | a_dim = trait.size()[-1]
133 |
134 | ret[key] = {}
135 | delta = (trait - new_trait).abs()
136 | ret[key]['delta'] = delta.sum().item()
137 | ret[key]['delta_ave'] = delta.mean().item()
138 | ret[key]['support'] = len(trait)
139 |
140 | ret["user"] = ret["theta"]
141 | ret["item"] = {
142 | "delta":
143 | ret["a"]["delta"] + ret["b"]["delta"],
144 | "delta_ave":
145 | (ret["a"]["delta_ave"] * a_dim + ret["b"]["delta_ave"] * b_dim) /
146 | (a_dim + b_dim),
147 | "support":
148 | ret["a"]["support"],
149 | }
150 | macro = ret["user"]["delta_ave"] + ret["item"]["delta_ave"]
151 | micro = ret["user"]["support"] * ret["user"]["delta_ave"] + ret["item"][
152 | "support"] * ret["item"]["delta_ave"]
153 | ret["macro_ave"] = macro / 2
154 | ret["micro_ave"] = micro / (ret["user"]["support"] +
155 | ret["item"]["support"])
156 | return POrderedDict(ret)
157 |
--------------------------------------------------------------------------------
/EduCDM/ICD/sym/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2022/1/29 @ tongshiwei
3 |
4 | from .net import get_net, get_loss, ICD, DualICD, get_dual_loss
5 | from .fit_eval import eval_f, dual_fit_f, stableness_eval, turning_point
6 | from .pos_linear import PosLinear
7 |
--------------------------------------------------------------------------------
/EduCDM/ICD/sym/net/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from .net import get_net, get_loss, ICD, DualICD, get_dual_loss, EmbICD
4 |
--------------------------------------------------------------------------------
/EduCDM/ICD/sym/net/dtn.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | import torch
3 | from torch import nn
4 | from baize.torch.functional import mask_sequence
5 |
6 |
7 | class DTN(nn.Module):
8 | def __init__(self, input_dim, know_dim):
9 | self.know_dim = know_dim
10 | self.input_dim = input_dim
11 | self.fea_dim = 64
12 |
13 | super(DTN, self).__init__()
14 | self.emb = nn.Sequential(nn.Embedding(self.input_dim, self.fea_dim),
15 | # nn.Dropout(p=0.5),
16 | )
17 | # self.feature_net = nn.Sequential(
18 | # # nn.Dropout(p=0.2),
19 | # nn.Linear(self.know_dim, self.know_dim),
20 | # # nn.Dropout(p=0.5),
21 | # # nn.Linear(self.prednet_len2, self.know_dim),
22 | # )
23 | # self.atn = nn.MultiheadAttention(self.fea_dim, 4)
24 | self.feature_net = nn.Sequential(
25 | # nn.ReLU(),
26 | # nn.Dropout(p=0.5),
27 | nn.Linear(self.fea_dim, self.know_dim))
28 |
29 | def avg_pool(self, data, mask: torch.Tensor):
30 | # batch_num * emb_dim * max_len => batch_num * emb_dim * 1
31 | # print(data,mask)
32 | mask_data = mask_sequence(data, mask)
33 | rs = torch.sum(mask_data.permute(0, 2, 1), dim=-1)
34 | len_mask = mask.reshape((-1, 1))
35 | len_mask = len_mask.expand(len_mask.size()[0], self.know_dim)
36 | # print(rs.size(),mask.size())
37 | rs = torch.div(rs, len_mask)
38 | return rs
39 |
40 | def forward(self, log, mask):
41 | # emb = mask_sequence(self.emb(log), mask)
42 | # att_emb = emb.permute(1, 0, 2)
43 | # att_emb, _ = self.atn(att_emb, att_emb, att_emb)
44 | # fea = self.feature_net(att_emb)
45 | # fea = fea.permute(1, 0, 2)
46 |
47 | emb = self.emb(log)
48 | fea = self.feature_net(emb)
49 |
50 | trait = self.avg_pool(fea, mask)
51 | return trait
52 |
--------------------------------------------------------------------------------
/EduCDM/ICD/sym/net/mirt.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | import torch
4 | from torch import nn
5 | import torch.nn.functional as F
6 | from .dtn import DTN
7 | from EduCDM.MIRT.MIRT import irt2pl
8 |
9 |
10 | class MIRTNet(nn.Module):
11 | def __init__(self, trait_dim, a_range=0.1, irf_kwargs=None):
12 | super(MIRTNet, self).__init__()
13 | self.irf_kwargs = irf_kwargs if irf_kwargs is not None else {}
14 | self.l_dtn_theta = nn.Linear(trait_dim, trait_dim)
15 | self.i_dtn_a = nn.Linear(trait_dim, trait_dim)
16 | self.i_dtn_b = nn.Linear(trait_dim, 1)
17 | self.a_range = a_range
18 |
19 | def forward(self, u_trait, v_trait, *args):
20 | theta = self.u_theta(u_trait)
21 | b = self.i_difficulty(v_trait)
22 | a = self.i_discrimination(v_trait)
23 |
24 | if torch.max(theta != theta) or torch.max(a != a) or torch.max(b != b): # pragma: no cover
25 | raise ValueError('ValueError:theta,a,b may contains nan! The a_range is too large.')
26 |
27 | return self.irf(theta, a, b, **self.irf_kwargs), theta, a, b
28 |
29 | @classmethod
30 | def int_f(cls, theta, a, b, *args, **kwargs):
31 | return irt2pl(theta, a, b, F=torch)
32 |
33 | @classmethod
34 | def irf(cls, theta, a, b, **kwargs):
35 | return irt2pl(theta, a, b, F=torch)
36 |
37 | def u_theta(self, u_trait):
38 | return (torch.sigmoid(torch.squeeze(self.l_dtn_theta(u_trait), dim=-1)) - 0.5) * 6
39 |
40 | def i_difficulty(self, v_trait):
41 | return (torch.sigmoid(torch.squeeze(self.i_dtn_b(v_trait), dim=-1)) - 0.5) * 6
42 |
43 | def i_discrimination(self, v_trait):
44 | a = torch.squeeze(self.i_dtn_a(v_trait), dim=-1)
45 | if self.a_range is not None:
46 | a = self.a_range * torch.sigmoid(a)
47 | else: # pragma: no cover
48 | a = F.softplus(a)
49 | return a
50 |
--------------------------------------------------------------------------------
/EduCDM/ICD/sym/net/ncd.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | import torch
3 | from torch import nn
4 | from ..pos_linear import PosLinear
5 |
6 |
7 | class NCDMNet(nn.Module):
8 | def __init__(self, trait_dim, know_dim):
9 | super(NCDMNet, self).__init__()
10 |
11 | self.knowledge_dim = know_dim
12 | self.prednet_input_len = self.knowledge_dim
13 | self.prednet_len1, self.prednet_len2 = 512, 256 # changeable
14 | self.l_dtn_theta_fc = nn.Linear(trait_dim, self.prednet_input_len)
15 | self.i_dtn_kd_fc = nn.Linear(trait_dim, self.prednet_input_len)
16 | self.i_dtn_ed_fc = nn.Linear(trait_dim, self.prednet_input_len)
17 | self.int_fc = nn.Sequential(
18 | PosLinear(self.prednet_input_len, self.prednet_len1), nn.Sigmoid(),
19 | nn.Dropout(p=0.5), PosLinear(self.prednet_len1, self.prednet_len2),
20 | nn.Sigmoid(), nn.Dropout(p=0.5), PosLinear(self.prednet_len2, 1),
21 | nn.Sigmoid())
22 |
23 | def u_theta(self, u_trait):
24 | return torch.sigmoid(self.l_dtn_theta_fc(u_trait))
25 |
26 | def i_difficulty(self, v_trait):
27 | return torch.sigmoid(self.i_dtn_kd_fc(v_trait))
28 |
29 | def i_discrimination(self, v_trait):
30 | return torch.sigmoid(self.i_dtn_ed_fc(v_trait))
31 |
32 | def forward(self, u_trait, v_trait, v_know):
33 | theta = self.u_theta(u_trait)
34 |
35 | difficulty = self.i_difficulty(v_trait)
36 | discrimination = self.i_discrimination(v_trait)
37 |
38 | # prednet
39 | input_x = discrimination * (theta - difficulty) * v_know
40 | output_1 = self.int_fc(input_x)
41 |
42 | return output_1.view(-1), theta, discrimination, difficulty
43 |
44 | def int_f(self, theta, a, b, know):
45 | return self.int_fc(a * (theta - b) * know).view(-1)
46 |
--------------------------------------------------------------------------------
/EduCDM/ICD/sym/net/net.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from tqdm import tqdm
4 | import torch
5 | from torch import nn
6 | from baize.torch import loss_dict2tmt_torch_loss
7 | from longling.ML.PytorchHelper import set_device
8 | from longling.ML.PytorchHelper.toolkit.trainer import collect_params
9 |
10 | from .ncd import NCDMNet
11 | from .mirt import MIRTNet
12 | from .dtn import DTN
13 |
14 |
15 | class ICD(nn.Module):
16 | def __init__(self, user_n, item_n, know_n, cdm="ncd"):
17 | super(ICD, self).__init__()
18 | self.l_dtn = DTN(2 * item_n + 1, know_n)
19 | self.i_dtn = DTN(2 * user_n + 1, know_n)
20 | self.cdm_name = cdm
21 | if cdm == "ncd":
22 | self.cdm = NCDMNet(know_n, know_n)
23 | elif cdm == "mirt":
24 | self.cdm = MIRTNet(know_n)
25 | else: # pragma: no cover
26 | raise ValueError()
27 |
28 | for name, param in self.named_parameters():
29 | if 'weight' in name:
30 | nn.init.xavier_normal_(param)
31 |
32 | def forward(self, u2i, u_mask, i2u, i_mask, i2k):
33 | u_trait = self.l_dtn(u2i, u_mask)
34 | v_trait = self.i_dtn(i2u, i_mask)
35 | return self.cdm(u_trait, v_trait, i2k)
36 |
37 | def get_user_profiles(self, batches):
38 | device = next(self.parameters()).device
39 | ids = []
40 | traits = []
41 | for _id, records, r_mask in tqdm(batches, "getting user profiles"):
42 | ids.append(_id.to("cpu"))
43 | traits.append(
44 | self.cdm.u_theta(
45 | self.l_dtn(records.to(device),
46 | r_mask.to(device))).to("cpu"))
47 |
48 | obj = {"uid": torch.cat(ids), "u_trait": torch.cat(traits)}
49 | return obj
50 |
51 | def get_item_profiles(self, batches):
52 | device = next(self.parameters()).device
53 | ids = []
54 | a = []
55 | b = []
56 | for _id, records, r_mask in tqdm(batches, "getting item profiles"):
57 | v_trait = self.i_dtn(records.to(device), r_mask.to(device))
58 | ids.append(_id.cpu())
59 | a.append(self.cdm.i_discrimination(v_trait).to("cpu"))
60 | b.append(self.cdm.i_difficulty(v_trait).to("cpu"))
61 | obj = {"iid": torch.cat(ids), "ia": torch.cat(a), "ib": torch.cat(b)}
62 | return obj
63 |
64 |
65 | class DualICD(nn.Module):
66 | def __init__(self, stat_net: ICD, net: ICD, alpha=0.999):
67 | super(DualICD, self).__init__()
68 | self.stat_net = stat_net
69 | self.net = net
70 | self.alpha = alpha
71 |
72 | def momentum_weight_update(self, pre_net, train_select=None):
73 | """
74 | Momentum update of ICD
75 | """
76 | pre_net_params = collect_params(pre_net, train_select)
77 | net_params = collect_params(self.net, train_select)
78 | for param_pre, param_now in zip(pre_net_params, net_params):
79 | param_now.data = param_pre.data * self.alpha + param_now.data * (
80 | 1. - self.alpha)
81 |
82 | def forward(self, u2i, u_mask, i2u, i_mask, i2k):
83 | output, theta, a, b = self.net(u2i, u_mask, i2u, i_mask, i2k)
84 | _, stat_theta, stat_a, stat_b = self.stat_net(u2i, u_mask, i2u, i_mask,
85 | i2k)
86 | return output, theta, a, b, stat_theta, stat_a, stat_b
87 |
88 |
89 | class EmbICD(nn.Module):
90 | def __init__(self, int_fc, weights):
91 | super(EmbICD, self).__init__()
92 | self.theta_emb = nn.Embedding(*weights[0].size(), _weight=weights[0])
93 | self.a_emb = nn.Embedding(*weights[1].size(), _weight=weights[1])
94 | if len(weights[2].size()) == 1:
95 | self.b_emb = nn.Embedding(weights[2].size()[0],
96 | 1,
97 | _weight=torch.unsqueeze(weights[2], 1))
98 | else:
99 | self.b_emb = nn.Embedding(*weights[2].size(), _weight=weights[2])
100 | self.int_fc = int_fc
101 | self._user_id2idx = {}
102 | self._item_id2idx = {}
103 |
104 | def build_user_id2idx(self, users):
105 | idx = 0
106 | for user_id in users:
107 | if user_id not in self._user_id2idx:
108 | self._user_id2idx[user_id] = idx
109 | idx += 1
110 |
111 | def build_item_id2idx(self, items):
112 | idx = 0
113 | for item_id in items:
114 | if item_id not in self._item_id2idx:
115 | self._item_id2idx[item_id] = idx
116 | idx += 1
117 |
118 | def user_id2idx(self, users):
119 | users_idx = []
120 | for user in users:
121 | users_idx.append(self._user_id2idx[user])
122 | return users_idx
123 |
124 | def item_id2idx(self, items):
125 | items_idx = []
126 | for item in items:
127 | items_idx.append(self._item_id2idx[item])
128 | return items_idx
129 |
130 | def forward(self, user_idx, item_idx, know):
131 | theta = self.theta_emb(user_idx).detach()
132 | a = self.a_emb(item_idx).detach()
133 | b = self.b_emb(item_idx).detach()
134 |
135 | theta.requires_grad_(True)
136 | a.requires_grad_(True)
137 | b.requires_grad_(True)
138 |
139 | return self.int_fc(theta, a, torch.squeeze(b),
140 | know).view(-1), theta, a, b
141 |
142 |
143 | class DeltaTraitLoss(nn.Module):
144 | def __init__(self):
145 | super(DeltaTraitLoss, self).__init__()
146 | self.mse_loss = nn.MSELoss()
147 |
148 | def forward(self, theta, a, b, stat_theta, stat_a, stat_b):
149 | return self.mse_loss(theta, stat_theta) + self.mse_loss(
150 | a, stat_a) + self.mse_loss(b, stat_b)
151 |
152 |
153 | class DualLoss(nn.Module):
154 | def __init__(self, beta=0.95, *args, **kwargs):
155 | super(DualLoss, self).__init__()
156 | self.beta = beta
157 | self.bce = nn.BCELoss(*args, **kwargs)
158 | self.delta_trait = DeltaTraitLoss()
159 |
160 | def forward(self, pred, truth, theta, a, b, stat_theta, stat_a, stat_b):
161 | return self.beta * self.bce(
162 | pred, truth) + (1. - self.beta) * self.delta_trait(
163 | theta, a, b, stat_theta, stat_a, stat_b)
164 |
165 |
166 | def get_dual_loss(ctx, beta=0.95, *args, **kwargs):
167 | return loss_dict2tmt_torch_loss({
168 | "Loss":
169 | set_device(DualLoss(beta, *args, **kwargs), ctx),
170 | "BCE":
171 | set_device(torch.nn.BCELoss(*args, **kwargs), ctx),
172 | "DTL":
173 | set_device(DeltaTraitLoss(), ctx),
174 | })
175 |
176 |
177 | def get_loss(ctx, *args, **kwargs): # pragma: no cover
178 | return loss_dict2tmt_torch_loss(
179 | {"cross entropy": set_device(torch.nn.BCELoss(*args, **kwargs), ctx)})
180 |
181 |
182 | def get_net(ctx=None, *args, **kwargs):
183 | if ctx is None: # pragma: no cover
184 | return ICD(*args, **kwargs)
185 | return set_device(ICD(*args, **kwargs), ctx)
186 |
--------------------------------------------------------------------------------
/EduCDM/ICD/sym/pos_linear.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | import torch
4 | import torch.nn.functional as F
5 | from torch import nn
6 |
7 |
8 | class PosLinear(nn.Linear):
9 | def forward(self, input: torch.Tensor) -> torch.Tensor:
10 | weight = 2 * F.relu(1 * torch.neg(self.weight)) + self.weight
11 | return F.linear(input, weight, self.bias)
12 |
--------------------------------------------------------------------------------
/EduCDM/ICD/utils.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | import json
3 | import logging
4 |
5 |
6 | def output_metrics(_id, obj, wfs=None, header=None, logger=logging):
7 | logger.info("-------- %s: %s ----------" % (header, _id))
8 | logger.info("\n%s" % obj)
9 | if wfs is not None: # pragma: no cover
10 | print(json.dumps({
11 | "id": _id,
12 | "metrics": obj
13 | }),
14 | file=wfs[header],
15 | flush=True)
16 |
--------------------------------------------------------------------------------
/EduCDM/IRR/DINA.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/7/1 @ tongshiwei
3 |
4 | import pandas as pd
5 | import numpy as np
6 | import torch
7 | from torch import nn
8 | from EduCDM import GDDINA
9 | from .loss import PairSCELoss, HarmonicLoss, loss_mask
10 | from tqdm import tqdm
11 | from longling.ML.metrics import ranking_report
12 |
13 |
14 | class DINA(GDDINA):
15 | def __init__(self, user_num, item_num, knowledge_num, ste=False, zeta=0.5):
16 | super(DINA, self).__init__(user_num, item_num, knowledge_num, ste)
17 | self.zeta = zeta
18 |
19 | def train(self, train_data, test_data=None, *, epoch: int, device="cpu", lr=0.001) -> ...:
20 | self.dina_net = self.dina_net.to(device)
21 | point_loss_function = nn.BCELoss()
22 | pair_loss_function = PairSCELoss()
23 | loss_function = HarmonicLoss(self.zeta)
24 |
25 | trainer = torch.optim.Adam(self.dina_net.parameters(), lr, weight_decay=1e-4)
26 |
27 | for e in range(epoch):
28 | point_losses = []
29 | pair_losses = []
30 | losses = []
31 | for batch_data in tqdm(train_data, "Epoch %s" % e):
32 | user_id, item_id, knowledge, score, n_samples, *neg_users = batch_data
33 | user_id: torch.Tensor = user_id.to(device)
34 | item_id: torch.Tensor = item_id.to(device)
35 | knowledge: torch.Tensor = knowledge.to(device)
36 | n_samples: torch.Tensor = n_samples.to(device)
37 | predicted_pos_score: torch.Tensor = self.dina_net(user_id, item_id, knowledge)
38 | score: torch.Tensor = score.to(device)
39 | neg_score = 1 - score
40 |
41 | point_loss = point_loss_function(predicted_pos_score, score)
42 | predicted_neg_scores = []
43 | if neg_users:
44 | for neg_user in neg_users:
45 | neg_user: torch.Tensor = neg_user.to(device)
46 | predicted_neg_score = self.dina_net(neg_user, item_id, knowledge)
47 | predicted_neg_scores.append(predicted_neg_score)
48 |
49 | # prediction loss
50 | pair_pred_loss_list = []
51 | for i, predicted_neg_score in enumerate(predicted_neg_scores):
52 | pair_pred_loss_list.append(
53 | pair_loss_function(
54 | predicted_pos_score,
55 | predicted_neg_score,
56 | score - neg_score
57 | )
58 | )
59 |
60 | pair_loss = sum(loss_mask(pair_pred_loss_list, n_samples))
61 | else:
62 | pair_loss = 0
63 |
64 | loss = loss_function(point_loss, pair_loss)
65 |
66 | # back propagation
67 | trainer.zero_grad()
68 | loss.backward()
69 | trainer.step()
70 |
71 | point_losses.append(point_loss.mean().item())
72 | pair_losses.append(pair_loss.mean().item() if not isinstance(pair_loss, int) else pair_loss)
73 | losses.append(loss.item())
74 | print(
75 | "[Epoch %d] Loss: %.6f, PointLoss: %.6f, PairLoss: %.6f" % (
76 | e, float(np.mean(losses)), float(np.mean(point_losses)), float(np.mean(pair_losses))
77 | )
78 | )
79 |
80 | if test_data is not None:
81 | eval_data = self.eval(test_data, device=device)
82 | print("[Epoch %d]\n%s" % (e, eval_data))
83 |
84 | def eval(self, test_data, device="cpu"):
85 | self.dina_net = self.dina_net.to(device)
86 | self.dina_net.eval()
87 | y_pred = []
88 | y_true = []
89 | items = []
90 | for batch_data in tqdm(test_data, "evaluating"):
91 | user_id, item_id, knowledge, response = batch_data
92 | user_id: torch.Tensor = user_id.to(device)
93 | item_id: torch.Tensor = item_id.to(device)
94 | knowledge: torch.Tensor = knowledge.to(device)
95 | pred: torch.Tensor = self.dina_net(user_id, item_id, knowledge)
96 | y_pred.extend(pred.tolist())
97 | y_true.extend(response.tolist())
98 | items.extend(item_id.tolist())
99 |
100 | df = pd.DataFrame({
101 | "item_id": items,
102 | "score": y_true,
103 | "pred": y_pred,
104 | })
105 |
106 | ground_truth = []
107 | prediction = []
108 |
109 | for _, group_df in tqdm(df.groupby("item_id"), "formatting item df"):
110 | ground_truth.append(group_df["score"].values)
111 | prediction.append(group_df["pred"].values)
112 |
113 | self.dina_net.train()
114 |
115 | return ranking_report(
116 | ground_truth,
117 | y_pred=prediction,
118 | coerce="padding"
119 | )
120 |
--------------------------------------------------------------------------------
/EduCDM/IRR/IRT.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/6/19 @ tongshiwei
3 |
4 | import torch
5 | from torch import nn
6 | from tqdm import tqdm
7 | from EduCDM.IRT.GD import IRT as PointIRT
8 | import numpy as np
9 | import pandas as pd
10 | from .loss import PairSCELoss, HarmonicLoss, loss_mask
11 | from longling.ML.metrics import ranking_report
12 |
13 | __all__ = ["IRT"]
14 |
15 |
16 | class IRT(PointIRT):
17 | def __init__(self, user_num, item_num, knowledge_num, zeta=0.5):
18 | super(IRT, self).__init__(user_num, item_num)
19 | self.knowledge_num = knowledge_num
20 | self.zeta = zeta
21 |
22 | def train(self, train_data, test_data=None, *, epoch: int, device="cpu", lr=0.001) -> ...:
23 | self.irt_net = self.irt_net.to(device)
24 | point_loss_function = nn.BCELoss()
25 | pair_loss_function = PairSCELoss()
26 | loss_function = HarmonicLoss(self.zeta)
27 |
28 | trainer = torch.optim.Adam(self.irt_net.parameters(), lr, weight_decay=1e-4)
29 |
30 | for e in range(epoch):
31 | point_losses = []
32 | pair_losses = []
33 | losses = []
34 | for batch_data in tqdm(train_data, "Epoch %s" % e):
35 | user_id, item_id, _, score, n_samples, *neg_users = batch_data
36 | user_id: torch.Tensor = user_id.to(device)
37 | item_id: torch.Tensor = item_id.to(device)
38 | n_samples: torch.Tensor = n_samples.to(device)
39 | predicted_pos_score: torch.Tensor = self.irt_net(user_id, item_id)
40 | score: torch.Tensor = score.to(device)
41 | neg_score = 1 - score
42 |
43 | point_loss = point_loss_function(predicted_pos_score, score)
44 | predicted_neg_scores = []
45 | if neg_users:
46 | for neg_user in neg_users:
47 | neg_user: torch.Tensor = neg_user.to(device)
48 | predicted_neg_score = self.irt_net(neg_user, item_id)
49 | predicted_neg_scores.append(predicted_neg_score)
50 |
51 | # prediction loss
52 | pair_pred_loss_list = []
53 | for i, predicted_neg_score in enumerate(predicted_neg_scores):
54 | pair_pred_loss_list.append(
55 | pair_loss_function(
56 | predicted_pos_score,
57 | predicted_neg_score,
58 | score - neg_score
59 | )
60 | )
61 |
62 | pair_loss = sum(loss_mask(pair_pred_loss_list, n_samples))
63 | else:
64 | pair_loss = 0
65 |
66 | loss = loss_function(point_loss, pair_loss)
67 |
68 | # back propagation
69 | trainer.zero_grad()
70 | loss.backward()
71 | trainer.step()
72 |
73 | point_losses.append(point_loss.mean().item())
74 | pair_losses.append(pair_loss.mean().item() if not isinstance(pair_loss, int) else pair_loss)
75 | losses.append(loss.item())
76 | print(
77 | "[Epoch %d] Loss: %.6f, PointLoss: %.6f, PairLoss: %.6f" % (
78 | e, float(np.mean(losses)), float(np.mean(point_losses)), float(np.mean(pair_losses))
79 | )
80 | )
81 |
82 | if test_data is not None:
83 | eval_data = self.eval(test_data, device=device)
84 | print("[Epoch %d]\n%s" % (e, eval_data))
85 |
86 | def eval(self, test_data, device="cpu"):
87 | self.irt_net = self.irt_net.to(device)
88 | self.irt_net.eval()
89 | y_pred = []
90 | y_true = []
91 | items = []
92 | for batch_data in tqdm(test_data, "evaluating"):
93 | user_id, item_id, _, response = batch_data
94 | user_id: torch.Tensor = user_id.to(device)
95 | item_id: torch.Tensor = item_id.to(device)
96 | pred: torch.Tensor = self.irt_net(user_id, item_id)
97 | y_pred.extend(pred.tolist())
98 | y_true.extend(response.tolist())
99 | items.extend(item_id.tolist())
100 |
101 | df = pd.DataFrame({
102 | "item_id": items,
103 | "score": y_true,
104 | "pred": y_pred,
105 | })
106 |
107 | ground_truth = []
108 | prediction = []
109 |
110 | for _, group_df in tqdm(df.groupby("item_id"), "formatting item df"):
111 | ground_truth.append(group_df["score"].values)
112 | prediction.append(group_df["pred"].values)
113 |
114 | self.irt_net.train()
115 |
116 | return ranking_report(
117 | ground_truth,
118 | y_pred=prediction,
119 | coerce="padding"
120 | )
121 |
--------------------------------------------------------------------------------
/EduCDM/IRR/MIRT.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/7/1 @ tongshiwei
3 |
4 |
5 | import torch
6 | from torch import nn
7 | from tqdm import tqdm
8 | from EduCDM import MIRT as PointMIRT
9 | import numpy as np
10 | import pandas as pd
11 | from .loss import PairSCELoss, HarmonicLoss, loss_mask
12 | from longling.ML.metrics import ranking_report
13 |
14 | __all__ = ["MIRT"]
15 |
16 |
17 | class MIRT(PointMIRT):
18 | def __init__(self, user_num, item_num, knowledge_num, latent_dim=None, zeta=0.5):
19 | latent_dim = knowledge_num if latent_dim is None else latent_dim
20 | super(MIRT, self).__init__(user_num, item_num, latent_dim)
21 | self.knowledge_num = knowledge_num
22 | self.zeta = zeta
23 |
24 | def train(self, train_data, test_data=None, *, epoch: int, device="cpu", lr=0.001) -> ...:
25 | self.irt_net = self.irt_net.to(device)
26 | point_loss_function = nn.BCELoss()
27 | pair_loss_function = PairSCELoss()
28 | loss_function = HarmonicLoss(self.zeta)
29 |
30 | trainer = torch.optim.Adam(self.irt_net.parameters(), lr, weight_decay=1e-4)
31 |
32 | for e in range(epoch):
33 | point_losses = []
34 | pair_losses = []
35 | losses = []
36 | for batch_data in tqdm(train_data, "Epoch %s" % e):
37 | user_id, item_id, _, score, n_samples, *neg_users = batch_data
38 | user_id: torch.Tensor = user_id.to(device)
39 | item_id: torch.Tensor = item_id.to(device)
40 | n_samples: torch.Tensor = n_samples.to(device)
41 | predicted_pos_score: torch.Tensor = self.irt_net(user_id, item_id)
42 | score: torch.Tensor = score.to(device)
43 | neg_score = 1 - score
44 |
45 | point_loss = point_loss_function(predicted_pos_score, score)
46 | predicted_neg_scores = []
47 | if neg_users:
48 | for neg_user in neg_users:
49 | neg_user: torch.Tensor = neg_user.to(device)
50 | predicted_neg_score = self.irt_net(neg_user, item_id)
51 | predicted_neg_scores.append(predicted_neg_score)
52 |
53 | # prediction loss
54 | pair_pred_loss_list = []
55 | for i, predicted_neg_score in enumerate(predicted_neg_scores):
56 | pair_pred_loss_list.append(
57 | pair_loss_function(
58 | predicted_pos_score,
59 | predicted_neg_score,
60 | score - neg_score
61 | )
62 | )
63 |
64 | pair_loss = sum(loss_mask(pair_pred_loss_list, n_samples))
65 | else:
66 | pair_loss = 0
67 |
68 | loss = loss_function(point_loss, pair_loss)
69 |
70 | # back propagation
71 | trainer.zero_grad()
72 | loss.backward()
73 | trainer.step()
74 |
75 | point_losses.append(point_loss.mean().item())
76 | pair_losses.append(pair_loss.mean().item() if not isinstance(pair_loss, int) else pair_loss)
77 | losses.append(loss.item())
78 | print(
79 | "[Epoch %d] Loss: %.6f, PointLoss: %.6f, PairLoss: %.6f" % (
80 | e, float(np.mean(losses)), float(np.mean(point_losses)), float(np.mean(pair_losses))
81 | )
82 | )
83 |
84 | if test_data is not None:
85 | eval_data = self.eval(test_data, device=device)
86 | print("[Epoch %d]\n%s" % (e, eval_data))
87 |
88 | def eval(self, test_data, device="cpu"):
89 | self.irt_net = self.irt_net.to(device)
90 | self.irt_net.eval()
91 | y_pred = []
92 | y_true = []
93 | items = []
94 | for batch_data in tqdm(test_data, "evaluating"):
95 | user_id, item_id, _, response = batch_data
96 | user_id: torch.Tensor = user_id.to(device)
97 | item_id: torch.Tensor = item_id.to(device)
98 | pred: torch.Tensor = self.irt_net(user_id, item_id)
99 | y_pred.extend(pred.tolist())
100 | y_true.extend(response.tolist())
101 | items.extend(item_id.tolist())
102 |
103 | df = pd.DataFrame({
104 | "item_id": items,
105 | "score": y_true,
106 | "pred": y_pred,
107 | })
108 |
109 | ground_truth = []
110 | prediction = []
111 |
112 | for _, group_df in tqdm(df.groupby("item_id"), "formatting item df"):
113 | ground_truth.append(group_df["score"].values)
114 | prediction.append(group_df["pred"].values)
115 |
116 | self.irt_net.train()
117 |
118 | return ranking_report(
119 | ground_truth,
120 | y_pred=prediction,
121 | coerce="padding"
122 | )
123 |
--------------------------------------------------------------------------------
/EduCDM/IRR/NCDM.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/7/1 @ tongshiwei
3 |
4 | import pandas as pd
5 | import numpy as np
6 | import torch
7 | from torch import nn
8 | from EduCDM import NCDM as PointNCDM
9 | from .loss import PairSCELoss, HarmonicLoss, loss_mask
10 | from tqdm import tqdm
11 | from longling.ML.metrics import ranking_report
12 |
13 |
14 | class NCDM(PointNCDM):
15 | def __init__(self, user_num, item_num, knowledge_num, zeta=0.5):
16 | super(NCDM, self).__init__(knowledge_num, item_num, user_num)
17 | self.zeta = zeta
18 |
19 | def train(self, train_data, test_data=None, epoch=10, device="cpu", lr=0.002, silence=False) -> ...:
20 | self.ncdm_net = self.ncdm_net.to(device)
21 | point_loss_function = nn.BCELoss()
22 | pair_loss_function = PairSCELoss()
23 | loss_function = HarmonicLoss(self.zeta)
24 |
25 | trainer = torch.optim.Adam(self.ncdm_net.parameters(), lr, weight_decay=1e-4)
26 |
27 | for e in range(epoch):
28 | point_losses = []
29 | pair_losses = []
30 | losses = []
31 | for batch_data in tqdm(train_data, "Epoch %s" % e):
32 | user_id, item_id, knowledge, score, n_samples, *neg_users = batch_data
33 | user_id: torch.Tensor = user_id.to(device)
34 | item_id: torch.Tensor = item_id.to(device)
35 | knowledge: torch.Tensor = knowledge.to(device)
36 | n_samples: torch.Tensor = n_samples.to(device)
37 | predicted_pos_score: torch.Tensor = self.ncdm_net(user_id, item_id, knowledge)
38 | score: torch.Tensor = score.to(device)
39 | neg_score = 1 - score
40 |
41 | point_loss = point_loss_function(predicted_pos_score, score)
42 | predicted_neg_scores = []
43 | if neg_users:
44 | for neg_user in neg_users:
45 | neg_user: torch.Tensor = neg_user.to(device)
46 | predicted_neg_score = self.ncdm_net(neg_user, item_id, knowledge)
47 | predicted_neg_scores.append(predicted_neg_score)
48 |
49 | # prediction loss
50 | pair_pred_loss_list = []
51 | for i, predicted_neg_score in enumerate(predicted_neg_scores):
52 | pair_pred_loss_list.append(
53 | pair_loss_function(
54 | predicted_pos_score,
55 | predicted_neg_score,
56 | score - neg_score
57 | )
58 | )
59 |
60 | pair_loss = sum(loss_mask(pair_pred_loss_list, n_samples))
61 | else:
62 | pair_loss = 0
63 |
64 | loss = loss_function(point_loss, pair_loss)
65 |
66 | # back propagation
67 | trainer.zero_grad()
68 | loss.backward()
69 | trainer.step()
70 |
71 | point_losses.append(point_loss.mean().item())
72 | pair_losses.append(pair_loss.mean().item() if not isinstance(pair_loss, int) else pair_loss)
73 | losses.append(loss.item())
74 | print(
75 | "[Epoch %d] Loss: %.6f, PointLoss: %.6f, PairLoss: %.6f" % (
76 | e, float(np.mean(losses)), float(np.mean(point_losses)), float(np.mean(pair_losses))
77 | )
78 | )
79 |
80 | if test_data is not None:
81 | eval_data = self.eval(test_data, device=device)
82 | print("[Epoch %d]\n%s" % (e, eval_data))
83 |
84 | def eval(self, test_data, device="cpu"):
85 | self.ncdm_net = self.ncdm_net.to(device)
86 | self.ncdm_net.eval()
87 | y_pred = []
88 | y_true = []
89 | items = []
90 | for batch_data in tqdm(test_data, "evaluating"):
91 | user_id, item_id, knowledge, response = batch_data
92 | user_id: torch.Tensor = user_id.to(device)
93 | item_id: torch.Tensor = item_id.to(device)
94 | knowledge: torch.Tensor = knowledge.to(device)
95 | pred: torch.Tensor = self.ncdm_net(user_id, item_id, knowledge)
96 | y_pred.extend(pred.tolist())
97 | y_true.extend(response.tolist())
98 | items.extend(item_id.tolist())
99 |
100 | df = pd.DataFrame({
101 | "item_id": items,
102 | "score": y_true,
103 | "pred": y_pred,
104 | })
105 |
106 | ground_truth = []
107 | prediction = []
108 |
109 | for _, group_df in tqdm(df.groupby("item_id"), "formatting item df"):
110 | ground_truth.append(group_df["score"].values)
111 | prediction.append(group_df["pred"].values)
112 |
113 | self.ncdm_net.train()
114 |
115 | return ranking_report(
116 | ground_truth,
117 | y_pred=prediction,
118 | coerce="padding"
119 | )
120 |
--------------------------------------------------------------------------------
/EduCDM/IRR/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/6/19 @ tongshiwei
3 |
4 | from .IRT import IRT
5 | from .DINA import DINA
6 | from .MIRT import MIRT
7 | from .NCDM import NCDM
8 | from .etl import point_etl, pair_etl, extract_item
9 |
--------------------------------------------------------------------------------
/EduCDM/IRR/etl/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/6/19 @ tongshiwei
3 |
4 | from .utils import extract_item
5 | from .point_etl import etl as point_etl
6 | from .pair_etl import etl as pair_etl
7 |
--------------------------------------------------------------------------------
/EduCDM/IRR/etl/pair_etl.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/6/19 @ tongshiwei
3 | import torch
4 | import os
5 | from longling import print_time, iterwrap
6 | import pandas as pd
7 | import numpy as np
8 |
9 | from longling.ML.toolkit.dataset import ItemSpecificSampler
10 |
11 | __all__ = ["etl"]
12 |
13 |
14 | def extract(data_src, params):
15 | with print_time("loading data from %s" % os.path.abspath(data_src), params.logger):
16 | df = pd.read_csv(data_src, dtype={"user_id": "int64", "item_id": "int64", "score": "float32"})
17 | sampler = ItemSpecificSampler(
18 | ItemSpecificSampler.rating2triplet(
19 | df, query_field="item_id", key_field="user_id", value_field="score"
20 | ),
21 | query_field="item_id", user_id_range=[1, params.hyper_params["user_num"]],
22 | )
23 | return df, sampler
24 |
25 |
26 | @iterwrap()
27 | def transform(raw_data, knowledge, params):
28 | # 定义数据转换接口
29 | # raw_data --> batch_data
30 |
31 | batch_size = params.batch_size
32 | n_neg = params.n_neg
33 | n_imp = params.n_imp
34 | df: pd.DataFrame = raw_data[0]
35 | sampler: ItemSpecificSampler = raw_data[1]
36 |
37 | for start in range(0, len(df), batch_size):
38 | _df = df.iloc[start: start + batch_size]
39 | n_sample, sample = sampler(
40 | _df["item_id"], n_neg, neg=_df["score"] != 0.0, return_column=True, padding=True,
41 | split_sample_to_column=True, verbose=False, padding_implicit=False,
42 | fast_implicit=True, with_n_implicit=n_imp
43 | ) if (n_neg + n_imp) > 0 else ([0] * _df.shape[0], [])
44 | _knowledge = np.stack([knowledge[int(item)] for item in _df["item_id"]]).astype("float32")
45 | yield [
46 | torch.tensor(array if not isinstance(array, pd.Series) else array.values) for array in
47 | [_df["user_id"], _df["item_id"], _knowledge, _df["score"],
48 | n_sample, *sample]
49 | ]
50 |
51 |
52 | @iterwrap()
53 | def load(transformed_data, params):
54 | return transformed_data
55 |
56 |
57 | def etl(filepath, knowledge, params):
58 | raw_data = extract(filepath, params)
59 | transformed_data = transform(raw_data, knowledge, params)
60 | return load(transformed_data, params), raw_data[0]
61 |
--------------------------------------------------------------------------------
/EduCDM/IRR/etl/point_etl.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/6/19 @ tongshiwei
3 |
4 | import os
5 | import numpy as np
6 | import pandas as pd
7 | from longling import print_time
8 |
9 | import torch
10 | from torch.utils.data import TensorDataset, DataLoader
11 |
12 |
13 | def extract(data_src, params):
14 | with print_time("loading data from %s" % os.path.abspath(data_src), params.logger):
15 | df = pd.read_csv(data_src, dtype={"user_id": "int64", "item_id": "int64", "score": "float32"})
16 | return df
17 |
18 |
19 | def transform(df, knowledge, *args):
20 | # 定义数据转换接口
21 | # raw_data --> batch_data
22 | dataset = TensorDataset(
23 | torch.tensor(df["user_id"]),
24 | torch.tensor(df["item_id"]),
25 | torch.tensor(np.stack([knowledge[int(item)] for item in df["item_id"]])),
26 | torch.tensor(df["score"], dtype=torch.float)
27 | )
28 | return dataset
29 |
30 |
31 | def load(transformed_data, params):
32 | batch_size = params.batch_size
33 |
34 | return DataLoader(transformed_data, batch_size=batch_size)
35 |
36 |
37 | def etl(filepath, knowledge, params):
38 | raw_data = extract(filepath, params)
39 | transformed_data = transform(raw_data, knowledge, params)
40 | return load(transformed_data, params), raw_data
41 |
--------------------------------------------------------------------------------
/EduCDM/IRR/etl/utils.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/6/19 @ tongshiwei
3 |
4 | from tqdm import tqdm
5 | import os
6 | import pandas as pd
7 | from longling import print_time
8 | import numpy as np
9 |
10 |
11 | def extract_item(data_src, knowledge_num, params):
12 | with print_time("loading data from %s" % os.path.abspath(data_src), params.logger):
13 | knowledge = {}
14 | for record in tqdm(pd.read_csv(data_src).to_dict("records"), "reading records from %s" % data_src):
15 | knowledge_code_vector = [0] * knowledge_num
16 | for code in eval(record["knowledge_code"]):
17 | assert code >= 1
18 | knowledge_code_vector[code - 1] = 1
19 | knowledge[record["item_id"]] = np.asarray(knowledge_code_vector)
20 | return knowledge
21 |
--------------------------------------------------------------------------------
/EduCDM/IRR/loss.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/6/19 @ tongshiwei
3 |
4 | import torch
5 | from torch import nn
6 |
7 |
8 | def loss_mask(loss_list, n_samples):
9 | return [(i <= n_samples) * loss for i, loss in enumerate(loss_list)]
10 |
11 |
12 | class PairSCELoss(nn.Module):
13 | def __init__(self):
14 | super(PairSCELoss, self).__init__()
15 | self._loss = nn.CrossEntropyLoss()
16 |
17 | def forward(self, pred1, pred2, sign=1, *args):
18 | """
19 | sign is either 1 or -1
20 | could be seen as predicting the sign based on the pred1 and pred2
21 | 1: pred1 should be greater than pred2
22 | -1: otherwise
23 | """
24 | pred = torch.stack([pred1, pred2], dim=1)
25 | return self._loss(pred, ((torch.ones(pred1.shape[0], device=pred.device) - sign) / 2).long())
26 |
27 |
28 | class HarmonicLoss(object):
29 | def __init__(self, zeta: (int, float) = 0.):
30 | self.zeta = zeta
31 |
32 | def __call__(self, point_wise_loss, pair_pred_loss, *args, **kwargs):
33 | return ((1 - self.zeta) * point_wise_loss + self.zeta * pair_pred_loss).mean()
34 |
--------------------------------------------------------------------------------
/EduCDM/IRT/EM/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/5/2 @ liujiayu
3 |
4 | from .IRT import IRT
5 |
--------------------------------------------------------------------------------
/EduCDM/IRT/GD/IRT.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/4/23 @ tongshiwei
3 |
4 | import logging
5 | import numpy as np
6 | import torch
7 | from EduCDM import CDM
8 | from torch import nn
9 | import torch.nn.functional as F
10 | from tqdm import tqdm
11 | from ..irt import irt3pl
12 | from sklearn.metrics import roc_auc_score, accuracy_score
13 |
14 |
15 | class IRTNet(nn.Module):
16 | def __init__(self, user_num, item_num, value_range, a_range, irf_kwargs=None):
17 | super(IRTNet, self).__init__()
18 | self.user_num = user_num
19 | self.item_num = item_num
20 | self.irf_kwargs = irf_kwargs if irf_kwargs is not None else {}
21 | self.theta = nn.Embedding(self.user_num, 1)
22 | self.a = nn.Embedding(self.item_num, 1)
23 | self.b = nn.Embedding(self.item_num, 1)
24 | self.c = nn.Embedding(self.item_num, 1)
25 | self.value_range = value_range
26 | self.a_range = a_range
27 |
28 | def forward(self, user, item):
29 | theta = torch.squeeze(self.theta(user), dim=-1)
30 | a = torch.squeeze(self.a(item), dim=-1)
31 | b = torch.squeeze(self.b(item), dim=-1)
32 | c = torch.squeeze(self.c(item), dim=-1)
33 | c = torch.sigmoid(c)
34 | if self.value_range is not None:
35 | theta = self.value_range * (torch.sigmoid(theta) - 0.5)
36 | b = self.value_range * (torch.sigmoid(b) - 0.5)
37 | if self.a_range is not None:
38 | a = self.a_range * torch.sigmoid(a)
39 | else:
40 | a = F.softplus(a)
41 | if torch.max(theta != theta) or torch.max(a != a) or torch.max(b != b): # pragma: no cover
42 | raise ValueError('ValueError:theta,a,b may contains nan! The value_range or a_range is too large.')
43 | return self.irf(theta, a, b, c, **self.irf_kwargs)
44 |
45 | @classmethod
46 | def irf(cls, theta, a, b, c, **kwargs):
47 | return irt3pl(theta, a, b, c, F=torch, **kwargs)
48 |
49 |
50 | class IRT(CDM):
51 | def __init__(self, user_num, item_num, value_range=None, a_range=None):
52 | super(IRT, self).__init__()
53 | self.irt_net = IRTNet(user_num, item_num, value_range, a_range)
54 |
55 | def train(self, train_data, test_data=None, *, epoch: int, device="cpu", lr=0.001) -> ...:
56 | self.irt_net = self.irt_net.to(device)
57 | loss_function = nn.BCELoss()
58 |
59 | trainer = torch.optim.Adam(self.irt_net.parameters(), lr)
60 |
61 | for e in range(epoch):
62 | losses = []
63 | for batch_data in tqdm(train_data, "Epoch %s" % e):
64 | user_id, item_id, response = batch_data
65 | user_id: torch.Tensor = user_id.to(device)
66 | item_id: torch.Tensor = item_id.to(device)
67 | predicted_response: torch.Tensor = self.irt_net(user_id, item_id)
68 | response: torch.Tensor = response.to(device)
69 | loss = loss_function(predicted_response, response)
70 |
71 | # back propagation
72 | trainer.zero_grad()
73 | loss.backward()
74 | trainer.step()
75 |
76 | losses.append(loss.mean().item())
77 | print("[Epoch %d] LogisticLoss: %.6f" % (e, float(np.mean(losses))))
78 |
79 | if test_data is not None:
80 | auc, accuracy = self.eval(test_data, device=device)
81 | print("[Epoch %d] auc: %.6f, accuracy: %.6f" % (e, auc, accuracy))
82 |
83 | def eval(self, test_data, device="cpu") -> tuple:
84 | self.irt_net = self.irt_net.to(device)
85 | self.irt_net.eval()
86 | y_pred = []
87 | y_true = []
88 | for batch_data in tqdm(test_data, "evaluating"):
89 | user_id, item_id, response = batch_data
90 | user_id: torch.Tensor = user_id.to(device)
91 | item_id: torch.Tensor = item_id.to(device)
92 | pred: torch.Tensor = self.irt_net(user_id, item_id)
93 | y_pred.extend(pred.tolist())
94 | y_true.extend(response.tolist())
95 |
96 | self.irt_net.train()
97 | return roc_auc_score(y_true, y_pred), accuracy_score(y_true, np.array(y_pred) >= 0.5)
98 |
99 | def save(self, filepath):
100 | torch.save(self.irt_net.state_dict(), filepath)
101 | logging.info("save parameters to %s" % filepath)
102 |
103 | def load(self, filepath):
104 | self.irt_net.load_state_dict(torch.load(filepath))
105 | logging.info("load parameters from %s" % filepath)
106 |
--------------------------------------------------------------------------------
/EduCDM/IRT/GD/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/4/23 @ tongshiwei
3 |
4 | from .IRT import IRT
5 |
--------------------------------------------------------------------------------
/EduCDM/IRT/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/4/23 @ tongshiwei
3 |
4 |
5 | from .GD import IRT as GDIRT
6 | from .EM import IRT as EMIRT
7 |
--------------------------------------------------------------------------------
/EduCDM/IRT/irt.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/4/23 @ tongshiwei
3 |
4 | import numpy as np
5 |
6 | __all__ = ["irf", "irt3pl"]
7 |
8 |
9 | def irf(theta, a, b, c, D=1.702, *, F=np):
10 | return c + (1 - c) / (1 + F.exp(-D * a * (theta - b)))
11 |
12 |
13 | irt3pl = irf
14 |
--------------------------------------------------------------------------------
/EduCDM/KaNCD/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/4/1 @ WangFei
3 |
4 | from .KaNCD import KaNCD
5 |
--------------------------------------------------------------------------------
/EduCDM/MCD/MCD.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/3/23 @ tongshiwei
3 |
4 | import logging
5 | import numpy as np
6 | import torch
7 | from tqdm import tqdm
8 | from torch import nn
9 | from EduCDM import CDM
10 | from sklearn.metrics import roc_auc_score, accuracy_score
11 |
12 |
13 | class MFNet(nn.Module):
14 | """Matrix Factorization Network"""
15 |
16 | def __init__(self, user_num, item_num, latent_dim):
17 | super(MFNet, self).__init__()
18 | self.user_num = user_num
19 | self.item_num = item_num
20 | self.latent_dim = latent_dim
21 | self.user_embedding = nn.Embedding(self.user_num, self.latent_dim)
22 | self.item_embedding = nn.Embedding(self.item_num, self.latent_dim)
23 | self.response = nn.Linear(2 * self.latent_dim, 1)
24 |
25 | def forward(self, user_id, item_id):
26 | user = self.user_embedding(user_id)
27 | item = self.item_embedding(item_id)
28 | return torch.squeeze(torch.sigmoid(self.response(torch.cat([user, item], dim=-1))), dim=-1)
29 |
30 |
31 | class MCD(CDM):
32 | """Matrix factorization based Cognitive Diagnosis Model"""
33 |
34 | def __init__(self, user_num, item_num, latent_dim):
35 | super(MCD, self).__init__()
36 | self.mf_net = MFNet(user_num, item_num, latent_dim)
37 |
38 | def train(self, train_data, test_data=None, *, epoch: int, device="cpu", lr=0.001) -> ...:
39 | self.mf_net = self.mf_net.to(device)
40 | loss_function = nn.BCELoss()
41 |
42 | trainer = torch.optim.Adam(self.mf_net.parameters(), lr)
43 |
44 | for e in range(epoch):
45 | losses = []
46 | for batch_data in tqdm(train_data, "Epoch %s" % e):
47 | user_id, item_id, response = batch_data
48 | user_id: torch.Tensor = user_id.to(device)
49 | item_id: torch.Tensor = item_id.to(device)
50 | predicted_response: torch.Tensor = self.mf_net(user_id, item_id)
51 | response: torch.Tensor = response.to(device)
52 | loss = loss_function(predicted_response, response)
53 |
54 | # back propagation
55 | trainer.zero_grad()
56 | loss.backward()
57 | trainer.step()
58 |
59 | losses.append(loss.mean().item())
60 | print("[Epoch %d] LogisticLoss: %.6f" % (e, float(np.mean(losses))))
61 |
62 | if test_data is not None:
63 | auc, accuracy = self.eval(test_data, device=device)
64 | print("[Epoch %d] auc: %.6f, accuracy: %.6f" % (e, auc, accuracy))
65 |
66 | def eval(self, test_data, device="cpu") -> tuple:
67 | self.mf_net = self.mf_net.to(device)
68 | self.mf_net.eval()
69 | y_pred = []
70 | y_true = []
71 | for batch_data in tqdm(test_data, "evaluating"):
72 | user_id, item_id, response = batch_data
73 | user_id: torch.Tensor = user_id.to(device)
74 | item_id: torch.Tensor = item_id.to(device)
75 | pred: torch.Tensor = self.mf_net(user_id, item_id)
76 | y_pred.extend(pred.tolist())
77 | y_true.extend(response.tolist())
78 |
79 | self.mf_net.train()
80 | return roc_auc_score(y_true, y_pred), accuracy_score(y_true, np.array(y_pred) >= 0.5)
81 |
82 | def save(self, filepath):
83 | torch.save(self.mf_net.state_dict(), filepath)
84 | logging.info("save parameters to %s" % filepath)
85 |
86 | def load(self, filepath):
87 | self.mf_net.load_state_dict(torch.load(filepath))
88 | logging.info("load parameters from %s" % filepath)
89 |
--------------------------------------------------------------------------------
/EduCDM/MCD/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/3/23 @ tongshiwei
3 |
4 |
5 | from .MCD import MCD
6 |
--------------------------------------------------------------------------------
/EduCDM/MIRT/MIRT.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/7/1 @ tongshiwei
3 |
4 |
5 | import logging
6 | import numpy as np
7 | import torch
8 | from EduCDM import CDM
9 | from torch import nn
10 | import torch.nn.functional as F
11 | from tqdm import tqdm
12 | from sklearn.metrics import roc_auc_score, accuracy_score
13 |
14 |
15 | def irt2pl(theta, a, b, *, F=np):
16 | """
17 |
18 | Parameters
19 | ----------
20 | theta
21 | a
22 | b
23 | F
24 |
25 | Returns
26 | -------
27 |
28 | Examples
29 | --------
30 | >>> theta = [1, 0.5, 0.3]
31 | >>> a = [-3, 1, 3]
32 | >>> b = 0.5
33 | >>> float(irt2pl(theta, a, b)) # doctest: +ELLIPSIS
34 | 0.109...
35 | >>> theta = [[1, 0.5, 0.3], [2, 1, 0]]
36 | >>> a = [[-3, 1, 3], [-3, 1, 3]]
37 | >>> b = [0.5, 0.5]
38 | >>> irt2pl(theta, a, b) # doctest: +ELLIPSIS
39 | array([0.109..., 0.004...])
40 | """
41 | return 1 / (1 + F.exp(- F.sum(F.multiply(a, theta), axis=-1) + b))
42 |
43 |
44 | class MIRTNet(nn.Module):
45 | def __init__(self, user_num, item_num, latent_dim, a_range, irf_kwargs=None):
46 | super(MIRTNet, self).__init__()
47 | self.user_num = user_num
48 | self.item_num = item_num
49 | self.irf_kwargs = irf_kwargs if irf_kwargs is not None else {}
50 | self.theta = nn.Embedding(self.user_num, latent_dim)
51 | self.a = nn.Embedding(self.item_num, latent_dim)
52 | self.b = nn.Embedding(self.item_num, 1)
53 | self.a_range = a_range
54 |
55 | def forward(self, user, item):
56 | theta = torch.squeeze(self.theta(user), dim=-1)
57 | a = torch.squeeze(self.a(item), dim=-1)
58 | if self.a_range is not None:
59 | a = self.a_range * torch.sigmoid(a)
60 | else:
61 | a = F.softplus(a)
62 | b = torch.squeeze(self.b(item), dim=-1)
63 | if torch.max(theta != theta) or torch.max(a != a) or torch.max(b != b): # pragma: no cover
64 | raise ValueError('ValueError:theta,a,b may contains nan! The a_range is too large.')
65 | return self.irf(theta, a, b, **self.irf_kwargs)
66 |
67 | @classmethod
68 | def irf(cls, theta, a, b, **kwargs):
69 | return irt2pl(theta, a, b, F=torch)
70 |
71 |
72 | class MIRT(CDM):
73 | def __init__(self, user_num, item_num, latent_dim, a_range=None):
74 | super(MIRT, self).__init__()
75 | self.irt_net = MIRTNet(user_num, item_num, latent_dim, a_range)
76 |
77 | def train(self, train_data, test_data=None, *, epoch: int, device="cpu", lr=0.001) -> ...:
78 | self.irt_net = self.irt_net.to(device)
79 | loss_function = nn.BCELoss()
80 |
81 | trainer = torch.optim.Adam(self.irt_net.parameters(), lr)
82 |
83 | for e in range(epoch):
84 | losses = []
85 | for batch_data in tqdm(train_data, "Epoch %s" % e):
86 | user_id, item_id, response = batch_data
87 | user_id: torch.Tensor = user_id.to(device)
88 | item_id: torch.Tensor = item_id.to(device)
89 | predicted_response: torch.Tensor = self.irt_net(user_id, item_id)
90 | response: torch.Tensor = response.to(device)
91 | loss = loss_function(predicted_response, response)
92 |
93 | # back propagation
94 | trainer.zero_grad()
95 | loss.backward()
96 | trainer.step()
97 |
98 | losses.append(loss.mean().item())
99 | print("[Epoch %d] LogisticLoss: %.6f" % (e, float(np.mean(losses))))
100 |
101 | if test_data is not None:
102 | auc, accuracy = self.eval(test_data, device=device)
103 | print("[Epoch %d] auc: %.6f, accuracy: %.6f" % (e, auc, accuracy))
104 |
105 | def eval(self, test_data, device="cpu") -> tuple:
106 | self.irt_net = self.irt_net.to(device)
107 | self.irt_net.eval()
108 | y_pred = []
109 | y_true = []
110 | for batch_data in tqdm(test_data, "evaluating"):
111 | user_id, item_id, response = batch_data
112 | user_id: torch.Tensor = user_id.to(device)
113 | item_id: torch.Tensor = item_id.to(device)
114 | pred: torch.Tensor = self.irt_net(user_id, item_id)
115 | y_pred.extend(pred.tolist())
116 | y_true.extend(response.tolist())
117 |
118 | self.irt_net.train()
119 | return roc_auc_score(y_true, y_pred), accuracy_score(y_true, np.array(y_pred) >= 0.5)
120 |
121 | def save(self, filepath):
122 | torch.save(self.irt_net.state_dict(), filepath)
123 | logging.info("save parameters to %s" % filepath)
124 |
125 | def load(self, filepath):
126 | self.irt_net.load_state_dict(torch.load(filepath))
127 | logging.info("load parameters from %s" % filepath)
128 |
--------------------------------------------------------------------------------
/EduCDM/MIRT/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/7/1 @ tongshiwei
3 |
4 | from .MIRT import MIRT
5 |
--------------------------------------------------------------------------------
/EduCDM/NCDM/NCDM.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/4/1 @ WangFei
3 |
4 | import logging
5 | import torch
6 | import torch.nn as nn
7 | import torch.optim as optim
8 | import torch.nn.functional as F
9 | import numpy as np
10 | from tqdm import tqdm
11 | from sklearn.metrics import roc_auc_score, accuracy_score
12 | from EduCDM import CDM
13 |
14 |
15 | class PosLinear(nn.Linear):
16 | def forward(self, input: torch.Tensor) -> torch.Tensor:
17 | weight = 2 * F.relu(1 * torch.neg(self.weight)) + self.weight
18 | return F.linear(input, weight, self.bias)
19 |
20 |
21 | class Net(nn.Module):
22 |
23 | def __init__(self, knowledge_n, exer_n, student_n):
24 | self.knowledge_dim = knowledge_n
25 | self.exer_n = exer_n
26 | self.emb_num = student_n
27 | self.stu_dim = self.knowledge_dim
28 | self.prednet_input_len = self.knowledge_dim
29 | self.prednet_len1, self.prednet_len2 = 512, 256 # changeable
30 |
31 | super(Net, self).__init__()
32 |
33 | # prediction sub-net
34 | self.student_emb = nn.Embedding(self.emb_num, self.stu_dim)
35 | self.k_difficulty = nn.Embedding(self.exer_n, self.knowledge_dim)
36 | self.e_difficulty = nn.Embedding(self.exer_n, 1)
37 | self.prednet_full1 = PosLinear(self.prednet_input_len, self.prednet_len1)
38 | self.drop_1 = nn.Dropout(p=0.5)
39 | self.prednet_full2 = PosLinear(self.prednet_len1, self.prednet_len2)
40 | self.drop_2 = nn.Dropout(p=0.5)
41 | self.prednet_full3 = PosLinear(self.prednet_len2, 1)
42 |
43 | # initialize
44 | for name, param in self.named_parameters():
45 | if 'weight' in name:
46 | nn.init.xavier_normal_(param)
47 |
48 | def forward(self, stu_id, input_exercise, input_knowledge_point):
49 | # before prednet
50 | stu_emb = self.student_emb(stu_id)
51 | stat_emb = torch.sigmoid(stu_emb)
52 | k_difficulty = torch.sigmoid(self.k_difficulty(input_exercise))
53 | e_difficulty = torch.sigmoid(self.e_difficulty(input_exercise)) # * 10
54 | # prednet
55 | input_x = e_difficulty * (stat_emb - k_difficulty) * input_knowledge_point
56 | input_x = self.drop_1(torch.sigmoid(self.prednet_full1(input_x)))
57 | input_x = self.drop_2(torch.sigmoid(self.prednet_full2(input_x)))
58 | output_1 = torch.sigmoid(self.prednet_full3(input_x))
59 |
60 | return output_1.view(-1)
61 |
62 |
63 | class NCDM(CDM):
64 | '''Neural Cognitive Diagnosis Model'''
65 |
66 | def __init__(self, knowledge_n, exer_n, student_n):
67 | super(NCDM, self).__init__()
68 | self.ncdm_net = Net(knowledge_n, exer_n, student_n)
69 |
70 | def train(self, train_data, test_data=None, epoch=10, device="cpu", lr=0.002, silence=False):
71 | self.ncdm_net = self.ncdm_net.to(device)
72 | self.ncdm_net.train()
73 | loss_function = nn.BCELoss()
74 | optimizer = optim.Adam(self.ncdm_net.parameters(), lr=lr)
75 | for epoch_i in range(epoch):
76 | epoch_losses = []
77 | batch_count = 0
78 | for batch_data in tqdm(train_data, "Epoch %s" % epoch_i):
79 | batch_count += 1
80 | user_id, item_id, knowledge_emb, y = batch_data
81 | user_id: torch.Tensor = user_id.to(device)
82 | item_id: torch.Tensor = item_id.to(device)
83 | knowledge_emb: torch.Tensor = knowledge_emb.to(device)
84 | y: torch.Tensor = y.to(device)
85 | pred: torch.Tensor = self.ncdm_net(user_id, item_id, knowledge_emb)
86 | loss = loss_function(pred, y)
87 |
88 | optimizer.zero_grad()
89 | loss.backward()
90 | optimizer.step()
91 |
92 | epoch_losses.append(loss.mean().item())
93 |
94 | print("[Epoch %d] average loss: %.6f" % (epoch_i, float(np.mean(epoch_losses))))
95 |
96 | if test_data is not None:
97 | auc, accuracy = self.eval(test_data, device=device)
98 | print("[Epoch %d] auc: %.6f, accuracy: %.6f" % (epoch_i, auc, accuracy))
99 |
100 | def eval(self, test_data, device="cpu"):
101 | self.ncdm_net = self.ncdm_net.to(device)
102 | self.ncdm_net.eval()
103 | y_true, y_pred = [], []
104 | for batch_data in tqdm(test_data, "Evaluating"):
105 | user_id, item_id, knowledge_emb, y = batch_data
106 | user_id: torch.Tensor = user_id.to(device)
107 | item_id: torch.Tensor = item_id.to(device)
108 | knowledge_emb: torch.Tensor = knowledge_emb.to(device)
109 | pred: torch.Tensor = self.ncdm_net(user_id, item_id, knowledge_emb)
110 | y_pred.extend(pred.detach().cpu().tolist())
111 | y_true.extend(y.tolist())
112 |
113 | return roc_auc_score(y_true, y_pred), accuracy_score(y_true, np.array(y_pred) >= 0.5)
114 |
115 | def save(self, filepath):
116 | torch.save(self.ncdm_net.state_dict(), filepath)
117 | logging.info("save parameters to %s" % filepath)
118 |
119 | def load(self, filepath):
120 | self.ncdm_net.load_state_dict(torch.load(filepath)) # , map_location=lambda s, loc: s
121 | logging.info("load parameters from %s" % filepath)
122 |
--------------------------------------------------------------------------------
/EduCDM/NCDM/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/4/1 @ WangFei
3 |
4 | from .NCDM import NCDM
5 |
--------------------------------------------------------------------------------
/EduCDM/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/3/17 @ tongshiwei
3 |
4 |
5 | from .meta import CDM
6 | from .MCD import MCD
7 | from .DINA import EMDINA, GDDINA
8 | from .FuzzyCDF import FuzzyCDF
9 | from .NCDM import NCDM
10 | from .IRT import EMIRT, GDIRT
11 | from .MIRT import MIRT
12 | from .KaNCD import KaNCD
13 |
--------------------------------------------------------------------------------
/EduCDM/meta.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/3/17 @ tongshiwei
3 |
4 |
5 | def etl(*args, **kwargs) -> ...: # pragma: no cover
6 | """
7 | extract - transform - load
8 | """
9 | pass
10 |
11 |
12 | def train(*args, **kwargs) -> ...: # pragma: no cover
13 | pass
14 |
15 |
16 | def evaluate(*args, **kwargs) -> ...: # pragma: no cover
17 | pass
18 |
19 |
20 | class CDM(object):
21 | def __init__(self, *args, **kwargs) -> ...:
22 | pass
23 |
24 | def train(self, *args, **kwargs) -> ...:
25 | raise NotImplementedError
26 |
27 | def eval(self, *args, **kwargs) -> ...:
28 | raise NotImplementedError
29 |
30 | def save(self, *args, **kwargs) -> ...:
31 | raise NotImplementedError
32 |
33 | def load(self, *args, **kwargs) -> ...:
34 | raise NotImplementedError
35 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | VERSION=`ls dist/*.tar.gz | sed "s/dist\/CDM-\(.*\)\.tar\.gz/\1/g"`
2 |
3 | ifdef ENVPIP
4 | PIP = $(ENVPIP)
5 | else
6 | PIP = pip3
7 | endif
8 |
9 | ifdef ENVPYTHON
10 | PYTHON = $(ENVPYTHON)
11 | else
12 | PYTHON = python3
13 | endif
14 |
15 | ifdef ENVPYTEST
16 | PYTEST = $(ENVPYTEST)
17 | else
18 | PYTEST = pytest
19 | endif
20 |
21 | help:
22 |
23 | @echo "install install CDM"
24 | @echo "test run test"
25 | @echo "release publish to PyPI and release in github"
26 | @echo "release_test publish to TestPyPI"
27 | @echo "clean remove all build, test, coverage and Python artifacts"
28 | @echo "clean-build remove build artifacts"
29 | @echo "clean-pyc remove Python file artifacts"
30 | @echo "clean-test remove test and coverage artifacts"
31 |
32 | .PHONY: install, test, build, release, release_test, version, .test, .build, clean
33 |
34 | install:
35 | @echo "install CDM"
36 | $(PIP) install -e . --user
37 |
38 | test:
39 | @echo "run test"
40 | $(PYTEST)
41 |
42 | build: test, clean
43 | $(PYTHON) setup.py bdist_wheel sdist
44 |
45 | .test:
46 | $(PYTEST) > /dev/null
47 |
48 | .build: clean
49 | $(PYTHON) setup.py bdist_wheel sdist > /dev/null
50 |
51 | version: .build
52 | @echo $(VERSION)
53 |
54 | release: test, build
55 | @echo "publish to pypi and release in github"
56 | @echo "version $(VERSION)"
57 |
58 | -@twine upload dist/* && git tag "v$(VERSION)"
59 | git push && git push --tags
60 |
61 | release_test: test, build
62 | @echo "publish to test pypi"
63 | @echo "version $(VERSION)"
64 |
65 | -@twine upload --repository test dist/*
66 |
67 | clean: clean-build clean-pyc clean-test
68 |
69 | clean-build:
70 | rm -rf build/*
71 | rm -rf dist/*
72 | rm -rf .eggs/*
73 | find . -name '*.egg-info' -exec rm -fr {} +
74 | find . -name '*.egg' -exec rm -f {} +
75 |
76 | clean-pyc:
77 | find . -name '*.pyc' -exec rm -f {} +
78 | find . -name '*.pyo' -exec rm -f {} +
79 | find . -name '*~' -exec rm -f {} +
80 | find . -name '__pycache__' -exec rm -rf {} +
81 |
82 | clean-test:
83 | rm -f .coverage
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | # EduCDM
6 |
7 |
8 | [](https://pypi.python.org/pypi/EduCDM)
9 | [](https://github.com/bigdata-ustc/EduCDM/actions/workflows/python-test.yml)
10 | [](https://codecov.io/gh/bigdata-ustc/EduCDM)
11 | [](https://pypi.python.org/pypi/EduCDM)
12 | [](LICENSE)
13 | [](https://zenodo.org/badge/latestdoi/348569904)
14 |
15 | The Model Zoo of Cognitive Diagnosis Models, including classic Item Response Ranking (**IRT**), Multidimensional Item Response Ranking (**MIRT**), Deterministic Input, Noisy "And" model(**DINA**), and advanced Fuzzy Cognitive Diagnosis Framework (**FuzzyCDF**), Neural Cognitive Diagnosis Model (**NCDM**), Item Response Ranking framework (**IRR**), Incremental Cognitive Diagnosis (**ICD**) and Knowledge-association baesd extension of NeuralCD (**KaNCD**).
16 |
17 | ## Brief introduction to CDM
18 |
19 | Cognitive diagnosis model (CDM) for intelligent educational systems is a type of model that infers students' knowledge states from their learning behaviors (especially exercise response logs).
20 |
21 |
22 |
23 | Typically, the input of a CDM could be the students' response logs of items (i.e., exercises/questions), the Q-matrix that denotes the correlation between items and knowledge concepts (skills). The output is the diagnosed student knowledge states, such as students' abilities and students' proficiencies on each knowledge concepts.
24 |
25 |
26 |
27 | Traditional CDMs include:
28 |
29 | - [IRT](https://link.springer.com/book/10.1007/978-0-387-89976-3): item response theory, a continuous unidimensional CDM with logistic-like item response function.
30 | - [MIRT](https://link.springer.com/book/10.1007/978-0-387-89976-3): Multidimensional item response theory, a continuous multidimensional CDM with logistic-like item response function. Mostly extended from unidimensional IRT.
31 | - [DINA](https://journals.sagepub.com/doi/10.3102/1076998607309474): deterministic input, noisy "and" model, a discrete multidimensional CDM. Q-matrix is used to model the effect of knowledge concepts in the cognitive process, as well as guessing and slipping factors.
32 |
33 | etc.
34 |
35 | More recent researches about CDMs:
36 |
37 | - [FuzzyCDF](http://staff.ustc.edu.cn/~qiliuql/files/Publications/Qi-Liu-TIST2018.pdf): fuzzy cognitive diagnosis framework, a continuous multidimensional CDM for students' cognitive modeling with both objective and subjective items.
38 | - [NeuralCD](http://staff.ustc.edu.cn/~cheneh/paper_pdf/2020/Fei-Wang-AAAI.pdf): neural cognitive diagnosis framework, a neural-network-based general cognitive diagnosis framework. In this repository we provide the basic implementation NCDM.
39 | - [IRR](http://home.ustc.edu.cn/~tongsw/files/IRR.pdf): item response ranking framework, a pairwise cognitive diagnosis framework. In this repository we provide the several implementations for most of CDMs.
40 | - [ICD]: Incremental Cognitive Diagnosis, a framework that tailor cognitive diagnosis into the online scenario of intelligent education. In this repository we provide the several implementations for most of CDMs.
41 | - [KaNCD](https://ieeexplore.ieee.org/abstract/document/9865139): extended from the NeuralCD framework. We use high-order latent traits of students, exercises and knowledge concepts to capture latent associations among knowledge concepts.
42 |
43 | ## List of models
44 |
45 | * [NCDM](EduCDM/NCDM) [[doc]](docs/NCDM.md) [[example]](examples/NCDM)
46 | * [FuzzyCDF](EduCDM/FuzzyCDF) [[doc]](docs/FuzzyCDF.md) [[example]](examples/FuzzyCDF)
47 | * [DINA](EduCDM/DINA) [[doc]](docs/DINA.md) [[example]](examples/DINA)
48 | * Eexpectation Maximization ([EMDINA](EduCDM/DINA/EM)) [[example]](examples/DINA/EM)
49 | * Gradient Descent ([GDDINA](EduCDM/DINA/GD)) [[example]](examples/DINA/GD)
50 | * [MIRT](EduCDM/MIRT) [[doc]](docs/MIRT.md) [[example]](examples/MIRT)
51 | * [IRT](EduCDM/IRT) [[doc]](docs/IRT.md) [[example]](examples/IRT)
52 | * Eexpectation Maximization ([EMIRT](EduCDM/IRT/EM)) [[example]](examples/IRT/EM)
53 | * Gradient Descent ([GDIRT](EduCDM/IRT/GD)) [[example]](examples/IRT/GD)
54 | * [MCD](EduCDM/MCD) [[doc]](docs/MCD.md) [[example]](examples/MCD)
55 | * [IRR](EduCDM/IRR) [[doc]](docs/IRR.md)[[example]](examples/IRR)
56 | * [IRR-NCDM](examples/IRR/NCDM.ipynb)
57 | * [IRR-MIRT](examples/IRR/MIRT.ipynb)
58 | * [IRR-DINA](examples/IRR/DINA.ipynb)
59 | * [IRR-IRT](examples/IRR/IRT.ipynb)
60 | * [ICD](EduCDM/ICD) [[doc]](docs/ICD.md)
61 | * [KaNCD](EduCDM/KaNCD) [[doc\]](docs/KaNCD.md) [[example\]](examples/KaNCD)
62 | ## Installation
63 |
64 | Git and install with `pip`:
65 |
66 | ```
67 | git clone https://github.com/bigdata-ustc/EduCDM.git
68 | cd path/to/code
69 | pip install .
70 | ```
71 |
72 | Or directly install from pypi:
73 |
74 | ```
75 | pip install EduCDM
76 | ```
77 |
78 |
79 | ## Contribute
80 |
81 | EduCDM is still under development. More algorithms and features are going to be added and we always welcome contributions to help make EduCDM better. If you would like to contribute, please follow this [guideline](CONTRIBUTE.md).
82 |
83 | ## Citation
84 |
85 | If this repository is helpful for you, please cite our work
86 |
87 | ```
88 | @misc{bigdata2021educdm,
89 | title={EduCDM},
90 | author={bigdata-ustc},
91 | publisher = {GitHub},
92 | journal = {GitHub repository},
93 | year = {2021},
94 | howpublished = {\url{https://github.com/bigdata-ustc/EduCDM}},
95 | }
96 | ```
97 |
98 | ## Reference
99 |
100 | [1] Liu Q, Wu R, Chen E, et al. Fuzzy cognitive diagnosis for modelling examinee performance[J]. ACM Transactions on Intelligent Systems and Technology (TIST), 2018, 9(4): 1-26.
101 |
102 | [2] Wang F, Liu Q, Chen E, et al. Neural cognitive diagnosis for intelligent education systems[C]//Proceedings of the AAAI Conference on Artificial Intelligence. 2020, 34(04): 6153-6161.
103 |
104 | [3] Tong S, Liu Q, Yu R, et al. Item response ranking for cognitive diagnosis[C]. IJCAI, 2021.
105 |
106 | [4] Wang F, Liu Q, Chen E, et al. NeuralCD: A General Framework for Cognitive Diagnosis. IEEE Transactions on Knowledge and Data Engineering (IEEE TKDE), accepted, 2022.
--------------------------------------------------------------------------------
/docs/DINA.md:
--------------------------------------------------------------------------------
1 | # Deterministic Inputs, Noisy “And” gate model
2 |
3 | If the reader wants to know the details of DINA, please refer to the Appendix of the paper: *[DINA model and parameter estimation: A didactic](https://journals.sagepub.com/doi/10.3102/1076998607309474)*.
4 | ```bibtex
5 | @article{de2009dina,
6 | title={DINA model and parameter estimation: A didactic},
7 | author={De La Torre, Jimmy},
8 | journal={Journal of educational and behavioral statistics},
9 | volume={34},
10 | number={1},
11 | pages={115--130},
12 | year={2009},
13 | publisher={Sage Publications Sage CA: Los Angeles, CA}
14 | }
15 | ```
16 |
17 | 
18 |
--------------------------------------------------------------------------------
/docs/FuzzyCDF.md:
--------------------------------------------------------------------------------
1 | # Fuzzy cognitive diagnosis framework
2 |
3 | If the reader wants to know the details of FuzzyCDF, please refer to the Chapter 4 of the paper: *[Fuzzy Cognitive Diagnosis for Modelling Examinee Performance](http://staff.ustc.edu.cn/~qiliuql/files/Publications/Qi-Liu-TIST2018.pdf)*.
4 | ```bibtex
5 | @article{liu2018fuzzy,
6 | title={Fuzzy cognitive diagnosis for modelling examinee performance},
7 | author={Liu, Qi and Wu, Runze and Chen, Enhong and Xu, Guandong and Su, Yu and Chen, Zhigang and Hu, Guoping},
8 | journal={ACM Transactions on Intelligent Systems and Technology (TIST)},
9 | volume={9},
10 | number={4},
11 | pages={1--26},
12 | year={2018},
13 | publisher={ACM New York, NY, USA}
14 | }
15 | ```
16 |
17 | 
18 |
--------------------------------------------------------------------------------
/docs/ICD.md:
--------------------------------------------------------------------------------
1 | # ICD:Incremental Cognitive Diagnosis for Intelligent Education
2 | This is our implementation for the paper:
3 |
4 | Shiwei Tong,Jiayu Liu ,Yuting Hong, Zhenya Huang, Le Wu, Qi Liu, Wei Huang, Enhong Chen, Dan Zhang. Incremental Cognitive Diagnosis for Intelligent Education . The 28th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining (KDD'2022)
5 |
6 | Please cite our KDD'2022 paper if you use our codes. Thanks!
7 |
8 | Author: Shiwei Tong
9 |
10 | Email: tongsw@mail.ustc.edu.cn
11 |
12 |
13 |
14 | ## Example to run the codes.
15 | The instruction of commands and take a0910 dataset as an example
16 |
17 | Go to the code directory:
18 | ```
19 | cd EduCDM/EduCDM/ICD/ICD
20 | ```
21 | Replace path_prefix by your project_url in ICD/constant.py.
22 |
23 |
24 | Run incremental method
25 | ```
26 | python examles/ICD/ICD.py
27 | ```
28 |
29 | ## Citation
30 | ```bibtex
31 | @inproceedings{tong2022incremental,
32 | title={Incremental Cognitive Diagnosis for Intelligent Education},
33 | author={Tong, Shiwei and Liu, Jiayu and Hong, Yuting and Huang, Zhenya and Wu, Le and Liu, Qi and Huang, Wei and Chen, Enhong and Zhang, Dan},
34 | booktitle={Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining},
35 | pages={1760--1770},
36 | year={2022}
37 | }
38 | ```
--------------------------------------------------------------------------------
/docs/IRR.md:
--------------------------------------------------------------------------------
1 | # Item Response Ranking for Cognitive Diagnosis
2 | [[Paper]](https://www.ijcai.org/proceedings/2021/0241.pdf)
3 | [[Poster]](http://home.ustc.edu.cn/~tongsw/files/IRR_Poster.pdf)
4 | [[Slide]](http://home.ustc.edu.cn/~tongsw/files/IRR_Slide.pdf)
5 | [[Code]](../EduCDM/IRR)
6 |
7 |
8 | Cognitive diagnosis, a fundamental task in education area, aims at providing an approach to reveal the proficiency level of students on knowledge concepts.
9 | Actually, **monotonicity is one of the basic conditions in cognitive diagnosis theory**, which assumes that **student's proficiency is monotonic with the probability of giving the right response to a test item**.
10 | However, few of previous methods consider the monotonicity during optimization.
11 | To this end, we propose Item Response Ranking framework (IRR), aiming at introducing pairwise learning into cognitive diagnosis to well model the monotonicity between item responses.
12 | Specifically, we first use an item specific sampling method to sample item responses and construct response pairs based on their partial order, where we propose the two-branch sampling methods to handle the unobserved responses (see Figure 2).
13 | After that, we use a pairwise objective function to exploit the monotonicity in the pair formulation.
14 | In fact, IRR is a general framework which can be applied to most of contemporary cognitive diagnosis models.
15 |
16 | We provide some examples for better illustration:
17 |
18 | * [IRR-IRT](../examples/IRR/IRT.ipynb)
19 | * [IRR-MIRT](../examples/IRR/MIRT.ipynb)
20 | * [IRR-DINA](../examples/IRR/DINA.ipynb)
21 | * [IRR-NCDM](../examples/IRR/NCDM.ipynb)
22 |
23 | 
24 |
25 | In the following parts, we will simply introduce the basic lemma `pairwise monotonicity` and training procedure.
26 |
27 | ## Pairwise Monotonicity
28 |
29 | In the literature, the monotonicity theory assumes that student's proficiency is monotonic with the probability of giving the right response to a test item.
30 | We rewrite it in a pairwise perspective: a more skilled student should have a higher probability to give the right response to a test item than an unskilled one. Formally, we have the following pairwise monotonicity:
31 |
32 | ### Pairwise Monotonicity
33 |
34 | _Given a specific test item, the students with right responses are more skilled than those with wrong responses._
35 |
36 | ## Learning Model with IRR
37 |
38 | We first design an item specific pair sampling method to resolve the potential non-overlapped problem, i.e., sampling responses from different students to the same item to keep related knowledge concepts the same.
39 | Then, to handle the unobserved responses along with the observed responses, we conduct a two-branch sampling method, i.e., positive sampling and negative sampling.
40 | After that, based on the sampled pairs, we introduce the pairwise learning to model the partial order among response pairs, where we use a pairwise objective function to better optimize the monotonicity.
41 |
42 | The objective function of IRR is:
43 |
44 | $$
45 | min_{\Theta} - \mathop{ln} IRR + \lambda(\Theta),
46 | $$
47 | where $\lambda(\Theta)$ is the regularization term and $\lambda$ is a hyper-parameter. We can apply IRR to any fully differentiable CDMs (e.g., MIRT) and train them with Stochastic Gradient Descent.
48 |
49 | For more details, please refer to our paper.
50 |
51 | ## Citation
52 |
53 | ```
54 | @inproceedings{tong2021item,
55 | title={Item response ranking for cognitive diagnosis},
56 | author={Tong, Shiwei and Liu, Qi and Yu, Runlong and Huang, Wei and Huang, Zhenya and Pardos, Zachary and Jiang, Weijie},
57 | year={2021},
58 | organization={IJCAI}
59 | }
60 | ```
61 |
--------------------------------------------------------------------------------
/docs/IRT.md:
--------------------------------------------------------------------------------
1 | # Item response theory
2 |
3 | If the reader wants to know the details of EMIRT, please refer to the paper: *[Estimation for Item Response Models using the EM Algorithm for Finite Mixtures](https://files.eric.ed.gov/fulltext/ED405356.pdf)*.
4 | ```bibtex
5 | @article{woodruff1996estimation,
6 | title={Estimation of Item Response Models Using the EM Algorithm for Finite Mixtures.},
7 | author={Woodruff, David J and Hanson, Bradley A},
8 | year={1996},
9 | publisher={ERIC}
10 | }
11 | ```
--------------------------------------------------------------------------------
/docs/KaNCD.md:
--------------------------------------------------------------------------------
1 | # KaNCD
2 |
3 | The implementation of the KaNCD model in paper: [NeuralCD: A General Framework for Cognitive Diagnosis](https://ieeexplore.ieee.org/abstract/document/9865139)
4 |
5 | KaNCD is an **K**nowledge-**a**ssociation based extension of the **N**eural**CD**M (alias NCDM in this package) model. In KaNCD, higher-order low dimensional latent traits of students, exercises and knowledge concepts are used respectively.
6 |
7 | The knowledge difficulty vector of an exercise is calculated from the latent trait of the exercise and the latent trait of each knowledge concept.
8 |
9 | 
10 |
11 | Similarly, the knowledge proficiency vector of a student is calculated from the latent trait of the student and the latent trait of each knowledge concept.
12 |
13 | 
14 |
15 | Please refer to the paper for more details.
--------------------------------------------------------------------------------
/docs/MCD.md:
--------------------------------------------------------------------------------
1 | # Matrix-factorization-based Cognitive Diagnosis model
2 |
3 | 
4 |
--------------------------------------------------------------------------------
/docs/MIRT.md:
--------------------------------------------------------------------------------
1 | # Multidimensional Item Response Theory
2 |
3 | If the reader wants to know the details of MIRT, please refer to the paper: *[Multidimensional item response theory models](http://ndl.ethernet.edu.et/bitstream/123456789/60415/1/116.pdf)*
4 |
5 | ```
6 | @incollection{reckase2009multidimensional,
7 | title={Multidimensional item response theory models},
8 | author={Reckase, Mark D},
9 | booktitle={Multidimensional item response theory},
10 | pages={79--112},
11 | year={2009},
12 | publisher={Springer}
13 | }
14 | ```
--------------------------------------------------------------------------------
/docs/NCDM.md:
--------------------------------------------------------------------------------
1 | # Neural Cognitive Diagnosis Model
2 |
3 | The implementation of the NeuralCDM model in paper: *[Neural Cognitive Diagnosis for Intelligent Education Systems](http://staff.ustc.edu.cn/~qiliuql/files/Publications/Fei-Wang-AAAI2020.pdf)*.
4 |
5 | 
--------------------------------------------------------------------------------
/docs/_static/DINA.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdata-ustc/EduCDM/c610e6ae7f45cfe8f1106a1c342d4ef5a357472e/docs/_static/DINA.png
--------------------------------------------------------------------------------
/docs/_static/EduCDM.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdata-ustc/EduCDM/c610e6ae7f45cfe8f1106a1c342d4ef5a357472e/docs/_static/EduCDM.png
--------------------------------------------------------------------------------
/docs/_static/FuzzyCDF.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdata-ustc/EduCDM/c610e6ae7f45cfe8f1106a1c342d4ef5a357472e/docs/_static/FuzzyCDF.png
--------------------------------------------------------------------------------
/docs/_static/IRR.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdata-ustc/EduCDM/c610e6ae7f45cfe8f1106a1c342d4ef5a357472e/docs/_static/IRR.png
--------------------------------------------------------------------------------
/docs/_static/KDM_MF.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdata-ustc/EduCDM/c610e6ae7f45cfe8f1106a1c342d4ef5a357472e/docs/_static/KDM_MF.png
--------------------------------------------------------------------------------
/docs/_static/KPM_MF.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdata-ustc/EduCDM/c610e6ae7f45cfe8f1106a1c342d4ef5a357472e/docs/_static/KPM_MF.png
--------------------------------------------------------------------------------
/docs/_static/MCD.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdata-ustc/EduCDM/c610e6ae7f45cfe8f1106a1c342d4ef5a357472e/docs/_static/MCD.png
--------------------------------------------------------------------------------
/docs/_static/NeuralCDM.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdata-ustc/EduCDM/c610e6ae7f45cfe8f1106a1c342d4ef5a357472e/docs/_static/NeuralCDM.JPG
--------------------------------------------------------------------------------
/examples/DINA/EM/DINA.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/3/28 @ liujiayu
3 | import logging
4 | import numpy as np
5 | import json
6 | from EduCDM import EMDINA as DINA
7 |
8 | q_m = np.loadtxt("../../../data/math2015/Math1/q_m.csv", dtype=int, delimiter=',')
9 | prob_num, know_num = q_m.shape[0], q_m.shape[1]
10 |
11 | # training data
12 | with open("../../../data/math2015/Math1/train_data.json", encoding='utf-8') as file:
13 | train_set = json.load(file)
14 | stu_num = max([x['user_id'] for x in train_set]) + 1
15 | R = -1 * np.ones(shape=(stu_num, prob_num))
16 | for log in train_set:
17 | R[log['user_id'], log['item_id']] = log['score']
18 |
19 | # testing data
20 | with open("../../../data/math2015/Math1/test_data.json", encoding='utf-8') as file:
21 | test_set = json.load(file)
22 |
23 | logging.getLogger().setLevel(logging.INFO)
24 |
25 | cdm = DINA(R, q_m, stu_num, prob_num, know_num, skip_value=-1)
26 |
27 | cdm.train(epoch=2, epsilon=1e-3)
28 | cdm.save("dina.params")
29 |
30 | cdm.load("dina.params")
31 | rmse, mae = cdm.eval(test_set)
32 | print("RMSE: %.6f, MAE: %.6f" % (rmse, mae))
33 |
34 | # ---incremental training
35 | new_data = [{'user_id': 0, 'item_id': 0, 'score': 1.0}, {'user_id': 1, 'item_id': 2, 'score': 0.0}]
36 | cdm.inc_train(new_data, epoch=2, epsilon=1e-3)
37 |
38 | # ---evaluate user's state
39 | stu_rec = np.array([0, 1, -1, 0, -1, 0, 1, 1, 0, 1, 0, 1, 0, -1, -1, -1, -1, 0, 1, -1])
40 | dia_id, dia_state = cdm.transform(stu_rec)
41 | print("id of user's state is %d, state is " % dia_id + str(dia_state))
42 |
--------------------------------------------------------------------------------
/examples/DINA/EM/prepare_dataset.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "# Download the Cognitive Diagnosis Benchmark Datasets (CDBD)\n",
10 | "from EduData import get_data\n",
11 | "\n",
12 | "get_data(\"math2015\", \"../../../data\")"
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": 1,
18 | "metadata": {},
19 | "outputs": [
20 | {
21 | "name": "stdout",
22 | "output_type": "stream",
23 | "text": [
24 | "{'user_id': 0, 'item_id': 5, 'score': 1.0} {'user_id': 0, 'item_id': 8, 'score': 1.0}\n"
25 | ]
26 | }
27 | ],
28 | "source": [
29 | "# Data preprocessing, split train/valid/test data\n",
30 | "\n",
31 | "import numpy as np\n",
32 | "import random\n",
33 | "import json\n",
34 | "\n",
35 | "train_ratio = 0.8\n",
36 | "valid_ratio = 0\n",
37 | "# Q matrix\n",
38 | "np.savetxt(\"../../../data/math2015/Math1/q_m.csv\", np.loadtxt(\"../../../data/math2015/Math1/q.txt\", dtype=int), delimiter=',', fmt='%d')\n",
39 | "\n",
40 | "# response matrix, split dataset\n",
41 | "R = (np.loadtxt(\"../../../data/math2015/Math1/data.txt\") == 1).astype(float)\n",
42 | "stu_num, prob_num = R.shape[0], R.shape[1]\n",
43 | "train_logs, valid_logs, test_logs = [], [], []\n",
44 | "for stu in range(stu_num):\n",
45 | " stu_logs = []\n",
46 | " for prob in range(prob_num):\n",
47 | " log = {'user_id': int(stu), 'item_id': int(prob), 'score': R[stu][prob]}\n",
48 | " stu_logs.append(log)\n",
49 | " random.shuffle(stu_logs)\n",
50 | " train_logs += stu_logs[: int(train_ratio * prob_num)]\n",
51 | " valid_logs += stu_logs[int(train_ratio * prob_num): int(train_ratio * prob_num) + int(valid_ratio * prob_num)]\n",
52 | " test_logs += stu_logs[int(train_ratio * prob_num) + int(valid_ratio * prob_num):]\n",
53 | "\n",
54 | "with open(\"../../../data/math2015/Math1/train_data.json\", 'w', encoding='utf8') as file:\n",
55 | " json.dump(train_logs, file, indent=4, ensure_ascii=False)\n",
56 | "with open(\"../../../data/math2015/Math1/valid_data.json\", 'w', encoding='utf8') as file:\n",
57 | " json.dump(valid_logs, file, indent=4, ensure_ascii=False)\n",
58 | "with open(\"../../../data/math2015/Math1/test_data.json\", 'w', encoding='utf8') as file:\n",
59 | " json.dump(test_logs, file, indent=4, ensure_ascii=False)\n",
60 | "\n",
61 | "print(train_logs[0], test_logs[0])"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": null,
67 | "metadata": {},
68 | "outputs": [],
69 | "source": []
70 | }
71 | ],
72 | "metadata": {
73 | "kernelspec": {
74 | "display_name": "Python 3",
75 | "language": "python",
76 | "name": "python3"
77 | },
78 | "language_info": {
79 | "codemirror_mode": {
80 | "name": "ipython",
81 | "version": 3
82 | },
83 | "file_extension": ".py",
84 | "mimetype": "text/x-python",
85 | "name": "python",
86 | "nbconvert_exporter": "python",
87 | "pygments_lexer": "ipython3",
88 | "version": "3.7.3"
89 | }
90 | },
91 | "nbformat": 4,
92 | "nbformat_minor": 1
93 | }
94 |
--------------------------------------------------------------------------------
/examples/DINA/GD/DINA.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/3/23 @ tongshiwei
3 | import logging
4 | from EduCDM import GDDINA
5 | import torch
6 | from torch.utils.data import TensorDataset, DataLoader
7 | import pandas as pd
8 |
9 | train_data = pd.read_csv("../../../data/a0910/train.csv")
10 | valid_data = pd.read_csv("../../../data/a0910/valid.csv")
11 | test_data = pd.read_csv("../../../data/a0910/test.csv")
12 | item_data = pd.read_csv("../../../data/a0910/item.csv")
13 |
14 | knowledge_num = 123
15 |
16 |
17 | def code2vector(x):
18 | vector = [0] * knowledge_num
19 | for k in eval(x):
20 | vector[k - 1] = 1
21 | return vector
22 |
23 |
24 | item_data["knowledge"] = item_data["knowledge_code"].apply(code2vector)
25 | item_data.drop(columns=["knowledge_code"], inplace=True)
26 |
27 | train_data = pd.merge(train_data, item_data, on="item_id")
28 | valid_data = pd.merge(valid_data, item_data, on="item_id")
29 | test_data = pd.merge(test_data, item_data, on="item_id")
30 |
31 | batch_size = 32
32 |
33 |
34 | def transform(x, y, z, k, batch_size, **params):
35 | dataset = TensorDataset(
36 | torch.tensor(x, dtype=torch.int64),
37 | torch.tensor(y, dtype=torch.int64),
38 | torch.tensor(k, dtype=torch.float32),
39 | torch.tensor(z, dtype=torch.float32)
40 | )
41 | return DataLoader(dataset, batch_size=batch_size, **params)
42 |
43 |
44 | train, valid, test = [
45 | transform(data["user_id"], data["item_id"], data["score"], data["knowledge"], batch_size)
46 | for data in [train_data, valid_data, test_data]
47 | ]
48 |
49 | logging.getLogger().setLevel(logging.INFO)
50 |
51 | cdm = GDDINA(4164, 17747, knowledge_num)
52 |
53 | cdm.train(train, valid, epoch=2)
54 | cdm.save("dina.params")
55 |
56 | cdm.load("dina.params")
57 | auc, accuracy = cdm.eval(test)
58 | print("auc: %.6f, accuracy: %.6f" % (auc, accuracy))
59 |
--------------------------------------------------------------------------------
/examples/DINA/GD/prepare_dataset.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 2,
6 | "outputs": [
7 | {
8 | "name": "stderr",
9 | "output_type": "stream",
10 | "text": [
11 | "downloader, INFO http://base.ustc.edu.cn/data/cdbd/a0910/item.csv is saved as ..\\..\\..\\data\\a0910\\item.csv\n",
12 | "downloader, INFO http://base.ustc.edu.cn/data/cdbd/a0910/readme.txt is saved as ..\\..\\..\\data\\a0910\\readme.txt\n",
13 | "downloader, INFO http://base.ustc.edu.cn/data/cdbd/a0910/test.csv is saved as ..\\..\\..\\data\\a0910\\test.csv\n",
14 | "downloader, INFO http://base.ustc.edu.cn/data/cdbd/a0910/train.csv is saved as ..\\..\\..\\data\\a0910\\train.csv\n",
15 | "downloader, INFO http://base.ustc.edu.cn/data/cdbd/a0910/valid.csv is saved as ..\\..\\..\\data\\a0910\\valid.csv\n"
16 | ]
17 | },
18 | {
19 | "name": "stdout",
20 | "output_type": "stream",
21 | "text": [
22 | "Downloading ..\\..\\..\\data\\a0910\\item.csv 100.00%: 258118 | 258118\n",
23 | "Downloading ..\\..\\..\\data\\a0910\\readme.txt 100.00%: 86 | 86\n",
24 | "Downloading ..\\..\\..\\data\\a0910\\test.csv 100.00%: 810767 | 810767\n",
25 | "Downloading ..\\..\\..\\data\\a0910\\train.csv 100.00%: 2329161 | 2329161\n",
26 | "Downloading ..\\..\\..\\data\\a0910\\valid.csv 100.00%: 371493 | 371493\n"
27 | ]
28 | },
29 | {
30 | "data": {
31 | "text/plain": "'../../../data'"
32 | },
33 | "execution_count": 2,
34 | "metadata": {},
35 | "output_type": "execute_result"
36 | }
37 | ],
38 | "source": [
39 | "# Download the Cognitive Diagnosis Benchmark Datasets (CDBD)\n",
40 | "from EduData import get_data\n",
41 | "\n",
42 | "get_data(\"cdbd-a0910\", \"../../../data\")\n"
43 | ],
44 | "metadata": {
45 | "collapsed": false,
46 | "pycharm": {
47 | "name": "#%%\n"
48 | }
49 | }
50 | }
51 | ],
52 | "metadata": {
53 | "kernelspec": {
54 | "display_name": "Python 3",
55 | "language": "python",
56 | "name": "python3"
57 | },
58 | "language_info": {
59 | "codemirror_mode": {
60 | "name": "ipython",
61 | "version": 2
62 | },
63 | "file_extension": ".py",
64 | "mimetype": "text/x-python",
65 | "name": "python",
66 | "nbconvert_exporter": "python",
67 | "pygments_lexer": "ipython2",
68 | "version": "2.7.6"
69 | }
70 | },
71 | "nbformat": 4,
72 | "nbformat_minor": 0
73 | }
--------------------------------------------------------------------------------
/examples/FuzzyCDF/FuzzyCDF.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "collapsed": true,
7 | "pycharm": {
8 | "name": "#%% md\n"
9 | }
10 | },
11 | "source": [
12 | "# Fuzzy cognitive diagnosis framework (FuzzyCDF)\n",
13 | "\n",
14 | "This notebook will show you how to train and use the FuzzyCDF.\n",
15 | "First, we will show how to get the data (here we use Math1 from math2015 as the dataset).\n",
16 | "Then we will show how to train a FuzzyCDF and perform the parameters persistence.\n",
17 | "At last, we will show how to load the parameters from the file and evaluate on the test dataset.\n",
18 | "\n",
19 | "The script version could be found in [FuzzyCDF.py](FuzzyCDF.ipynb)"
20 | ]
21 | },
22 | {
23 | "cell_type": "markdown",
24 | "metadata": {},
25 | "source": [
26 | "## Data Preparation\n",
27 | "\n",
28 | "Before we process the data, we need to first acquire the dataset which is shown in [prepare_dataset.ipynb](prepare_dataset.ipynb)"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": 1,
34 | "metadata": {},
35 | "outputs": [],
36 | "source": [
37 | "# Load the data from files\n",
38 | "import numpy as np\n",
39 | "import json\n",
40 | "\n",
41 | "# type of problems\n",
42 | "obj_prob_index = np.loadtxt(\"../../data/math2015/Math1/obj_prob_index.csv\", delimiter=',', dtype=int)\n",
43 | "sub_prob_index = np.loadtxt(\"../../data/math2015/Math1/sub_prob_index.csv\", delimiter=',', dtype=int)\n",
44 | "# Q matrix\n",
45 | "q_m = np.loadtxt(\"../../data/math2015/Math1/q_m.csv\", dtype=int, delimiter=',')\n",
46 | "prob_num, know_num = q_m.shape[0], q_m.shape[1]\n",
47 | "\n",
48 | "# training data\n",
49 | "with open(\"../../data/math2015/Math1/train_data.json\", encoding='utf-8') as file:\n",
50 | " train_set = json.load(file)\n",
51 | "stu_num = max([x['user_id'] for x in train_set]) + 1\n",
52 | "R = -1 * np.ones(shape=(stu_num, prob_num))\n",
53 | "for log in train_set:\n",
54 | " R[log['user_id'], log['item_id']] = log['score']\n",
55 | "\n",
56 | "# testing data\n",
57 | "with open(\"../../data/math2015/Math1/test_data.json\", encoding='utf-8') as file:\n",
58 | " test_set = json.load(file)"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": 2,
64 | "metadata": {},
65 | "outputs": [
66 | {
67 | "name": "stdout",
68 | "output_type": "stream",
69 | "text": [
70 | "{'user_id': 0, 'item_id': 7, 'score': 1.0} {'user_id': 0, 'item_id': 9, 'score': 1.0}\n"
71 | ]
72 | }
73 | ],
74 | "source": [
75 | "print(train_set[0], test_set[0])"
76 | ]
77 | },
78 | {
79 | "cell_type": "code",
80 | "execution_count": 3,
81 | "metadata": {
82 | "pycharm": {
83 | "name": "#%%\n"
84 | }
85 | },
86 | "outputs": [
87 | {
88 | "data": {
89 | "text/plain": [
90 | "(67344, 16836)"
91 | ]
92 | },
93 | "execution_count": 3,
94 | "metadata": {},
95 | "output_type": "execute_result"
96 | }
97 | ],
98 | "source": [
99 | "len(train_set), len(test_set)"
100 | ]
101 | },
102 | {
103 | "cell_type": "markdown",
104 | "metadata": {
105 | "pycharm": {
106 | "name": "#%% md\n"
107 | }
108 | },
109 | "source": [
110 | "## Training and Persistence"
111 | ]
112 | },
113 | {
114 | "cell_type": "code",
115 | "execution_count": 4,
116 | "metadata": {
117 | "pycharm": {
118 | "name": "#%%\n"
119 | }
120 | },
121 | "outputs": [],
122 | "source": [
123 | "import logging\n",
124 | "logging.getLogger().setLevel(logging.INFO)"
125 | ]
126 | },
127 | {
128 | "cell_type": "code",
129 | "execution_count": 6,
130 | "metadata": {
131 | "pycharm": {
132 | "name": "#%%\n"
133 | }
134 | },
135 | "outputs": [
136 | {
137 | "name": "stderr",
138 | "output_type": "stream",
139 | "text": [
140 | "INFO:root:save parameters to fuzzycdf.params\n"
141 | ]
142 | }
143 | ],
144 | "source": [
145 | "from EduCDM import FuzzyCDF\n",
146 | "\n",
147 | "cdm = FuzzyCDF(R, q_m, stu_num, prob_num, know_num, obj_prob_index, sub_prob_index, skip_value=-1)\n",
148 | "\n",
149 | "cdm.train(epoch=10, burnin=5)\n",
150 | "cdm.save(\"fuzzycdf.params\")"
151 | ]
152 | },
153 | {
154 | "cell_type": "markdown",
155 | "metadata": {
156 | "pycharm": {
157 | "name": "#%% md\n"
158 | }
159 | },
160 | "source": [
161 | "## Loading and Testing"
162 | ]
163 | },
164 | {
165 | "cell_type": "code",
166 | "execution_count": 7,
167 | "metadata": {
168 | "pycharm": {
169 | "name": "#%%\n"
170 | }
171 | },
172 | "outputs": [
173 | {
174 | "name": "stderr",
175 | "output_type": "stream",
176 | "text": [
177 | "INFO:root:load parameters from fuzzycdf.params\n",
178 | "evaluating: 100%|█████████████████████████████████████████████████████████████| 16836/16836 [00:00<00:00, 91552.55it/s]"
179 | ]
180 | },
181 | {
182 | "name": "stdout",
183 | "output_type": "stream",
184 | "text": [
185 | "RMSE: 0.447697, MAE: 0.405684\n"
186 | ]
187 | },
188 | {
189 | "name": "stderr",
190 | "output_type": "stream",
191 | "text": [
192 | "\n"
193 | ]
194 | }
195 | ],
196 | "source": [
197 | "cdm.load(\"fuzzycdf.params\")\n",
198 | "rmse, mae = cdm.eval(test_set)\n",
199 | "print(\"RMSE: %.6f, MAE: %.6f\" % (rmse, mae))"
200 | ]
201 | },
202 | {
203 | "cell_type": "markdown",
204 | "metadata": {},
205 | "source": [
206 | "## Incremental Training"
207 | ]
208 | },
209 | {
210 | "cell_type": "code",
211 | "execution_count": 8,
212 | "metadata": {},
213 | "outputs": [],
214 | "source": [
215 | "new_data = [{'user_id': 0, 'item_id': 2, 'score': 0.0}, {'user_id': 1, 'item_id': 1, 'score': 1.0}]\n",
216 | "cdm.inc_train(new_data, epoch=10, burnin=5)"
217 | ]
218 | },
219 | {
220 | "cell_type": "code",
221 | "execution_count": null,
222 | "metadata": {},
223 | "outputs": [],
224 | "source": []
225 | }
226 | ],
227 | "metadata": {
228 | "kernelspec": {
229 | "display_name": "Python 3",
230 | "language": "python",
231 | "name": "python3"
232 | },
233 | "language_info": {
234 | "codemirror_mode": {
235 | "name": "ipython",
236 | "version": 3
237 | },
238 | "file_extension": ".py",
239 | "mimetype": "text/x-python",
240 | "name": "python",
241 | "nbconvert_exporter": "python",
242 | "pygments_lexer": "ipython3",
243 | "version": "3.7.3"
244 | }
245 | },
246 | "nbformat": 4,
247 | "nbformat_minor": 1
248 | }
249 |
--------------------------------------------------------------------------------
/examples/FuzzyCDF/FuzzyCDF.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/3/28 @ liujiayu
3 | import logging
4 | import numpy as np
5 | import json
6 | from EduCDM import FuzzyCDF
7 |
8 |
9 | # type of problems
10 | obj_prob_index = np.loadtxt("../../data/math2015/Math1/obj_prob_index.csv", delimiter=',', dtype=int)
11 | sub_prob_index = np.loadtxt("../../data/math2015/Math1/sub_prob_index.csv", delimiter=',', dtype=int)
12 | # Q matrix
13 | q_m = np.loadtxt("../../data/math2015/Math1/q_m.csv", dtype=int, delimiter=',')
14 | prob_num, know_num = q_m.shape[0], q_m.shape[1]
15 |
16 | # training data
17 | with open("../../data/math2015/Math1/train_data.json", encoding='utf-8') as file:
18 | train_set = json.load(file)
19 | stu_num = max([x['user_id'] for x in train_set]) + 1
20 | R = -1 * np.ones(shape=(stu_num, prob_num))
21 | for log in train_set:
22 | R[log['user_id'], log['item_id']] = log['score']
23 |
24 | # testing data
25 | with open("../../data/math2015/Math1/test_data.json", encoding='utf-8') as file:
26 | test_set = json.load(file)
27 |
28 | logging.getLogger().setLevel(logging.INFO)
29 |
30 | cdm = FuzzyCDF(R, q_m, stu_num, prob_num, know_num, obj_prob_index, sub_prob_index, skip_value=-1)
31 |
32 | cdm.train(epoch=10, burnin=5)
33 | cdm.save("fuzzycdf.params")
34 |
35 | cdm.load("fuzzycdf.params")
36 | rmse, mae = cdm.eval(test_set)
37 | print("RMSE, MAE are %.6f, %.6f" % (rmse, mae))
38 |
39 | # ---incremental training
40 | new_data = [{'user_id': 0, 'item_id': 2, 'score': 0.0}, {'user_id': 1, 'item_id': 1, 'score': 1.0}]
41 | cdm.inc_train(new_data, epoch=10, burnin=5)
42 |
--------------------------------------------------------------------------------
/examples/FuzzyCDF/prepare_dataset.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "# Download the Cognitive Diagnosis Benchmark Datasets (CDBD)\n",
10 | "from EduData import get_data\n",
11 | "\n",
12 | "get_data(\"math2015\", \"../../data\")"
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": 1,
18 | "metadata": {},
19 | "outputs": [
20 | {
21 | "name": "stdout",
22 | "output_type": "stream",
23 | "text": [
24 | "{'user_id': 0, 'item_id': 7, 'score': 1.0} {'user_id': 0, 'item_id': 9, 'score': 1.0}\n"
25 | ]
26 | }
27 | ],
28 | "source": [
29 | "# Data preprocessing, split train/valid/test data\n",
30 | "\n",
31 | "import json\n",
32 | "import numpy as np\n",
33 | "import random\n",
34 | "\n",
35 | "def read_probdesc(filepath):\n",
36 | " prob_type = np.loadtxt(filepath, dtype=int, delimiter='\\t',\n",
37 | " converters={0: lambda x: int(x) - 1, 1: lambda s: s == b'Obj'}, skiprows=1)\n",
38 | " obj_prob_index, sub_prob_index = prob_type[prob_type[:, 1] == 1][:, 0], prob_type[prob_type[:, 1] == 0][:, 0]\n",
39 | " return prob_type, obj_prob_index, sub_prob_index\n",
40 | "\n",
41 | "train_ratio = 0.8\n",
42 | "valid_ratio = 0\n",
43 | "\n",
44 | "# type of problems\n",
45 | "problems, obj_prob_index, sub_prob_index = read_probdesc(\"../../data/math2015/Math1/problemdesc.txt\")\n",
46 | "np.savetxt(\"../../data/math2015/Math1/obj_prob_index.csv\", obj_prob_index, delimiter=',', fmt='%d')\n",
47 | "np.savetxt(\"../../data/math2015/Math1/sub_prob_index.csv\", sub_prob_index, delimiter=',', fmt='%d')\n",
48 | "\n",
49 | "# Q matrix\n",
50 | "np.savetxt(\"../../data/math2015/Math1/q_m.csv\", np.loadtxt(\"../../data/math2015/Math1/q.txt\", dtype=int), delimiter=',', fmt='%d')\n",
51 | "\n",
52 | "# response matrix, split dataset\n",
53 | "R = np.loadtxt(\"../../data/math2015/Math1/data.txt\")\n",
54 | "stu_num, prob_num = R.shape[0], R.shape[1]\n",
55 | "train_logs, valid_logs, test_logs = [], [], []\n",
56 | "for stu in range(stu_num):\n",
57 | " stu_logs = []\n",
58 | " for prob in range(prob_num):\n",
59 | " log = {'user_id': int(stu), 'item_id': int(prob), 'score': R[stu][prob]}\n",
60 | " stu_logs.append(log)\n",
61 | " random.shuffle(stu_logs)\n",
62 | " train_logs += stu_logs[: int(train_ratio * prob_num)]\n",
63 | " valid_logs += stu_logs[int(train_ratio * prob_num): int(train_ratio * prob_num) + int(valid_ratio * prob_num)]\n",
64 | " test_logs += stu_logs[int(train_ratio * prob_num) + int(valid_ratio * prob_num):]\n",
65 | "\n",
66 | "with open(\"../../data/math2015/Math1/train_data.json\", 'w', encoding='utf8') as file:\n",
67 | " json.dump(train_logs, file, indent=4, ensure_ascii=False)\n",
68 | "with open(\"../../data/math2015/Math1/valid_data.json\", 'w', encoding='utf8') as file:\n",
69 | " json.dump(valid_logs, file, indent=4, ensure_ascii=False)\n",
70 | "with open(\"../../data/math2015/Math1/test_data.json\", 'w', encoding='utf8') as file:\n",
71 | " json.dump(test_logs, file, indent=4, ensure_ascii=False)\n",
72 | "\n",
73 | "print(train_logs[0], test_logs[0])"
74 | ]
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": null,
79 | "metadata": {},
80 | "outputs": [],
81 | "source": []
82 | }
83 | ],
84 | "metadata": {
85 | "kernelspec": {
86 | "display_name": "Python 3",
87 | "language": "python",
88 | "name": "python3"
89 | },
90 | "language_info": {
91 | "codemirror_mode": {
92 | "name": "ipython",
93 | "version": 3
94 | },
95 | "file_extension": ".py",
96 | "mimetype": "text/x-python",
97 | "name": "python",
98 | "nbconvert_exporter": "python",
99 | "pygments_lexer": "ipython3",
100 | "version": "3.7.3"
101 | }
102 | },
103 | "nbformat": 4,
104 | "nbformat_minor": 1
105 | }
106 |
--------------------------------------------------------------------------------
/examples/ICD/ICD.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | import logging
4 | from baize import config_logging
5 | import os
6 | from EduCDM.ICD.etl import extract, inc_stream
7 | from longling import build_dir
8 | from longling.lib.stream import to_io_group, close_io
9 | from EduCDM.ICD.ICD import ICD
10 |
11 | path_prefix = os.path.abspath('.')
12 |
13 |
14 | def run(cdm,
15 | user_n,
16 | item_n,
17 | know_n,
18 | dataset,
19 | max_u2i=None,
20 | max_i2u=None,
21 | stream_num=50,
22 | alpha=0.999,
23 | beta=0.95,
24 | tolerance=1e-3,
25 | inner_metrics=True,
26 | hyper_tag=False,
27 | epoch=1,
28 | wfs=None,
29 | logger=logging,
30 | log_file="log",
31 | warmup_ratio=0.1,
32 | epsilon=1e-2,
33 | weight_decay=0,
34 | vector_numbers=None,
35 | vector_path_format=None,
36 | ctx="cpu",
37 | *args,
38 | **kwargs):
39 | dataset_dir = "%s/data/%s/" % (path_prefix, dataset)
40 | data_dir = dataset_dir
41 | item2know = "%sitem.csv" % dataset_dir
42 | path_format = "%s{}.csv" % data_dir
43 |
44 | inc_train_data_path = path_format.format(log_file)
45 | inc_train_df, _, _, i2k = extract(inc_train_data_path, item2know)
46 | inc_train_df_list = list(
47 | inc_stream(inc_train_df,
48 | stream_size=int(len(inc_train_df) // stream_num)))
49 | ICDNet = ICD(cdm, user_n, item_n, know_n, epoch, weight_decay,
50 | inner_metrics, logger, alpha, ctx)
51 | ICDNet.train(inc_train_df_list, i2k, beta, warmup_ratio, tolerance,
52 | max_u2i, max_i2u, hyper_tag, vector_numbers,
53 | vector_path_format, wfs)
54 |
55 |
56 | def main(dataset="a0910",
57 | ctx="cpu",
58 | cdm="mirt",
59 | alpha=0.2,
60 | beta=0.9,
61 | tolerance=2e-1,
62 | epoch=1,
63 | pretrained=False,
64 | savename=None,
65 | inc_epoch=None,
66 | inner_metrics=True,
67 | log_file="log",
68 | warmup_ratio=0.1,
69 | epsilon=1e-2,
70 | stream_num=None,
71 | vector_numbers=None):
72 | if savename:
73 | dataset_dir = "%s/data/%s/" % (path_prefix, dataset)
74 | data_dir = dataset_dir
75 | model_dir = data_dir + "model/%s/%s/" % (cdm, savename)
76 | keys = [
77 | "metrics", "before_metrics", "ind_inc_user", "ind_inc_item",
78 | "inc_user", "inc_item", "new_user", "new_item", "new_both",
79 | "trait", "inc_trait", "tp"
80 | ]
81 | path_format = model_dir + "{}.json"
82 | wfs = dict(
83 | zip(
84 | keys,
85 | to_io_group(*[path_format.format(key) for key in keys],
86 | mode="w"))) if savename else None
87 | logger = config_logging(model_dir + "log.txt",
88 | logger="ICD",
89 | console_log_level="info")
90 | logger.info("logs to %s" % model_dir + "log.txt")
91 | vector_path_format = model_dir + "{}_{}.pt"
92 | build_dir(vector_path_format)
93 | else:
94 | wfs = None
95 | logger = config_logging(logger="ICD", console_log_level="info")
96 | vector_path_format = None
97 |
98 | config = dict(
99 | dataset=dataset,
100 | cdm=cdm,
101 | alpha=alpha,
102 | beta=beta,
103 | tolerance=tolerance,
104 | ctx=ctx,
105 | epoch=epoch,
106 | inc_epoch=inc_epoch,
107 | inner_metrics=inner_metrics,
108 | log_file=log_file,
109 | warmup_ratio=warmup_ratio,
110 | epsilon=epsilon,
111 | vector_numbers=vector_numbers,
112 | vector_path_format=vector_path_format,
113 | )
114 | logger.info(config)
115 |
116 | dataset_config = {
117 | "a0910":
118 | dict(
119 | user_n=4129,
120 | item_n=17747,
121 | know_n=123,
122 | stream_num=50 if stream_num is None else stream_num,
123 | max_u2i=128,
124 | max_i2u=64,
125 | ),
126 | "math":
127 | dict(
128 | user_n=10269,
129 | item_n=17747,
130 | know_n=1488,
131 | stream_num=200 if stream_num is None else stream_num,
132 | # max_u2i=128,
133 | # max_i2u=64,
134 | ),
135 | "xunfei":
136 | dict(
137 | # user_n=10269+1,
138 | # item_n=2507+1,
139 | user_n=6820 + 1,
140 | item_n=1196 + 1,
141 | know_n=497,
142 | stream_num=50 if stream_num is None else stream_num,
143 | max_u2i=128,
144 | max_i2u=64,
145 | ),
146 | }
147 | cdm_config = {
148 | "irt": {},
149 | "dina": {},
150 | "ncd": {},
151 | "mirt": {
152 | "weight_decay": 1e-4
153 | }
154 | }
155 | run(
156 | # cdm="mirt",
157 | pretrained=pretrained,
158 | wfs=wfs,
159 | logger=logger,
160 | **cdm_config[cdm],
161 | **config,
162 | **dataset_config[dataset.split("_")[0]])
163 | if wfs is not None:
164 | close_io(list(wfs.values()))
165 |
166 |
167 | if __name__ == '__main__':
168 | import fire
169 |
170 | fire.Fire(main)
171 |
--------------------------------------------------------------------------------
/examples/ICD/prepare_dataset.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "name": "stderr",
10 | "output_type": "stream",
11 | "text": [
12 | "downloader, INFO http://base.ustc.edu.cn/data/cdbd/a0910/item.csv is saved as ../../data/a0910/item.csv\n",
13 | "downloader, INFO file existed, skipped\n"
14 | ]
15 | },
16 | {
17 | "data": {
18 | "text/plain": [
19 | "'../../data'"
20 | ]
21 | },
22 | "execution_count": 1,
23 | "metadata": {},
24 | "output_type": "execute_result"
25 | }
26 | ],
27 | "source": [
28 | "from EduData import get_data\n",
29 | "\n",
30 | "get_data(\"cdbd-a0910\", \"../../data\")\n"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": 2,
36 | "metadata": {},
37 | "outputs": [],
38 | "source": [
39 | "import pandas as pd\n",
40 | "path_prefix = '../../data/a0910/'\n",
41 | "train_data = pd.read_csv(f'{path_prefix}train.csv',\n",
42 | " encoding=\"utf-8\", low_memory=False)\n",
43 | "valid_data = pd.read_csv(f'{path_prefix}valid.csv',\n",
44 | " encoding=\"utf-8\", low_memory=False)\n",
45 | "test_data = pd.read_csv(f'{path_prefix}test.csv',\n",
46 | " encoding=\"utf-8\", low_memory=False)\n",
47 | "data = train_data.append([valid_data, test_data])\n",
48 | "# 保留作答题目大于15的学生\n",
49 | "group = data.groupby(['item_id'], as_index=False)\n",
50 | "df = pd.DataFrame(columns=['user_id', 'item_id', 'score'])\n",
51 | "for i in group:\n",
52 | " if len(i[1]) >= 15:\n",
53 | " df = df.append([i[1]])\n",
54 | "df.to_csv(f'{path_prefix}log.csv', index=None)"
55 | ]
56 | }
57 | ],
58 | "metadata": {
59 | "kernelspec": {
60 | "display_name": "Python 3.9.7 ('base')",
61 | "language": "python",
62 | "name": "python3"
63 | },
64 | "language_info": {
65 | "codemirror_mode": {
66 | "name": "ipython",
67 | "version": 3
68 | },
69 | "file_extension": ".py",
70 | "mimetype": "text/x-python",
71 | "name": "python",
72 | "nbconvert_exporter": "python",
73 | "pygments_lexer": "ipython3",
74 | "version": "3.9.7"
75 | },
76 | "orig_nbformat": 4,
77 | "vscode": {
78 | "interpreter": {
79 | "hash": "30cf1c0bf0a8c24a67f341fa01023997b228873b5bd061707cfd99d0cfb90c8a"
80 | }
81 | }
82 | },
83 | "nbformat": 4,
84 | "nbformat_minor": 2
85 | }
86 |
--------------------------------------------------------------------------------
/examples/IRR/DINA.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/6/19 @ tongshiwei
3 |
4 |
5 | from EduCDM.IRR import DINA
6 | import logging
7 | from longling.lib.structure import AttrDict
8 | from longling import set_logging_info
9 | from EduCDM.IRR import pair_etl as etl, point_etl as vt_etl, extract_item
10 |
11 | set_logging_info()
12 |
13 | params = AttrDict(
14 | batch_size=256,
15 | n_neg=10,
16 | n_imp=10,
17 | logger=logging.getLogger(),
18 | hyper_params={"user_num": 4164, "knowledge_num": 123}
19 | )
20 | item_knowledge = extract_item("../../data/a0910/item.csv", params["hyper_params"]["knowledge_num"], params)
21 | train_data, train_df = etl("../../data/a0910/train.csv", item_knowledge, params)
22 | valid_data, _ = vt_etl("../../data/a0910/valid.csv", item_knowledge, params)
23 | test_data, _ = vt_etl("../../data/a0910/test.csv", item_knowledge, params)
24 |
25 | cdm = DINA(
26 | 4163 + 1,
27 | 17746 + 1,
28 | 123,
29 | ste=True
30 | )
31 | cdm.train(
32 | train_data,
33 | valid_data,
34 | epoch=2,
35 | )
36 | cdm.save("IRR-DINA.params")
37 |
38 | cdm.load("IRR-DINA.params")
39 | print(cdm.eval(test_data))
40 |
--------------------------------------------------------------------------------
/examples/IRR/IRT.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/6/19 @ tongshiwei
3 |
4 |
5 | from EduCDM.IRR import IRT
6 | import logging
7 | from longling.lib.structure import AttrDict
8 | from longling import set_logging_info
9 | from EduCDM.IRR import pair_etl as etl, point_etl as vt_etl, extract_item
10 |
11 | set_logging_info()
12 |
13 | params = AttrDict(
14 | batch_size=256,
15 | n_neg=10,
16 | n_imp=10,
17 | logger=logging.getLogger(),
18 | hyper_params={"user_num": 4164}
19 | )
20 | item_knowledge = extract_item("../../data/a0910/item.csv", 123, params)
21 | train_data, train_df = etl("../../data/a0910/train.csv", item_knowledge, params)
22 | valid_data, _ = vt_etl("../../data/a0910/valid.csv", item_knowledge, params)
23 | test_data, _ = vt_etl("../../data/a0910/test.csv", item_knowledge, params)
24 |
25 | cdm = IRT(
26 | 4163 + 1,
27 | 17746 + 1,
28 | 123
29 | )
30 | cdm.train(
31 | train_data,
32 | valid_data,
33 | epoch=2,
34 | )
35 | cdm.save("IRR-IRT.params")
36 |
37 | cdm.load("IRR-IRT.params")
38 | print(cdm.eval(test_data))
39 |
--------------------------------------------------------------------------------
/examples/IRR/MIRT.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/6/19 @ tongshiwei
3 |
4 |
5 | from EduCDM.IRR import MIRT
6 | import logging
7 | from longling.lib.structure import AttrDict
8 | from longling import set_logging_info
9 | from EduCDM.IRR import pair_etl as etl, point_etl as vt_etl, extract_item
10 |
11 | set_logging_info()
12 |
13 | params = AttrDict(
14 | batch_size=256,
15 | n_neg=10,
16 | n_imp=10,
17 | logger=logging.getLogger(),
18 | hyper_params={"user_num": 4164}
19 | )
20 | item_knowledge = extract_item("../../data/a0910/item.csv", 123, params)
21 | train_data, train_df = etl("../../data/a0910/train.csv", item_knowledge, params)
22 | valid_data, _ = vt_etl("../../data/a0910/valid.csv", item_knowledge, params)
23 | test_data, _ = vt_etl("../../data/a0910/test.csv", item_knowledge, params)
24 |
25 | cdm = MIRT(
26 | 4163 + 1,
27 | 17746 + 1,
28 | 123
29 | )
30 | cdm.train(
31 | train_data,
32 | valid_data,
33 | epoch=2,
34 | )
35 | cdm.save("IRR-MIRT.params")
36 |
37 | cdm.load("IRR-MIRT.params")
38 | print(cdm.eval(test_data))
39 |
--------------------------------------------------------------------------------
/examples/IRR/NCDM.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/6/19 @ tongshiwei
3 |
4 |
5 | from EduCDM.IRR import NCDM
6 | import logging
7 | from longling.lib.structure import AttrDict
8 | from longling import set_logging_info
9 | from EduCDM.IRR import pair_etl as etl, point_etl as vt_etl, extract_item
10 |
11 | set_logging_info()
12 |
13 | params = AttrDict(
14 | batch_size=256,
15 | n_neg=10,
16 | n_imp=10,
17 | logger=logging.getLogger(),
18 | hyper_params={"user_num": 4164, "knowledge_num": 123}
19 | )
20 | item_knowledge = extract_item("../../data/a0910/item.csv", params["hyper_params"]["knowledge_num"], params)
21 | train_data, train_df = etl("../../data/a0910/train.csv", item_knowledge, params)
22 | valid_data, _ = vt_etl("../../data/a0910/valid.csv", item_knowledge, params)
23 | test_data, _ = vt_etl("../../data/a0910/test.csv", item_knowledge, params)
24 |
25 | cdm = NCDM(
26 | 4163 + 1,
27 | 17746 + 1,
28 | 123,
29 | )
30 | cdm.train(
31 | train_data,
32 | valid_data,
33 | epoch=2,
34 | )
35 | cdm.save("IRR-NCDM.params")
36 |
37 | cdm.load("IRR-NCDM.params")
38 | print(cdm.eval(test_data))
39 |
--------------------------------------------------------------------------------
/examples/IRR/README.md:
--------------------------------------------------------------------------------
1 | # Item Response Ranking for Cognitive Diagnosis
2 |
3 | * [IRR-IRT](IRT.ipynb)
4 | * [IRR-DINA](DINA.ipynb)
5 | * [IRR-NCD](NCDM.ipynb)
6 |
7 |
--------------------------------------------------------------------------------
/examples/IRR/prepare_dataset.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 5,
6 | "outputs": [
7 | {
8 | "name": "stderr",
9 | "output_type": "stream",
10 | "text": [
11 | "downloader, INFO http://base.ustc.edu.cn/data/cdbd/a0910/item.csv is saved as ..\\..\\data\\a0910\\item.csv\n",
12 | "downloader, INFO http://base.ustc.edu.cn/data/cdbd/a0910/test.csv is saved as ..\\..\\data\\a0910\\test.csv\n",
13 | "downloader, INFO http://base.ustc.edu.cn/data/cdbd/a0910/train.csv is saved as ..\\..\\data\\a0910\\train.csv\n",
14 | "downloader, INFO http://base.ustc.edu.cn/data/cdbd/a0910/valid.csv is saved as ..\\..\\data\\a0910\\valid.csv\n"
15 | ]
16 | },
17 | {
18 | "name": "stdout",
19 | "output_type": "stream",
20 | "text": [
21 | "Downloading ..\\..\\data\\a0910\\item.csv 100.00%: 258118 | 258118\n",
22 | "Downloading ..\\..\\data\\a0910\\test.csv 100.00%: 810767 | 810767\n",
23 | "Downloading ..\\..\\data\\a0910\\train.csv 100.00%: 2329161 | 2329161\n",
24 | "Downloading ..\\..\\data\\a0910\\valid.csv 100.00%: 371493 | 371493\n"
25 | ]
26 | },
27 | {
28 | "data": {
29 | "text/plain": "'../../data'"
30 | },
31 | "execution_count": 5,
32 | "metadata": {},
33 | "output_type": "execute_result"
34 | }
35 | ],
36 | "source": [
37 | "# Download the Cognitive Diagnosis Benchmark Datasets (CDBD)\n",
38 | "from EduData import get_data\n",
39 | "\n",
40 | "get_data(\"cdbd-a0910\", \"../../data\")\n"
41 | ],
42 | "metadata": {
43 | "collapsed": false,
44 | "pycharm": {
45 | "name": "#%%\n"
46 | }
47 | }
48 | }
49 | ],
50 | "metadata": {
51 | "kernelspec": {
52 | "display_name": "Python 3",
53 | "language": "python",
54 | "name": "python3"
55 | },
56 | "language_info": {
57 | "codemirror_mode": {
58 | "name": "ipython",
59 | "version": 2
60 | },
61 | "file_extension": ".py",
62 | "mimetype": "text/x-python",
63 | "name": "python",
64 | "nbconvert_exporter": "python",
65 | "pygments_lexer": "ipython2",
66 | "version": "2.7.6"
67 | }
68 | },
69 | "nbformat": 4,
70 | "nbformat_minor": 0
71 | }
--------------------------------------------------------------------------------
/examples/IRT/EM/IRT.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/5/2 @ liujiayu
3 | import logging
4 | import numpy as np
5 | import pandas as pd
6 | from EduCDM import EMIRT
7 |
8 | train_data = pd.read_csv("../../../data/a0910/train.csv")
9 | valid_data = pd.read_csv("../../../data/a0910/valid.csv")
10 | test_data = pd.read_csv("../../../data/a0910/test.csv")
11 |
12 | stu_num = max(max(train_data['user_id']), max(test_data['user_id']))
13 | prob_num = max(max(train_data['item_id']), max(test_data['item_id']))
14 |
15 | R = -1 * np.ones(shape=(stu_num, prob_num))
16 | R[train_data['user_id']-1, train_data['item_id']-1] = train_data['score']
17 |
18 | test_set = []
19 | for i in range(len(test_data)):
20 | row = test_data.iloc[i]
21 | test_set.append({'user_id':int(row['user_id'])-1, 'item_id':int(row['item_id'])-1, 'score':row['score']})
22 |
23 | logging.getLogger().setLevel(logging.INFO)
24 |
25 | cdm = EMIRT(R, stu_num, prob_num, dim=1, skip_value=-1) # IRT, dim > 1 is MIRT
26 |
27 | cdm.train(lr=1e-3, epoch=2)
28 | cdm.save("irt.params")
29 |
30 | cdm.load("irt.params")
31 | rmse, mae = cdm.eval(test_set)
32 | print("RMSE, MAE are %.6f, %.6f" % (rmse, mae))
33 |
34 | # ---incremental training
35 | new_data = [{'user_id': 0, 'item_id': 2, 'score': 0.0}, {'user_id': 1, 'item_id': 1, 'score': 1.0}]
36 | cdm.inc_train(new_data, lr=1e-3, epoch=2)
37 |
38 | # ---evaluate user's state
39 | stu_rec = np.random.randint(-1, 2, size=prob_num)
40 | dia_state = cdm.transform(stu_rec)
41 | print("user's state is " + str(dia_state))
42 |
--------------------------------------------------------------------------------
/examples/IRT/EM/prepare_dataset.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "outputs": [
7 | {
8 | "name": "stderr",
9 | "output_type": "stream",
10 | "text": [
11 | "downloader, INFO ..\\..\\..\\data\\a0910\\item.csv already exists. Send resume request after 258118 bytes\n",
12 | "downloader, INFO http://base.ustc.edu.cn/data/cdbd/a0910/item.csv is saved as ..\\..\\..\\data\\a0910\\item.csv\n",
13 | "downloader, WARNING Range not support. Redownloading...\n",
14 | "downloader, INFO ..\\..\\..\\data\\a0910\\test.csv already exists. Send resume request after 810767 bytes\n",
15 | "downloader, INFO http://base.ustc.edu.cn/data/cdbd/a0910/test.csv is saved as ..\\..\\..\\data\\a0910\\test.csv\n",
16 | "downloader, WARNING Range not support. Redownloading...\n",
17 | "downloader, INFO ..\\..\\..\\data\\a0910\\train.csv already exists. Send resume request after 2329161 bytes\n",
18 | "downloader, INFO http://base.ustc.edu.cn/data/cdbd/a0910/train.csv is saved as ..\\..\\..\\data\\a0910\\train.csv\n",
19 | "downloader, WARNING Range not support. Redownloading...\n",
20 | "downloader, INFO ..\\..\\..\\data\\a0910\\valid.csv already exists. Send resume request after 371493 bytes\n",
21 | "downloader, INFO http://base.ustc.edu.cn/data/cdbd/a0910/valid.csv is saved as ..\\..\\..\\data\\a0910\\valid.csv\n",
22 | "downloader, WARNING Range not support. Redownloading...\n"
23 | ]
24 | },
25 | {
26 | "name": "stdout",
27 | "output_type": "stream",
28 | "text": [
29 | "Downloading 100.00% : 376832 | 37149361"
30 | ]
31 | },
32 | {
33 | "data": {
34 | "text/plain": "'../../../data'"
35 | },
36 | "execution_count": 1,
37 | "metadata": {},
38 | "output_type": "execute_result"
39 | }
40 | ],
41 | "source": [
42 | "# Download the Cognitive Diagnosis Benchmark Datasets (CDBD)\n",
43 | "from EduData import get_data\n",
44 | "\n",
45 | "get_data(\"cdbd-a0910\", \"../../../data\")\n"
46 | ],
47 | "metadata": {
48 | "collapsed": false,
49 | "pycharm": {
50 | "name": "#%%\n"
51 | }
52 | }
53 | }
54 | ],
55 | "metadata": {
56 | "kernelspec": {
57 | "display_name": "Python 3",
58 | "language": "python",
59 | "name": "python3"
60 | },
61 | "language_info": {
62 | "codemirror_mode": {
63 | "name": "ipython",
64 | "version": 2
65 | },
66 | "file_extension": ".py",
67 | "mimetype": "text/x-python",
68 | "name": "python",
69 | "nbconvert_exporter": "python",
70 | "pygments_lexer": "ipython2",
71 | "version": "2.7.6"
72 | }
73 | },
74 | "nbformat": 4,
75 | "nbformat_minor": 0
76 | }
--------------------------------------------------------------------------------
/examples/IRT/GD/IRT.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/3/23 @ tongshiwei
3 | import logging
4 | from EduCDM import GDIRT
5 | import torch
6 | from torch.utils.data import TensorDataset, DataLoader
7 | import pandas as pd
8 |
9 | train_data = pd.read_csv("../../../data/a0910/train.csv")
10 | valid_data = pd.read_csv("../../../data/a0910/valid.csv")
11 | test_data = pd.read_csv("../../../data/a0910/test.csv")
12 |
13 | batch_size = 256
14 |
15 |
16 | def transform(x, y, z, batch_size, **params):
17 | dataset = TensorDataset(
18 | torch.tensor(x, dtype=torch.int64),
19 | torch.tensor(y, dtype=torch.int64),
20 | torch.tensor(z, dtype=torch.float32)
21 | )
22 | return DataLoader(dataset, batch_size=batch_size, **params)
23 |
24 |
25 | train, valid, test = [
26 | transform(data["user_id"], data["item_id"], data["score"], batch_size)
27 | for data in [train_data, valid_data, test_data]
28 | ]
29 |
30 | logging.getLogger().setLevel(logging.INFO)
31 |
32 | cdm = GDIRT(4164, 17747)
33 |
34 | cdm.train(train, valid, epoch=2)
35 | cdm.save("irt.params")
36 |
37 | cdm.load("irt.params")
38 | auc, accuracy = cdm.eval(test)
39 | print("auc: %.6f, accuracy: %.6f" % (auc, accuracy))
40 |
--------------------------------------------------------------------------------
/examples/IRT/GD/prepare_dataset.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "outputs": [
7 | {
8 | "name": "stderr",
9 | "output_type": "stream",
10 | "text": [
11 | "downloader, INFO ..\\..\\..\\data\\a0910\\item.csv already exists. Send resume request after 258118 bytes\n",
12 | "downloader, INFO http://base.ustc.edu.cn/data/cdbd/a0910/item.csv is saved as ..\\..\\..\\data\\a0910\\item.csv\n",
13 | "downloader, WARNING Range not support. Redownloading...\n",
14 | "downloader, INFO ..\\..\\..\\data\\a0910\\test.csv already exists. Send resume request after 810767 bytes\n",
15 | "downloader, INFO http://base.ustc.edu.cn/data/cdbd/a0910/test.csv is saved as ..\\..\\..\\data\\a0910\\test.csv\n",
16 | "downloader, WARNING Range not support. Redownloading...\n",
17 | "downloader, INFO ..\\..\\..\\data\\a0910\\train.csv already exists. Send resume request after 2329161 bytes\n",
18 | "downloader, INFO http://base.ustc.edu.cn/data/cdbd/a0910/train.csv is saved as ..\\..\\..\\data\\a0910\\train.csv\n",
19 | "downloader, WARNING Range not support. Redownloading...\n",
20 | "downloader, INFO ..\\..\\..\\data\\a0910\\valid.csv already exists. Send resume request after 371493 bytes\n",
21 | "downloader, INFO http://base.ustc.edu.cn/data/cdbd/a0910/valid.csv is saved as ..\\..\\..\\data\\a0910\\valid.csv\n",
22 | "downloader, WARNING Range not support. Redownloading...\n"
23 | ]
24 | },
25 | {
26 | "name": "stdout",
27 | "output_type": "stream",
28 | "text": [
29 | "Downloading 100.00% : 376832 | 37149361"
30 | ]
31 | },
32 | {
33 | "data": {
34 | "text/plain": "'../../../data'"
35 | },
36 | "execution_count": 1,
37 | "metadata": {},
38 | "output_type": "execute_result"
39 | }
40 | ],
41 | "source": [
42 | "# Download the Cognitive Diagnosis Benchmark Datasets (CDBD)\n",
43 | "from EduData import get_data\n",
44 | "\n",
45 | "get_data(\"cdbd-a0910\", \"../../../data\")\n"
46 | ],
47 | "metadata": {
48 | "collapsed": false,
49 | "pycharm": {
50 | "name": "#%%\n"
51 | }
52 | }
53 | }
54 | ],
55 | "metadata": {
56 | "kernelspec": {
57 | "display_name": "Python 3",
58 | "language": "python",
59 | "name": "python3"
60 | },
61 | "language_info": {
62 | "codemirror_mode": {
63 | "name": "ipython",
64 | "version": 2
65 | },
66 | "file_extension": ".py",
67 | "mimetype": "text/x-python",
68 | "name": "python",
69 | "nbconvert_exporter": "python",
70 | "pygments_lexer": "ipython2",
71 | "version": "2.7.6"
72 | }
73 | },
74 | "nbformat": 4,
75 | "nbformat_minor": 0
76 | }
--------------------------------------------------------------------------------
/examples/KaNCD/KaNCD.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2023/3/7 @ WangFei
3 | import logging
4 | from EduCDM import KaNCD
5 | import torch
6 | from torch.utils.data import TensorDataset, DataLoader
7 | import pandas as pd
8 | import numpy as np
9 |
10 |
11 | train_data = pd.read_csv("../../data/a0910/train.csv")
12 | valid_data = pd.read_csv("../../data/a0910/valid.csv")
13 | test_data = pd.read_csv("../../data/a0910/test.csv")
14 | df_item = pd.read_csv("../../data/a0910/item.csv")
15 | item2knowledge = {}
16 | knowledge_set = set()
17 | for i, s in df_item.iterrows():
18 | item_id, knowledge_codes = s['item_id'], list(set(eval(s['knowledge_code'])))
19 | item2knowledge[item_id] = knowledge_codes
20 | knowledge_set.update(knowledge_codes)
21 |
22 | batch_size = 32
23 | user_n = np.max(train_data['user_id'])
24 | item_n = np.max([np.max(train_data['item_id']), np.max(valid_data['item_id']), np.max(test_data['item_id'])])
25 | knowledge_n = np.max(list(knowledge_set))
26 |
27 |
28 | def transform(user, item, item2knowledge, score, batch_size):
29 | knowledge_emb = torch.zeros((len(item), knowledge_n))
30 | for idx in range(len(item)):
31 | knowledge_emb[idx][np.array(item2knowledge[item[idx]]) - 1] = 1.0
32 |
33 | data_set = TensorDataset(
34 | torch.tensor(user, dtype=torch.int64) - 1, # (1, user_n) to (0, user_n-1)
35 | torch.tensor(item, dtype=torch.int64) - 1, # (1, item_n) to (0, item_n-1)
36 | knowledge_emb,
37 | torch.tensor(score, dtype=torch.float32)
38 | )
39 | return DataLoader(data_set, batch_size=batch_size, shuffle=True)
40 |
41 |
42 | train_set, valid_set, test_set = [
43 | transform(data["user_id"], data["item_id"], item2knowledge, data["score"], batch_size)
44 | for data in [train_data, valid_data, test_data]
45 | ]
46 |
47 | logging.getLogger().setLevel(logging.INFO)
48 | cdm = KaNCD(exer_n=item_n, student_n=user_n, knowledge_n=knowledge_n, mf_type='gmf', dim=20)
49 | cdm.train(train_set, valid_set, epoch_n=3, device="cuda", lr=0.002)
50 | cdm.save("kancd.snapshot")
51 |
52 | cdm.load("kancd.snapshot")
53 | auc, accuracy = cdm.eval(test_set, device="cuda")
54 | print("auc: %.6f, accuracy: %.6f" % (auc, accuracy))
55 |
56 |
57 |
--------------------------------------------------------------------------------
/examples/KaNCD/prepare_dataset.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "from EduData import get_data\n",
10 | "\n",
11 | "get_data(\"cdbd-a0910\", \"../../data\")"
12 | ]
13 | }
14 | ],
15 | "metadata": {
16 | "kernelspec": {
17 | "display_name": "Python 3",
18 | "language": "python",
19 | "name": "python3"
20 | },
21 | "language_info": {
22 | "codemirror_mode": {
23 | "name": "ipython",
24 | "version": 3
25 | },
26 | "file_extension": ".py",
27 | "mimetype": "text/x-python",
28 | "name": "python",
29 | "nbconvert_exporter": "python",
30 | "pygments_lexer": "ipython3",
31 | "version": "3.8.3"
32 | }
33 | },
34 | "nbformat": 4,
35 | "nbformat_minor": 4
36 | }
37 |
--------------------------------------------------------------------------------
/examples/MCD/MCD.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/3/23 @ tongshiwei
3 | import logging
4 | from EduCDM import MCD
5 | import torch
6 | from torch.utils.data import TensorDataset, DataLoader
7 | import pandas as pd
8 |
9 | train_data = pd.read_csv("../../data/a0910/train.csv")
10 | valid_data = pd.read_csv("../../data/a0910/valid.csv")
11 | test_data = pd.read_csv("../../data/a0910/test.csv")
12 |
13 | batch_size = 256
14 |
15 |
16 | def transform(x, y, z, batch_size, **params):
17 | dataset = TensorDataset(
18 | torch.tensor(x, dtype=torch.int64),
19 | torch.tensor(y, dtype=torch.int64),
20 | torch.tensor(z, dtype=torch.float32)
21 | )
22 | return DataLoader(dataset, batch_size=batch_size, **params)
23 |
24 |
25 | train, valid, test = [
26 | transform(data["user_id"], data["item_id"], data["score"], batch_size)
27 | for data in [train_data, valid_data, test_data]
28 | ]
29 |
30 | logging.getLogger().setLevel(logging.INFO)
31 |
32 | cdm = MCD(4164, 17747, 100)
33 |
34 | cdm.train(train, valid, epoch=2)
35 | cdm.save("mcd.params")
36 |
37 | cdm.load("mcd.params")
38 | auc, accuracy = cdm.eval(test)
39 | print("auc: %.6f, accuracy: %.6f" % (auc, accuracy))
40 |
--------------------------------------------------------------------------------
/examples/MCD/prepare_dataset.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 2,
6 | "outputs": [
7 | {
8 | "name": "stderr",
9 | "output_type": "stream",
10 | "text": [
11 | "downloader, INFO http://base.ustc.edu.cn/data/cdbd/a0910/item.csv is saved as ..\\..\\data\\a0910\\item.csv\n",
12 | "downloader, INFO http://base.ustc.edu.cn/data/cdbd/a0910/test.csv is saved as ..\\..\\data\\a0910\\test.csv\n",
13 | "downloader, INFO http://base.ustc.edu.cn/data/cdbd/a0910/train.csv is saved as ..\\..\\data\\a0910\\train.csv\n",
14 | "downloader, INFO http://base.ustc.edu.cn/data/cdbd/a0910/valid.csv is saved as ..\\..\\data\\a0910\\valid.csv\n"
15 | ]
16 | },
17 | {
18 | "name": "stdout",
19 | "output_type": "stream",
20 | "text": [
21 | "Downloading ..\\..\\data\\a0910\\item.csv 100.00%: 258118 | 258118\n",
22 | "Downloading ..\\..\\data\\a0910\\test.csv 100.00%: 810767 | 810767\n",
23 | "Downloading ..\\..\\data\\a0910\\train.csv 100.00%: 2329161 | 2329161\n",
24 | "Downloading ..\\..\\data\\a0910\\valid.csv 100.00%: 371493 | 371493\n"
25 | ]
26 | },
27 | {
28 | "data": {
29 | "text/plain": "'../../data'"
30 | },
31 | "execution_count": 2,
32 | "metadata": {},
33 | "output_type": "execute_result"
34 | }
35 | ],
36 | "source": [
37 | "# Download the Cognitive Diagnosis Benchmark Datasets (CDBD)\n",
38 | "from EduData import get_data\n",
39 | "\n",
40 | "get_data(\"cdbd-a0910\", \"../../data\")\n"
41 | ],
42 | "metadata": {
43 | "collapsed": false,
44 | "pycharm": {
45 | "name": "#%%\n"
46 | }
47 | }
48 | }
49 | ],
50 | "metadata": {
51 | "kernelspec": {
52 | "display_name": "Python 3",
53 | "language": "python",
54 | "name": "python3"
55 | },
56 | "language_info": {
57 | "codemirror_mode": {
58 | "name": "ipython",
59 | "version": 2
60 | },
61 | "file_extension": ".py",
62 | "mimetype": "text/x-python",
63 | "name": "python",
64 | "nbconvert_exporter": "python",
65 | "pygments_lexer": "ipython2",
66 | "version": "2.7.6"
67 | }
68 | },
69 | "nbformat": 4,
70 | "nbformat_minor": 0
71 | }
--------------------------------------------------------------------------------
/examples/MIRT/MIRT.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/3/23 @ tongshiwei
3 | import logging
4 | from EduCDM import MIRT
5 | import torch
6 | from torch.utils.data import TensorDataset, DataLoader
7 | import pandas as pd
8 |
9 | train_data = pd.read_csv("../../data/a0910/train.csv")
10 | valid_data = pd.read_csv("../../data/a0910/valid.csv")
11 | test_data = pd.read_csv("../../data/a0910/test.csv")
12 |
13 | batch_size = 256
14 |
15 |
16 | def transform(x, y, z, batch_size, **params):
17 | dataset = TensorDataset(
18 | torch.tensor(x, dtype=torch.int64),
19 | torch.tensor(y, dtype=torch.int64),
20 | torch.tensor(z, dtype=torch.float32)
21 | )
22 | return DataLoader(dataset, batch_size=batch_size, **params)
23 |
24 |
25 | train, valid, test = [
26 | transform(data["user_id"], data["item_id"], data["score"], batch_size)
27 | for data in [train_data, valid_data, test_data]
28 | ]
29 |
30 | logging.getLogger().setLevel(logging.INFO)
31 |
32 | cdm = MIRT(4164, 17747, 123)
33 |
34 | cdm.train(train, valid, epoch=2)
35 | cdm.save("mirt.params")
36 |
37 | cdm.load("mirt.params")
38 | auc, accuracy = cdm.eval(test)
39 | print("auc: %.6f, accuracy: %.6f" % (auc, accuracy))
40 |
--------------------------------------------------------------------------------
/examples/MIRT/prepare_dataset.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 2,
6 | "outputs": [
7 | {
8 | "name": "stderr",
9 | "output_type": "stream",
10 | "text": [
11 | "downloader, INFO http://base.ustc.edu.cn/data/cdbd/a0910/item.csv is saved as ..\\..\\data\\a0910\\item.csv\n",
12 | "downloader, INFO http://base.ustc.edu.cn/data/cdbd/a0910/readme.txt is saved as ..\\..\\data\\a0910\\readme.txt\n",
13 | "downloader, INFO http://base.ustc.edu.cn/data/cdbd/a0910/test.csv is saved as ..\\..\\data\\a0910\\test.csv\n",
14 | "downloader, INFO http://base.ustc.edu.cn/data/cdbd/a0910/train.csv is saved as ..\\..\\data\\a0910\\train.csv\n",
15 | "downloader, INFO http://base.ustc.edu.cn/data/cdbd/a0910/valid.csv is saved as ..\\..\\data\\a0910\\valid.csv\n"
16 | ]
17 | },
18 | {
19 | "name": "stdout",
20 | "output_type": "stream",
21 | "text": [
22 | "Downloading ..\\..\\data\\a0910\\item.csv 100.00%: 258118 | 258118\n",
23 | "Downloading ..\\..\\data\\a0910\\readme.txt 100.00%: 86 | 86\n",
24 | "Downloading ..\\..\\data\\a0910\\test.csv 100.00%: 810767 | 810767\n",
25 | "Downloading ..\\..\\data\\a0910\\train.csv 100.00%: 2329161 | 2329161\n",
26 | "Downloading ..\\..\\data\\a0910\\valid.csv 100.00%: 371493 | 371493\n"
27 | ]
28 | },
29 | {
30 | "data": {
31 | "text/plain": "'../../data'"
32 | },
33 | "execution_count": 2,
34 | "metadata": {},
35 | "output_type": "execute_result"
36 | }
37 | ],
38 | "source": [
39 | "# Download the Cognitive Diagnosis Benchmark Datasets (CDBD)\n",
40 | "from EduData import get_data\n",
41 | "\n",
42 | "get_data(\"cdbd-a0910\", \"../../data\")\n"
43 | ],
44 | "metadata": {
45 | "collapsed": false,
46 | "pycharm": {
47 | "name": "#%%\n"
48 | }
49 | }
50 | }
51 | ],
52 | "metadata": {
53 | "kernelspec": {
54 | "display_name": "Python 3",
55 | "language": "python",
56 | "name": "python3"
57 | },
58 | "language_info": {
59 | "codemirror_mode": {
60 | "name": "ipython",
61 | "version": 2
62 | },
63 | "file_extension": ".py",
64 | "mimetype": "text/x-python",
65 | "name": "python",
66 | "nbconvert_exporter": "python",
67 | "pygments_lexer": "ipython2",
68 | "version": "2.7.6"
69 | }
70 | },
71 | "nbformat": 4,
72 | "nbformat_minor": 0
73 | }
--------------------------------------------------------------------------------
/examples/NCDM/NCDM.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/4/1 @ WangFei
3 | import logging
4 | from EduCDM import NCDM
5 | import torch
6 | from torch.utils.data import TensorDataset, DataLoader
7 | import pandas as pd
8 | import numpy as np
9 |
10 |
11 | train_data = pd.read_csv("../../data/a0910/train.csv")
12 | valid_data = pd.read_csv("../../data/a0910/valid.csv")
13 | test_data = pd.read_csv("../../data/a0910/test.csv")
14 | df_item = pd.read_csv("../../data/a0910/item.csv")
15 | item2knowledge = {}
16 | knowledge_set = set()
17 | for i, s in df_item.iterrows():
18 | item_id, knowledge_codes = s['item_id'], list(set(eval(s['knowledge_code'])))
19 | item2knowledge[item_id] = knowledge_codes
20 | knowledge_set.update(knowledge_codes)
21 |
22 | batch_size = 32
23 | user_n = np.max(train_data['user_id'])
24 | item_n = np.max([np.max(train_data['item_id']), np.max(valid_data['item_id']), np.max(test_data['item_id'])])
25 | knowledge_n = np.max(list(knowledge_set))
26 |
27 |
28 | def transform(user, item, item2knowledge, score, batch_size):
29 | knowledge_emb = torch.zeros((len(item), knowledge_n))
30 | for idx in range(len(item)):
31 | knowledge_emb[idx][np.array(item2knowledge[item[idx]]) - 1] = 1.0
32 |
33 | data_set = TensorDataset(
34 | torch.tensor(user, dtype=torch.int64) - 1, # (1, user_n) to (0, user_n-1)
35 | torch.tensor(item, dtype=torch.int64) - 1, # (1, item_n) to (0, item_n-1)
36 | knowledge_emb,
37 | torch.tensor(score, dtype=torch.float32)
38 | )
39 | return DataLoader(data_set, batch_size=batch_size, shuffle=True)
40 |
41 |
42 | train_set, valid_set, test_set = [
43 | transform(data["user_id"], data["item_id"], item2knowledge, data["score"], batch_size)
44 | for data in [train_data, valid_data, test_data]
45 | ]
46 |
47 | logging.getLogger().setLevel(logging.INFO)
48 | cdm = NCDM(knowledge_n, item_n, user_n)
49 | cdm.train(train_set, valid_set, epoch=3, device="cuda")
50 | cdm.save("ncdm.snapshot")
51 |
52 | cdm.load("ncdm.snapshot")
53 | auc, accuracy = cdm.eval(test_set)
54 | print("auc: %.6f, accuracy: %.6f" % (auc, accuracy))
55 |
56 |
57 |
--------------------------------------------------------------------------------
/examples/NCDM/prepare_dataset.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "from EduData import get_data\n",
10 | "\n",
11 | "get_data(\"cdbd-a0910\", \"../../data\")"
12 | ]
13 | }
14 | ],
15 | "metadata": {
16 | "kernelspec": {
17 | "display_name": "Python 3",
18 | "language": "python",
19 | "name": "python3"
20 | },
21 | "language_info": {
22 | "codemirror_mode": {
23 | "name": "ipython",
24 | "version": 3
25 | },
26 | "file_extension": ".py",
27 | "mimetype": "text/x-python",
28 | "name": "python",
29 | "nbconvert_exporter": "python",
30 | "pygments_lexer": "ipython3",
31 | "version": "3.8.3"
32 | }
33 | },
34 | "nbformat": 4,
35 | "nbformat_minor": 4
36 | }
37 |
--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | # For pytest usage, refer to https://hb4dsai.readthedocs.io/zh/latest/Architecture/Test.html
3 | norecursedirs = docs *build* trash dev examples
4 |
5 | # Deal with marker warnings
6 | markers =
7 | flake8: flake8
8 |
9 | # Enable line length testing with maximum line length of 120
10 | flake8-max-line-length = 120
11 |
12 | # Ignore module level import not at top of file (E402)
13 | # Others can be found in https://flake8.pycqa.org/en/latest/user/error-codes.html
14 | flake8-ignore = E402 F401 F403 E126 W504 W503
15 |
16 | # --doctest-modules is used for unitest
17 | addopts = --doctest-modules --cov --cov-report=term-missing --flake8
18 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [coverage:run]
2 | source=EduCDM
3 | [coverage:report]
4 | exclude_lines =
5 | pragma: no cover
6 | pass
7 | raise NotImplementedError
8 | if __name__ == '__main__':
9 | if __name__ == "__main__":
10 | def __str__
11 | def __repr__
12 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 |
3 | test_deps = [
4 | 'pytest>=4',
5 | 'pytest-cov>=2.6.0',
6 | # 'pytest-flake8==4.0.1',
7 | 'pytest-flake8<1.1.2',
8 | 'flake8<5.0.0'
9 | ]
10 |
11 | setup(
12 | name='EduCDM',
13 | version='1.0.1',
14 | extras_require={
15 | 'test': test_deps,
16 | },
17 | packages=find_packages(),
18 | install_requires=[
19 | "torch", "tqdm", "numpy>=1.16.5", "scikit-learn", "pandas",
20 | "longling>=1.3.33", "longling<=1.3.36", 'PyBaize>=0.0.7', 'fire'
21 | ], # And any other dependencies for needs
22 | entry_points={},
23 | )
24 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/3/17 @ tongshiwei
3 |
4 | import random
5 |
6 | random.seed(10)
7 |
--------------------------------------------------------------------------------
/tests/dina/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/3/28 @ liujiayu
3 |
--------------------------------------------------------------------------------
/tests/dina/em/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/6/21 @ tongshiwei
3 |
--------------------------------------------------------------------------------
/tests/dina/em/conftest.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/3/28 @ liujiayu
3 |
4 | import random
5 | import numpy as np
6 | import pytest
7 |
8 |
9 | @pytest.fixture(scope="package")
10 | def conf():
11 | user_num = 5
12 | item_num = 2
13 | know_num = 3
14 | return user_num, item_num, know_num
15 |
16 |
17 | @pytest.fixture(scope="package")
18 | def data(conf):
19 | user_num, item_num, know_num = conf
20 | q_m = np.zeros(shape=(item_num, know_num))
21 | for i in range(item_num):
22 | for j in range(know_num):
23 | q_m[i, j] = random.randint(0, 1)
24 |
25 | R = -1 * np.ones(shape=(user_num, item_num))
26 | for i in range(user_num):
27 | for j in range(item_num):
28 | R[i, j] = random.randint(-1, 1)
29 |
30 | new_data = [{'user_id': 1, 'item_id': 1, 'score': 1.0}]
31 |
32 | stu_rec = np.ones(item_num)
33 | for i in range(item_num):
34 | stu_rec[i] = random.randint(-1, 1)
35 |
36 | return user_num, item_num, know_num, R, q_m, new_data, stu_rec
37 |
--------------------------------------------------------------------------------
/tests/dina/em/test_dina.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/3/28 @ liujiayu
3 | from EduCDM import EMDINA as DINA
4 |
5 |
6 | def test_train(data, tmp_path):
7 | stu_num, prob_num, know_num, R, q_m, new_data, stu_rec = data
8 | cdm = DINA(R, q_m, stu_num, prob_num, know_num, skip_value=-1)
9 | cdm.train(epoch=30, epsilon=1e-3)
10 | rmse, mae = cdm.eval([{'user_id': 0, 'item_id': 0, 'score': 1.0}])
11 | filepath = tmp_path / "dina.params"
12 | cdm.save(filepath)
13 | cdm.load(filepath)
14 | cdm.inc_train(new_data, epoch=30, epsilon=1e-3)
15 | dia_id, dia_state = cdm.transform(stu_rec)
16 |
--------------------------------------------------------------------------------
/tests/dina/gd/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/6/21 @ tongshiwei
3 |
--------------------------------------------------------------------------------
/tests/dina/gd/conftest.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/3/23 @ tongshiwei
3 |
4 | import random
5 | import pytest
6 | import torch
7 | from torch.utils.data import TensorDataset, DataLoader
8 |
9 |
10 | @pytest.fixture(scope="package")
11 | def conf():
12 | user_num = 5
13 | item_num = 2
14 | knowledge_num = 3
15 | return user_num, item_num, knowledge_num
16 |
17 |
18 | @pytest.fixture(scope="package")
19 | def data(conf):
20 | user_num, item_num, knowledge_num = conf
21 | log = []
22 | for i in range(user_num):
23 | for j in range(item_num):
24 | k = [0] * knowledge_num
25 | k[random.randint(0, knowledge_num - 1)] = 1
26 | score = random.randint(0, 1)
27 | log.append((i, j, k, score))
28 |
29 | user_id, item_id, knowledge, score = zip(*log)
30 | batch_size = 4
31 |
32 | dataset = TensorDataset(
33 | torch.tensor(user_id, dtype=torch.int64),
34 | torch.tensor(item_id, dtype=torch.int64),
35 | torch.tensor(knowledge, dtype=torch.float),
36 | torch.tensor(score, dtype=torch.float)
37 | )
38 | return DataLoader(dataset, batch_size=batch_size)
39 |
--------------------------------------------------------------------------------
/tests/dina/gd/test_gddina.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/4/23 @ tongshiwei
3 |
4 | import pytest
5 | from EduCDM import GDDINA
6 |
7 |
8 | @pytest.mark.parametrize("ste", [True, False])
9 | def test_train(data, conf, tmp_path, ste):
10 | user_num, item_num, knowledge_num = conf
11 | cdm = GDDINA(user_num, item_num, knowledge_num, ste=ste)
12 | cdm.train(data, test_data=data, epoch=2)
13 | filepath = tmp_path / "dina.params"
14 | cdm.save(filepath)
15 | cdm.load(filepath)
16 |
--------------------------------------------------------------------------------
/tests/fuzzycdf/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/3/28 @ liujiayu
3 |
--------------------------------------------------------------------------------
/tests/fuzzycdf/conftest.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/3/28 @ liujiayu
3 |
4 | import random
5 | import numpy as np
6 | import pytest
7 |
8 |
9 | @pytest.fixture(scope="package")
10 | def conf():
11 | user_num = 5
12 | item_num = 2
13 | know_num = 3
14 | return user_num, item_num, know_num
15 |
16 |
17 | @pytest.fixture(scope="package")
18 | def data(conf):
19 | user_num, item_num, know_num = conf
20 | q_m = np.zeros(shape=(item_num, know_num))
21 | for i in range(item_num):
22 | for j in range(know_num):
23 | q_m[i, j] = random.randint(0, 1)
24 |
25 | R = -1 * np.ones(shape=(user_num, item_num))
26 | for i in range(user_num):
27 | for j in range(item_num):
28 | R[i, j] = random.randint(-1, 1)
29 |
30 | index = random.randint(1, item_num - 1)
31 | obj_prob_index = np.arange(0, index)
32 | sub_prob_index = np.arange(index - 1, item_num)
33 |
34 | new_data = [{'user_id': 1, 'item_id': 1, 'score': 1.0}]
35 |
36 | return user_num, item_num, know_num, R, q_m, obj_prob_index, sub_prob_index, new_data
37 |
--------------------------------------------------------------------------------
/tests/fuzzycdf/test_fuzzycdf.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/3/28 @ liujiayu
3 | from EduCDM import FuzzyCDF
4 |
5 |
6 | def test_train(data, tmp_path):
7 | stu_num, prob_num, know_num, R, q_m, obj_prob_index, sub_prob_index, new_data = data
8 | cdm = FuzzyCDF(R, q_m, stu_num, prob_num, know_num, obj_prob_index, sub_prob_index, skip_value=-1)
9 | cdm.train(epoch=10, burnin=5)
10 | rmse, mae = cdm.eval([{'user_id': 0, 'item_id': 0, 'score': 1.0}])
11 | filepath = tmp_path / "fuzzycdf.params"
12 | cdm.save(filepath)
13 | cdm.load(filepath)
14 | cdm.inc_train(new_data, epoch=10, burnin=5)
15 |
--------------------------------------------------------------------------------
/tests/icd/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdata-ustc/EduCDM/c610e6ae7f45cfe8f1106a1c342d4ef5a357472e/tests/icd/__init__.py
--------------------------------------------------------------------------------
/tests/icd/conftest.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/4/6 @ WangFei
3 |
4 | import random
5 | import pytest
6 | import pandas as pd
7 | from EduCDM.ICD.etl import inc_stream
8 |
9 |
10 | @pytest.fixture(scope="package")
11 | def conf():
12 | user_num = 50
13 | item_num = 20
14 | knowledge_num = 4
15 | return user_num, item_num, knowledge_num
16 |
17 |
18 | @pytest.fixture(scope="package")
19 | def data(conf):
20 | user_num, item_num, knowledge_num = conf
21 | i2k = {}
22 | for i in range(item_num):
23 | i2k[i] = [random.randint(0, knowledge_num - 1)]
24 | log = []
25 | for i in range(user_num):
26 | for j in range(item_num):
27 | score = random.randint(0, 1)
28 | log.append([i, j, score])
29 | random.shuffle(log)
30 | df = pd.DataFrame(log, columns=['user_id', 'item_id', 'score'])
31 | inc_train_df_list = list(inc_stream(df, stream_size=int(len(df) // 50)))
32 |
33 | return inc_train_df_list, i2k
34 |
--------------------------------------------------------------------------------
/tests/icd/test_mirt.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/4/23 @ tongshiwei
3 |
4 | from EduCDM.ICD.ICD import ICD
5 | # from EduCDM import ICD
6 |
7 |
8 | def test_train(data, conf, tmp_path):
9 | user_n, item_n, know_n = conf
10 | cdm = ICD('mirt', user_n, item_n, know_n)
11 | log, i2k = data
12 | cdm.train(log, i2k)
13 | cdm.save()
14 | cdm.load()
15 |
16 |
17 | def test_exception(data, conf, tmp_path):
18 | try:
19 | user_n, item_n, know_n = conf
20 | cdm = ICD('mirt', user_n, item_n, know_n)
21 | log, i2k = data
22 | cdm.train(log, i2k)
23 | cdm.save()
24 | cdm.load()
25 | except ValueError:
26 | print(ValueError)
27 |
--------------------------------------------------------------------------------
/tests/icd/test_ncd.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/4/23 @ tongshiwei
3 |
4 | from EduCDM.ICD.ICD import ICD
5 | # from EduCDM import ICD
6 |
7 |
8 | def test_train(data, conf, tmp_path):
9 | user_n, item_n, know_n = conf
10 | cdm = ICD('ncd', user_n, item_n, know_n)
11 | log, i2k = data
12 | cdm.train(log, i2k)
13 | cdm.save()
14 | cdm.load()
15 |
16 |
17 | def test_exception(data, conf, tmp_path):
18 | try:
19 | user_n, item_n, know_n = conf
20 | cdm = ICD('ncd', user_n, item_n, know_n)
21 | log, i2k = data
22 | cdm.train(log, i2k)
23 | cdm.save()
24 | cdm.load()
25 | except ValueError:
26 | print(ValueError)
27 |
--------------------------------------------------------------------------------
/tests/irr/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/6/19 @ tongshiwei
3 |
--------------------------------------------------------------------------------
/tests/irr/conftest.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/3/23 @ tongshiwei
3 |
4 | import pandas as pd
5 | import random
6 | import pytest
7 | from longling.lib.structure import AttrDict
8 | from EduCDM.IRR import pair_etl, point_etl, extract_item
9 | import logging
10 | from copy import deepcopy
11 |
12 |
13 | @pytest.fixture(scope="package")
14 | def conf():
15 | user_num = 5
16 | item_num = 2
17 | knowledge_num = 3
18 | return user_num, item_num, knowledge_num
19 |
20 |
21 | @pytest.fixture(scope="package")
22 | def params(conf):
23 | user_num, item_num, knowledge_num = conf
24 | return AttrDict(
25 | logger=logging,
26 | user_num=user_num,
27 | item_num=item_num,
28 | knowledge_num=knowledge_num,
29 | n_neg=1,
30 | n_imp=1,
31 | hyper_params={"user_num": user_num},
32 | batch_size=4
33 | )
34 |
35 |
36 | @pytest.fixture(scope="package")
37 | def source(tmpdir_factory, conf):
38 | user_num, item_num, knowledge_num = conf
39 |
40 | d = tmpdir_factory.mktemp("irr")
41 | log_path = d / "log.csv"
42 | item_path = d / "item.csv"
43 |
44 | knowledge = []
45 | for j in range(item_num):
46 | knowledge.append([j, [random.randint(1, knowledge_num)]])
47 |
48 | pd.DataFrame(knowledge, columns=["item_id", "knowledge_code"]).to_csv(item_path)
49 |
50 | log = []
51 | for i in range(user_num):
52 | for j in range(item_num):
53 | score = random.randint(0, 1)
54 | log.append((i, j, score))
55 |
56 | pd.DataFrame(log, columns=["user_id", "item_id", "score"]).to_csv(log_path)
57 |
58 | return log_path, item_path
59 |
60 |
61 | @pytest.fixture(scope="package")
62 | def knowledge(source, params):
63 | _, item_path = source
64 | return extract_item(item_path, params.knowledge_num, params)
65 |
66 |
67 | @pytest.fixture(scope="package")
68 | def train_data(source, knowledge, params):
69 | log_path, _ = source
70 | data, _ = pair_etl(log_path, knowledge, params)
71 | return data
72 |
73 |
74 | @pytest.fixture(scope="package")
75 | def zero_train_data(source, knowledge, params):
76 | log_path, _ = source
77 | params_0 = dict(params.items())
78 | params_0["n_neg"] = 0
79 | params_0["n_imp"] = 0
80 | params_0 = AttrDict(**params_0)
81 | data, _ = pair_etl(log_path, knowledge, params_0)
82 | return data
83 |
84 |
85 | @pytest.fixture(scope="package")
86 | def test_data(source, knowledge, params):
87 | log_path, _ = source
88 | data, _ = point_etl(log_path, knowledge, params)
89 | return data
90 |
--------------------------------------------------------------------------------
/tests/irr/test_dina.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/6/19 @ tongshiwei
3 |
4 | from EduCDM.IRR import DINA
5 |
6 |
7 | def test_irr_dina(train_data, test_data, params, tmp_path):
8 | cdm = DINA(params.user_num, params.item_num, params.knowledge_num)
9 | cdm.train(train_data, test_data=test_data, epoch=2)
10 | filepath = tmp_path / "irr.params"
11 | cdm.save(filepath)
12 | cdm.load(filepath)
13 |
14 |
15 | def test_irt(zero_train_data, test_data, params, tmp_path):
16 | cdm = DINA(params.user_num, params.item_num, params.knowledge_num, zeta=0)
17 | cdm.train(zero_train_data, test_data=test_data, epoch=2)
18 | filepath = tmp_path / "irr.params"
19 | cdm.save(filepath)
20 | cdm.load(filepath)
21 |
--------------------------------------------------------------------------------
/tests/irr/test_irt.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/6/19 @ tongshiwei
3 |
4 | from EduCDM.IRR import IRT
5 |
6 |
7 | def test_irr_irt(train_data, test_data, params, tmp_path):
8 | cdm = IRT(params.user_num, params.item_num, params.knowledge_num)
9 | cdm.train(train_data, test_data=test_data, epoch=2)
10 | filepath = tmp_path / "irr.params"
11 | cdm.save(filepath)
12 | cdm.load(filepath)
13 |
14 |
15 | def test_irt(zero_train_data, test_data, params, tmp_path):
16 | cdm = IRT(params.user_num, params.item_num, params.knowledge_num, zeta=0)
17 | cdm.train(zero_train_data, test_data=test_data, epoch=2)
18 | filepath = tmp_path / "irr.params"
19 | cdm.save(filepath)
20 | cdm.load(filepath)
21 |
--------------------------------------------------------------------------------
/tests/irr/test_mirt.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/6/19 @ tongshiwei
3 |
4 | from EduCDM.IRR import MIRT
5 |
6 |
7 | def test_irr_irt(train_data, test_data, params, tmp_path):
8 | cdm = MIRT(params.user_num, params.item_num, params.knowledge_num)
9 | cdm.train(train_data, test_data=test_data, epoch=2)
10 | filepath = tmp_path / "irr.params"
11 | cdm.save(filepath)
12 | cdm.load(filepath)
13 |
14 |
15 | def test_irt(zero_train_data, test_data, params, tmp_path):
16 | cdm = MIRT(params.user_num, params.item_num, params.knowledge_num, zeta=0)
17 | cdm.train(zero_train_data, test_data=test_data, epoch=2)
18 | filepath = tmp_path / "irr.params"
19 | cdm.save(filepath)
20 | cdm.load(filepath)
21 |
--------------------------------------------------------------------------------
/tests/irr/test_ncdm.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/7/1 @ tongshiwei
3 |
4 | from EduCDM.IRR import NCDM
5 |
6 |
7 | def test_irr_dina(train_data, test_data, params, tmp_path):
8 | cdm = NCDM(params.user_num, params.item_num, params.knowledge_num)
9 | cdm.train(train_data, test_data=test_data, epoch=2)
10 | filepath = tmp_path / "irr.params"
11 | cdm.save(filepath)
12 | cdm.load(filepath)
13 |
14 |
15 | def test_irt(zero_train_data, test_data, params, tmp_path):
16 | cdm = NCDM(params.user_num, params.item_num, params.knowledge_num, zeta=0)
17 | cdm.train(zero_train_data, test_data=test_data, epoch=2)
18 | filepath = tmp_path / "irr.params"
19 | cdm.save(filepath)
20 | cdm.load(filepath)
21 |
--------------------------------------------------------------------------------
/tests/irt/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/6/21 @ tongshiwei
3 |
--------------------------------------------------------------------------------
/tests/irt/em/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/4/23 @ tongshiwei
3 |
--------------------------------------------------------------------------------
/tests/irt/em/conftest.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/5/2 @ liujiayu
3 |
4 | import random
5 | import numpy as np
6 | import pytest
7 |
8 |
9 | @pytest.fixture(scope="package")
10 | def conf():
11 | user_num = 5
12 | item_num = 2
13 | return user_num, item_num
14 |
15 |
16 | @pytest.fixture(scope="package")
17 | def data(conf):
18 | user_num, item_num = conf
19 |
20 | R = -1 * np.ones(shape=(user_num, item_num))
21 | for i in range(user_num):
22 | for j in range(item_num):
23 | R[i, j] = random.randint(-1, 1)
24 |
25 | new_data = [{'user_id': 1, 'item_id': 1, 'score': 1.0}]
26 |
27 | stu_rec = np.ones(item_num)
28 | for i in range(item_num):
29 | stu_rec[i] = random.randint(-1, 1)
30 |
31 | return user_num, item_num, R, new_data, stu_rec
32 |
--------------------------------------------------------------------------------
/tests/irt/em/test_emirt.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/5/2 @ liujiayu
3 |
4 | from EduCDM import EMIRT
5 |
6 |
7 | def test_train(data, conf, tmp_path):
8 | stu_num, prob_num, R, new_data, stu_rec = data
9 | cdm = EMIRT(R, stu_num, prob_num, dim=1, skip_value=-1)
10 | cdm.train(lr=1e-3, epoch=30, epsilon=1e-1)
11 | rmse, mae = cdm.eval([{'user_id': 0, 'item_id': 0, 'score': 1.0}])
12 | filepath = tmp_path / "irt.params"
13 | cdm.save(filepath)
14 | cdm.load(filepath)
15 | cdm.inc_train(new_data, lr=1e-3, epoch=10)
16 | cdm.transform(stu_rec)
17 |
--------------------------------------------------------------------------------
/tests/irt/gd/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/4/23 @ tongshiwei
3 |
--------------------------------------------------------------------------------
/tests/irt/gd/conftest.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/3/23 @ tongshiwei
3 |
4 | import random
5 | import pytest
6 | import torch
7 | from torch.utils.data import TensorDataset, DataLoader
8 |
9 |
10 | @pytest.fixture(scope="package")
11 | def conf():
12 | user_num = 5
13 | item_num = 2
14 | return user_num, item_num
15 |
16 |
17 | @pytest.fixture(scope="package")
18 | def data(conf):
19 | user_num, item_num = conf
20 | log = []
21 | for i in range(user_num):
22 | for j in range(item_num):
23 | score = random.randint(0, 1)
24 | log.append((i, j, score))
25 |
26 | user_id, item_id, score = zip(*log)
27 | batch_size = 4
28 |
29 | dataset = TensorDataset(
30 | torch.tensor(user_id, dtype=torch.int64),
31 | torch.tensor(item_id, dtype=torch.int64),
32 | torch.tensor(score, dtype=torch.float)
33 | )
34 | return DataLoader(dataset, batch_size=batch_size)
35 |
--------------------------------------------------------------------------------
/tests/irt/gd/test_gdirt.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/4/23 @ tongshiwei
3 |
4 | from EduCDM import GDIRT
5 | import pytest
6 |
7 |
8 | def test_train(data, conf, tmp_path):
9 | user_num, item_num = conf
10 | cdm = GDIRT(user_num, item_num)
11 | cdm.train(data, test_data=data, epoch=2)
12 | filepath = tmp_path / "mcd.params"
13 | cdm.save(filepath)
14 | cdm.load(filepath)
15 |
16 |
17 | def test_exception(data, conf, tmp_path):
18 | try:
19 | user_num, item_num = conf
20 | cdm = GDIRT(user_num, item_num, value_range=10, a_range=100)
21 | cdm.train(data, test_data=data, epoch=2)
22 | filepath = tmp_path / "mcd.params"
23 | cdm.save(filepath)
24 | cdm.load(filepath)
25 | except ValueError:
26 | print(ValueError)
27 |
--------------------------------------------------------------------------------
/tests/kancd/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2023/3/8 @ WangFei
3 |
--------------------------------------------------------------------------------
/tests/kancd/conftest.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2023/3/8 @ WangFei
3 |
4 | import random
5 | import pytest
6 | import torch
7 | import numpy as np
8 | from torch.utils.data import TensorDataset, DataLoader
9 |
10 |
11 | @pytest.fixture(scope="package")
12 | def conf():
13 | user_num = 5
14 | item_num = 2
15 | knowledge_num = 4
16 | return user_num, item_num, knowledge_num
17 |
18 |
19 | @pytest.fixture(scope="package")
20 | def data(conf):
21 | user_num, item_num, knowledge_num = conf
22 | knowledge_embs = np.zeros((item_num, knowledge_num))
23 | for i in range(item_num):
24 | for j in range(knowledge_num):
25 | knowledge_embs[i][j] = random.randint(0, 1)
26 | log = []
27 | for i in range(user_num):
28 | for j in range(item_num):
29 | score = random.randint(0, 1)
30 | log.append((i, j, knowledge_embs[j], score))
31 |
32 | user_id, item_id, knowledge_emb, score = zip(*log)
33 | batch_size = 4
34 |
35 | dataset = TensorDataset(
36 | torch.tensor(user_id, dtype=torch.int64),
37 | torch.tensor(item_id, dtype=torch.int64),
38 | torch.tensor(knowledge_emb, dtype=torch.int64),
39 | torch.tensor(score, dtype=torch.float)
40 | )
41 | return DataLoader(dataset, batch_size=batch_size)
42 |
--------------------------------------------------------------------------------
/tests/kancd/test_kancd.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2023/3/8 @ WangFei
3 | from EduCDM import KaNCD
4 |
5 |
6 | def test_train(data, conf, tmp_path):
7 | user_num, item_num, knowledge_num = conf
8 | cdm = KaNCD(exer_n=item_num, student_n=user_num, knowledge_n=knowledge_num, mf_type='mf', dim=2)
9 | cdm.train(data, data, epoch_n=2)
10 | cdm = KaNCD(exer_n=item_num, student_n=user_num, knowledge_n=knowledge_num, mf_type='gmf', dim=2)
11 | cdm.train(data, data, epoch_n=2)
12 | cdm = KaNCD(exer_n=item_num, student_n=user_num, knowledge_n=knowledge_num, mf_type='ncf1', dim=2)
13 | cdm.train(data, data, epoch_n=2)
14 | cdm = KaNCD(exer_n=item_num, student_n=user_num, knowledge_n=knowledge_num, mf_type='ncf2', dim=2)
15 | cdm.train(data, data, epoch_n=2)
16 | filepath = tmp_path / "kancd.params"
17 | cdm.save(filepath)
18 | cdm.load(filepath)
19 |
--------------------------------------------------------------------------------
/tests/mcd/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/3/23 @ tongshiwei
3 |
--------------------------------------------------------------------------------
/tests/mcd/conftest.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/3/23 @ tongshiwei
3 |
4 | import random
5 | import pytest
6 | import torch
7 | from torch.utils.data import TensorDataset, DataLoader
8 |
9 |
10 | @pytest.fixture(scope="package")
11 | def conf():
12 | user_num = 5
13 | item_num = 2
14 | return user_num, item_num
15 |
16 |
17 | @pytest.fixture(scope="package")
18 | def data(conf):
19 | user_num, item_num = conf
20 | log = []
21 | for i in range(user_num):
22 | for j in range(item_num):
23 | score = random.randint(0, 1)
24 | log.append((i, j, score))
25 |
26 | user_id, item_id, score = zip(*log)
27 | batch_size = 4
28 |
29 | dataset = TensorDataset(
30 | torch.tensor(user_id, dtype=torch.int64),
31 | torch.tensor(item_id, dtype=torch.int64),
32 | torch.tensor(score, dtype=torch.float)
33 | )
34 | return DataLoader(dataset, batch_size=batch_size)
35 |
--------------------------------------------------------------------------------
/tests/mcd/test_mcd.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/3/23 @ tongshiwei
3 | from EduCDM import MCD
4 |
5 |
6 | def test_train(data, conf, tmp_path):
7 | user_num, item_num = conf
8 | cdm = MCD(user_num, item_num, 10)
9 | cdm.train(data, test_data=data, epoch=2)
10 | filepath = tmp_path / "mcd.params"
11 | cdm.save(filepath)
12 | cdm.load(filepath)
13 |
--------------------------------------------------------------------------------
/tests/mirt/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/7/1 @ tongshiwei
3 |
--------------------------------------------------------------------------------
/tests/mirt/conftest.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/3/23 @ tongshiwei
3 |
4 | import random
5 | import pytest
6 | import torch
7 | from torch.utils.data import TensorDataset, DataLoader
8 |
9 |
10 | @pytest.fixture(scope="package")
11 | def conf():
12 | user_num = 5
13 | item_num = 2
14 | return user_num, item_num
15 |
16 |
17 | @pytest.fixture(scope="package")
18 | def data(conf):
19 | user_num, item_num = conf
20 | log = []
21 | for i in range(user_num):
22 | for j in range(item_num):
23 | score = random.randint(0, 1)
24 | log.append((i, j, score))
25 |
26 | user_id, item_id, score = zip(*log)
27 | batch_size = 4
28 |
29 | dataset = TensorDataset(
30 | torch.tensor(user_id, dtype=torch.int64),
31 | torch.tensor(item_id, dtype=torch.int64),
32 | torch.tensor(score, dtype=torch.float)
33 | )
34 | return DataLoader(dataset, batch_size=batch_size)
35 |
--------------------------------------------------------------------------------
/tests/mirt/test_mirt.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/4/23 @ tongshiwei
3 |
4 | from EduCDM import MIRT
5 | import pytest
6 |
7 |
8 | def test_train(data, conf, tmp_path):
9 | user_num, item_num = conf
10 | cdm = MIRT(user_num, item_num, 10)
11 | cdm.train(data, test_data=data, epoch=2)
12 | filepath = tmp_path / "mcd.params"
13 | cdm.save(filepath)
14 | cdm.load(filepath)
15 |
16 |
17 | def test_exception(data, conf, tmp_path):
18 | try:
19 | user_num, item_num = conf
20 | cdm = MIRT(user_num, item_num, 10, a_range=100)
21 | cdm.train(data, test_data=data, epoch=2)
22 | filepath = tmp_path / "mcd.params"
23 | cdm.save(filepath)
24 | cdm.load(filepath)
25 | except ValueError:
26 | print(ValueError)
27 |
--------------------------------------------------------------------------------
/tests/ncdm/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/4/6 @ WangFei
3 |
--------------------------------------------------------------------------------
/tests/ncdm/conftest.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/4/6 @ WangFei
3 |
4 | import random
5 | import pytest
6 | import torch
7 | import numpy as np
8 | from torch.utils.data import TensorDataset, DataLoader
9 |
10 |
11 | @pytest.fixture(scope="package")
12 | def conf():
13 | user_num = 5
14 | item_num = 2
15 | knowledge_num = 4
16 | return user_num, item_num, knowledge_num
17 |
18 |
19 | @pytest.fixture(scope="package")
20 | def data(conf):
21 | user_num, item_num, knowledge_num = conf
22 | knowledge_embs = np.zeros((item_num, knowledge_num))
23 | for i in range(item_num):
24 | for j in range(knowledge_num):
25 | knowledge_embs[i][j] = random.randint(0, 1)
26 | log = []
27 | for i in range(user_num):
28 | for j in range(item_num):
29 | score = random.randint(0, 1)
30 | log.append((i, j, knowledge_embs[j], score))
31 |
32 | user_id, item_id, knowledge_emb, score = zip(*log)
33 | batch_size = 4
34 |
35 | dataset = TensorDataset(
36 | torch.tensor(user_id, dtype=torch.int64),
37 | torch.tensor(item_id, dtype=torch.int64),
38 | torch.tensor(knowledge_emb, dtype=torch.int64),
39 | torch.tensor(score, dtype=torch.float)
40 | )
41 | return DataLoader(dataset, batch_size=batch_size)
42 |
--------------------------------------------------------------------------------
/tests/ncdm/test_ncdm.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/4/6 @ WangFei
3 | from EduCDM import NCDM
4 |
5 |
6 | def test_train(data, conf, tmp_path):
7 | user_num, item_num, knowledge_num = conf
8 | cdm = NCDM(knowledge_num, item_num, user_num)
9 | cdm.train(data, test_data=data, epoch=2)
10 | filepath = tmp_path / "mcd.params"
11 | cdm.save(filepath)
12 | cdm.load(filepath)
13 |
--------------------------------------------------------------------------------