├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── documentation.md │ └── feature_request.md ├── PULL_REQUEST_TEMPLATE.md └── workflows │ ├── python-publish.yml │ └── python-test.yml ├── .gitignore ├── .travis.yml ├── AUTHORS.md ├── CHANGE.txt ├── CONTRIBUTE.md ├── CONTRIBUTE_CH.md ├── EduCDM ├── DINA │ ├── EM │ │ ├── DINA.py │ │ └── __init__.py │ ├── GD │ │ ├── DINA.py │ │ └── __init__.py │ └── __init__.py ├── FuzzyCDF │ ├── FuzzyCDF.py │ ├── __init__.py │ └── modules.py ├── ICD │ ├── ICD.py │ ├── __init__.py │ ├── etl │ │ ├── __init__.py │ │ ├── etl.py │ │ └── utils.py │ ├── metrics │ │ ├── __init__.py │ │ └── metrics.py │ ├── sym │ │ ├── __init__.py │ │ ├── fit_eval.py │ │ ├── net │ │ │ ├── __init__.py │ │ │ ├── dtn.py │ │ │ ├── mirt.py │ │ │ ├── ncd.py │ │ │ └── net.py │ │ └── pos_linear.py │ └── utils.py ├── IRR │ ├── DINA.py │ ├── IRT.py │ ├── MIRT.py │ ├── NCDM.py │ ├── __init__.py │ ├── etl │ │ ├── __init__.py │ │ ├── pair_etl.py │ │ ├── point_etl.py │ │ └── utils.py │ └── loss.py ├── IRT │ ├── EM │ │ ├── IRT.py │ │ └── __init__.py │ ├── GD │ │ ├── IRT.py │ │ └── __init__.py │ ├── __init__.py │ └── irt.py ├── KaNCD │ ├── KaNCD.py │ └── __init__.py ├── MCD │ ├── MCD.py │ └── __init__.py ├── MIRT │ ├── MIRT.py │ └── __init__.py ├── NCDM │ ├── NCDM.py │ └── __init__.py ├── __init__.py └── meta.py ├── LICENSE ├── Makefile ├── README.md ├── docs ├── DINA.md ├── FuzzyCDF.md ├── ICD.md ├── IRR.md ├── IRT.md ├── KaNCD.md ├── MCD.md ├── MIRT.md ├── NCDM.md └── _static │ ├── DINA.png │ ├── EduCDM.png │ ├── FuzzyCDF.png │ ├── IRR.png │ ├── KDM_MF.png │ ├── KPM_MF.png │ ├── MCD.png │ └── NeuralCDM.JPG ├── examples ├── DINA │ ├── EM │ │ ├── DINA.ipynb │ │ ├── DINA.py │ │ └── prepare_dataset.ipynb │ └── GD │ │ ├── DINA.ipynb │ │ ├── DINA.py │ │ └── prepare_dataset.ipynb ├── FuzzyCDF │ ├── FuzzyCDF.ipynb │ ├── FuzzyCDF.py │ └── prepare_dataset.ipynb ├── ICD │ ├── ICD.py │ └── prepare_dataset.ipynb ├── IRR │ ├── DINA.ipynb │ ├── DINA.py │ ├── IRT.ipynb │ ├── IRT.py │ ├── MIRT.ipynb │ ├── MIRT.py │ ├── NCDM.ipynb │ ├── NCDM.py │ ├── README.md │ └── prepare_dataset.ipynb ├── IRT │ ├── EM │ │ ├── IRT.ipynb │ │ ├── IRT.py │ │ └── prepare_dataset.ipynb │ └── GD │ │ ├── IRT.ipynb │ │ ├── IRT.py │ │ └── prepare_dataset.ipynb ├── KaNCD │ ├── KaNCD.ipynb │ ├── KaNCD.py │ └── prepare_dataset.ipynb ├── MCD │ ├── MCD.ipynb │ ├── MCD.py │ └── prepare_dataset.ipynb ├── MIRT │ ├── MIRT.ipynb │ ├── MIRT.py │ └── prepare_dataset.ipynb └── NCDM │ ├── NCDM.ipynb │ ├── NCDM.py │ └── prepare_dataset.ipynb ├── pytest.ini ├── setup.cfg ├── setup.py └── tests ├── __init__.py ├── dina ├── __init__.py ├── em │ ├── __init__.py │ ├── conftest.py │ └── test_dina.py └── gd │ ├── __init__.py │ ├── conftest.py │ └── test_gddina.py ├── fuzzycdf ├── __init__.py ├── conftest.py └── test_fuzzycdf.py ├── icd ├── __init__.py ├── conftest.py ├── test_mirt.py └── test_ncd.py ├── irr ├── __init__.py ├── conftest.py ├── test_dina.py ├── test_irt.py ├── test_mirt.py └── test_ncdm.py ├── irt ├── __init__.py ├── em │ ├── __init__.py │ ├── conftest.py │ └── test_emirt.py └── gd │ ├── __init__.py │ ├── conftest.py │ └── test_gdirt.py ├── kancd ├── __init__.py ├── conftest.py └── test_kancd.py ├── mcd ├── __init__.py ├── conftest.py └── test_mcd.py ├── mirt ├── __init__.py ├── conftest.py └── test_mirt.py └── ncdm ├── __init__.py ├── conftest.py └── test_ncdm.py /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: 'Bug, needs triage' 6 | 7 | --- 8 | ## 🐛 Description 9 | (A clear and concise description of what the bug is.) 10 | 11 | ### Error Message 12 | (Paste the complete error message. Please also include stack trace by setting environment variable `DMLC_LOG_STACK_TRACE_DEPTH=100` before running your script.) 13 | 14 | ## To Reproduce 15 | (If you developed your own code, please provide a short script that reproduces the error. For existing examples, please provide link.) 16 | 17 | ### Steps to reproduce 18 | (Paste the commands you ran that produced the error.) 19 | 20 | 1. 21 | 2. 22 | 23 | ## What have you tried to solve it? 24 | 25 | 1. 26 | 2. 27 | 28 | ## Environment 29 | 30 |
31 | Environment Information 32 | 33 | **Operating System:** ... 34 | 35 | **Python Version:** (e.g., python3.6, anaconda/python3.7, venv/python3.8) 36 | 37 |
38 | 39 | ## Additional context 40 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/documentation.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: 📚 Documentation 3 | about: Update api documentation or add the data analysis 4 | --- 5 | 6 | ## 📚 Documentation -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: 'Feature request' 6 | 7 | --- 8 | 9 | ## Description 10 | (A clear and concise description of what the feature is.) 11 | - If the proposal is about an algorithm or a model, provide mock examples if possible. In addition, you may need to carefully follow the [guidance](https://github.com/bigdata-ustc/EduCDM/blob/main/CONTRIBUTE.md) 12 | 13 | ## References 14 | - list reference and related literature 15 | - list known implementations 16 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | Thanks for sending a pull request! 2 | Please make sure you click the link above to view the [contribution guidelines](../blob/master/CONTRIBUTE.md), 3 | then fill out the blanks below. 4 | 5 | ## Description ## 6 | (Brief description on what this PR is about) 7 | 8 | ### What does this implement/fix? Explain your changes. 9 | ... 10 | 11 | #### Pull request type 12 | - [ ] [DATASET] Add a new dataset 13 | - [ ] [BUGFIX] Bugfix 14 | - [ ] [FEATURE] New feature (non-breaking change which adds functionality) 15 | - [ ] [BREAKING] Breaking change (fix or feature that would cause existing functionality to not work as expected) 16 | - [ ] [STYLE] Code style update (formatting, renaming) 17 | - [ ] [REFACTOR] Refactoring (no functional changes, no api changes) 18 | - [ ] [BUILD] Build related changes 19 | - [ ] [DOC] Documentation content changes 20 | - [ ] [Sync] Synchronization with a repository 21 | - [ ] [OTHER] Other (please describe): 22 | 23 | 24 | #### Changes 25 | - Feature1, tests, (and when applicable, API doc) 26 | - Feature2, tests, (and when applicable, API doc) 27 | 28 | or 29 | 30 | - Fix1, tests 31 | - Fix2, tests 32 | 33 | ### Does this close any currently open issues? 34 | ... 35 | 36 | ### Any relevant logs, error output, etc? 37 | ... 38 | 39 | ## Checklist ## 40 | Before you submit a pull request, please make sure you have to following: 41 | 42 | ### Essentials ### 43 | - [ ] PR's title starts with a category (e.g. [BUGFIX], [FEATURE], [BREAKING], [DOC], etc) 44 | - [ ] Changes are complete (i.e. I finished coding on this PR) 45 | - [ ] All changes have test coverage and al tests passing 46 | - [ ] Code is well-documented (extended the README / documentation, if necessary) 47 | - [ ] If this PR is your first one, add your name and github account to [AUTHORS.md](../blob/master/AUTHORS.md) 48 | 49 | ## Comments ## 50 | - If this change is a backward incompatible change, why must this change be made. 51 | - Interesting edge cases to note here 52 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflows will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | name: Upload Python Package 5 | 6 | on: 7 | release: 8 | types: [created] 9 | 10 | jobs: 11 | deploy: 12 | 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - uses: actions/checkout@v2 17 | - name: Set up Python 18 | uses: actions/setup-python@v2 19 | with: 20 | python-version: '3.x' 21 | - name: Install dependencies 22 | run: | 23 | python -m pip install --upgrade pip 24 | pip install setuptools wheel twine 25 | - name: Build and publish 26 | env: 27 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 28 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 29 | run: | 30 | python setup.py sdist bdist_wheel 31 | twine upload dist/* 32 | -------------------------------------------------------------------------------- /.github/workflows/python-test.yml: -------------------------------------------------------------------------------- 1 | 2 | name: test 3 | 4 | on: [push, pull_request] 5 | 6 | jobs: 7 | build: 8 | 9 | runs-on: ubuntu-latest 10 | strategy: 11 | matrix: 12 | python-version: [3.7, 3.8, 3.9, '3.10'] 13 | 14 | steps: 15 | - uses: actions/checkout@v2 16 | - name: Set up Python ${{ matrix.python-version }} 17 | uses: actions/setup-python@v2 18 | with: 19 | python-version: ${{ matrix.python-version }} 20 | - name: Install dependencies 21 | run: | 22 | pip install -e .[test] 23 | pip install codecov 24 | - name: Test with pytest 25 | run: | 26 | pytest 27 | codecov 28 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | wheels/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | MANIFEST 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *.cover 46 | .hypothesis/ 47 | .pytest_cache/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | db.sqlite3 57 | 58 | # Flask stuff: 59 | instance/ 60 | .webassets-cache 61 | 62 | # Scrapy stuff: 63 | .scrapy 64 | 65 | # Sphinx documentation 66 | docs/_build/ 67 | **/_build/ 68 | **/_build/* 69 | 70 | # PyBuilder 71 | target/ 72 | 73 | # Jupyter Notebook 74 | .ipynb_checkpoints 75 | 76 | # pyenv 77 | .python-version 78 | 79 | # celery beat schedule file 80 | celerybeat-schedule 81 | 82 | # SageMath parsed files 83 | *.sage.py 84 | 85 | # Environments 86 | .env 87 | .venv 88 | env/ 89 | venv/ 90 | ENV/ 91 | env.bak/ 92 | venv.bak/ 93 | 94 | # Spyder project settings 95 | .spyderproject 96 | .spyproject 97 | 98 | # Rope project settings 99 | .ropeproject 100 | 101 | # mkdocs documentation 102 | /site 103 | 104 | # IDE 105 | .idea/ 106 | .vscode/ 107 | .DS_Store 108 | 109 | # Pyre type checker 110 | .pyre/ 111 | 112 | # User Definition 113 | data/ -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | 3 | matrix: 4 | include: 5 | - python: 3.6 6 | - python: 3.7 7 | - python: 3.8 8 | - python: 3.9 9 | dist: xenial 10 | sudo: true 11 | 12 | install: 13 | - pip install -e .[test] 14 | - pip install codecov 15 | 16 | script: 17 | - pytest 18 | - codecov 19 | -------------------------------------------------------------------------------- /AUTHORS.md: -------------------------------------------------------------------------------- 1 | # AUTHORS 2 | 3 | [Shiwei Tong](https://github.com/tswsxk) 4 | 5 | [Wei Huang](https://github.com/RandolphVI) 6 | 7 | [Jiayu Liu](https://github.com/Ljyustc) 8 | 9 | [Fei Wang](https://github.com/LegionKing) 10 | 11 | [Fangzhou Yao](https://github.com/fannazya) 12 | 13 | [Yuting Hong](https://github.com/ViviHong200709) 14 | 15 | -------------------------------------------------------------------------------- /CHANGE.txt: -------------------------------------------------------------------------------- 1 | v1.0.1: 2 | * update version requirements of longling and pytest-flake8 3 | 4 | v1.0.0: 5 | * add KaNCD and ICD 6 | 7 | v0.0.13: 8 | * Bugfix: update dependency version of `longling` 9 | * use PosLinear to replace clipper operation 10 | 11 | v0.0.12: 12 | * limit the range of parameters in IRT and MIRT 13 | 14 | v0.0.11: 15 | * fix error in `irf` 16 | 17 | v0.0.10: 18 | * add STE operator in DINA 19 | * add Multidimensional Item Response Theory (MIRT) 20 | * add IRR-DINA, IRR-MIRT, IRR-NCDM 21 | 22 | v0.0.9: 23 | * add Item Response Ranking for Cognitive Diagnosis (IRR) 24 | * IRT 25 | 26 | v0.0.8: 27 | * add DINA model with Gradient Descent Optimization (GDDINA) and rename the previous DINA to EMDINA 28 | 29 | v0.0.7: 30 | * fix potential ModuleNotFoundError 31 | 32 | v0.0.6: 33 | * add Item Response Theory with Expectation Maximization Optimization (EMIRT) 34 | 35 | v0.0.5: 36 | * add Item Response Theory with Gradient Descent Optimization (GDIRT) 37 | 38 | v0.0.4: 39 | * add NeuralCDM (NCDM) 40 | 41 | v0.0.3: 42 | * add DINA 43 | * add FuzzyCDF 44 | 45 | v0.0.2: 46 | * add MCD 47 | 48 | v0.0.1: 49 | * Add meta class 50 | -------------------------------------------------------------------------------- /CONTRIBUTE.md: -------------------------------------------------------------------------------- 1 | # CONTRIBUTE 2 | 3 | [中文版本](CONTRIBUTE_CH.md) 4 | 5 | ## Guidance 6 | Thank you for your interest in contributing to EduCDM! 7 | Before you begin writing code, it is important that you share your intention to contribute with the team, 8 | based on the type of contribution: 9 | 10 | 1. You want to propose a new feature and implement it. 11 | * Post about your intended feature in an issue, 12 | and we shall discuss the design and implementation. 13 | Once we agree that the plan looks good, go ahead and implement it. 14 | 2. You want to implement a feature or bug-fix for an outstanding issue. 15 | * Search for your issue in the [EduCDM issue list](https://github.com/bigdata-ustc/EduCDM/issues). 16 | * Pick an issue and comment that you'd like to work on the feature or bug-fix. 17 | * If you need more context on a particular issue, please ask and we shall provide. 18 | 19 | Once you implement and test your feature or bug-fix, 20 | please submit a Pull Request to [EduCDM](https://github.com/bigdata-ustc/EduCDM): 21 | 22 | 1. Fork this repository to your branch. 23 | 2. Modify the code. Note that we strongly recommend that you comply with our [commit format specifications](CONTRIBUTE.md#About-Commit). 24 | 3. Pass code tests and make the test coverage reach 100%. [An example](tests/mcd). 25 | 4. Submit a Pull Request to [EduCDM](https://github.com/bigdata-ustc/EduCDM). Note that we provide a standard template of Pull Request [here](https://github.com/bigdata-ustc/EduCDM/pull/7). Please fill in the information carefully. 26 | 27 | The followings are some helpful guidelines for different types contribution: 28 | 29 | ### Add new dataset 30 | 31 | If you want to add the data analysis or a new dataset, please submit a Pull Request to [EduData](https://github.com/bigdata-ustc/EduData). 32 | 33 | ### Add new CDM model 34 | 35 | The newly implemented CDM model requires: 36 | 37 | 1. Dataset processing. 38 | 2. Inherit the `class CDM` in `EduCDM/meta.py` and implement four methods in it. 39 | 3. Write the corresponding example code for the model (This refers to a demo that can be tested by others). It should include at least [notebook](examples/MCD/MCD.ipynb) and [script](examples/MCD/MCD.py). [An example](examples/MCD). 40 | 4. Write the corresponding test code for the model and make sure that the test coverage is 100%. [An example](tests/mcd). 41 | 42 | #### Dataset Processing 43 | 44 | As for the dataset preprocessing, we suggest: 45 | 46 | 1. Write a script, and make sure that: 47 | - Processing and converting of the raw dataset. 48 | - Partitioning Training/validation/test dataset. 49 | 2. Provide or use [CDBD](https://github.com/bigdata-ustc/EduData) dataset (which is already divided into training/validation/test datasets). 50 | 51 | 52 | #### Module 53 | 54 | All modules are inherited from `Class CDM`, it will raise `NotImplementedError` if the functions are not implemented. 55 | 56 | Note that we do not constrain your neural network or algorithms (for example, the network construction, optimizer, loss function definitions, etc.). 57 | 58 | - **Train** module 59 | 60 | This module is a training module, which is used to train model. 61 | 62 | ```python3 63 | def train(self, *args, **kwargs) -> ...: 64 | raise NotImplementedError 65 | ``` 66 | 67 | - **Eval** module 68 | 69 | This module is an evaluation module, which is used to verify and test the model. 70 | 71 | ```python3 72 | def eval(self, *args, **kwargs) -> ...: 73 | raise NotImplementedError 74 | ``` 75 | 76 | - **Save** module 77 | 78 | This module is a model saving module, which is used to save the trained model. 79 | 80 | ```python3 81 | def save(self, *args, **kwargs) -> ...: 82 | raise NotImplementedError 83 | ``` 84 | 85 | - **Load** module 86 | 87 | This module is a model loading module, which is used to load the saved model. 88 | 89 | ```python3 90 | def load(self, *args, **kwargs) -> ...: 91 | raise NotImplementedError 92 | ``` 93 | 94 | #### Demo 95 | 96 | Make sure you make a demo for your model. [An example](examples/MCD). 97 | 98 | #### Docs Format 99 | 100 | Numpy docs format is used: 101 | 102 | ``` 103 | function 104 | 105 | Parameters 106 | ---------- 107 | Variable 1: type , optional or not 108 | description 109 | Variable 2: type , optional or not 110 | description 111 | ... 112 | 113 | Returns 114 | ------- 115 | Variable: type 116 | description 117 | 118 | See Also (Optional) 119 | -------- 120 | Similar to function(): 121 | 122 | Examples (Optional) 123 | -------- 124 | >>> For example: 125 | ... 126 | ``` 127 | 128 | ### About Commit 129 | 130 | #### commit format 131 | 132 | ``` 133 | []() 134 | ``` 135 | 136 | #### type 137 | - `feat`:New feature。 138 | - `fix/to`:Fix bugs, either found in Q&A or found in your own use. 139 | - `fix`:Generating diff and fixes the problem automatically. **Suitable for one submit to fix the problem directly**. 140 | - `to`:Generating only **diff** but does not automatically fix the problem. **Suitable for multiple submissions**. Use `fix` when the final fix problem is committed. 141 | - `docs`:Documentation. 142 | - `style`:Format (do not affect code execution). 143 | - `refactor`:Refactoring (not new features or bug fix). 144 | - `perf`:Optimize related issues, such as code performance, user experience. 145 | - `test`:Add test unit. 146 | - `chore`:Build process or auxiliary tools change. 147 | - `revert`:Roll back to the previous version. 148 | - `merge`:Code merge. 149 | - `sync`:Synchronizing the bug of main or branch。 150 | - `arch`: Engineering documents or tools change. 151 | 152 | ##### scope (optional) 153 | 154 | Scope is used to describe the impact of the commit, such as **the data layer**, **the control layer**, **the view layer**, and so on, depending on the project. 155 | 156 | For example, in Angular, it can be location, browser, compile, compile, rootScope, ngHref, ngClick, ngView, and so on. If your changes affect more than one scope, you can use `*` instead. 157 | 158 | ##### subject (mandatory) 159 | 160 | A subject is a short description of the purpose of the commit, not more than 50 characters. 161 | 162 | There is no period or other punctuation at the end. 163 | 164 | #### Example 165 | 166 | - **[docs] update the README.md** 167 | 168 | ```sh 169 | git commit -m "[docs] update the README.md" 170 | ``` 171 | 172 | ## FAQ 173 | 174 | Q: I have carefully tested the code in my local system (all testing passed) but still failed in online CI? 175 | 176 | A: There are two possible reasons: 177 | 1. the online CI system is different from your local system; 178 | 2. there are some network error causing the downloading test failed, which you can find in the CI log. 179 | 180 | For the second reason, all you need to do is to retry the test. 181 | 182 | -------------------------------------------------------------------------------- /CONTRIBUTE_CH.md: -------------------------------------------------------------------------------- 1 | # 贡献规范 2 | 3 | [English version](CONTRIBUTE.md) 4 | 5 | ## 导引 6 | 7 | 首先感谢您关注 EduCDM 并致力于让其变得更好! 8 | 在您开始贡献自己的一份力之前,需要注意以下几点: 9 | 1. 如果您希望我们实现新的功能。 10 | - 可以在通过 issue 来告诉我们您想要的功能,我们将及时展开讨论设计和实现。 11 | - 一旦我们一致地认为这个计划不错,那么您可以期待新的功能很快就可以与您见面。 12 | 2. 如果您想要对于某个未解决问题的 issue 提供解决性意见或 bug 修复。 13 | - 可以先在 [EduCDM issue list](https://github.com/bigdata-ustc/CDM/issues) 中搜索您的问题。 14 | - 之后,选择一个具体问题和评论,来提供您的解决性意见或者 bug 修复。 15 | - 如果对于具体的 issue,您需要更多的细节,请向我们咨询。 16 | 17 | 一旦您实现并已经测试过了你的想法或者是对于 bug 的修复,请通过 Pull Request 提及到到 [EduCDM](https://github.com/bigdata-ustc/CDM) : 18 | 1. 首先fork此仓库到你的分支下 19 | 2. 对代码进行修改。注意:我们强烈建议你遵守我们的 [commit格式规范](CONTRIBUTE_CH.md#关于Commit的格式) 20 | 3. 通过代码测试,测试覆盖度达到100%,例子可见[此处](tests/mcd) 21 | 4. 通过Pull Request 提及到到 [EduCDM](https://github.com/bigdata-ustc/CDM) 。注意:我们提供了一个标准的PR请求模板,你需要认真完成其中的信息,一个标准且规范的PR可参考[此处](https://github.com/bigdata-ustc/EduCDM/pull/7) 22 | 23 | 以下是对于不同贡献内容的有用建议: 24 | 25 | ### 添加新的数据集或者数据分析 26 | 27 | 有关新数据集或数据分析,请移步至 [EduData](https://github.com/bigdata-ustc/EduData) 。 28 | 29 | ### 添加新的 CDM 模型 30 | 31 | 新实现的 CDM 模型需要: 32 | 1. 数据集的预处理。 33 | 2. 继承 `EduCDM/meta.py` 中的的 `class CDM`,并实现中间的四个方法。 34 | 3. 编写模型对应的 example 代码(这里指的是可供其他人运行测试使用的 demo),例子可见[此处](examples/MCD):至少应当包括:[notebook](examples/MCD/MCD.ipynb) 和 [script](examples/MCD/MCD.py) 35 | 4. 编写模型对应的测试代码,保证测试覆盖度为100%,例子可见[此处](tests/mcd) 36 | 37 | #### 数据预处理 38 | 39 | 关于数据集的预处理,我们提供如下两种建议: 40 | 41 | 1. 编写一个 script,完成: 42 | - 对原始数据集中进行处理,转换。 43 | - 训练/验证/测试集划分。 44 | 2. 提交或使用 [CDBD](https://github.com/bigdata-ustc/EduData) 数据集(已划分好训练/验证/测试集)。 45 | 46 | #### 模块编写 47 | 48 | 编写的新 CDM 模型,其中几个重要模块需要继承 `EduCDM/meta.py` 中的 `class CDM`。 49 | 需要注意的是,我们并不对您的神经网络、算法(例如,网络构造、优化器、损失函数定义等)进行约束。 50 | 51 | - 训练模块 52 | 53 | 该模块为训练模块,用于对模型、算法进行训练。 54 | 55 | ```python3 56 | def train(self, *args, **kwargs) -> ...: 57 | raise NotImplementedError 58 | ``` 59 | 60 | - 测试模块 61 | 62 | 该模块为测试模块,用于对模型、算法进行验证、测试。 63 | 64 | ```python3 65 | def eval(self, *args, **kwargs) -> ...: 66 | raise NotImplementedError 67 | ``` 68 | 69 | - 模型存储模块 70 | 71 | 该模块为存储模块,用于保存训练好了的模型、算法。 72 | 73 | ```python3 74 | def save(self, *args, **kwargs) -> ...: 75 | raise NotImplementedError 76 | ``` 77 | 78 | - 模型读取模块 79 | 80 | 该模块为模型读取模块,用于读取保存好了的模型、算法。 81 | 82 | ```python3 83 | def load(self, *args, **kwargs) -> ...: 84 | raise NotImplementedError 85 | ``` 86 | 87 | #### 编写 Demo 88 | 89 | 编写模型对应的 Example 代码,例子可见[]() : 90 | 91 | #### 代码注释风格 92 | 93 | 请使用 Numpy 代码注释风格: 94 | 95 | ``` 96 | function 的功能 97 | 98 | Parameters 99 | ---------- 100 | 变量名 1: 类型, 是否 optional 101 | 描述 102 | 变量名 2: 类型, 是否 optional 103 | 描述 104 | ... 105 | 106 | Returns 107 | ------- 108 | 变量名: 类型 109 | 描述 110 | 111 | See Also (可选) 112 | -------- 113 | 类似 function: 类似 function 的功能 114 | 115 | Examples (可选) 116 | -------- 117 | >>> 举例怎么用 118 | ``` 119 | 120 | ### 关于Commit的格式 121 | 122 | #### commit format 123 | 124 | ``` 125 | []() 126 | ``` 127 | 128 | #### type 129 | - `feat`:新功能(feature)。 130 | - `fix/to`:修复 bug,可以是 Q&A 发现的 bug,也可以是自己在使用时发现的 bug。 131 | - `fix`:产生 diff 并自动修复此问题。**适合于一次提交直接修复问题**。 132 | - `to`:只产生 diff 不自动修复此问题。**适合于多次提交**。最终修复问题提交时使用 `fix`。 133 | - `docs`:文档(documentation)。 134 | - `style`:格式(不影响代码运行的变动)。 135 | - `refactor`:重构(即非新增功能,也不是修改 bug 的代码变动)。 136 | - `perf`:优化相关,比如提升性能、体验。 137 | - `test`:增加测试。 138 | - `chore`:构建过程或辅助工具的变动。 139 | - `revert`:回滚到上一个版本。 140 | - `merge`:代码合并。 141 | - `sync`:同步主线或分支的 bug。 142 | - `arch`: 工程文件或工具的改动。 143 | 144 | #### scope (可选) 145 | 146 | scope 是用于说明 commit 影响的范围,比如数据层控制层视图层等等,视项目不同而不同。 147 | 148 | 例如在 Angular,可以是 location,browser,compile,compile,rootScope, ngHref,ngClick,ngView等。如果你的修改影响了不止一个scope,你可以使用`*`代替。 149 | 150 | #### subject (必须) 151 | 152 | subject 是 commit 目的的简短描述,不超过50个字符。 153 | 154 | 结尾不加句号或其他标点符号。 155 | 156 | #### Example 157 | 158 | - **[docs] update the README.md** 159 | 160 | ```sh 161 | git commit -m "[docs] update the README.md" 162 | ``` 163 | 164 | ## FAQ 165 | 166 | 问题: 我已经在本地仔细地测试了代码,并通过了代码检查,但是在 CI 步骤时却报错? 167 | 回答: 这个问题可能是两个原因造成: 168 | 1. 在线的 CI 系统与您自己本地系统有差别; 169 | 2. 可能是网络原因造成的,如果是可以通过 CI 的日志文件查看。 170 | -------------------------------------------------------------------------------- /EduCDM/DINA/EM/DINA.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/3/28 @ liujiayu 3 | 4 | import logging 5 | import numpy as np 6 | from tqdm import tqdm 7 | import pickle 8 | from EduCDM import CDM 9 | 10 | 11 | def initial_all_knowledge_state(know_num): 12 | state_num = 2 ** know_num 13 | all_states = np.zeros((state_num, know_num)) 14 | for i in range(state_num): 15 | k, quotient, residue = 1, i // 2, i % 2 16 | while True: 17 | all_states[i, know_num - k] = residue 18 | if quotient <= 0: 19 | break 20 | quotient, residue = quotient // 2, quotient % 2 21 | k += 1 22 | return all_states 23 | 24 | 25 | def init_parameters(stu_num, prob_num): 26 | slip = np.zeros(shape=prob_num) + 0.2 27 | guess = np.zeros(shape=prob_num) + 0.2 28 | theta = np.zeros(shape=stu_num) # index of state 29 | return theta, slip, guess 30 | 31 | 32 | class DINA(CDM): 33 | """ 34 | DINA model, training (EM) and testing methods 35 | :param R (array): response matrix, shape = (stu_num, prob_num) 36 | :param q_m (array): Q matrix, shape = (prob_num, know_num) 37 | :param stu_num (int): number of students 38 | :param prob_num (int): number of problems 39 | :param know_num (int): number of knowledge 40 | :param skip_value (int): skip value in response matrix 41 | """ 42 | 43 | def __init__(self, R, q_m, stu_num, prob_num, know_num, skip_value=-1): 44 | self.R, self.q_m, self.state_num, self.skip_value = R, q_m, 2 ** know_num, skip_value 45 | self.stu_num, self.prob_num, self.know_num = stu_num, prob_num, know_num 46 | self.theta, self.slip, self.guess = init_parameters(stu_num, prob_num) 47 | self.all_states = initial_all_knowledge_state(know_num) # shape = (state_num, know_num) 48 | state_prob = np.transpose(np.sum(q_m, axis=1, keepdims=True) - np.dot(q_m, np.transpose(self.all_states))) 49 | self.eta = 1 - (state_prob > 0) # state covers knowledge of problem (1: yes), shape = (state_num, prob_num) 50 | 51 | def train(self, epoch, epsilon) -> ...: 52 | like = np.zeros(shape=(self.stu_num, self.state_num)) # likelihood 53 | post = np.zeros(shape=(self.stu_num, self.state_num)) # posterior 54 | theta, slip, guess, tmp_R = np.copy(self.theta), np.copy(self.slip), np.copy(self.guess), np.copy(self.R) 55 | tmp_R[np.where(self.R == self.skip_value)[0], np.where(self.R == self.skip_value)[1]] = 0 56 | for iteration in range(epoch): 57 | post_tmp, slip_tmp, guess_tmp = np.copy(post), np.copy(slip), np.copy(guess) 58 | answer_right = (1 - slip) * self.eta + guess * (1 - self.eta) 59 | for s in range(self.state_num): 60 | log_like = np.log(answer_right[s, :] + 1e-9) * self.R + np.log(1 - answer_right[s, :] + 1e-9) * ( 61 | 1 - self.R) 62 | log_like[np.where(self.R == self.skip_value)[0], np.where(self.R == self.skip_value)[1]] = 0 63 | like[:, s] = np.exp(np.sum(log_like, axis=1)) 64 | post = like / np.sum(like, axis=1, keepdims=True) 65 | i_l = np.expand_dims(np.sum(post, axis=0), axis=1) # shape = (state_num, 1) 66 | r_jl = np.dot(np.transpose(post), tmp_R) # shape = (state_num, prob_num) 67 | r_jl_0, r_jl_1 = np.sum(r_jl * (1 - self.eta), axis=0), np.sum(r_jl * self.eta, axis=0) 68 | i_jl_0, i_jl_1 = np.sum(i_l * (1 - self.eta), axis=0), np.sum(i_l * self.eta, axis=0) 69 | guess, slip = r_jl_0 / i_jl_0, (i_jl_1 - r_jl_1) / i_jl_1 70 | 71 | change = max(np.max(np.abs(post - post_tmp)), np.max(np.abs(slip - slip_tmp)), 72 | np.max(np.abs(guess - guess_tmp))) 73 | theta = np.argmax(post, axis=1) 74 | if iteration > 20 and change < epsilon: 75 | break 76 | self.theta, self.slip, self.guess = theta, slip, guess 77 | 78 | def eval(self, test_data) -> tuple: 79 | pred_score = (1 - self.slip) * self.eta + self.guess * (1 - self.eta) 80 | test_rmse, test_mae = [], [] 81 | for i in tqdm(test_data, "evaluating"): 82 | stu, test_id, true_score = i['user_id'], i['item_id'], i['score'] 83 | test_rmse.append((pred_score[self.theta[stu], test_id] - true_score) ** 2) 84 | test_mae.append(abs(pred_score[self.theta[stu], test_id] - true_score)) 85 | return np.sqrt(np.average(test_rmse)), np.average(test_mae) 86 | 87 | def save(self, filepath): 88 | with open(filepath, 'wb') as file: 89 | pickle.dump({"theta": self.theta, "slip": self.slip, "guess": self.guess}, file) 90 | logging.info("save parameters to %s" % filepath) 91 | 92 | def load(self, filepath): 93 | with open(filepath, 'rb') as file: 94 | self.theta, self.slip, self.guess = pickle.load(file).values() 95 | logging.info("load parameters from %s" % filepath) 96 | 97 | def inc_train(self, inc_train_data, epoch, epsilon): # incremental training 98 | for i in inc_train_data: 99 | stu, test_id, true_score = i['user_id'], i['item_id'], i['score'] 100 | self.R[stu, test_id] = true_score 101 | self.train(epoch, epsilon) 102 | 103 | def transform(self, records): # MLE for evaluating student's state 104 | # max_like_id: diagnose which state among all_states the student belongs to 105 | # dia_state: binaray vector of length know_num, 0/1 indicates whether masters the knowledge 106 | answer_right = (1 - self.slip) * self.eta + self.guess * (1 - self.eta) 107 | log_like = records * np.log(answer_right + 1e-9) + (1 - records) * np.log(1 - answer_right + 1e-9) 108 | log_like[:, np.where(records == self.skip_value)[0]] = 0 109 | max_like_id = np.argmax(np.exp(np.sum(log_like, axis=1))) 110 | dia_state = self.all_states[max_like_id] 111 | return max_like_id, dia_state 112 | -------------------------------------------------------------------------------- /EduCDM/DINA/EM/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/6/21 @ tongshiwei 3 | 4 | from .DINA import DINA 5 | -------------------------------------------------------------------------------- /EduCDM/DINA/GD/DINA.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/6/21 @ tongshiwei 3 | 4 | import logging 5 | import numpy as np 6 | import torch 7 | from EduCDM import CDM 8 | from torch import nn 9 | from tqdm import tqdm 10 | from sklearn.metrics import roc_auc_score, accuracy_score 11 | import torch.autograd as autograd 12 | import torch.nn.functional as F 13 | 14 | 15 | class DINANet(nn.Module): 16 | def __init__(self, user_num, item_num, hidden_dim, max_slip=0.4, max_guess=0.4, *args, **kwargs): 17 | super(DINANet, self).__init__() 18 | self._user_num = user_num 19 | self._item_num = item_num 20 | self.step = 0 21 | self.max_step = 1000 22 | self.max_slip = max_slip 23 | self.max_guess = max_guess 24 | 25 | self.guess = nn.Embedding(self._item_num, 1) 26 | self.slip = nn.Embedding(self._item_num, 1) 27 | self.theta = nn.Embedding(self._user_num, hidden_dim) 28 | 29 | def forward(self, user, item, knowledge, *args): 30 | theta = self.theta(user) 31 | slip = torch.squeeze(torch.sigmoid(self.slip(item)) * self.max_slip) 32 | guess = torch.squeeze(torch.sigmoid(self.guess(item)) * self.max_guess) 33 | if self.training: 34 | n = torch.sum(knowledge * (torch.sigmoid(theta) - 0.5), dim=1) 35 | t, self.step = max((np.sin(2 * np.pi * self.step / self.max_step) + 1) / 2 * 100, 36 | 1e-6), self.step + 1 if self.step < self.max_step else 0 37 | return torch.sum( 38 | torch.stack([1 - slip, guess]).T * torch.softmax(torch.stack([n, torch.zeros_like(n)]).T / t, dim=-1), 39 | dim=1 40 | ) 41 | else: 42 | n = torch.prod(knowledge * (theta >= 0) + (1 - knowledge), dim=1) 43 | return (1 - slip) ** n * guess ** (1 - n) 44 | 45 | 46 | class STEFunction(autograd.Function): 47 | @staticmethod 48 | def forward(ctx, input): 49 | return (input > 0).float() 50 | 51 | @staticmethod 52 | def backward(ctx, grad_output): 53 | return F.hardtanh(grad_output) 54 | 55 | 56 | class StraightThroughEstimator(nn.Module): 57 | def __init__(self): 58 | super(StraightThroughEstimator, self).__init__() 59 | 60 | def forward(self, x): 61 | x = STEFunction.apply(x) 62 | return x 63 | 64 | 65 | class STEDINANet(DINANet): 66 | def __init__(self, user_num, item_num, hidden_dim, max_slip=0.4, max_guess=0.4, *args, **kwargs): 67 | super(STEDINANet, self).__init__(user_num, item_num, hidden_dim, max_slip, max_guess, *args, **kwargs) 68 | self.sign = StraightThroughEstimator() 69 | 70 | def forward(self, user, item, knowledge, *args): 71 | theta = self.sign(self.theta(user)) 72 | slip = torch.squeeze(torch.sigmoid(self.slip(item)) * self.max_slip) 73 | guess = torch.squeeze(torch.sigmoid(self.guess(item)) * self.max_guess) 74 | mask_theta = (knowledge == 0) + (knowledge == 1) * theta 75 | n = torch.prod((mask_theta + 1) / 2, dim=-1) 76 | return torch.pow(1 - slip, n) * torch.pow(guess, 1 - n) 77 | 78 | 79 | class DINA(CDM): 80 | def __init__(self, user_num, item_num, hidden_dim, ste=False): 81 | super(DINA, self).__init__() 82 | if ste: 83 | self.dina_net = STEDINANet(user_num, item_num, hidden_dim) 84 | else: 85 | self.dina_net = DINANet(user_num, item_num, hidden_dim) 86 | 87 | def train(self, train_data, test_data=None, *, epoch: int, device="cpu", lr=0.001) -> ...: 88 | self.dina_net = self.dina_net.to(device) 89 | loss_function = nn.BCELoss() 90 | 91 | trainer = torch.optim.Adam(self.dina_net.parameters(), lr) 92 | 93 | for e in range(epoch): 94 | losses = [] 95 | for batch_data in tqdm(train_data, "Epoch %s" % e): 96 | user_id, item_id, knowledge, response = batch_data 97 | user_id: torch.Tensor = user_id.to(device) 98 | item_id: torch.Tensor = item_id.to(device) 99 | knowledge: torch.Tensor = knowledge.to(device) 100 | predicted_response: torch.Tensor = self.dina_net(user_id, item_id, knowledge) 101 | response: torch.Tensor = response.to(device) 102 | loss = loss_function(predicted_response, response) 103 | 104 | # back propagation 105 | trainer.zero_grad() 106 | loss.backward() 107 | trainer.step() 108 | 109 | losses.append(loss.mean().item()) 110 | print("[Epoch %d] LogisticLoss: %.6f" % (e, float(np.mean(losses)))) 111 | 112 | if test_data is not None: 113 | auc, accuracy = self.eval(test_data, device=device) 114 | print("[Epoch %d] auc: %.6f, accuracy: %.6f" % (e, auc, accuracy)) 115 | 116 | def eval(self, test_data, device="cpu") -> tuple: 117 | self.dina_net = self.dina_net.to(device) 118 | self.dina_net.eval() 119 | y_pred = [] 120 | y_true = [] 121 | for batch_data in tqdm(test_data, "evaluating"): 122 | user_id, item_id, knowledge, response = batch_data 123 | user_id: torch.Tensor = user_id.to(device) 124 | item_id: torch.Tensor = item_id.to(device) 125 | knowledge: torch.Tensor = knowledge.to(device) 126 | pred: torch.Tensor = self.dina_net(user_id, item_id, knowledge) 127 | y_pred.extend(pred.tolist()) 128 | y_true.extend(response.tolist()) 129 | 130 | self.dina_net.train() 131 | return roc_auc_score(y_true, y_pred), accuracy_score(y_true, np.array(y_pred) >= 0.5) 132 | 133 | def save(self, filepath): 134 | torch.save(self.dina_net.state_dict(), filepath) 135 | logging.info("save parameters to %s" % filepath) 136 | 137 | def load(self, filepath): 138 | self.dina_net.load_state_dict(torch.load(filepath)) 139 | logging.info("load parameters from %s" % filepath) 140 | -------------------------------------------------------------------------------- /EduCDM/DINA/GD/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/6/21 @ tongshiwei 3 | 4 | from .DINA import DINA 5 | -------------------------------------------------------------------------------- /EduCDM/DINA/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/3/28 @ liujiayu 3 | 4 | 5 | from .GD import DINA as GDDINA 6 | from .EM import DINA as EMDINA 7 | -------------------------------------------------------------------------------- /EduCDM/FuzzyCDF/FuzzyCDF.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/3/28 @ liujiayu 3 | 4 | import logging 5 | import numpy as np 6 | import pickle 7 | from scipy import stats 8 | from tqdm import tqdm 9 | from collections import namedtuple 10 | from EduCDM import CDM 11 | from .modules import get_LogLikelihood, cal_alpha_mastery, update_A_B, update_theta, update_slip_guess, update_variance 12 | 13 | hyper_para = namedtuple("hyperparameters", 14 | ["sig_a", "mu_a", "sig_b", "mu_b", "max_s", "min_s", "max_g", "min_g", "mu_theta", "sig_theta"]) 15 | default_hyper = hyper_para(1, 0, 1, 0, 0.6, 0, 0.6, 0, 0, 1) 16 | 17 | 18 | def init_parameters(stu_num, prob_num, know_num, args): # initialize FuzzyCDF parameters 19 | a = stats.lognorm.rvs(s=args.sig_a, loc=0, scale=np.exp(args.mu_a), size=(stu_num, know_num)) 20 | b = stats.norm.rvs(loc=args.mu_b, scale=args.sig_b, size=(stu_num, know_num)) 21 | slip = stats.beta.rvs(a=1, b=2, size=prob_num) * (args.max_s - args.min_s) + args.min_s 22 | guess = stats.beta.rvs(a=1, b=2, size=prob_num) * (args.max_g - args.min_g) + args.min_g 23 | theta = stats.norm.rvs(loc=args.mu_theta, scale=args.sig_theta, size=stu_num) 24 | variance = 1 / stats.gamma.rvs(a=4, scale=1 / 6, size=1) 25 | return a, b, slip, guess, theta, variance 26 | 27 | 28 | class FuzzyCDF(CDM): 29 | """ 30 | FuzzyCDF model, training (MCMC) and testing methods 31 | :param R (array): response matrix, shape = (stu_num, prob_num) 32 | :param q_m (array): Q matrix, shape = (prob_num, know_num) 33 | :param stu_num (int): number of students 34 | :param prob_num (int): number of problems 35 | :param know_num (int): number of knowledge 36 | :param obj_prob_index (array): index of all objective problems, shape = (number, ) 37 | :param sub_prob_index (array): index of all subjective problems, shape = (number, ) 38 | :param skip_value (int): skip value in response matrix 39 | :param args: all hyper-parameters 40 | """ 41 | 42 | def __init__(self, R, q_m, stu_num, prob_num, know_num, obj_prob_index, sub_prob_index, skip_value=-1, 43 | args=default_hyper): 44 | self.args = args 45 | self.R, self.q_m, self.stu_num, self.prob_num, self.know_num = R, q_m, stu_num, prob_num, know_num 46 | self.a, self.b, self.slip, self.guess, self.theta, self.variance = init_parameters(stu_num, prob_num, know_num, 47 | self.args) 48 | self.obj_prob_index, self.sub_prob_index, self.skip_value = obj_prob_index, sub_prob_index, skip_value 49 | 50 | def train(self, epoch, burnin) -> ...: 51 | A, B, slip, guess = np.copy(self.a), np.copy(self.b), np.copy(self.slip), np.copy(self.guess) 52 | theta, variance = np.copy(self.theta), np.copy(self.variance) 53 | estimate_A, estimate_B, estimate_slip, estimate_guess, estimate_theta, estimate_variance = 0, 0, 0, 0, 0, 0 54 | for iteration in range(epoch): 55 | update_A_B(A, B, theta, slip, guess, variance, self.R, self.q_m, self.obj_prob_index, self.sub_prob_index, 56 | self.skip_value, self.args) 57 | update_theta(A, B, theta, slip, guess, variance, self.R, self.q_m, self.obj_prob_index, self.sub_prob_index, 58 | self.skip_value, self.args) 59 | update_slip_guess(A, B, theta, slip, guess, variance, self.R, self.q_m, self.obj_prob_index, 60 | self.sub_prob_index, 61 | self.skip_value, self.args) 62 | variance = update_variance(A, B, theta, slip, guess, variance, self.R, self.q_m, self.obj_prob_index, 63 | self.sub_prob_index, 64 | self.skip_value) 65 | if iteration >= burnin: 66 | estimate_A += A 67 | estimate_B += B 68 | estimate_slip += slip 69 | estimate_guess += guess 70 | estimate_theta += theta 71 | estimate_variance += variance 72 | self.a, self.b, self.slip, self.guess, self.theta, self.variance = estimate_A / (epoch - burnin), estimate_B / ( 73 | epoch - burnin), estimate_slip / (epoch - burnin), estimate_guess / (epoch - burnin), estimate_theta \ 74 | / (epoch - burnin), estimate_variance / (epoch - burnin) 75 | 76 | def eval(self, test_data) -> tuple: 77 | _, pred_mastery = cal_alpha_mastery(self.a, self.b, self.theta, self.q_m, self.obj_prob_index, 78 | self.sub_prob_index) 79 | pred_score = (1 - self.slip) * pred_mastery + self.guess * (1 - pred_mastery) 80 | test_rmse, test_mae = [], [] 81 | for i in tqdm(test_data, "evaluating"): 82 | stu, test_id, true_score = i['user_id'], i['item_id'], i['score'] 83 | test_rmse.append((pred_score[stu, test_id] - true_score) ** 2) 84 | test_mae.append(abs(pred_score[stu, test_id] - true_score)) 85 | return np.sqrt(np.average(test_rmse)), np.average(test_mae) 86 | 87 | def save(self, filepath): 88 | with open(filepath, 'wb') as file: 89 | pickle.dump({"a": self.a, "b": self.b, "theta": self.theta, "slip": self.slip, "guess": self.guess}, file) 90 | logging.info("save parameters to %s" % filepath) 91 | 92 | def load(self, filepath): 93 | with open(filepath, 'rb') as file: 94 | self.a, self.b, self.theta, self.slip, self.guess = pickle.load(file).values() 95 | logging.info("load parameters from %s" % filepath) 96 | 97 | def inc_train(self, inc_train_data, epoch, burnin): # incremental training 98 | for i in inc_train_data: 99 | stu, test_id, true_score = i['user_id'], i['item_id'], i['score'] 100 | self.R[stu, test_id] = true_score 101 | self.train(epoch, burnin) 102 | -------------------------------------------------------------------------------- /EduCDM/FuzzyCDF/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/3/28 @ liujiayu 3 | 4 | 5 | from .FuzzyCDF import FuzzyCDF 6 | -------------------------------------------------------------------------------- /EduCDM/FuzzyCDF/modules.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/3/28 @ liujiayu 3 | # Modules in FuzzyCDF 4 | 5 | import numpy as np 6 | from scipy import stats 7 | 8 | 9 | def cal_alpha_mastery(A, B, theta, q_m, obj_prob_index, sub_prob_index): # calculate proficiency on knows and probs 10 | stu_num, prob_num = len(theta), q_m.shape[0] 11 | alpha = 1 / (1 + np.exp(-1.7 * A * (theta.reshape([-1, 1]) - B))) 12 | mastery = np.zeros((stu_num, prob_num)) 13 | for i in range(stu_num): 14 | stu_i = alpha[i] * q_m # shape = (prob_num, know_num) 15 | if len(obj_prob_index) > 0: 16 | mastery[i][obj_prob_index] = np.min((stu_i + 2 * (1 - q_m))[obj_prob_index], axis=1) 17 | if len(sub_prob_index) > 0: 18 | mastery[i][sub_prob_index] = np.max(stu_i[sub_prob_index], axis=1) 19 | return alpha, mastery 20 | 21 | 22 | def get_LogLikelihood(A, B, theta, R, q_m, slip, guess, variance, obj_prob_index, sub_prob_index, skip_value=-1): 23 | # calculate log-likelihood for each response log 24 | _, mastery = cal_alpha_mastery(A, B, theta, q_m, obj_prob_index, sub_prob_index) 25 | stu_num, prob_num = R.shape[0], R.shape[1] 26 | x = (1 - slip) * mastery + guess * (1 - mastery) 27 | result = np.zeros((stu_num, prob_num)) 28 | if len(obj_prob_index) > 0: 29 | result[:, obj_prob_index] = (np.log(x + 1e-9) * R + np.log(1 - x + 1e-9) * (1 - R))[:, obj_prob_index] 30 | if len(sub_prob_index) > 0: 31 | result[:, sub_prob_index] = np.log(stats.norm.pdf(R, loc=x, scale=variance))[:, sub_prob_index] 32 | 33 | result[np.where(R == skip_value)[0], np.where(R == skip_value)[1]] = 0 # skip logs 34 | return result # shape = (stu_num, prob_num) 35 | 36 | 37 | # ---below are updating processes in MCMC for FuzzyCDF--- 38 | def update_A_B(A, B, theta, slip, guess, variance, R, q_m, obj_prob_index, sub_prob_index, skip_value, args): 39 | know_num = A.shape[1] 40 | new_A = A + 0.3 * stats.norm.rvs(size=A.shape) 41 | new_B = B + 0.3 * stats.norm.rvs(size=B.shape) 42 | for know in range(know_num): 43 | tempA = np.copy(A) 44 | tempB = np.copy(B) 45 | tempA[:, know] = np.copy(new_A[:, know]) 46 | tempB[:, know] = np.copy(new_B[:, know]) 47 | 48 | l_0 = get_LogLikelihood(A, B, theta, R, q_m, slip, guess, variance, obj_prob_index, sub_prob_index, skip_value) 49 | l_1 = get_LogLikelihood(tempA, tempB, theta, R, q_m, slip, guess, variance, obj_prob_index, sub_prob_index, 50 | skip_value) 51 | 52 | log_p0 = np.sum(l_0, axis=1) + np.log(stats.norm.pdf(x=B[:, know], loc=args.mu_b, scale=args.sig_b) + 1e-9) + \ 53 | np.log(stats.lognorm.pdf(x=A[:, know], loc=0, scale=np.exp(args.mu_a), s=args.sig_a) + 1e-9) 54 | log_p1 = np.sum(l_1, axis=1) + np.log(stats.norm.pdf(x=tempB[:, know], loc=args.mu_b, scale=args.sig_b) + 1e-9)\ 55 | + np.log(stats.lognorm.pdf(x=tempA[:, know], loc=0, scale=np.exp(args.mu_a), s=args.sig_a) + 1e-9) 56 | accept_prob = np.exp(np.minimum(log_p1 - log_p0, 0)) # avoid overflow in exp 57 | mask = accept_prob >= np.random.random(1) 58 | A[mask, know] = new_A[mask, know] 59 | B[mask, know] = new_B[mask, know] 60 | 61 | 62 | def update_theta(A, B, theta, slip, guess, variance, R, q_m, obj_prob_index, sub_prob_index, skip_value, args): 63 | new_theta = theta + 0.1 * stats.norm.rvs(size=theta.shape) 64 | 65 | l_0 = get_LogLikelihood(A, B, theta, R, q_m, slip, guess, variance, obj_prob_index, sub_prob_index, skip_value) 66 | l_1 = get_LogLikelihood(A, B, new_theta, R, q_m, slip, guess, variance, obj_prob_index, sub_prob_index, skip_value) 67 | 68 | log_p0 = np.sum(l_0, axis=1) + np.log(stats.norm.pdf(x=theta, loc=args.mu_theta, scale=args.sig_theta) + 1e-9) 69 | log_p1 = np.sum(l_1, axis=1) + np.log(stats.norm.pdf(x=new_theta, loc=args.mu_theta, scale=args.sig_theta) + 1e-9) 70 | accept_prob = np.exp(np.minimum(log_p1 - log_p0, 0)) # avoid overflow in exp 71 | mask = accept_prob >= np.random.random(1) 72 | theta[mask] = new_theta[mask] 73 | 74 | 75 | def update_slip_guess(A, B, theta, slip, guess, variance, R, q_m, obj_prob_index, sub_prob_index, skip_value, args): 76 | new_slip = np.abs(slip + 0.2 * stats.norm.rvs(size=slip.shape) - 0.1) 77 | new_guess = np.abs(guess + 0.2 * stats.norm.rvs(size=guess.shape) - 0.1) 78 | 79 | l_0 = get_LogLikelihood(A, B, theta, R, q_m, slip, guess, variance, obj_prob_index, sub_prob_index, skip_value) 80 | l_1 = get_LogLikelihood(A, B, theta, R, q_m, new_slip, new_guess, variance, obj_prob_index, sub_prob_index, 81 | skip_value) 82 | 83 | log_p0 = np.sum(l_0, axis=0) + np.log(stats.beta.pdf(x=slip / (args.max_s - args.min_s), a=1, b=2) + 1e-9) + np.log( 84 | stats.beta.pdf(x=guess / (args.max_g - args.min_g), a=1, b=2) + 1e-9) 85 | log_p1 = np.sum(l_1, axis=0) + np.log(stats.beta.pdf(x=new_slip / (args.max_s - args.min_s), a=1, b=2) + 1e-9) + \ 86 | np.log(stats.beta.pdf(x=new_guess / (args.max_g - args.min_g), a=1, b=2) + 1e-9) 87 | accept_prob = np.exp(np.minimum(log_p1 - log_p0, 0)) # avoid overflow in exp 88 | mask = accept_prob >= np.random.random(1) 89 | slip[mask] = new_slip[mask] 90 | guess[mask] = new_guess[mask] 91 | 92 | 93 | def update_variance(A, B, theta, slip, guess, variance, R, q_m, obj_prob_index, sub_prob_index, skip_value): 94 | new_var = np.maximum(variance - 0.01 + 0.02 * stats.norm.rvs(size=variance.shape), 0) 95 | 96 | l_0 = get_LogLikelihood(A, B, theta, R, q_m, slip, guess, variance, obj_prob_index, sub_prob_index, skip_value) 97 | l_1 = get_LogLikelihood(A, B, theta, R, q_m, slip, guess, new_var, obj_prob_index, sub_prob_index, skip_value) 98 | 99 | l_0[:, obj_prob_index] = 0 100 | l_1[:, obj_prob_index] = 0 101 | 102 | log_p0 = np.sum(l_0) + np.log(stats.gamma.pdf(x=1 / (variance + 1e-9), a=4, scale=1 / 6) + 1e-9) 103 | log_p1 = np.sum(l_1) + np.log(stats.gamma.pdf(x=1 / (new_var + 1e-9), a=4, scale=1 / 6) + 1e-9) 104 | accept_prob = np.exp(np.minimum(log_p1 - log_p0, 0)) # avoid overflow in exp 105 | if accept_prob >= np.random.random(1): 106 | variance = new_var 107 | return variance 108 | -------------------------------------------------------------------------------- /EduCDM/ICD/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | -------------------------------------------------------------------------------- /EduCDM/ICD/etl/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from .etl import * 4 | -------------------------------------------------------------------------------- /EduCDM/ICD/etl/utils.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import torch 4 | from baize.utils import pad_sequence 5 | from torch import Tensor, LongTensor 6 | 7 | 8 | def multi_hot(ks, kn): 9 | array = [0] * kn 10 | for k in ks: 11 | array[k] = 1 12 | return array 13 | 14 | 15 | def pack_batch(batch): 16 | user_id, user_items, item_id, item_users, item_knows, response = zip(*batch) 17 | user_items_length = [len(d) for d in user_items] 18 | padded_user_items = pad_sequence(user_items) 19 | item_users_length = [len(d) for d in item_users] 20 | padded_item_users = pad_sequence(item_users) 21 | return ( 22 | LongTensor(user_id), LongTensor(padded_user_items), LongTensor(user_items_length), 23 | LongTensor(item_id), LongTensor(padded_item_users), LongTensor(item_users_length), Tensor(item_knows), 24 | Tensor(response) 25 | ) 26 | -------------------------------------------------------------------------------- /EduCDM/ICD/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from .metrics import doa_report, stableness_report 4 | -------------------------------------------------------------------------------- /EduCDM/ICD/metrics/metrics.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2022/2/1 @ tongshiwei 3 | import pandas as pd 4 | from longling.ML.metrics import POrderedDict 5 | import numpy as np 6 | from tqdm import tqdm 7 | 8 | 9 | def doa_report(user, item, know, score, theta): 10 | df = pd.DataFrame({ 11 | "user_id": user, 12 | "item_id": item, 13 | "score": score, 14 | "theta": theta, 15 | "knowledge": know 16 | }) 17 | ground_truth = [] 18 | 19 | for _, group_df in tqdm(df.groupby("item_id"), "formatting item df"): 20 | ground_truth.append(group_df["score"].values) 21 | ground_truth.append(1 - group_df["score"].values) 22 | 23 | knowledges = [] 24 | knowledge_item = [] 25 | knowledge_user = [] 26 | knowledge_truth = [] 27 | knowledge_theta = [] 28 | for user, item, score, theta, knowledge in tqdm( 29 | df[["user_id", "item_id", "score", "theta", "knowledge"]].values, 30 | "formatting knowledge df"): 31 | if isinstance(theta, list): 32 | for i, (theta_i, knowledge_i) in enumerate(zip(theta, knowledge)): 33 | if knowledge_i == 1: 34 | knowledges.append(i) 35 | knowledge_item.append(item) 36 | knowledge_user.append(user) 37 | knowledge_truth.append(score) 38 | knowledge_theta.append(theta_i) 39 | else: # pragma: no cover 40 | for i, knowledge_i in enumerate(knowledge): 41 | if knowledge_i == 1: 42 | knowledges.append(i) 43 | knowledge_item.append(item) 44 | knowledge_user.append(user) 45 | knowledge_truth.append(score) 46 | knowledge_theta.append(theta) 47 | 48 | knowledge_df = pd.DataFrame({ 49 | "knowledge": knowledges, 50 | "user_id": knowledge_user, 51 | "item_id": knowledge_item, 52 | "score": knowledge_truth, 53 | "theta": knowledge_theta 54 | }) 55 | knowledge_ground_truth = [] 56 | knowledge_prediction = [] 57 | for _, group_df in knowledge_df.groupby("knowledge"): 58 | _knowledge_ground_truth = [] 59 | _knowledge_prediction = [] 60 | for _, item_group_df in group_df.groupby("item_id"): 61 | _knowledge_ground_truth.append(item_group_df["score"].values) 62 | _knowledge_prediction.append(item_group_df["theta"].values) 63 | knowledge_ground_truth.append(_knowledge_ground_truth) 64 | knowledge_prediction.append(_knowledge_prediction) 65 | 66 | return POrderedDict(doa_eval(knowledge_ground_truth, knowledge_prediction)) 67 | 68 | 69 | def doa_eval(y_true, y_pred): 70 | """ 71 | >>> import numpy as np 72 | >>> y_true = [ 73 | ... [np.array([1, 0, 1])], 74 | ... [np.array([0, 1, 1])] 75 | ... ] 76 | >>> y_pred = [ 77 | ... [np.array([.5, .4, .6])], 78 | ... [np.array([.2, .3, .5])] 79 | ... ] 80 | >>> float(doa_eval(y_true, y_pred)['doa']) 81 | 1.0 82 | >>> y_pred = [ 83 | ... [np.array([.4, .5, .6])], 84 | ... [np.array([.3, .2, .5])] 85 | ... ] 86 | >>> float(doa_eval(y_true, y_pred)['doa']) 87 | 0.5 88 | """ 89 | doa = [] 90 | doa_support = 0 91 | z_support = 0 92 | for knowledge_label, knowledge_pred in tqdm(zip(y_true, y_pred), 93 | "doa metrics"): 94 | _doa = 0 95 | _z = 0 96 | for label, pred in zip(knowledge_label, knowledge_pred): 97 | if sum(label) == len(label) or sum(label) == 0: 98 | continue 99 | pos_idx = [] 100 | neg_idx = [] 101 | for i, _label in enumerate(label): 102 | if _label == 1: 103 | pos_idx.append(i) 104 | else: 105 | neg_idx.append(i) 106 | pos_pred = pred[pos_idx] 107 | neg_pred = pred[neg_idx] 108 | invalid = 0 109 | for _pos_pred in pos_pred: 110 | _doa += len(neg_pred[neg_pred < _pos_pred]) 111 | invalid += len(neg_pred[neg_pred == _pos_pred]) 112 | _z += (len(pos_pred) * len(neg_pred)) - invalid 113 | if _z > 0: 114 | doa.append(_doa / _z) 115 | z_support += _z 116 | doa_support += 1 117 | return { 118 | "doa": np.mean(doa), 119 | "doa_know_support": doa_support, 120 | "doa_z_support": z_support, 121 | } 122 | 123 | 124 | def stableness_report(traits: list, new_traits: list, keys: list): 125 | ret = {} 126 | a_dim = None 127 | b_dim = None 128 | for trait, new_trait, key in zip(traits, new_traits, keys): 129 | if key == "b" and b_dim is None: 130 | b_dim = trait.size()[-1] if len(trait.size()) > 1 else 1 131 | if key == "a" and a_dim is None: 132 | a_dim = trait.size()[-1] 133 | 134 | ret[key] = {} 135 | delta = (trait - new_trait).abs() 136 | ret[key]['delta'] = delta.sum().item() 137 | ret[key]['delta_ave'] = delta.mean().item() 138 | ret[key]['support'] = len(trait) 139 | 140 | ret["user"] = ret["theta"] 141 | ret["item"] = { 142 | "delta": 143 | ret["a"]["delta"] + ret["b"]["delta"], 144 | "delta_ave": 145 | (ret["a"]["delta_ave"] * a_dim + ret["b"]["delta_ave"] * b_dim) / 146 | (a_dim + b_dim), 147 | "support": 148 | ret["a"]["support"], 149 | } 150 | macro = ret["user"]["delta_ave"] + ret["item"]["delta_ave"] 151 | micro = ret["user"]["support"] * ret["user"]["delta_ave"] + ret["item"][ 152 | "support"] * ret["item"]["delta_ave"] 153 | ret["macro_ave"] = macro / 2 154 | ret["micro_ave"] = micro / (ret["user"]["support"] + 155 | ret["item"]["support"]) 156 | return POrderedDict(ret) 157 | -------------------------------------------------------------------------------- /EduCDM/ICD/sym/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2022/1/29 @ tongshiwei 3 | 4 | from .net import get_net, get_loss, ICD, DualICD, get_dual_loss 5 | from .fit_eval import eval_f, dual_fit_f, stableness_eval, turning_point 6 | from .pos_linear import PosLinear 7 | -------------------------------------------------------------------------------- /EduCDM/ICD/sym/net/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from .net import get_net, get_loss, ICD, DualICD, get_dual_loss, EmbICD 4 | -------------------------------------------------------------------------------- /EduCDM/ICD/sym/net/dtn.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import torch 3 | from torch import nn 4 | from baize.torch.functional import mask_sequence 5 | 6 | 7 | class DTN(nn.Module): 8 | def __init__(self, input_dim, know_dim): 9 | self.know_dim = know_dim 10 | self.input_dim = input_dim 11 | self.fea_dim = 64 12 | 13 | super(DTN, self).__init__() 14 | self.emb = nn.Sequential(nn.Embedding(self.input_dim, self.fea_dim), 15 | # nn.Dropout(p=0.5), 16 | ) 17 | # self.feature_net = nn.Sequential( 18 | # # nn.Dropout(p=0.2), 19 | # nn.Linear(self.know_dim, self.know_dim), 20 | # # nn.Dropout(p=0.5), 21 | # # nn.Linear(self.prednet_len2, self.know_dim), 22 | # ) 23 | # self.atn = nn.MultiheadAttention(self.fea_dim, 4) 24 | self.feature_net = nn.Sequential( 25 | # nn.ReLU(), 26 | # nn.Dropout(p=0.5), 27 | nn.Linear(self.fea_dim, self.know_dim)) 28 | 29 | def avg_pool(self, data, mask: torch.Tensor): 30 | # batch_num * emb_dim * max_len => batch_num * emb_dim * 1 31 | # print(data,mask) 32 | mask_data = mask_sequence(data, mask) 33 | rs = torch.sum(mask_data.permute(0, 2, 1), dim=-1) 34 | len_mask = mask.reshape((-1, 1)) 35 | len_mask = len_mask.expand(len_mask.size()[0], self.know_dim) 36 | # print(rs.size(),mask.size()) 37 | rs = torch.div(rs, len_mask) 38 | return rs 39 | 40 | def forward(self, log, mask): 41 | # emb = mask_sequence(self.emb(log), mask) 42 | # att_emb = emb.permute(1, 0, 2) 43 | # att_emb, _ = self.atn(att_emb, att_emb, att_emb) 44 | # fea = self.feature_net(att_emb) 45 | # fea = fea.permute(1, 0, 2) 46 | 47 | emb = self.emb(log) 48 | fea = self.feature_net(emb) 49 | 50 | trait = self.avg_pool(fea, mask) 51 | return trait 52 | -------------------------------------------------------------------------------- /EduCDM/ICD/sym/net/mirt.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import torch 4 | from torch import nn 5 | import torch.nn.functional as F 6 | from .dtn import DTN 7 | from EduCDM.MIRT.MIRT import irt2pl 8 | 9 | 10 | class MIRTNet(nn.Module): 11 | def __init__(self, trait_dim, a_range=0.1, irf_kwargs=None): 12 | super(MIRTNet, self).__init__() 13 | self.irf_kwargs = irf_kwargs if irf_kwargs is not None else {} 14 | self.l_dtn_theta = nn.Linear(trait_dim, trait_dim) 15 | self.i_dtn_a = nn.Linear(trait_dim, trait_dim) 16 | self.i_dtn_b = nn.Linear(trait_dim, 1) 17 | self.a_range = a_range 18 | 19 | def forward(self, u_trait, v_trait, *args): 20 | theta = self.u_theta(u_trait) 21 | b = self.i_difficulty(v_trait) 22 | a = self.i_discrimination(v_trait) 23 | 24 | if torch.max(theta != theta) or torch.max(a != a) or torch.max(b != b): # pragma: no cover 25 | raise ValueError('ValueError:theta,a,b may contains nan! The a_range is too large.') 26 | 27 | return self.irf(theta, a, b, **self.irf_kwargs), theta, a, b 28 | 29 | @classmethod 30 | def int_f(cls, theta, a, b, *args, **kwargs): 31 | return irt2pl(theta, a, b, F=torch) 32 | 33 | @classmethod 34 | def irf(cls, theta, a, b, **kwargs): 35 | return irt2pl(theta, a, b, F=torch) 36 | 37 | def u_theta(self, u_trait): 38 | return (torch.sigmoid(torch.squeeze(self.l_dtn_theta(u_trait), dim=-1)) - 0.5) * 6 39 | 40 | def i_difficulty(self, v_trait): 41 | return (torch.sigmoid(torch.squeeze(self.i_dtn_b(v_trait), dim=-1)) - 0.5) * 6 42 | 43 | def i_discrimination(self, v_trait): 44 | a = torch.squeeze(self.i_dtn_a(v_trait), dim=-1) 45 | if self.a_range is not None: 46 | a = self.a_range * torch.sigmoid(a) 47 | else: # pragma: no cover 48 | a = F.softplus(a) 49 | return a 50 | -------------------------------------------------------------------------------- /EduCDM/ICD/sym/net/ncd.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import torch 3 | from torch import nn 4 | from ..pos_linear import PosLinear 5 | 6 | 7 | class NCDMNet(nn.Module): 8 | def __init__(self, trait_dim, know_dim): 9 | super(NCDMNet, self).__init__() 10 | 11 | self.knowledge_dim = know_dim 12 | self.prednet_input_len = self.knowledge_dim 13 | self.prednet_len1, self.prednet_len2 = 512, 256 # changeable 14 | self.l_dtn_theta_fc = nn.Linear(trait_dim, self.prednet_input_len) 15 | self.i_dtn_kd_fc = nn.Linear(trait_dim, self.prednet_input_len) 16 | self.i_dtn_ed_fc = nn.Linear(trait_dim, self.prednet_input_len) 17 | self.int_fc = nn.Sequential( 18 | PosLinear(self.prednet_input_len, self.prednet_len1), nn.Sigmoid(), 19 | nn.Dropout(p=0.5), PosLinear(self.prednet_len1, self.prednet_len2), 20 | nn.Sigmoid(), nn.Dropout(p=0.5), PosLinear(self.prednet_len2, 1), 21 | nn.Sigmoid()) 22 | 23 | def u_theta(self, u_trait): 24 | return torch.sigmoid(self.l_dtn_theta_fc(u_trait)) 25 | 26 | def i_difficulty(self, v_trait): 27 | return torch.sigmoid(self.i_dtn_kd_fc(v_trait)) 28 | 29 | def i_discrimination(self, v_trait): 30 | return torch.sigmoid(self.i_dtn_ed_fc(v_trait)) 31 | 32 | def forward(self, u_trait, v_trait, v_know): 33 | theta = self.u_theta(u_trait) 34 | 35 | difficulty = self.i_difficulty(v_trait) 36 | discrimination = self.i_discrimination(v_trait) 37 | 38 | # prednet 39 | input_x = discrimination * (theta - difficulty) * v_know 40 | output_1 = self.int_fc(input_x) 41 | 42 | return output_1.view(-1), theta, discrimination, difficulty 43 | 44 | def int_f(self, theta, a, b, know): 45 | return self.int_fc(a * (theta - b) * know).view(-1) 46 | -------------------------------------------------------------------------------- /EduCDM/ICD/sym/net/net.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from tqdm import tqdm 4 | import torch 5 | from torch import nn 6 | from baize.torch import loss_dict2tmt_torch_loss 7 | from longling.ML.PytorchHelper import set_device 8 | from longling.ML.PytorchHelper.toolkit.trainer import collect_params 9 | 10 | from .ncd import NCDMNet 11 | from .mirt import MIRTNet 12 | from .dtn import DTN 13 | 14 | 15 | class ICD(nn.Module): 16 | def __init__(self, user_n, item_n, know_n, cdm="ncd"): 17 | super(ICD, self).__init__() 18 | self.l_dtn = DTN(2 * item_n + 1, know_n) 19 | self.i_dtn = DTN(2 * user_n + 1, know_n) 20 | self.cdm_name = cdm 21 | if cdm == "ncd": 22 | self.cdm = NCDMNet(know_n, know_n) 23 | elif cdm == "mirt": 24 | self.cdm = MIRTNet(know_n) 25 | else: # pragma: no cover 26 | raise ValueError() 27 | 28 | for name, param in self.named_parameters(): 29 | if 'weight' in name: 30 | nn.init.xavier_normal_(param) 31 | 32 | def forward(self, u2i, u_mask, i2u, i_mask, i2k): 33 | u_trait = self.l_dtn(u2i, u_mask) 34 | v_trait = self.i_dtn(i2u, i_mask) 35 | return self.cdm(u_trait, v_trait, i2k) 36 | 37 | def get_user_profiles(self, batches): 38 | device = next(self.parameters()).device 39 | ids = [] 40 | traits = [] 41 | for _id, records, r_mask in tqdm(batches, "getting user profiles"): 42 | ids.append(_id.to("cpu")) 43 | traits.append( 44 | self.cdm.u_theta( 45 | self.l_dtn(records.to(device), 46 | r_mask.to(device))).to("cpu")) 47 | 48 | obj = {"uid": torch.cat(ids), "u_trait": torch.cat(traits)} 49 | return obj 50 | 51 | def get_item_profiles(self, batches): 52 | device = next(self.parameters()).device 53 | ids = [] 54 | a = [] 55 | b = [] 56 | for _id, records, r_mask in tqdm(batches, "getting item profiles"): 57 | v_trait = self.i_dtn(records.to(device), r_mask.to(device)) 58 | ids.append(_id.cpu()) 59 | a.append(self.cdm.i_discrimination(v_trait).to("cpu")) 60 | b.append(self.cdm.i_difficulty(v_trait).to("cpu")) 61 | obj = {"iid": torch.cat(ids), "ia": torch.cat(a), "ib": torch.cat(b)} 62 | return obj 63 | 64 | 65 | class DualICD(nn.Module): 66 | def __init__(self, stat_net: ICD, net: ICD, alpha=0.999): 67 | super(DualICD, self).__init__() 68 | self.stat_net = stat_net 69 | self.net = net 70 | self.alpha = alpha 71 | 72 | def momentum_weight_update(self, pre_net, train_select=None): 73 | """ 74 | Momentum update of ICD 75 | """ 76 | pre_net_params = collect_params(pre_net, train_select) 77 | net_params = collect_params(self.net, train_select) 78 | for param_pre, param_now in zip(pre_net_params, net_params): 79 | param_now.data = param_pre.data * self.alpha + param_now.data * ( 80 | 1. - self.alpha) 81 | 82 | def forward(self, u2i, u_mask, i2u, i_mask, i2k): 83 | output, theta, a, b = self.net(u2i, u_mask, i2u, i_mask, i2k) 84 | _, stat_theta, stat_a, stat_b = self.stat_net(u2i, u_mask, i2u, i_mask, 85 | i2k) 86 | return output, theta, a, b, stat_theta, stat_a, stat_b 87 | 88 | 89 | class EmbICD(nn.Module): 90 | def __init__(self, int_fc, weights): 91 | super(EmbICD, self).__init__() 92 | self.theta_emb = nn.Embedding(*weights[0].size(), _weight=weights[0]) 93 | self.a_emb = nn.Embedding(*weights[1].size(), _weight=weights[1]) 94 | if len(weights[2].size()) == 1: 95 | self.b_emb = nn.Embedding(weights[2].size()[0], 96 | 1, 97 | _weight=torch.unsqueeze(weights[2], 1)) 98 | else: 99 | self.b_emb = nn.Embedding(*weights[2].size(), _weight=weights[2]) 100 | self.int_fc = int_fc 101 | self._user_id2idx = {} 102 | self._item_id2idx = {} 103 | 104 | def build_user_id2idx(self, users): 105 | idx = 0 106 | for user_id in users: 107 | if user_id not in self._user_id2idx: 108 | self._user_id2idx[user_id] = idx 109 | idx += 1 110 | 111 | def build_item_id2idx(self, items): 112 | idx = 0 113 | for item_id in items: 114 | if item_id not in self._item_id2idx: 115 | self._item_id2idx[item_id] = idx 116 | idx += 1 117 | 118 | def user_id2idx(self, users): 119 | users_idx = [] 120 | for user in users: 121 | users_idx.append(self._user_id2idx[user]) 122 | return users_idx 123 | 124 | def item_id2idx(self, items): 125 | items_idx = [] 126 | for item in items: 127 | items_idx.append(self._item_id2idx[item]) 128 | return items_idx 129 | 130 | def forward(self, user_idx, item_idx, know): 131 | theta = self.theta_emb(user_idx).detach() 132 | a = self.a_emb(item_idx).detach() 133 | b = self.b_emb(item_idx).detach() 134 | 135 | theta.requires_grad_(True) 136 | a.requires_grad_(True) 137 | b.requires_grad_(True) 138 | 139 | return self.int_fc(theta, a, torch.squeeze(b), 140 | know).view(-1), theta, a, b 141 | 142 | 143 | class DeltaTraitLoss(nn.Module): 144 | def __init__(self): 145 | super(DeltaTraitLoss, self).__init__() 146 | self.mse_loss = nn.MSELoss() 147 | 148 | def forward(self, theta, a, b, stat_theta, stat_a, stat_b): 149 | return self.mse_loss(theta, stat_theta) + self.mse_loss( 150 | a, stat_a) + self.mse_loss(b, stat_b) 151 | 152 | 153 | class DualLoss(nn.Module): 154 | def __init__(self, beta=0.95, *args, **kwargs): 155 | super(DualLoss, self).__init__() 156 | self.beta = beta 157 | self.bce = nn.BCELoss(*args, **kwargs) 158 | self.delta_trait = DeltaTraitLoss() 159 | 160 | def forward(self, pred, truth, theta, a, b, stat_theta, stat_a, stat_b): 161 | return self.beta * self.bce( 162 | pred, truth) + (1. - self.beta) * self.delta_trait( 163 | theta, a, b, stat_theta, stat_a, stat_b) 164 | 165 | 166 | def get_dual_loss(ctx, beta=0.95, *args, **kwargs): 167 | return loss_dict2tmt_torch_loss({ 168 | "Loss": 169 | set_device(DualLoss(beta, *args, **kwargs), ctx), 170 | "BCE": 171 | set_device(torch.nn.BCELoss(*args, **kwargs), ctx), 172 | "DTL": 173 | set_device(DeltaTraitLoss(), ctx), 174 | }) 175 | 176 | 177 | def get_loss(ctx, *args, **kwargs): # pragma: no cover 178 | return loss_dict2tmt_torch_loss( 179 | {"cross entropy": set_device(torch.nn.BCELoss(*args, **kwargs), ctx)}) 180 | 181 | 182 | def get_net(ctx=None, *args, **kwargs): 183 | if ctx is None: # pragma: no cover 184 | return ICD(*args, **kwargs) 185 | return set_device(ICD(*args, **kwargs), ctx) 186 | -------------------------------------------------------------------------------- /EduCDM/ICD/sym/pos_linear.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import torch 4 | import torch.nn.functional as F 5 | from torch import nn 6 | 7 | 8 | class PosLinear(nn.Linear): 9 | def forward(self, input: torch.Tensor) -> torch.Tensor: 10 | weight = 2 * F.relu(1 * torch.neg(self.weight)) + self.weight 11 | return F.linear(input, weight, self.bias) 12 | -------------------------------------------------------------------------------- /EduCDM/ICD/utils.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import json 3 | import logging 4 | 5 | 6 | def output_metrics(_id, obj, wfs=None, header=None, logger=logging): 7 | logger.info("-------- %s: %s ----------" % (header, _id)) 8 | logger.info("\n%s" % obj) 9 | if wfs is not None: # pragma: no cover 10 | print(json.dumps({ 11 | "id": _id, 12 | "metrics": obj 13 | }), 14 | file=wfs[header], 15 | flush=True) 16 | -------------------------------------------------------------------------------- /EduCDM/IRR/DINA.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/7/1 @ tongshiwei 3 | 4 | import pandas as pd 5 | import numpy as np 6 | import torch 7 | from torch import nn 8 | from EduCDM import GDDINA 9 | from .loss import PairSCELoss, HarmonicLoss, loss_mask 10 | from tqdm import tqdm 11 | from longling.ML.metrics import ranking_report 12 | 13 | 14 | class DINA(GDDINA): 15 | def __init__(self, user_num, item_num, knowledge_num, ste=False, zeta=0.5): 16 | super(DINA, self).__init__(user_num, item_num, knowledge_num, ste) 17 | self.zeta = zeta 18 | 19 | def train(self, train_data, test_data=None, *, epoch: int, device="cpu", lr=0.001) -> ...: 20 | self.dina_net = self.dina_net.to(device) 21 | point_loss_function = nn.BCELoss() 22 | pair_loss_function = PairSCELoss() 23 | loss_function = HarmonicLoss(self.zeta) 24 | 25 | trainer = torch.optim.Adam(self.dina_net.parameters(), lr, weight_decay=1e-4) 26 | 27 | for e in range(epoch): 28 | point_losses = [] 29 | pair_losses = [] 30 | losses = [] 31 | for batch_data in tqdm(train_data, "Epoch %s" % e): 32 | user_id, item_id, knowledge, score, n_samples, *neg_users = batch_data 33 | user_id: torch.Tensor = user_id.to(device) 34 | item_id: torch.Tensor = item_id.to(device) 35 | knowledge: torch.Tensor = knowledge.to(device) 36 | n_samples: torch.Tensor = n_samples.to(device) 37 | predicted_pos_score: torch.Tensor = self.dina_net(user_id, item_id, knowledge) 38 | score: torch.Tensor = score.to(device) 39 | neg_score = 1 - score 40 | 41 | point_loss = point_loss_function(predicted_pos_score, score) 42 | predicted_neg_scores = [] 43 | if neg_users: 44 | for neg_user in neg_users: 45 | neg_user: torch.Tensor = neg_user.to(device) 46 | predicted_neg_score = self.dina_net(neg_user, item_id, knowledge) 47 | predicted_neg_scores.append(predicted_neg_score) 48 | 49 | # prediction loss 50 | pair_pred_loss_list = [] 51 | for i, predicted_neg_score in enumerate(predicted_neg_scores): 52 | pair_pred_loss_list.append( 53 | pair_loss_function( 54 | predicted_pos_score, 55 | predicted_neg_score, 56 | score - neg_score 57 | ) 58 | ) 59 | 60 | pair_loss = sum(loss_mask(pair_pred_loss_list, n_samples)) 61 | else: 62 | pair_loss = 0 63 | 64 | loss = loss_function(point_loss, pair_loss) 65 | 66 | # back propagation 67 | trainer.zero_grad() 68 | loss.backward() 69 | trainer.step() 70 | 71 | point_losses.append(point_loss.mean().item()) 72 | pair_losses.append(pair_loss.mean().item() if not isinstance(pair_loss, int) else pair_loss) 73 | losses.append(loss.item()) 74 | print( 75 | "[Epoch %d] Loss: %.6f, PointLoss: %.6f, PairLoss: %.6f" % ( 76 | e, float(np.mean(losses)), float(np.mean(point_losses)), float(np.mean(pair_losses)) 77 | ) 78 | ) 79 | 80 | if test_data is not None: 81 | eval_data = self.eval(test_data, device=device) 82 | print("[Epoch %d]\n%s" % (e, eval_data)) 83 | 84 | def eval(self, test_data, device="cpu"): 85 | self.dina_net = self.dina_net.to(device) 86 | self.dina_net.eval() 87 | y_pred = [] 88 | y_true = [] 89 | items = [] 90 | for batch_data in tqdm(test_data, "evaluating"): 91 | user_id, item_id, knowledge, response = batch_data 92 | user_id: torch.Tensor = user_id.to(device) 93 | item_id: torch.Tensor = item_id.to(device) 94 | knowledge: torch.Tensor = knowledge.to(device) 95 | pred: torch.Tensor = self.dina_net(user_id, item_id, knowledge) 96 | y_pred.extend(pred.tolist()) 97 | y_true.extend(response.tolist()) 98 | items.extend(item_id.tolist()) 99 | 100 | df = pd.DataFrame({ 101 | "item_id": items, 102 | "score": y_true, 103 | "pred": y_pred, 104 | }) 105 | 106 | ground_truth = [] 107 | prediction = [] 108 | 109 | for _, group_df in tqdm(df.groupby("item_id"), "formatting item df"): 110 | ground_truth.append(group_df["score"].values) 111 | prediction.append(group_df["pred"].values) 112 | 113 | self.dina_net.train() 114 | 115 | return ranking_report( 116 | ground_truth, 117 | y_pred=prediction, 118 | coerce="padding" 119 | ) 120 | -------------------------------------------------------------------------------- /EduCDM/IRR/IRT.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/6/19 @ tongshiwei 3 | 4 | import torch 5 | from torch import nn 6 | from tqdm import tqdm 7 | from EduCDM.IRT.GD import IRT as PointIRT 8 | import numpy as np 9 | import pandas as pd 10 | from .loss import PairSCELoss, HarmonicLoss, loss_mask 11 | from longling.ML.metrics import ranking_report 12 | 13 | __all__ = ["IRT"] 14 | 15 | 16 | class IRT(PointIRT): 17 | def __init__(self, user_num, item_num, knowledge_num, zeta=0.5): 18 | super(IRT, self).__init__(user_num, item_num) 19 | self.knowledge_num = knowledge_num 20 | self.zeta = zeta 21 | 22 | def train(self, train_data, test_data=None, *, epoch: int, device="cpu", lr=0.001) -> ...: 23 | self.irt_net = self.irt_net.to(device) 24 | point_loss_function = nn.BCELoss() 25 | pair_loss_function = PairSCELoss() 26 | loss_function = HarmonicLoss(self.zeta) 27 | 28 | trainer = torch.optim.Adam(self.irt_net.parameters(), lr, weight_decay=1e-4) 29 | 30 | for e in range(epoch): 31 | point_losses = [] 32 | pair_losses = [] 33 | losses = [] 34 | for batch_data in tqdm(train_data, "Epoch %s" % e): 35 | user_id, item_id, _, score, n_samples, *neg_users = batch_data 36 | user_id: torch.Tensor = user_id.to(device) 37 | item_id: torch.Tensor = item_id.to(device) 38 | n_samples: torch.Tensor = n_samples.to(device) 39 | predicted_pos_score: torch.Tensor = self.irt_net(user_id, item_id) 40 | score: torch.Tensor = score.to(device) 41 | neg_score = 1 - score 42 | 43 | point_loss = point_loss_function(predicted_pos_score, score) 44 | predicted_neg_scores = [] 45 | if neg_users: 46 | for neg_user in neg_users: 47 | neg_user: torch.Tensor = neg_user.to(device) 48 | predicted_neg_score = self.irt_net(neg_user, item_id) 49 | predicted_neg_scores.append(predicted_neg_score) 50 | 51 | # prediction loss 52 | pair_pred_loss_list = [] 53 | for i, predicted_neg_score in enumerate(predicted_neg_scores): 54 | pair_pred_loss_list.append( 55 | pair_loss_function( 56 | predicted_pos_score, 57 | predicted_neg_score, 58 | score - neg_score 59 | ) 60 | ) 61 | 62 | pair_loss = sum(loss_mask(pair_pred_loss_list, n_samples)) 63 | else: 64 | pair_loss = 0 65 | 66 | loss = loss_function(point_loss, pair_loss) 67 | 68 | # back propagation 69 | trainer.zero_grad() 70 | loss.backward() 71 | trainer.step() 72 | 73 | point_losses.append(point_loss.mean().item()) 74 | pair_losses.append(pair_loss.mean().item() if not isinstance(pair_loss, int) else pair_loss) 75 | losses.append(loss.item()) 76 | print( 77 | "[Epoch %d] Loss: %.6f, PointLoss: %.6f, PairLoss: %.6f" % ( 78 | e, float(np.mean(losses)), float(np.mean(point_losses)), float(np.mean(pair_losses)) 79 | ) 80 | ) 81 | 82 | if test_data is not None: 83 | eval_data = self.eval(test_data, device=device) 84 | print("[Epoch %d]\n%s" % (e, eval_data)) 85 | 86 | def eval(self, test_data, device="cpu"): 87 | self.irt_net = self.irt_net.to(device) 88 | self.irt_net.eval() 89 | y_pred = [] 90 | y_true = [] 91 | items = [] 92 | for batch_data in tqdm(test_data, "evaluating"): 93 | user_id, item_id, _, response = batch_data 94 | user_id: torch.Tensor = user_id.to(device) 95 | item_id: torch.Tensor = item_id.to(device) 96 | pred: torch.Tensor = self.irt_net(user_id, item_id) 97 | y_pred.extend(pred.tolist()) 98 | y_true.extend(response.tolist()) 99 | items.extend(item_id.tolist()) 100 | 101 | df = pd.DataFrame({ 102 | "item_id": items, 103 | "score": y_true, 104 | "pred": y_pred, 105 | }) 106 | 107 | ground_truth = [] 108 | prediction = [] 109 | 110 | for _, group_df in tqdm(df.groupby("item_id"), "formatting item df"): 111 | ground_truth.append(group_df["score"].values) 112 | prediction.append(group_df["pred"].values) 113 | 114 | self.irt_net.train() 115 | 116 | return ranking_report( 117 | ground_truth, 118 | y_pred=prediction, 119 | coerce="padding" 120 | ) 121 | -------------------------------------------------------------------------------- /EduCDM/IRR/MIRT.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/7/1 @ tongshiwei 3 | 4 | 5 | import torch 6 | from torch import nn 7 | from tqdm import tqdm 8 | from EduCDM import MIRT as PointMIRT 9 | import numpy as np 10 | import pandas as pd 11 | from .loss import PairSCELoss, HarmonicLoss, loss_mask 12 | from longling.ML.metrics import ranking_report 13 | 14 | __all__ = ["MIRT"] 15 | 16 | 17 | class MIRT(PointMIRT): 18 | def __init__(self, user_num, item_num, knowledge_num, latent_dim=None, zeta=0.5): 19 | latent_dim = knowledge_num if latent_dim is None else latent_dim 20 | super(MIRT, self).__init__(user_num, item_num, latent_dim) 21 | self.knowledge_num = knowledge_num 22 | self.zeta = zeta 23 | 24 | def train(self, train_data, test_data=None, *, epoch: int, device="cpu", lr=0.001) -> ...: 25 | self.irt_net = self.irt_net.to(device) 26 | point_loss_function = nn.BCELoss() 27 | pair_loss_function = PairSCELoss() 28 | loss_function = HarmonicLoss(self.zeta) 29 | 30 | trainer = torch.optim.Adam(self.irt_net.parameters(), lr, weight_decay=1e-4) 31 | 32 | for e in range(epoch): 33 | point_losses = [] 34 | pair_losses = [] 35 | losses = [] 36 | for batch_data in tqdm(train_data, "Epoch %s" % e): 37 | user_id, item_id, _, score, n_samples, *neg_users = batch_data 38 | user_id: torch.Tensor = user_id.to(device) 39 | item_id: torch.Tensor = item_id.to(device) 40 | n_samples: torch.Tensor = n_samples.to(device) 41 | predicted_pos_score: torch.Tensor = self.irt_net(user_id, item_id) 42 | score: torch.Tensor = score.to(device) 43 | neg_score = 1 - score 44 | 45 | point_loss = point_loss_function(predicted_pos_score, score) 46 | predicted_neg_scores = [] 47 | if neg_users: 48 | for neg_user in neg_users: 49 | neg_user: torch.Tensor = neg_user.to(device) 50 | predicted_neg_score = self.irt_net(neg_user, item_id) 51 | predicted_neg_scores.append(predicted_neg_score) 52 | 53 | # prediction loss 54 | pair_pred_loss_list = [] 55 | for i, predicted_neg_score in enumerate(predicted_neg_scores): 56 | pair_pred_loss_list.append( 57 | pair_loss_function( 58 | predicted_pos_score, 59 | predicted_neg_score, 60 | score - neg_score 61 | ) 62 | ) 63 | 64 | pair_loss = sum(loss_mask(pair_pred_loss_list, n_samples)) 65 | else: 66 | pair_loss = 0 67 | 68 | loss = loss_function(point_loss, pair_loss) 69 | 70 | # back propagation 71 | trainer.zero_grad() 72 | loss.backward() 73 | trainer.step() 74 | 75 | point_losses.append(point_loss.mean().item()) 76 | pair_losses.append(pair_loss.mean().item() if not isinstance(pair_loss, int) else pair_loss) 77 | losses.append(loss.item()) 78 | print( 79 | "[Epoch %d] Loss: %.6f, PointLoss: %.6f, PairLoss: %.6f" % ( 80 | e, float(np.mean(losses)), float(np.mean(point_losses)), float(np.mean(pair_losses)) 81 | ) 82 | ) 83 | 84 | if test_data is not None: 85 | eval_data = self.eval(test_data, device=device) 86 | print("[Epoch %d]\n%s" % (e, eval_data)) 87 | 88 | def eval(self, test_data, device="cpu"): 89 | self.irt_net = self.irt_net.to(device) 90 | self.irt_net.eval() 91 | y_pred = [] 92 | y_true = [] 93 | items = [] 94 | for batch_data in tqdm(test_data, "evaluating"): 95 | user_id, item_id, _, response = batch_data 96 | user_id: torch.Tensor = user_id.to(device) 97 | item_id: torch.Tensor = item_id.to(device) 98 | pred: torch.Tensor = self.irt_net(user_id, item_id) 99 | y_pred.extend(pred.tolist()) 100 | y_true.extend(response.tolist()) 101 | items.extend(item_id.tolist()) 102 | 103 | df = pd.DataFrame({ 104 | "item_id": items, 105 | "score": y_true, 106 | "pred": y_pred, 107 | }) 108 | 109 | ground_truth = [] 110 | prediction = [] 111 | 112 | for _, group_df in tqdm(df.groupby("item_id"), "formatting item df"): 113 | ground_truth.append(group_df["score"].values) 114 | prediction.append(group_df["pred"].values) 115 | 116 | self.irt_net.train() 117 | 118 | return ranking_report( 119 | ground_truth, 120 | y_pred=prediction, 121 | coerce="padding" 122 | ) 123 | -------------------------------------------------------------------------------- /EduCDM/IRR/NCDM.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/7/1 @ tongshiwei 3 | 4 | import pandas as pd 5 | import numpy as np 6 | import torch 7 | from torch import nn 8 | from EduCDM import NCDM as PointNCDM 9 | from .loss import PairSCELoss, HarmonicLoss, loss_mask 10 | from tqdm import tqdm 11 | from longling.ML.metrics import ranking_report 12 | 13 | 14 | class NCDM(PointNCDM): 15 | def __init__(self, user_num, item_num, knowledge_num, zeta=0.5): 16 | super(NCDM, self).__init__(knowledge_num, item_num, user_num) 17 | self.zeta = zeta 18 | 19 | def train(self, train_data, test_data=None, epoch=10, device="cpu", lr=0.002, silence=False) -> ...: 20 | self.ncdm_net = self.ncdm_net.to(device) 21 | point_loss_function = nn.BCELoss() 22 | pair_loss_function = PairSCELoss() 23 | loss_function = HarmonicLoss(self.zeta) 24 | 25 | trainer = torch.optim.Adam(self.ncdm_net.parameters(), lr, weight_decay=1e-4) 26 | 27 | for e in range(epoch): 28 | point_losses = [] 29 | pair_losses = [] 30 | losses = [] 31 | for batch_data in tqdm(train_data, "Epoch %s" % e): 32 | user_id, item_id, knowledge, score, n_samples, *neg_users = batch_data 33 | user_id: torch.Tensor = user_id.to(device) 34 | item_id: torch.Tensor = item_id.to(device) 35 | knowledge: torch.Tensor = knowledge.to(device) 36 | n_samples: torch.Tensor = n_samples.to(device) 37 | predicted_pos_score: torch.Tensor = self.ncdm_net(user_id, item_id, knowledge) 38 | score: torch.Tensor = score.to(device) 39 | neg_score = 1 - score 40 | 41 | point_loss = point_loss_function(predicted_pos_score, score) 42 | predicted_neg_scores = [] 43 | if neg_users: 44 | for neg_user in neg_users: 45 | neg_user: torch.Tensor = neg_user.to(device) 46 | predicted_neg_score = self.ncdm_net(neg_user, item_id, knowledge) 47 | predicted_neg_scores.append(predicted_neg_score) 48 | 49 | # prediction loss 50 | pair_pred_loss_list = [] 51 | for i, predicted_neg_score in enumerate(predicted_neg_scores): 52 | pair_pred_loss_list.append( 53 | pair_loss_function( 54 | predicted_pos_score, 55 | predicted_neg_score, 56 | score - neg_score 57 | ) 58 | ) 59 | 60 | pair_loss = sum(loss_mask(pair_pred_loss_list, n_samples)) 61 | else: 62 | pair_loss = 0 63 | 64 | loss = loss_function(point_loss, pair_loss) 65 | 66 | # back propagation 67 | trainer.zero_grad() 68 | loss.backward() 69 | trainer.step() 70 | 71 | point_losses.append(point_loss.mean().item()) 72 | pair_losses.append(pair_loss.mean().item() if not isinstance(pair_loss, int) else pair_loss) 73 | losses.append(loss.item()) 74 | print( 75 | "[Epoch %d] Loss: %.6f, PointLoss: %.6f, PairLoss: %.6f" % ( 76 | e, float(np.mean(losses)), float(np.mean(point_losses)), float(np.mean(pair_losses)) 77 | ) 78 | ) 79 | 80 | if test_data is not None: 81 | eval_data = self.eval(test_data, device=device) 82 | print("[Epoch %d]\n%s" % (e, eval_data)) 83 | 84 | def eval(self, test_data, device="cpu"): 85 | self.ncdm_net = self.ncdm_net.to(device) 86 | self.ncdm_net.eval() 87 | y_pred = [] 88 | y_true = [] 89 | items = [] 90 | for batch_data in tqdm(test_data, "evaluating"): 91 | user_id, item_id, knowledge, response = batch_data 92 | user_id: torch.Tensor = user_id.to(device) 93 | item_id: torch.Tensor = item_id.to(device) 94 | knowledge: torch.Tensor = knowledge.to(device) 95 | pred: torch.Tensor = self.ncdm_net(user_id, item_id, knowledge) 96 | y_pred.extend(pred.tolist()) 97 | y_true.extend(response.tolist()) 98 | items.extend(item_id.tolist()) 99 | 100 | df = pd.DataFrame({ 101 | "item_id": items, 102 | "score": y_true, 103 | "pred": y_pred, 104 | }) 105 | 106 | ground_truth = [] 107 | prediction = [] 108 | 109 | for _, group_df in tqdm(df.groupby("item_id"), "formatting item df"): 110 | ground_truth.append(group_df["score"].values) 111 | prediction.append(group_df["pred"].values) 112 | 113 | self.ncdm_net.train() 114 | 115 | return ranking_report( 116 | ground_truth, 117 | y_pred=prediction, 118 | coerce="padding" 119 | ) 120 | -------------------------------------------------------------------------------- /EduCDM/IRR/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/6/19 @ tongshiwei 3 | 4 | from .IRT import IRT 5 | from .DINA import DINA 6 | from .MIRT import MIRT 7 | from .NCDM import NCDM 8 | from .etl import point_etl, pair_etl, extract_item 9 | -------------------------------------------------------------------------------- /EduCDM/IRR/etl/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/6/19 @ tongshiwei 3 | 4 | from .utils import extract_item 5 | from .point_etl import etl as point_etl 6 | from .pair_etl import etl as pair_etl 7 | -------------------------------------------------------------------------------- /EduCDM/IRR/etl/pair_etl.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/6/19 @ tongshiwei 3 | import torch 4 | import os 5 | from longling import print_time, iterwrap 6 | import pandas as pd 7 | import numpy as np 8 | 9 | from longling.ML.toolkit.dataset import ItemSpecificSampler 10 | 11 | __all__ = ["etl"] 12 | 13 | 14 | def extract(data_src, params): 15 | with print_time("loading data from %s" % os.path.abspath(data_src), params.logger): 16 | df = pd.read_csv(data_src, dtype={"user_id": "int64", "item_id": "int64", "score": "float32"}) 17 | sampler = ItemSpecificSampler( 18 | ItemSpecificSampler.rating2triplet( 19 | df, query_field="item_id", key_field="user_id", value_field="score" 20 | ), 21 | query_field="item_id", user_id_range=[1, params.hyper_params["user_num"]], 22 | ) 23 | return df, sampler 24 | 25 | 26 | @iterwrap() 27 | def transform(raw_data, knowledge, params): 28 | # 定义数据转换接口 29 | # raw_data --> batch_data 30 | 31 | batch_size = params.batch_size 32 | n_neg = params.n_neg 33 | n_imp = params.n_imp 34 | df: pd.DataFrame = raw_data[0] 35 | sampler: ItemSpecificSampler = raw_data[1] 36 | 37 | for start in range(0, len(df), batch_size): 38 | _df = df.iloc[start: start + batch_size] 39 | n_sample, sample = sampler( 40 | _df["item_id"], n_neg, neg=_df["score"] != 0.0, return_column=True, padding=True, 41 | split_sample_to_column=True, verbose=False, padding_implicit=False, 42 | fast_implicit=True, with_n_implicit=n_imp 43 | ) if (n_neg + n_imp) > 0 else ([0] * _df.shape[0], []) 44 | _knowledge = np.stack([knowledge[int(item)] for item in _df["item_id"]]).astype("float32") 45 | yield [ 46 | torch.tensor(array if not isinstance(array, pd.Series) else array.values) for array in 47 | [_df["user_id"], _df["item_id"], _knowledge, _df["score"], 48 | n_sample, *sample] 49 | ] 50 | 51 | 52 | @iterwrap() 53 | def load(transformed_data, params): 54 | return transformed_data 55 | 56 | 57 | def etl(filepath, knowledge, params): 58 | raw_data = extract(filepath, params) 59 | transformed_data = transform(raw_data, knowledge, params) 60 | return load(transformed_data, params), raw_data[0] 61 | -------------------------------------------------------------------------------- /EduCDM/IRR/etl/point_etl.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/6/19 @ tongshiwei 3 | 4 | import os 5 | import numpy as np 6 | import pandas as pd 7 | from longling import print_time 8 | 9 | import torch 10 | from torch.utils.data import TensorDataset, DataLoader 11 | 12 | 13 | def extract(data_src, params): 14 | with print_time("loading data from %s" % os.path.abspath(data_src), params.logger): 15 | df = pd.read_csv(data_src, dtype={"user_id": "int64", "item_id": "int64", "score": "float32"}) 16 | return df 17 | 18 | 19 | def transform(df, knowledge, *args): 20 | # 定义数据转换接口 21 | # raw_data --> batch_data 22 | dataset = TensorDataset( 23 | torch.tensor(df["user_id"]), 24 | torch.tensor(df["item_id"]), 25 | torch.tensor(np.stack([knowledge[int(item)] for item in df["item_id"]])), 26 | torch.tensor(df["score"], dtype=torch.float) 27 | ) 28 | return dataset 29 | 30 | 31 | def load(transformed_data, params): 32 | batch_size = params.batch_size 33 | 34 | return DataLoader(transformed_data, batch_size=batch_size) 35 | 36 | 37 | def etl(filepath, knowledge, params): 38 | raw_data = extract(filepath, params) 39 | transformed_data = transform(raw_data, knowledge, params) 40 | return load(transformed_data, params), raw_data 41 | -------------------------------------------------------------------------------- /EduCDM/IRR/etl/utils.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/6/19 @ tongshiwei 3 | 4 | from tqdm import tqdm 5 | import os 6 | import pandas as pd 7 | from longling import print_time 8 | import numpy as np 9 | 10 | 11 | def extract_item(data_src, knowledge_num, params): 12 | with print_time("loading data from %s" % os.path.abspath(data_src), params.logger): 13 | knowledge = {} 14 | for record in tqdm(pd.read_csv(data_src).to_dict("records"), "reading records from %s" % data_src): 15 | knowledge_code_vector = [0] * knowledge_num 16 | for code in eval(record["knowledge_code"]): 17 | assert code >= 1 18 | knowledge_code_vector[code - 1] = 1 19 | knowledge[record["item_id"]] = np.asarray(knowledge_code_vector) 20 | return knowledge 21 | -------------------------------------------------------------------------------- /EduCDM/IRR/loss.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/6/19 @ tongshiwei 3 | 4 | import torch 5 | from torch import nn 6 | 7 | 8 | def loss_mask(loss_list, n_samples): 9 | return [(i <= n_samples) * loss for i, loss in enumerate(loss_list)] 10 | 11 | 12 | class PairSCELoss(nn.Module): 13 | def __init__(self): 14 | super(PairSCELoss, self).__init__() 15 | self._loss = nn.CrossEntropyLoss() 16 | 17 | def forward(self, pred1, pred2, sign=1, *args): 18 | """ 19 | sign is either 1 or -1 20 | could be seen as predicting the sign based on the pred1 and pred2 21 | 1: pred1 should be greater than pred2 22 | -1: otherwise 23 | """ 24 | pred = torch.stack([pred1, pred2], dim=1) 25 | return self._loss(pred, ((torch.ones(pred1.shape[0], device=pred.device) - sign) / 2).long()) 26 | 27 | 28 | class HarmonicLoss(object): 29 | def __init__(self, zeta: (int, float) = 0.): 30 | self.zeta = zeta 31 | 32 | def __call__(self, point_wise_loss, pair_pred_loss, *args, **kwargs): 33 | return ((1 - self.zeta) * point_wise_loss + self.zeta * pair_pred_loss).mean() 34 | -------------------------------------------------------------------------------- /EduCDM/IRT/EM/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/5/2 @ liujiayu 3 | 4 | from .IRT import IRT 5 | -------------------------------------------------------------------------------- /EduCDM/IRT/GD/IRT.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/4/23 @ tongshiwei 3 | 4 | import logging 5 | import numpy as np 6 | import torch 7 | from EduCDM import CDM 8 | from torch import nn 9 | import torch.nn.functional as F 10 | from tqdm import tqdm 11 | from ..irt import irt3pl 12 | from sklearn.metrics import roc_auc_score, accuracy_score 13 | 14 | 15 | class IRTNet(nn.Module): 16 | def __init__(self, user_num, item_num, value_range, a_range, irf_kwargs=None): 17 | super(IRTNet, self).__init__() 18 | self.user_num = user_num 19 | self.item_num = item_num 20 | self.irf_kwargs = irf_kwargs if irf_kwargs is not None else {} 21 | self.theta = nn.Embedding(self.user_num, 1) 22 | self.a = nn.Embedding(self.item_num, 1) 23 | self.b = nn.Embedding(self.item_num, 1) 24 | self.c = nn.Embedding(self.item_num, 1) 25 | self.value_range = value_range 26 | self.a_range = a_range 27 | 28 | def forward(self, user, item): 29 | theta = torch.squeeze(self.theta(user), dim=-1) 30 | a = torch.squeeze(self.a(item), dim=-1) 31 | b = torch.squeeze(self.b(item), dim=-1) 32 | c = torch.squeeze(self.c(item), dim=-1) 33 | c = torch.sigmoid(c) 34 | if self.value_range is not None: 35 | theta = self.value_range * (torch.sigmoid(theta) - 0.5) 36 | b = self.value_range * (torch.sigmoid(b) - 0.5) 37 | if self.a_range is not None: 38 | a = self.a_range * torch.sigmoid(a) 39 | else: 40 | a = F.softplus(a) 41 | if torch.max(theta != theta) or torch.max(a != a) or torch.max(b != b): # pragma: no cover 42 | raise ValueError('ValueError:theta,a,b may contains nan! The value_range or a_range is too large.') 43 | return self.irf(theta, a, b, c, **self.irf_kwargs) 44 | 45 | @classmethod 46 | def irf(cls, theta, a, b, c, **kwargs): 47 | return irt3pl(theta, a, b, c, F=torch, **kwargs) 48 | 49 | 50 | class IRT(CDM): 51 | def __init__(self, user_num, item_num, value_range=None, a_range=None): 52 | super(IRT, self).__init__() 53 | self.irt_net = IRTNet(user_num, item_num, value_range, a_range) 54 | 55 | def train(self, train_data, test_data=None, *, epoch: int, device="cpu", lr=0.001) -> ...: 56 | self.irt_net = self.irt_net.to(device) 57 | loss_function = nn.BCELoss() 58 | 59 | trainer = torch.optim.Adam(self.irt_net.parameters(), lr) 60 | 61 | for e in range(epoch): 62 | losses = [] 63 | for batch_data in tqdm(train_data, "Epoch %s" % e): 64 | user_id, item_id, response = batch_data 65 | user_id: torch.Tensor = user_id.to(device) 66 | item_id: torch.Tensor = item_id.to(device) 67 | predicted_response: torch.Tensor = self.irt_net(user_id, item_id) 68 | response: torch.Tensor = response.to(device) 69 | loss = loss_function(predicted_response, response) 70 | 71 | # back propagation 72 | trainer.zero_grad() 73 | loss.backward() 74 | trainer.step() 75 | 76 | losses.append(loss.mean().item()) 77 | print("[Epoch %d] LogisticLoss: %.6f" % (e, float(np.mean(losses)))) 78 | 79 | if test_data is not None: 80 | auc, accuracy = self.eval(test_data, device=device) 81 | print("[Epoch %d] auc: %.6f, accuracy: %.6f" % (e, auc, accuracy)) 82 | 83 | def eval(self, test_data, device="cpu") -> tuple: 84 | self.irt_net = self.irt_net.to(device) 85 | self.irt_net.eval() 86 | y_pred = [] 87 | y_true = [] 88 | for batch_data in tqdm(test_data, "evaluating"): 89 | user_id, item_id, response = batch_data 90 | user_id: torch.Tensor = user_id.to(device) 91 | item_id: torch.Tensor = item_id.to(device) 92 | pred: torch.Tensor = self.irt_net(user_id, item_id) 93 | y_pred.extend(pred.tolist()) 94 | y_true.extend(response.tolist()) 95 | 96 | self.irt_net.train() 97 | return roc_auc_score(y_true, y_pred), accuracy_score(y_true, np.array(y_pred) >= 0.5) 98 | 99 | def save(self, filepath): 100 | torch.save(self.irt_net.state_dict(), filepath) 101 | logging.info("save parameters to %s" % filepath) 102 | 103 | def load(self, filepath): 104 | self.irt_net.load_state_dict(torch.load(filepath)) 105 | logging.info("load parameters from %s" % filepath) 106 | -------------------------------------------------------------------------------- /EduCDM/IRT/GD/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/4/23 @ tongshiwei 3 | 4 | from .IRT import IRT 5 | -------------------------------------------------------------------------------- /EduCDM/IRT/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/4/23 @ tongshiwei 3 | 4 | 5 | from .GD import IRT as GDIRT 6 | from .EM import IRT as EMIRT 7 | -------------------------------------------------------------------------------- /EduCDM/IRT/irt.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/4/23 @ tongshiwei 3 | 4 | import numpy as np 5 | 6 | __all__ = ["irf", "irt3pl"] 7 | 8 | 9 | def irf(theta, a, b, c, D=1.702, *, F=np): 10 | return c + (1 - c) / (1 + F.exp(-D * a * (theta - b))) 11 | 12 | 13 | irt3pl = irf 14 | -------------------------------------------------------------------------------- /EduCDM/KaNCD/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/4/1 @ WangFei 3 | 4 | from .KaNCD import KaNCD 5 | -------------------------------------------------------------------------------- /EduCDM/MCD/MCD.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/3/23 @ tongshiwei 3 | 4 | import logging 5 | import numpy as np 6 | import torch 7 | from tqdm import tqdm 8 | from torch import nn 9 | from EduCDM import CDM 10 | from sklearn.metrics import roc_auc_score, accuracy_score 11 | 12 | 13 | class MFNet(nn.Module): 14 | """Matrix Factorization Network""" 15 | 16 | def __init__(self, user_num, item_num, latent_dim): 17 | super(MFNet, self).__init__() 18 | self.user_num = user_num 19 | self.item_num = item_num 20 | self.latent_dim = latent_dim 21 | self.user_embedding = nn.Embedding(self.user_num, self.latent_dim) 22 | self.item_embedding = nn.Embedding(self.item_num, self.latent_dim) 23 | self.response = nn.Linear(2 * self.latent_dim, 1) 24 | 25 | def forward(self, user_id, item_id): 26 | user = self.user_embedding(user_id) 27 | item = self.item_embedding(item_id) 28 | return torch.squeeze(torch.sigmoid(self.response(torch.cat([user, item], dim=-1))), dim=-1) 29 | 30 | 31 | class MCD(CDM): 32 | """Matrix factorization based Cognitive Diagnosis Model""" 33 | 34 | def __init__(self, user_num, item_num, latent_dim): 35 | super(MCD, self).__init__() 36 | self.mf_net = MFNet(user_num, item_num, latent_dim) 37 | 38 | def train(self, train_data, test_data=None, *, epoch: int, device="cpu", lr=0.001) -> ...: 39 | self.mf_net = self.mf_net.to(device) 40 | loss_function = nn.BCELoss() 41 | 42 | trainer = torch.optim.Adam(self.mf_net.parameters(), lr) 43 | 44 | for e in range(epoch): 45 | losses = [] 46 | for batch_data in tqdm(train_data, "Epoch %s" % e): 47 | user_id, item_id, response = batch_data 48 | user_id: torch.Tensor = user_id.to(device) 49 | item_id: torch.Tensor = item_id.to(device) 50 | predicted_response: torch.Tensor = self.mf_net(user_id, item_id) 51 | response: torch.Tensor = response.to(device) 52 | loss = loss_function(predicted_response, response) 53 | 54 | # back propagation 55 | trainer.zero_grad() 56 | loss.backward() 57 | trainer.step() 58 | 59 | losses.append(loss.mean().item()) 60 | print("[Epoch %d] LogisticLoss: %.6f" % (e, float(np.mean(losses)))) 61 | 62 | if test_data is not None: 63 | auc, accuracy = self.eval(test_data, device=device) 64 | print("[Epoch %d] auc: %.6f, accuracy: %.6f" % (e, auc, accuracy)) 65 | 66 | def eval(self, test_data, device="cpu") -> tuple: 67 | self.mf_net = self.mf_net.to(device) 68 | self.mf_net.eval() 69 | y_pred = [] 70 | y_true = [] 71 | for batch_data in tqdm(test_data, "evaluating"): 72 | user_id, item_id, response = batch_data 73 | user_id: torch.Tensor = user_id.to(device) 74 | item_id: torch.Tensor = item_id.to(device) 75 | pred: torch.Tensor = self.mf_net(user_id, item_id) 76 | y_pred.extend(pred.tolist()) 77 | y_true.extend(response.tolist()) 78 | 79 | self.mf_net.train() 80 | return roc_auc_score(y_true, y_pred), accuracy_score(y_true, np.array(y_pred) >= 0.5) 81 | 82 | def save(self, filepath): 83 | torch.save(self.mf_net.state_dict(), filepath) 84 | logging.info("save parameters to %s" % filepath) 85 | 86 | def load(self, filepath): 87 | self.mf_net.load_state_dict(torch.load(filepath)) 88 | logging.info("load parameters from %s" % filepath) 89 | -------------------------------------------------------------------------------- /EduCDM/MCD/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/3/23 @ tongshiwei 3 | 4 | 5 | from .MCD import MCD 6 | -------------------------------------------------------------------------------- /EduCDM/MIRT/MIRT.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/7/1 @ tongshiwei 3 | 4 | 5 | import logging 6 | import numpy as np 7 | import torch 8 | from EduCDM import CDM 9 | from torch import nn 10 | import torch.nn.functional as F 11 | from tqdm import tqdm 12 | from sklearn.metrics import roc_auc_score, accuracy_score 13 | 14 | 15 | def irt2pl(theta, a, b, *, F=np): 16 | """ 17 | 18 | Parameters 19 | ---------- 20 | theta 21 | a 22 | b 23 | F 24 | 25 | Returns 26 | ------- 27 | 28 | Examples 29 | -------- 30 | >>> theta = [1, 0.5, 0.3] 31 | >>> a = [-3, 1, 3] 32 | >>> b = 0.5 33 | >>> float(irt2pl(theta, a, b)) # doctest: +ELLIPSIS 34 | 0.109... 35 | >>> theta = [[1, 0.5, 0.3], [2, 1, 0]] 36 | >>> a = [[-3, 1, 3], [-3, 1, 3]] 37 | >>> b = [0.5, 0.5] 38 | >>> irt2pl(theta, a, b) # doctest: +ELLIPSIS 39 | array([0.109..., 0.004...]) 40 | """ 41 | return 1 / (1 + F.exp(- F.sum(F.multiply(a, theta), axis=-1) + b)) 42 | 43 | 44 | class MIRTNet(nn.Module): 45 | def __init__(self, user_num, item_num, latent_dim, a_range, irf_kwargs=None): 46 | super(MIRTNet, self).__init__() 47 | self.user_num = user_num 48 | self.item_num = item_num 49 | self.irf_kwargs = irf_kwargs if irf_kwargs is not None else {} 50 | self.theta = nn.Embedding(self.user_num, latent_dim) 51 | self.a = nn.Embedding(self.item_num, latent_dim) 52 | self.b = nn.Embedding(self.item_num, 1) 53 | self.a_range = a_range 54 | 55 | def forward(self, user, item): 56 | theta = torch.squeeze(self.theta(user), dim=-1) 57 | a = torch.squeeze(self.a(item), dim=-1) 58 | if self.a_range is not None: 59 | a = self.a_range * torch.sigmoid(a) 60 | else: 61 | a = F.softplus(a) 62 | b = torch.squeeze(self.b(item), dim=-1) 63 | if torch.max(theta != theta) or torch.max(a != a) or torch.max(b != b): # pragma: no cover 64 | raise ValueError('ValueError:theta,a,b may contains nan! The a_range is too large.') 65 | return self.irf(theta, a, b, **self.irf_kwargs) 66 | 67 | @classmethod 68 | def irf(cls, theta, a, b, **kwargs): 69 | return irt2pl(theta, a, b, F=torch) 70 | 71 | 72 | class MIRT(CDM): 73 | def __init__(self, user_num, item_num, latent_dim, a_range=None): 74 | super(MIRT, self).__init__() 75 | self.irt_net = MIRTNet(user_num, item_num, latent_dim, a_range) 76 | 77 | def train(self, train_data, test_data=None, *, epoch: int, device="cpu", lr=0.001) -> ...: 78 | self.irt_net = self.irt_net.to(device) 79 | loss_function = nn.BCELoss() 80 | 81 | trainer = torch.optim.Adam(self.irt_net.parameters(), lr) 82 | 83 | for e in range(epoch): 84 | losses = [] 85 | for batch_data in tqdm(train_data, "Epoch %s" % e): 86 | user_id, item_id, response = batch_data 87 | user_id: torch.Tensor = user_id.to(device) 88 | item_id: torch.Tensor = item_id.to(device) 89 | predicted_response: torch.Tensor = self.irt_net(user_id, item_id) 90 | response: torch.Tensor = response.to(device) 91 | loss = loss_function(predicted_response, response) 92 | 93 | # back propagation 94 | trainer.zero_grad() 95 | loss.backward() 96 | trainer.step() 97 | 98 | losses.append(loss.mean().item()) 99 | print("[Epoch %d] LogisticLoss: %.6f" % (e, float(np.mean(losses)))) 100 | 101 | if test_data is not None: 102 | auc, accuracy = self.eval(test_data, device=device) 103 | print("[Epoch %d] auc: %.6f, accuracy: %.6f" % (e, auc, accuracy)) 104 | 105 | def eval(self, test_data, device="cpu") -> tuple: 106 | self.irt_net = self.irt_net.to(device) 107 | self.irt_net.eval() 108 | y_pred = [] 109 | y_true = [] 110 | for batch_data in tqdm(test_data, "evaluating"): 111 | user_id, item_id, response = batch_data 112 | user_id: torch.Tensor = user_id.to(device) 113 | item_id: torch.Tensor = item_id.to(device) 114 | pred: torch.Tensor = self.irt_net(user_id, item_id) 115 | y_pred.extend(pred.tolist()) 116 | y_true.extend(response.tolist()) 117 | 118 | self.irt_net.train() 119 | return roc_auc_score(y_true, y_pred), accuracy_score(y_true, np.array(y_pred) >= 0.5) 120 | 121 | def save(self, filepath): 122 | torch.save(self.irt_net.state_dict(), filepath) 123 | logging.info("save parameters to %s" % filepath) 124 | 125 | def load(self, filepath): 126 | self.irt_net.load_state_dict(torch.load(filepath)) 127 | logging.info("load parameters from %s" % filepath) 128 | -------------------------------------------------------------------------------- /EduCDM/MIRT/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/7/1 @ tongshiwei 3 | 4 | from .MIRT import MIRT 5 | -------------------------------------------------------------------------------- /EduCDM/NCDM/NCDM.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/4/1 @ WangFei 3 | 4 | import logging 5 | import torch 6 | import torch.nn as nn 7 | import torch.optim as optim 8 | import torch.nn.functional as F 9 | import numpy as np 10 | from tqdm import tqdm 11 | from sklearn.metrics import roc_auc_score, accuracy_score 12 | from EduCDM import CDM 13 | 14 | 15 | class PosLinear(nn.Linear): 16 | def forward(self, input: torch.Tensor) -> torch.Tensor: 17 | weight = 2 * F.relu(1 * torch.neg(self.weight)) + self.weight 18 | return F.linear(input, weight, self.bias) 19 | 20 | 21 | class Net(nn.Module): 22 | 23 | def __init__(self, knowledge_n, exer_n, student_n): 24 | self.knowledge_dim = knowledge_n 25 | self.exer_n = exer_n 26 | self.emb_num = student_n 27 | self.stu_dim = self.knowledge_dim 28 | self.prednet_input_len = self.knowledge_dim 29 | self.prednet_len1, self.prednet_len2 = 512, 256 # changeable 30 | 31 | super(Net, self).__init__() 32 | 33 | # prediction sub-net 34 | self.student_emb = nn.Embedding(self.emb_num, self.stu_dim) 35 | self.k_difficulty = nn.Embedding(self.exer_n, self.knowledge_dim) 36 | self.e_difficulty = nn.Embedding(self.exer_n, 1) 37 | self.prednet_full1 = PosLinear(self.prednet_input_len, self.prednet_len1) 38 | self.drop_1 = nn.Dropout(p=0.5) 39 | self.prednet_full2 = PosLinear(self.prednet_len1, self.prednet_len2) 40 | self.drop_2 = nn.Dropout(p=0.5) 41 | self.prednet_full3 = PosLinear(self.prednet_len2, 1) 42 | 43 | # initialize 44 | for name, param in self.named_parameters(): 45 | if 'weight' in name: 46 | nn.init.xavier_normal_(param) 47 | 48 | def forward(self, stu_id, input_exercise, input_knowledge_point): 49 | # before prednet 50 | stu_emb = self.student_emb(stu_id) 51 | stat_emb = torch.sigmoid(stu_emb) 52 | k_difficulty = torch.sigmoid(self.k_difficulty(input_exercise)) 53 | e_difficulty = torch.sigmoid(self.e_difficulty(input_exercise)) # * 10 54 | # prednet 55 | input_x = e_difficulty * (stat_emb - k_difficulty) * input_knowledge_point 56 | input_x = self.drop_1(torch.sigmoid(self.prednet_full1(input_x))) 57 | input_x = self.drop_2(torch.sigmoid(self.prednet_full2(input_x))) 58 | output_1 = torch.sigmoid(self.prednet_full3(input_x)) 59 | 60 | return output_1.view(-1) 61 | 62 | 63 | class NCDM(CDM): 64 | '''Neural Cognitive Diagnosis Model''' 65 | 66 | def __init__(self, knowledge_n, exer_n, student_n): 67 | super(NCDM, self).__init__() 68 | self.ncdm_net = Net(knowledge_n, exer_n, student_n) 69 | 70 | def train(self, train_data, test_data=None, epoch=10, device="cpu", lr=0.002, silence=False): 71 | self.ncdm_net = self.ncdm_net.to(device) 72 | self.ncdm_net.train() 73 | loss_function = nn.BCELoss() 74 | optimizer = optim.Adam(self.ncdm_net.parameters(), lr=lr) 75 | for epoch_i in range(epoch): 76 | epoch_losses = [] 77 | batch_count = 0 78 | for batch_data in tqdm(train_data, "Epoch %s" % epoch_i): 79 | batch_count += 1 80 | user_id, item_id, knowledge_emb, y = batch_data 81 | user_id: torch.Tensor = user_id.to(device) 82 | item_id: torch.Tensor = item_id.to(device) 83 | knowledge_emb: torch.Tensor = knowledge_emb.to(device) 84 | y: torch.Tensor = y.to(device) 85 | pred: torch.Tensor = self.ncdm_net(user_id, item_id, knowledge_emb) 86 | loss = loss_function(pred, y) 87 | 88 | optimizer.zero_grad() 89 | loss.backward() 90 | optimizer.step() 91 | 92 | epoch_losses.append(loss.mean().item()) 93 | 94 | print("[Epoch %d] average loss: %.6f" % (epoch_i, float(np.mean(epoch_losses)))) 95 | 96 | if test_data is not None: 97 | auc, accuracy = self.eval(test_data, device=device) 98 | print("[Epoch %d] auc: %.6f, accuracy: %.6f" % (epoch_i, auc, accuracy)) 99 | 100 | def eval(self, test_data, device="cpu"): 101 | self.ncdm_net = self.ncdm_net.to(device) 102 | self.ncdm_net.eval() 103 | y_true, y_pred = [], [] 104 | for batch_data in tqdm(test_data, "Evaluating"): 105 | user_id, item_id, knowledge_emb, y = batch_data 106 | user_id: torch.Tensor = user_id.to(device) 107 | item_id: torch.Tensor = item_id.to(device) 108 | knowledge_emb: torch.Tensor = knowledge_emb.to(device) 109 | pred: torch.Tensor = self.ncdm_net(user_id, item_id, knowledge_emb) 110 | y_pred.extend(pred.detach().cpu().tolist()) 111 | y_true.extend(y.tolist()) 112 | 113 | return roc_auc_score(y_true, y_pred), accuracy_score(y_true, np.array(y_pred) >= 0.5) 114 | 115 | def save(self, filepath): 116 | torch.save(self.ncdm_net.state_dict(), filepath) 117 | logging.info("save parameters to %s" % filepath) 118 | 119 | def load(self, filepath): 120 | self.ncdm_net.load_state_dict(torch.load(filepath)) # , map_location=lambda s, loc: s 121 | logging.info("load parameters from %s" % filepath) 122 | -------------------------------------------------------------------------------- /EduCDM/NCDM/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/4/1 @ WangFei 3 | 4 | from .NCDM import NCDM 5 | -------------------------------------------------------------------------------- /EduCDM/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/3/17 @ tongshiwei 3 | 4 | 5 | from .meta import CDM 6 | from .MCD import MCD 7 | from .DINA import EMDINA, GDDINA 8 | from .FuzzyCDF import FuzzyCDF 9 | from .NCDM import NCDM 10 | from .IRT import EMIRT, GDIRT 11 | from .MIRT import MIRT 12 | from .KaNCD import KaNCD 13 | -------------------------------------------------------------------------------- /EduCDM/meta.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/3/17 @ tongshiwei 3 | 4 | 5 | def etl(*args, **kwargs) -> ...: # pragma: no cover 6 | """ 7 | extract - transform - load 8 | """ 9 | pass 10 | 11 | 12 | def train(*args, **kwargs) -> ...: # pragma: no cover 13 | pass 14 | 15 | 16 | def evaluate(*args, **kwargs) -> ...: # pragma: no cover 17 | pass 18 | 19 | 20 | class CDM(object): 21 | def __init__(self, *args, **kwargs) -> ...: 22 | pass 23 | 24 | def train(self, *args, **kwargs) -> ...: 25 | raise NotImplementedError 26 | 27 | def eval(self, *args, **kwargs) -> ...: 28 | raise NotImplementedError 29 | 30 | def save(self, *args, **kwargs) -> ...: 31 | raise NotImplementedError 32 | 33 | def load(self, *args, **kwargs) -> ...: 34 | raise NotImplementedError 35 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | VERSION=`ls dist/*.tar.gz | sed "s/dist\/CDM-\(.*\)\.tar\.gz/\1/g"` 2 | 3 | ifdef ENVPIP 4 | PIP = $(ENVPIP) 5 | else 6 | PIP = pip3 7 | endif 8 | 9 | ifdef ENVPYTHON 10 | PYTHON = $(ENVPYTHON) 11 | else 12 | PYTHON = python3 13 | endif 14 | 15 | ifdef ENVPYTEST 16 | PYTEST = $(ENVPYTEST) 17 | else 18 | PYTEST = pytest 19 | endif 20 | 21 | help: 22 | 23 | @echo "install install CDM" 24 | @echo "test run test" 25 | @echo "release publish to PyPI and release in github" 26 | @echo "release_test publish to TestPyPI" 27 | @echo "clean remove all build, test, coverage and Python artifacts" 28 | @echo "clean-build remove build artifacts" 29 | @echo "clean-pyc remove Python file artifacts" 30 | @echo "clean-test remove test and coverage artifacts" 31 | 32 | .PHONY: install, test, build, release, release_test, version, .test, .build, clean 33 | 34 | install: 35 | @echo "install CDM" 36 | $(PIP) install -e . --user 37 | 38 | test: 39 | @echo "run test" 40 | $(PYTEST) 41 | 42 | build: test, clean 43 | $(PYTHON) setup.py bdist_wheel sdist 44 | 45 | .test: 46 | $(PYTEST) > /dev/null 47 | 48 | .build: clean 49 | $(PYTHON) setup.py bdist_wheel sdist > /dev/null 50 | 51 | version: .build 52 | @echo $(VERSION) 53 | 54 | release: test, build 55 | @echo "publish to pypi and release in github" 56 | @echo "version $(VERSION)" 57 | 58 | -@twine upload dist/* && git tag "v$(VERSION)" 59 | git push && git push --tags 60 | 61 | release_test: test, build 62 | @echo "publish to test pypi" 63 | @echo "version $(VERSION)" 64 | 65 | -@twine upload --repository test dist/* 66 | 67 | clean: clean-build clean-pyc clean-test 68 | 69 | clean-build: 70 | rm -rf build/* 71 | rm -rf dist/* 72 | rm -rf .eggs/* 73 | find . -name '*.egg-info' -exec rm -fr {} + 74 | find . -name '*.egg' -exec rm -f {} + 75 | 76 | clean-pyc: 77 | find . -name '*.pyc' -exec rm -f {} + 78 | find . -name '*.pyo' -exec rm -f {} + 79 | find . -name '*~' -exec rm -f {} + 80 | find . -name '__pycache__' -exec rm -rf {} + 81 | 82 | clean-test: 83 | rm -f .coverage -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |

4 | 5 | # EduCDM 6 | 7 | 8 | [![PyPI](https://img.shields.io/pypi/v/EduCDM.svg)](https://pypi.python.org/pypi/EduCDM) 9 | [![test](https://github.com/bigdata-ustc/EduCDM/actions/workflows/python-test.yml/badge.svg?branch=main)](https://github.com/bigdata-ustc/EduCDM/actions/workflows/python-test.yml) 10 | [![codecov](https://codecov.io/gh/bigdata-ustc/EduCDM/branch/main/graph/badge.svg?token=B7gscOGQLD)](https://codecov.io/gh/bigdata-ustc/EduCDM) 11 | [![Download](https://img.shields.io/pypi/dm/EduCDM.svg?style=flat)](https://pypi.python.org/pypi/EduCDM) 12 | [![License](https://img.shields.io/github/license/bigdata-ustc/EduCDM)](LICENSE) 13 | [![DOI](https://zenodo.org/badge/348569904.svg)](https://zenodo.org/badge/latestdoi/348569904) 14 | 15 | The Model Zoo of Cognitive Diagnosis Models, including classic Item Response Ranking (**IRT**), Multidimensional Item Response Ranking (**MIRT**), Deterministic Input, Noisy "And" model(**DINA**), and advanced Fuzzy Cognitive Diagnosis Framework (**FuzzyCDF**), Neural Cognitive Diagnosis Model (**NCDM**), Item Response Ranking framework (**IRR**), Incremental Cognitive Diagnosis (**ICD**) and Knowledge-association baesd extension of NeuralCD (**KaNCD**). 16 | 17 | ## Brief introduction to CDM 18 | 19 | Cognitive diagnosis model (CDM) for intelligent educational systems is a type of model that infers students' knowledge states from their learning behaviors (especially exercise response logs). 20 | 21 | 22 | 23 | Typically, the input of a CDM could be the students' response logs of items (i.e., exercises/questions), the Q-matrix that denotes the correlation between items and knowledge concepts (skills). The output is the diagnosed student knowledge states, such as students' abilities and students' proficiencies on each knowledge concepts. 24 | 25 | 26 | 27 | Traditional CDMs include: 28 | 29 | - [IRT](https://link.springer.com/book/10.1007/978-0-387-89976-3): item response theory, a continuous unidimensional CDM with logistic-like item response function. 30 | - [MIRT](https://link.springer.com/book/10.1007/978-0-387-89976-3): Multidimensional item response theory, a continuous multidimensional CDM with logistic-like item response function. Mostly extended from unidimensional IRT. 31 | - [DINA](https://journals.sagepub.com/doi/10.3102/1076998607309474): deterministic input, noisy "and" model, a discrete multidimensional CDM. Q-matrix is used to model the effect of knowledge concepts in the cognitive process, as well as guessing and slipping factors. 32 | 33 | etc. 34 | 35 | More recent researches about CDMs: 36 | 37 | - [FuzzyCDF](http://staff.ustc.edu.cn/~qiliuql/files/Publications/Qi-Liu-TIST2018.pdf): fuzzy cognitive diagnosis framework, a continuous multidimensional CDM for students' cognitive modeling with both objective and subjective items. 38 | - [NeuralCD](http://staff.ustc.edu.cn/~cheneh/paper_pdf/2020/Fei-Wang-AAAI.pdf): neural cognitive diagnosis framework, a neural-network-based general cognitive diagnosis framework. In this repository we provide the basic implementation NCDM. 39 | - [IRR](http://home.ustc.edu.cn/~tongsw/files/IRR.pdf): item response ranking framework, a pairwise cognitive diagnosis framework. In this repository we provide the several implementations for most of CDMs. 40 | - [ICD]: Incremental Cognitive Diagnosis, a framework that tailor cognitive diagnosis into the online scenario of intelligent education. In this repository we provide the several implementations for most of CDMs. 41 | - [KaNCD](https://ieeexplore.ieee.org/abstract/document/9865139): extended from the NeuralCD framework. We use high-order latent traits of students, exercises and knowledge concepts to capture latent associations among knowledge concepts. 42 | 43 | ## List of models 44 | 45 | * [NCDM](EduCDM/NCDM) [[doc]](docs/NCDM.md) [[example]](examples/NCDM) 46 | * [FuzzyCDF](EduCDM/FuzzyCDF) [[doc]](docs/FuzzyCDF.md) [[example]](examples/FuzzyCDF) 47 | * [DINA](EduCDM/DINA) [[doc]](docs/DINA.md) [[example]](examples/DINA) 48 | * Eexpectation Maximization ([EMDINA](EduCDM/DINA/EM)) [[example]](examples/DINA/EM) 49 | * Gradient Descent ([GDDINA](EduCDM/DINA/GD)) [[example]](examples/DINA/GD) 50 | * [MIRT](EduCDM/MIRT) [[doc]](docs/MIRT.md) [[example]](examples/MIRT) 51 | * [IRT](EduCDM/IRT) [[doc]](docs/IRT.md) [[example]](examples/IRT) 52 | * Eexpectation Maximization ([EMIRT](EduCDM/IRT/EM)) [[example]](examples/IRT/EM) 53 | * Gradient Descent ([GDIRT](EduCDM/IRT/GD)) [[example]](examples/IRT/GD) 54 | * [MCD](EduCDM/MCD) [[doc]](docs/MCD.md) [[example]](examples/MCD) 55 | * [IRR](EduCDM/IRR) [[doc]](docs/IRR.md)[[example]](examples/IRR) 56 | * [IRR-NCDM](examples/IRR/NCDM.ipynb) 57 | * [IRR-MIRT](examples/IRR/MIRT.ipynb) 58 | * [IRR-DINA](examples/IRR/DINA.ipynb) 59 | * [IRR-IRT](examples/IRR/IRT.ipynb) 60 | * [ICD](EduCDM/ICD) [[doc]](docs/ICD.md) 61 | * [KaNCD](EduCDM/KaNCD) [[doc\]](docs/KaNCD.md) [[example\]](examples/KaNCD) 62 | ## Installation 63 | 64 | Git and install with `pip`: 65 | 66 | ``` 67 | git clone https://github.com/bigdata-ustc/EduCDM.git 68 | cd path/to/code 69 | pip install . 70 | ``` 71 | 72 | Or directly install from pypi: 73 | 74 | ``` 75 | pip install EduCDM 76 | ``` 77 | 78 | 79 | ## Contribute 80 | 81 | EduCDM is still under development. More algorithms and features are going to be added and we always welcome contributions to help make EduCDM better. If you would like to contribute, please follow this [guideline](CONTRIBUTE.md). 82 | 83 | ## Citation 84 | 85 | If this repository is helpful for you, please cite our work 86 | 87 | ``` 88 | @misc{bigdata2021educdm, 89 | title={EduCDM}, 90 | author={bigdata-ustc}, 91 | publisher = {GitHub}, 92 | journal = {GitHub repository}, 93 | year = {2021}, 94 | howpublished = {\url{https://github.com/bigdata-ustc/EduCDM}}, 95 | } 96 | ``` 97 | 98 | ## Reference 99 | 100 | [1] Liu Q, Wu R, Chen E, et al. Fuzzy cognitive diagnosis for modelling examinee performance[J]. ACM Transactions on Intelligent Systems and Technology (TIST), 2018, 9(4): 1-26. 101 | 102 | [2] Wang F, Liu Q, Chen E, et al. Neural cognitive diagnosis for intelligent education systems[C]//Proceedings of the AAAI Conference on Artificial Intelligence. 2020, 34(04): 6153-6161. 103 | 104 | [3] Tong S, Liu Q, Yu R, et al. Item response ranking for cognitive diagnosis[C]. IJCAI, 2021. 105 | 106 | [4] Wang F, Liu Q, Chen E, et al. NeuralCD: A General Framework for Cognitive Diagnosis. IEEE Transactions on Knowledge and Data Engineering (IEEE TKDE), accepted, 2022. -------------------------------------------------------------------------------- /docs/DINA.md: -------------------------------------------------------------------------------- 1 | # Deterministic Inputs, Noisy “And” gate model 2 | 3 | If the reader wants to know the details of DINA, please refer to the Appendix of the paper: *[DINA model and parameter estimation: A didactic](https://journals.sagepub.com/doi/10.3102/1076998607309474)*. 4 | ```bibtex 5 | @article{de2009dina, 6 | title={DINA model and parameter estimation: A didactic}, 7 | author={De La Torre, Jimmy}, 8 | journal={Journal of educational and behavioral statistics}, 9 | volume={34}, 10 | number={1}, 11 | pages={115--130}, 12 | year={2009}, 13 | publisher={Sage Publications Sage CA: Los Angeles, CA} 14 | } 15 | ``` 16 | 17 | ![model](_static/DINA.png) 18 | -------------------------------------------------------------------------------- /docs/FuzzyCDF.md: -------------------------------------------------------------------------------- 1 | # Fuzzy cognitive diagnosis framework 2 | 3 | If the reader wants to know the details of FuzzyCDF, please refer to the Chapter 4 of the paper: *[Fuzzy Cognitive Diagnosis for Modelling Examinee Performance](http://staff.ustc.edu.cn/~qiliuql/files/Publications/Qi-Liu-TIST2018.pdf)*. 4 | ```bibtex 5 | @article{liu2018fuzzy, 6 | title={Fuzzy cognitive diagnosis for modelling examinee performance}, 7 | author={Liu, Qi and Wu, Runze and Chen, Enhong and Xu, Guandong and Su, Yu and Chen, Zhigang and Hu, Guoping}, 8 | journal={ACM Transactions on Intelligent Systems and Technology (TIST)}, 9 | volume={9}, 10 | number={4}, 11 | pages={1--26}, 12 | year={2018}, 13 | publisher={ACM New York, NY, USA} 14 | } 15 | ``` 16 | 17 | ![model](_static/FuzzyCDF.png) 18 | -------------------------------------------------------------------------------- /docs/ICD.md: -------------------------------------------------------------------------------- 1 | # ICD:Incremental Cognitive Diagnosis for Intelligent Education 2 | This is our implementation for the paper: 3 | 4 | Shiwei Tong,Jiayu Liu ,Yuting Hong, Zhenya Huang, Le Wu, Qi Liu, Wei Huang, Enhong Chen, Dan Zhang. Incremental Cognitive Diagnosis for Intelligent Education . The 28th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining (KDD'2022) 5 | 6 | Please cite our KDD'2022 paper if you use our codes. Thanks! 7 | 8 | Author: Shiwei Tong 9 | 10 | Email: tongsw@mail.ustc.edu.cn 11 | 12 | 13 | 14 | ## Example to run the codes. 15 | The instruction of commands and take a0910 dataset as an example 16 | 17 | Go to the code directory: 18 | ``` 19 | cd EduCDM/EduCDM/ICD/ICD 20 | ``` 21 | Replace path_prefix by your project_url in ICD/constant.py. 22 | 23 | 24 | Run incremental method 25 | ``` 26 | python examles/ICD/ICD.py 27 | ``` 28 | 29 | ## Citation 30 | ```bibtex 31 | @inproceedings{tong2022incremental, 32 | title={Incremental Cognitive Diagnosis for Intelligent Education}, 33 | author={Tong, Shiwei and Liu, Jiayu and Hong, Yuting and Huang, Zhenya and Wu, Le and Liu, Qi and Huang, Wei and Chen, Enhong and Zhang, Dan}, 34 | booktitle={Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining}, 35 | pages={1760--1770}, 36 | year={2022} 37 | } 38 | ``` -------------------------------------------------------------------------------- /docs/IRR.md: -------------------------------------------------------------------------------- 1 | # Item Response Ranking for Cognitive Diagnosis 2 | [[Paper]](https://www.ijcai.org/proceedings/2021/0241.pdf) 3 | [[Poster]](http://home.ustc.edu.cn/~tongsw/files/IRR_Poster.pdf) 4 | [[Slide]](http://home.ustc.edu.cn/~tongsw/files/IRR_Slide.pdf) 5 | [[Code]](../EduCDM/IRR) 6 | 7 | 8 | Cognitive diagnosis, a fundamental task in education area, aims at providing an approach to reveal the proficiency level of students on knowledge concepts. 9 | Actually, **monotonicity is one of the basic conditions in cognitive diagnosis theory**, which assumes that **student's proficiency is monotonic with the probability of giving the right response to a test item**. 10 | However, few of previous methods consider the monotonicity during optimization. 11 | To this end, we propose Item Response Ranking framework (IRR), aiming at introducing pairwise learning into cognitive diagnosis to well model the monotonicity between item responses. 12 | Specifically, we first use an item specific sampling method to sample item responses and construct response pairs based on their partial order, where we propose the two-branch sampling methods to handle the unobserved responses (see Figure 2). 13 | After that, we use a pairwise objective function to exploit the monotonicity in the pair formulation. 14 | In fact, IRR is a general framework which can be applied to most of contemporary cognitive diagnosis models. 15 | 16 | We provide some examples for better illustration: 17 | 18 | * [IRR-IRT](../examples/IRR/IRT.ipynb) 19 | * [IRR-MIRT](../examples/IRR/MIRT.ipynb) 20 | * [IRR-DINA](../examples/IRR/DINA.ipynb) 21 | * [IRR-NCDM](../examples/IRR/NCDM.ipynb) 22 | 23 | ![Sampling](_static/IRR.png) 24 | 25 | In the following parts, we will simply introduce the basic lemma `pairwise monotonicity` and training procedure. 26 | 27 | ## Pairwise Monotonicity 28 | 29 | In the literature, the monotonicity theory assumes that student's proficiency is monotonic with the probability of giving the right response to a test item. 30 | We rewrite it in a pairwise perspective: a more skilled student should have a higher probability to give the right response to a test item than an unskilled one. Formally, we have the following pairwise monotonicity: 31 | 32 | ### Pairwise Monotonicity 33 | 34 | _Given a specific test item, the students with right responses are more skilled than those with wrong responses._ 35 | 36 | ## Learning Model with IRR 37 | 38 | We first design an item specific pair sampling method to resolve the potential non-overlapped problem, i.e., sampling responses from different students to the same item to keep related knowledge concepts the same. 39 | Then, to handle the unobserved responses along with the observed responses, we conduct a two-branch sampling method, i.e., positive sampling and negative sampling. 40 | After that, based on the sampled pairs, we introduce the pairwise learning to model the partial order among response pairs, where we use a pairwise objective function to better optimize the monotonicity. 41 | 42 | The objective function of IRR is: 43 | 44 | $$ 45 | min_{\Theta} - \mathop{ln} IRR + \lambda(\Theta), 46 | $$ 47 | where $\lambda(\Theta)$ is the regularization term and $\lambda$ is a hyper-parameter. We can apply IRR to any fully differentiable CDMs (e.g., MIRT) and train them with Stochastic Gradient Descent. 48 | 49 | For more details, please refer to our paper. 50 | 51 | ## Citation 52 | 53 | ``` 54 | @inproceedings{tong2021item, 55 | title={Item response ranking for cognitive diagnosis}, 56 | author={Tong, Shiwei and Liu, Qi and Yu, Runlong and Huang, Wei and Huang, Zhenya and Pardos, Zachary and Jiang, Weijie}, 57 | year={2021}, 58 | organization={IJCAI} 59 | } 60 | ``` 61 | -------------------------------------------------------------------------------- /docs/IRT.md: -------------------------------------------------------------------------------- 1 | # Item response theory 2 | 3 | If the reader wants to know the details of EMIRT, please refer to the paper: *[Estimation for Item Response Models using the EM Algorithm for Finite Mixtures](https://files.eric.ed.gov/fulltext/ED405356.pdf)*. 4 | ```bibtex 5 | @article{woodruff1996estimation, 6 | title={Estimation of Item Response Models Using the EM Algorithm for Finite Mixtures.}, 7 | author={Woodruff, David J and Hanson, Bradley A}, 8 | year={1996}, 9 | publisher={ERIC} 10 | } 11 | ``` -------------------------------------------------------------------------------- /docs/KaNCD.md: -------------------------------------------------------------------------------- 1 | # KaNCD 2 | 3 | The implementation of the KaNCD model in paper: [NeuralCD: A General Framework for Cognitive Diagnosis](https://ieeexplore.ieee.org/abstract/document/9865139) 4 | 5 | KaNCD is an **K**nowledge-**a**ssociation based extension of the **N**eural**CD**M (alias NCDM in this package) model. In KaNCD, higher-order low dimensional latent traits of students, exercises and knowledge concepts are used respectively. 6 | 7 | The knowledge difficulty vector of an exercise is calculated from the latent trait of the exercise and the latent trait of each knowledge concept. 8 | 9 | ![KDM_MF](F:\git_project\EduCDM\EduCDM\docs\_static\KDM_MF.png) 10 | 11 | Similarly, the knowledge proficiency vector of a student is calculated from the latent trait of the student and the latent trait of each knowledge concept. 12 | 13 | ![KPM_MF](F:\git_project\EduCDM\EduCDM\docs\_static\KPM_MF.png) 14 | 15 | Please refer to the paper for more details. -------------------------------------------------------------------------------- /docs/MCD.md: -------------------------------------------------------------------------------- 1 | # Matrix-factorization-based Cognitive Diagnosis model 2 | 3 | ![model](_static/MCD.png) 4 | -------------------------------------------------------------------------------- /docs/MIRT.md: -------------------------------------------------------------------------------- 1 | # Multidimensional Item Response Theory 2 | 3 | If the reader wants to know the details of MIRT, please refer to the paper: *[Multidimensional item response theory models](http://ndl.ethernet.edu.et/bitstream/123456789/60415/1/116.pdf)* 4 | 5 | ``` 6 | @incollection{reckase2009multidimensional, 7 | title={Multidimensional item response theory models}, 8 | author={Reckase, Mark D}, 9 | booktitle={Multidimensional item response theory}, 10 | pages={79--112}, 11 | year={2009}, 12 | publisher={Springer} 13 | } 14 | ``` -------------------------------------------------------------------------------- /docs/NCDM.md: -------------------------------------------------------------------------------- 1 | # Neural Cognitive Diagnosis Model 2 | 3 | The implementation of the NeuralCDM model in paper: *[Neural Cognitive Diagnosis for Intelligent Education Systems](http://staff.ustc.edu.cn/~qiliuql/files/Publications/Fei-Wang-AAAI2020.pdf)*. 4 | 5 | ![](_static/NeuralCDM.JPG) -------------------------------------------------------------------------------- /docs/_static/DINA.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdata-ustc/EduCDM/c610e6ae7f45cfe8f1106a1c342d4ef5a357472e/docs/_static/DINA.png -------------------------------------------------------------------------------- /docs/_static/EduCDM.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdata-ustc/EduCDM/c610e6ae7f45cfe8f1106a1c342d4ef5a357472e/docs/_static/EduCDM.png -------------------------------------------------------------------------------- /docs/_static/FuzzyCDF.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdata-ustc/EduCDM/c610e6ae7f45cfe8f1106a1c342d4ef5a357472e/docs/_static/FuzzyCDF.png -------------------------------------------------------------------------------- /docs/_static/IRR.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdata-ustc/EduCDM/c610e6ae7f45cfe8f1106a1c342d4ef5a357472e/docs/_static/IRR.png -------------------------------------------------------------------------------- /docs/_static/KDM_MF.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdata-ustc/EduCDM/c610e6ae7f45cfe8f1106a1c342d4ef5a357472e/docs/_static/KDM_MF.png -------------------------------------------------------------------------------- /docs/_static/KPM_MF.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdata-ustc/EduCDM/c610e6ae7f45cfe8f1106a1c342d4ef5a357472e/docs/_static/KPM_MF.png -------------------------------------------------------------------------------- /docs/_static/MCD.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdata-ustc/EduCDM/c610e6ae7f45cfe8f1106a1c342d4ef5a357472e/docs/_static/MCD.png -------------------------------------------------------------------------------- /docs/_static/NeuralCDM.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdata-ustc/EduCDM/c610e6ae7f45cfe8f1106a1c342d4ef5a357472e/docs/_static/NeuralCDM.JPG -------------------------------------------------------------------------------- /examples/DINA/EM/DINA.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/3/28 @ liujiayu 3 | import logging 4 | import numpy as np 5 | import json 6 | from EduCDM import EMDINA as DINA 7 | 8 | q_m = np.loadtxt("../../../data/math2015/Math1/q_m.csv", dtype=int, delimiter=',') 9 | prob_num, know_num = q_m.shape[0], q_m.shape[1] 10 | 11 | # training data 12 | with open("../../../data/math2015/Math1/train_data.json", encoding='utf-8') as file: 13 | train_set = json.load(file) 14 | stu_num = max([x['user_id'] for x in train_set]) + 1 15 | R = -1 * np.ones(shape=(stu_num, prob_num)) 16 | for log in train_set: 17 | R[log['user_id'], log['item_id']] = log['score'] 18 | 19 | # testing data 20 | with open("../../../data/math2015/Math1/test_data.json", encoding='utf-8') as file: 21 | test_set = json.load(file) 22 | 23 | logging.getLogger().setLevel(logging.INFO) 24 | 25 | cdm = DINA(R, q_m, stu_num, prob_num, know_num, skip_value=-1) 26 | 27 | cdm.train(epoch=2, epsilon=1e-3) 28 | cdm.save("dina.params") 29 | 30 | cdm.load("dina.params") 31 | rmse, mae = cdm.eval(test_set) 32 | print("RMSE: %.6f, MAE: %.6f" % (rmse, mae)) 33 | 34 | # ---incremental training 35 | new_data = [{'user_id': 0, 'item_id': 0, 'score': 1.0}, {'user_id': 1, 'item_id': 2, 'score': 0.0}] 36 | cdm.inc_train(new_data, epoch=2, epsilon=1e-3) 37 | 38 | # ---evaluate user's state 39 | stu_rec = np.array([0, 1, -1, 0, -1, 0, 1, 1, 0, 1, 0, 1, 0, -1, -1, -1, -1, 0, 1, -1]) 40 | dia_id, dia_state = cdm.transform(stu_rec) 41 | print("id of user's state is %d, state is " % dia_id + str(dia_state)) 42 | -------------------------------------------------------------------------------- /examples/DINA/EM/prepare_dataset.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# Download the Cognitive Diagnosis Benchmark Datasets (CDBD)\n", 10 | "from EduData import get_data\n", 11 | "\n", 12 | "get_data(\"math2015\", \"../../../data\")" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 1, 18 | "metadata": {}, 19 | "outputs": [ 20 | { 21 | "name": "stdout", 22 | "output_type": "stream", 23 | "text": [ 24 | "{'user_id': 0, 'item_id': 5, 'score': 1.0} {'user_id': 0, 'item_id': 8, 'score': 1.0}\n" 25 | ] 26 | } 27 | ], 28 | "source": [ 29 | "# Data preprocessing, split train/valid/test data\n", 30 | "\n", 31 | "import numpy as np\n", 32 | "import random\n", 33 | "import json\n", 34 | "\n", 35 | "train_ratio = 0.8\n", 36 | "valid_ratio = 0\n", 37 | "# Q matrix\n", 38 | "np.savetxt(\"../../../data/math2015/Math1/q_m.csv\", np.loadtxt(\"../../../data/math2015/Math1/q.txt\", dtype=int), delimiter=',', fmt='%d')\n", 39 | "\n", 40 | "# response matrix, split dataset\n", 41 | "R = (np.loadtxt(\"../../../data/math2015/Math1/data.txt\") == 1).astype(float)\n", 42 | "stu_num, prob_num = R.shape[0], R.shape[1]\n", 43 | "train_logs, valid_logs, test_logs = [], [], []\n", 44 | "for stu in range(stu_num):\n", 45 | " stu_logs = []\n", 46 | " for prob in range(prob_num):\n", 47 | " log = {'user_id': int(stu), 'item_id': int(prob), 'score': R[stu][prob]}\n", 48 | " stu_logs.append(log)\n", 49 | " random.shuffle(stu_logs)\n", 50 | " train_logs += stu_logs[: int(train_ratio * prob_num)]\n", 51 | " valid_logs += stu_logs[int(train_ratio * prob_num): int(train_ratio * prob_num) + int(valid_ratio * prob_num)]\n", 52 | " test_logs += stu_logs[int(train_ratio * prob_num) + int(valid_ratio * prob_num):]\n", 53 | "\n", 54 | "with open(\"../../../data/math2015/Math1/train_data.json\", 'w', encoding='utf8') as file:\n", 55 | " json.dump(train_logs, file, indent=4, ensure_ascii=False)\n", 56 | "with open(\"../../../data/math2015/Math1/valid_data.json\", 'w', encoding='utf8') as file:\n", 57 | " json.dump(valid_logs, file, indent=4, ensure_ascii=False)\n", 58 | "with open(\"../../../data/math2015/Math1/test_data.json\", 'w', encoding='utf8') as file:\n", 59 | " json.dump(test_logs, file, indent=4, ensure_ascii=False)\n", 60 | "\n", 61 | "print(train_logs[0], test_logs[0])" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [] 70 | } 71 | ], 72 | "metadata": { 73 | "kernelspec": { 74 | "display_name": "Python 3", 75 | "language": "python", 76 | "name": "python3" 77 | }, 78 | "language_info": { 79 | "codemirror_mode": { 80 | "name": "ipython", 81 | "version": 3 82 | }, 83 | "file_extension": ".py", 84 | "mimetype": "text/x-python", 85 | "name": "python", 86 | "nbconvert_exporter": "python", 87 | "pygments_lexer": "ipython3", 88 | "version": "3.7.3" 89 | } 90 | }, 91 | "nbformat": 4, 92 | "nbformat_minor": 1 93 | } 94 | -------------------------------------------------------------------------------- /examples/DINA/GD/DINA.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/3/23 @ tongshiwei 3 | import logging 4 | from EduCDM import GDDINA 5 | import torch 6 | from torch.utils.data import TensorDataset, DataLoader 7 | import pandas as pd 8 | 9 | train_data = pd.read_csv("../../../data/a0910/train.csv") 10 | valid_data = pd.read_csv("../../../data/a0910/valid.csv") 11 | test_data = pd.read_csv("../../../data/a0910/test.csv") 12 | item_data = pd.read_csv("../../../data/a0910/item.csv") 13 | 14 | knowledge_num = 123 15 | 16 | 17 | def code2vector(x): 18 | vector = [0] * knowledge_num 19 | for k in eval(x): 20 | vector[k - 1] = 1 21 | return vector 22 | 23 | 24 | item_data["knowledge"] = item_data["knowledge_code"].apply(code2vector) 25 | item_data.drop(columns=["knowledge_code"], inplace=True) 26 | 27 | train_data = pd.merge(train_data, item_data, on="item_id") 28 | valid_data = pd.merge(valid_data, item_data, on="item_id") 29 | test_data = pd.merge(test_data, item_data, on="item_id") 30 | 31 | batch_size = 32 32 | 33 | 34 | def transform(x, y, z, k, batch_size, **params): 35 | dataset = TensorDataset( 36 | torch.tensor(x, dtype=torch.int64), 37 | torch.tensor(y, dtype=torch.int64), 38 | torch.tensor(k, dtype=torch.float32), 39 | torch.tensor(z, dtype=torch.float32) 40 | ) 41 | return DataLoader(dataset, batch_size=batch_size, **params) 42 | 43 | 44 | train, valid, test = [ 45 | transform(data["user_id"], data["item_id"], data["score"], data["knowledge"], batch_size) 46 | for data in [train_data, valid_data, test_data] 47 | ] 48 | 49 | logging.getLogger().setLevel(logging.INFO) 50 | 51 | cdm = GDDINA(4164, 17747, knowledge_num) 52 | 53 | cdm.train(train, valid, epoch=2) 54 | cdm.save("dina.params") 55 | 56 | cdm.load("dina.params") 57 | auc, accuracy = cdm.eval(test) 58 | print("auc: %.6f, accuracy: %.6f" % (auc, accuracy)) 59 | -------------------------------------------------------------------------------- /examples/DINA/GD/prepare_dataset.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "outputs": [ 7 | { 8 | "name": "stderr", 9 | "output_type": "stream", 10 | "text": [ 11 | "downloader, INFO http://base.ustc.edu.cn/data/cdbd/a0910/item.csv is saved as ..\\..\\..\\data\\a0910\\item.csv\n", 12 | "downloader, INFO http://base.ustc.edu.cn/data/cdbd/a0910/readme.txt is saved as ..\\..\\..\\data\\a0910\\readme.txt\n", 13 | "downloader, INFO http://base.ustc.edu.cn/data/cdbd/a0910/test.csv is saved as ..\\..\\..\\data\\a0910\\test.csv\n", 14 | "downloader, INFO http://base.ustc.edu.cn/data/cdbd/a0910/train.csv is saved as ..\\..\\..\\data\\a0910\\train.csv\n", 15 | "downloader, INFO http://base.ustc.edu.cn/data/cdbd/a0910/valid.csv is saved as ..\\..\\..\\data\\a0910\\valid.csv\n" 16 | ] 17 | }, 18 | { 19 | "name": "stdout", 20 | "output_type": "stream", 21 | "text": [ 22 | "Downloading ..\\..\\..\\data\\a0910\\item.csv 100.00%: 258118 | 258118\n", 23 | "Downloading ..\\..\\..\\data\\a0910\\readme.txt 100.00%: 86 | 86\n", 24 | "Downloading ..\\..\\..\\data\\a0910\\test.csv 100.00%: 810767 | 810767\n", 25 | "Downloading ..\\..\\..\\data\\a0910\\train.csv 100.00%: 2329161 | 2329161\n", 26 | "Downloading ..\\..\\..\\data\\a0910\\valid.csv 100.00%: 371493 | 371493\n" 27 | ] 28 | }, 29 | { 30 | "data": { 31 | "text/plain": "'../../../data'" 32 | }, 33 | "execution_count": 2, 34 | "metadata": {}, 35 | "output_type": "execute_result" 36 | } 37 | ], 38 | "source": [ 39 | "# Download the Cognitive Diagnosis Benchmark Datasets (CDBD)\n", 40 | "from EduData import get_data\n", 41 | "\n", 42 | "get_data(\"cdbd-a0910\", \"../../../data\")\n" 43 | ], 44 | "metadata": { 45 | "collapsed": false, 46 | "pycharm": { 47 | "name": "#%%\n" 48 | } 49 | } 50 | } 51 | ], 52 | "metadata": { 53 | "kernelspec": { 54 | "display_name": "Python 3", 55 | "language": "python", 56 | "name": "python3" 57 | }, 58 | "language_info": { 59 | "codemirror_mode": { 60 | "name": "ipython", 61 | "version": 2 62 | }, 63 | "file_extension": ".py", 64 | "mimetype": "text/x-python", 65 | "name": "python", 66 | "nbconvert_exporter": "python", 67 | "pygments_lexer": "ipython2", 68 | "version": "2.7.6" 69 | } 70 | }, 71 | "nbformat": 4, 72 | "nbformat_minor": 0 73 | } -------------------------------------------------------------------------------- /examples/FuzzyCDF/FuzzyCDF.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "collapsed": true, 7 | "pycharm": { 8 | "name": "#%% md\n" 9 | } 10 | }, 11 | "source": [ 12 | "# Fuzzy cognitive diagnosis framework (FuzzyCDF)\n", 13 | "\n", 14 | "This notebook will show you how to train and use the FuzzyCDF.\n", 15 | "First, we will show how to get the data (here we use Math1 from math2015 as the dataset).\n", 16 | "Then we will show how to train a FuzzyCDF and perform the parameters persistence.\n", 17 | "At last, we will show how to load the parameters from the file and evaluate on the test dataset.\n", 18 | "\n", 19 | "The script version could be found in [FuzzyCDF.py](FuzzyCDF.ipynb)" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "## Data Preparation\n", 27 | "\n", 28 | "Before we process the data, we need to first acquire the dataset which is shown in [prepare_dataset.ipynb](prepare_dataset.ipynb)" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 1, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "# Load the data from files\n", 38 | "import numpy as np\n", 39 | "import json\n", 40 | "\n", 41 | "# type of problems\n", 42 | "obj_prob_index = np.loadtxt(\"../../data/math2015/Math1/obj_prob_index.csv\", delimiter=',', dtype=int)\n", 43 | "sub_prob_index = np.loadtxt(\"../../data/math2015/Math1/sub_prob_index.csv\", delimiter=',', dtype=int)\n", 44 | "# Q matrix\n", 45 | "q_m = np.loadtxt(\"../../data/math2015/Math1/q_m.csv\", dtype=int, delimiter=',')\n", 46 | "prob_num, know_num = q_m.shape[0], q_m.shape[1]\n", 47 | "\n", 48 | "# training data\n", 49 | "with open(\"../../data/math2015/Math1/train_data.json\", encoding='utf-8') as file:\n", 50 | " train_set = json.load(file)\n", 51 | "stu_num = max([x['user_id'] for x in train_set]) + 1\n", 52 | "R = -1 * np.ones(shape=(stu_num, prob_num))\n", 53 | "for log in train_set:\n", 54 | " R[log['user_id'], log['item_id']] = log['score']\n", 55 | "\n", 56 | "# testing data\n", 57 | "with open(\"../../data/math2015/Math1/test_data.json\", encoding='utf-8') as file:\n", 58 | " test_set = json.load(file)" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 2, 64 | "metadata": {}, 65 | "outputs": [ 66 | { 67 | "name": "stdout", 68 | "output_type": "stream", 69 | "text": [ 70 | "{'user_id': 0, 'item_id': 7, 'score': 1.0} {'user_id': 0, 'item_id': 9, 'score': 1.0}\n" 71 | ] 72 | } 73 | ], 74 | "source": [ 75 | "print(train_set[0], test_set[0])" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 3, 81 | "metadata": { 82 | "pycharm": { 83 | "name": "#%%\n" 84 | } 85 | }, 86 | "outputs": [ 87 | { 88 | "data": { 89 | "text/plain": [ 90 | "(67344, 16836)" 91 | ] 92 | }, 93 | "execution_count": 3, 94 | "metadata": {}, 95 | "output_type": "execute_result" 96 | } 97 | ], 98 | "source": [ 99 | "len(train_set), len(test_set)" 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": { 105 | "pycharm": { 106 | "name": "#%% md\n" 107 | } 108 | }, 109 | "source": [ 110 | "## Training and Persistence" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 4, 116 | "metadata": { 117 | "pycharm": { 118 | "name": "#%%\n" 119 | } 120 | }, 121 | "outputs": [], 122 | "source": [ 123 | "import logging\n", 124 | "logging.getLogger().setLevel(logging.INFO)" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 6, 130 | "metadata": { 131 | "pycharm": { 132 | "name": "#%%\n" 133 | } 134 | }, 135 | "outputs": [ 136 | { 137 | "name": "stderr", 138 | "output_type": "stream", 139 | "text": [ 140 | "INFO:root:save parameters to fuzzycdf.params\n" 141 | ] 142 | } 143 | ], 144 | "source": [ 145 | "from EduCDM import FuzzyCDF\n", 146 | "\n", 147 | "cdm = FuzzyCDF(R, q_m, stu_num, prob_num, know_num, obj_prob_index, sub_prob_index, skip_value=-1)\n", 148 | "\n", 149 | "cdm.train(epoch=10, burnin=5)\n", 150 | "cdm.save(\"fuzzycdf.params\")" 151 | ] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "metadata": { 156 | "pycharm": { 157 | "name": "#%% md\n" 158 | } 159 | }, 160 | "source": [ 161 | "## Loading and Testing" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": 7, 167 | "metadata": { 168 | "pycharm": { 169 | "name": "#%%\n" 170 | } 171 | }, 172 | "outputs": [ 173 | { 174 | "name": "stderr", 175 | "output_type": "stream", 176 | "text": [ 177 | "INFO:root:load parameters from fuzzycdf.params\n", 178 | "evaluating: 100%|█████████████████████████████████████████████████████████████| 16836/16836 [00:00<00:00, 91552.55it/s]" 179 | ] 180 | }, 181 | { 182 | "name": "stdout", 183 | "output_type": "stream", 184 | "text": [ 185 | "RMSE: 0.447697, MAE: 0.405684\n" 186 | ] 187 | }, 188 | { 189 | "name": "stderr", 190 | "output_type": "stream", 191 | "text": [ 192 | "\n" 193 | ] 194 | } 195 | ], 196 | "source": [ 197 | "cdm.load(\"fuzzycdf.params\")\n", 198 | "rmse, mae = cdm.eval(test_set)\n", 199 | "print(\"RMSE: %.6f, MAE: %.6f\" % (rmse, mae))" 200 | ] 201 | }, 202 | { 203 | "cell_type": "markdown", 204 | "metadata": {}, 205 | "source": [ 206 | "## Incremental Training" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": 8, 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [ 215 | "new_data = [{'user_id': 0, 'item_id': 2, 'score': 0.0}, {'user_id': 1, 'item_id': 1, 'score': 1.0}]\n", 216 | "cdm.inc_train(new_data, epoch=10, burnin=5)" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": null, 222 | "metadata": {}, 223 | "outputs": [], 224 | "source": [] 225 | } 226 | ], 227 | "metadata": { 228 | "kernelspec": { 229 | "display_name": "Python 3", 230 | "language": "python", 231 | "name": "python3" 232 | }, 233 | "language_info": { 234 | "codemirror_mode": { 235 | "name": "ipython", 236 | "version": 3 237 | }, 238 | "file_extension": ".py", 239 | "mimetype": "text/x-python", 240 | "name": "python", 241 | "nbconvert_exporter": "python", 242 | "pygments_lexer": "ipython3", 243 | "version": "3.7.3" 244 | } 245 | }, 246 | "nbformat": 4, 247 | "nbformat_minor": 1 248 | } 249 | -------------------------------------------------------------------------------- /examples/FuzzyCDF/FuzzyCDF.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/3/28 @ liujiayu 3 | import logging 4 | import numpy as np 5 | import json 6 | from EduCDM import FuzzyCDF 7 | 8 | 9 | # type of problems 10 | obj_prob_index = np.loadtxt("../../data/math2015/Math1/obj_prob_index.csv", delimiter=',', dtype=int) 11 | sub_prob_index = np.loadtxt("../../data/math2015/Math1/sub_prob_index.csv", delimiter=',', dtype=int) 12 | # Q matrix 13 | q_m = np.loadtxt("../../data/math2015/Math1/q_m.csv", dtype=int, delimiter=',') 14 | prob_num, know_num = q_m.shape[0], q_m.shape[1] 15 | 16 | # training data 17 | with open("../../data/math2015/Math1/train_data.json", encoding='utf-8') as file: 18 | train_set = json.load(file) 19 | stu_num = max([x['user_id'] for x in train_set]) + 1 20 | R = -1 * np.ones(shape=(stu_num, prob_num)) 21 | for log in train_set: 22 | R[log['user_id'], log['item_id']] = log['score'] 23 | 24 | # testing data 25 | with open("../../data/math2015/Math1/test_data.json", encoding='utf-8') as file: 26 | test_set = json.load(file) 27 | 28 | logging.getLogger().setLevel(logging.INFO) 29 | 30 | cdm = FuzzyCDF(R, q_m, stu_num, prob_num, know_num, obj_prob_index, sub_prob_index, skip_value=-1) 31 | 32 | cdm.train(epoch=10, burnin=5) 33 | cdm.save("fuzzycdf.params") 34 | 35 | cdm.load("fuzzycdf.params") 36 | rmse, mae = cdm.eval(test_set) 37 | print("RMSE, MAE are %.6f, %.6f" % (rmse, mae)) 38 | 39 | # ---incremental training 40 | new_data = [{'user_id': 0, 'item_id': 2, 'score': 0.0}, {'user_id': 1, 'item_id': 1, 'score': 1.0}] 41 | cdm.inc_train(new_data, epoch=10, burnin=5) 42 | -------------------------------------------------------------------------------- /examples/FuzzyCDF/prepare_dataset.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# Download the Cognitive Diagnosis Benchmark Datasets (CDBD)\n", 10 | "from EduData import get_data\n", 11 | "\n", 12 | "get_data(\"math2015\", \"../../data\")" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 1, 18 | "metadata": {}, 19 | "outputs": [ 20 | { 21 | "name": "stdout", 22 | "output_type": "stream", 23 | "text": [ 24 | "{'user_id': 0, 'item_id': 7, 'score': 1.0} {'user_id': 0, 'item_id': 9, 'score': 1.0}\n" 25 | ] 26 | } 27 | ], 28 | "source": [ 29 | "# Data preprocessing, split train/valid/test data\n", 30 | "\n", 31 | "import json\n", 32 | "import numpy as np\n", 33 | "import random\n", 34 | "\n", 35 | "def read_probdesc(filepath):\n", 36 | " prob_type = np.loadtxt(filepath, dtype=int, delimiter='\\t',\n", 37 | " converters={0: lambda x: int(x) - 1, 1: lambda s: s == b'Obj'}, skiprows=1)\n", 38 | " obj_prob_index, sub_prob_index = prob_type[prob_type[:, 1] == 1][:, 0], prob_type[prob_type[:, 1] == 0][:, 0]\n", 39 | " return prob_type, obj_prob_index, sub_prob_index\n", 40 | "\n", 41 | "train_ratio = 0.8\n", 42 | "valid_ratio = 0\n", 43 | "\n", 44 | "# type of problems\n", 45 | "problems, obj_prob_index, sub_prob_index = read_probdesc(\"../../data/math2015/Math1/problemdesc.txt\")\n", 46 | "np.savetxt(\"../../data/math2015/Math1/obj_prob_index.csv\", obj_prob_index, delimiter=',', fmt='%d')\n", 47 | "np.savetxt(\"../../data/math2015/Math1/sub_prob_index.csv\", sub_prob_index, delimiter=',', fmt='%d')\n", 48 | "\n", 49 | "# Q matrix\n", 50 | "np.savetxt(\"../../data/math2015/Math1/q_m.csv\", np.loadtxt(\"../../data/math2015/Math1/q.txt\", dtype=int), delimiter=',', fmt='%d')\n", 51 | "\n", 52 | "# response matrix, split dataset\n", 53 | "R = np.loadtxt(\"../../data/math2015/Math1/data.txt\")\n", 54 | "stu_num, prob_num = R.shape[0], R.shape[1]\n", 55 | "train_logs, valid_logs, test_logs = [], [], []\n", 56 | "for stu in range(stu_num):\n", 57 | " stu_logs = []\n", 58 | " for prob in range(prob_num):\n", 59 | " log = {'user_id': int(stu), 'item_id': int(prob), 'score': R[stu][prob]}\n", 60 | " stu_logs.append(log)\n", 61 | " random.shuffle(stu_logs)\n", 62 | " train_logs += stu_logs[: int(train_ratio * prob_num)]\n", 63 | " valid_logs += stu_logs[int(train_ratio * prob_num): int(train_ratio * prob_num) + int(valid_ratio * prob_num)]\n", 64 | " test_logs += stu_logs[int(train_ratio * prob_num) + int(valid_ratio * prob_num):]\n", 65 | "\n", 66 | "with open(\"../../data/math2015/Math1/train_data.json\", 'w', encoding='utf8') as file:\n", 67 | " json.dump(train_logs, file, indent=4, ensure_ascii=False)\n", 68 | "with open(\"../../data/math2015/Math1/valid_data.json\", 'w', encoding='utf8') as file:\n", 69 | " json.dump(valid_logs, file, indent=4, ensure_ascii=False)\n", 70 | "with open(\"../../data/math2015/Math1/test_data.json\", 'w', encoding='utf8') as file:\n", 71 | " json.dump(test_logs, file, indent=4, ensure_ascii=False)\n", 72 | "\n", 73 | "print(train_logs[0], test_logs[0])" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [] 82 | } 83 | ], 84 | "metadata": { 85 | "kernelspec": { 86 | "display_name": "Python 3", 87 | "language": "python", 88 | "name": "python3" 89 | }, 90 | "language_info": { 91 | "codemirror_mode": { 92 | "name": "ipython", 93 | "version": 3 94 | }, 95 | "file_extension": ".py", 96 | "mimetype": "text/x-python", 97 | "name": "python", 98 | "nbconvert_exporter": "python", 99 | "pygments_lexer": "ipython3", 100 | "version": "3.7.3" 101 | } 102 | }, 103 | "nbformat": 4, 104 | "nbformat_minor": 1 105 | } 106 | -------------------------------------------------------------------------------- /examples/ICD/ICD.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import logging 4 | from baize import config_logging 5 | import os 6 | from EduCDM.ICD.etl import extract, inc_stream 7 | from longling import build_dir 8 | from longling.lib.stream import to_io_group, close_io 9 | from EduCDM.ICD.ICD import ICD 10 | 11 | path_prefix = os.path.abspath('.') 12 | 13 | 14 | def run(cdm, 15 | user_n, 16 | item_n, 17 | know_n, 18 | dataset, 19 | max_u2i=None, 20 | max_i2u=None, 21 | stream_num=50, 22 | alpha=0.999, 23 | beta=0.95, 24 | tolerance=1e-3, 25 | inner_metrics=True, 26 | hyper_tag=False, 27 | epoch=1, 28 | wfs=None, 29 | logger=logging, 30 | log_file="log", 31 | warmup_ratio=0.1, 32 | epsilon=1e-2, 33 | weight_decay=0, 34 | vector_numbers=None, 35 | vector_path_format=None, 36 | ctx="cpu", 37 | *args, 38 | **kwargs): 39 | dataset_dir = "%s/data/%s/" % (path_prefix, dataset) 40 | data_dir = dataset_dir 41 | item2know = "%sitem.csv" % dataset_dir 42 | path_format = "%s{}.csv" % data_dir 43 | 44 | inc_train_data_path = path_format.format(log_file) 45 | inc_train_df, _, _, i2k = extract(inc_train_data_path, item2know) 46 | inc_train_df_list = list( 47 | inc_stream(inc_train_df, 48 | stream_size=int(len(inc_train_df) // stream_num))) 49 | ICDNet = ICD(cdm, user_n, item_n, know_n, epoch, weight_decay, 50 | inner_metrics, logger, alpha, ctx) 51 | ICDNet.train(inc_train_df_list, i2k, beta, warmup_ratio, tolerance, 52 | max_u2i, max_i2u, hyper_tag, vector_numbers, 53 | vector_path_format, wfs) 54 | 55 | 56 | def main(dataset="a0910", 57 | ctx="cpu", 58 | cdm="mirt", 59 | alpha=0.2, 60 | beta=0.9, 61 | tolerance=2e-1, 62 | epoch=1, 63 | pretrained=False, 64 | savename=None, 65 | inc_epoch=None, 66 | inner_metrics=True, 67 | log_file="log", 68 | warmup_ratio=0.1, 69 | epsilon=1e-2, 70 | stream_num=None, 71 | vector_numbers=None): 72 | if savename: 73 | dataset_dir = "%s/data/%s/" % (path_prefix, dataset) 74 | data_dir = dataset_dir 75 | model_dir = data_dir + "model/%s/%s/" % (cdm, savename) 76 | keys = [ 77 | "metrics", "before_metrics", "ind_inc_user", "ind_inc_item", 78 | "inc_user", "inc_item", "new_user", "new_item", "new_both", 79 | "trait", "inc_trait", "tp" 80 | ] 81 | path_format = model_dir + "{}.json" 82 | wfs = dict( 83 | zip( 84 | keys, 85 | to_io_group(*[path_format.format(key) for key in keys], 86 | mode="w"))) if savename else None 87 | logger = config_logging(model_dir + "log.txt", 88 | logger="ICD", 89 | console_log_level="info") 90 | logger.info("logs to %s" % model_dir + "log.txt") 91 | vector_path_format = model_dir + "{}_{}.pt" 92 | build_dir(vector_path_format) 93 | else: 94 | wfs = None 95 | logger = config_logging(logger="ICD", console_log_level="info") 96 | vector_path_format = None 97 | 98 | config = dict( 99 | dataset=dataset, 100 | cdm=cdm, 101 | alpha=alpha, 102 | beta=beta, 103 | tolerance=tolerance, 104 | ctx=ctx, 105 | epoch=epoch, 106 | inc_epoch=inc_epoch, 107 | inner_metrics=inner_metrics, 108 | log_file=log_file, 109 | warmup_ratio=warmup_ratio, 110 | epsilon=epsilon, 111 | vector_numbers=vector_numbers, 112 | vector_path_format=vector_path_format, 113 | ) 114 | logger.info(config) 115 | 116 | dataset_config = { 117 | "a0910": 118 | dict( 119 | user_n=4129, 120 | item_n=17747, 121 | know_n=123, 122 | stream_num=50 if stream_num is None else stream_num, 123 | max_u2i=128, 124 | max_i2u=64, 125 | ), 126 | "math": 127 | dict( 128 | user_n=10269, 129 | item_n=17747, 130 | know_n=1488, 131 | stream_num=200 if stream_num is None else stream_num, 132 | # max_u2i=128, 133 | # max_i2u=64, 134 | ), 135 | "xunfei": 136 | dict( 137 | # user_n=10269+1, 138 | # item_n=2507+1, 139 | user_n=6820 + 1, 140 | item_n=1196 + 1, 141 | know_n=497, 142 | stream_num=50 if stream_num is None else stream_num, 143 | max_u2i=128, 144 | max_i2u=64, 145 | ), 146 | } 147 | cdm_config = { 148 | "irt": {}, 149 | "dina": {}, 150 | "ncd": {}, 151 | "mirt": { 152 | "weight_decay": 1e-4 153 | } 154 | } 155 | run( 156 | # cdm="mirt", 157 | pretrained=pretrained, 158 | wfs=wfs, 159 | logger=logger, 160 | **cdm_config[cdm], 161 | **config, 162 | **dataset_config[dataset.split("_")[0]]) 163 | if wfs is not None: 164 | close_io(list(wfs.values())) 165 | 166 | 167 | if __name__ == '__main__': 168 | import fire 169 | 170 | fire.Fire(main) 171 | -------------------------------------------------------------------------------- /examples/ICD/prepare_dataset.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "downloader, INFO http://base.ustc.edu.cn/data/cdbd/a0910/item.csv is saved as ../../data/a0910/item.csv\n", 13 | "downloader, INFO file existed, skipped\n" 14 | ] 15 | }, 16 | { 17 | "data": { 18 | "text/plain": [ 19 | "'../../data'" 20 | ] 21 | }, 22 | "execution_count": 1, 23 | "metadata": {}, 24 | "output_type": "execute_result" 25 | } 26 | ], 27 | "source": [ 28 | "from EduData import get_data\n", 29 | "\n", 30 | "get_data(\"cdbd-a0910\", \"../../data\")\n" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 2, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "import pandas as pd\n", 40 | "path_prefix = '../../data/a0910/'\n", 41 | "train_data = pd.read_csv(f'{path_prefix}train.csv',\n", 42 | " encoding=\"utf-8\", low_memory=False)\n", 43 | "valid_data = pd.read_csv(f'{path_prefix}valid.csv',\n", 44 | " encoding=\"utf-8\", low_memory=False)\n", 45 | "test_data = pd.read_csv(f'{path_prefix}test.csv',\n", 46 | " encoding=\"utf-8\", low_memory=False)\n", 47 | "data = train_data.append([valid_data, test_data])\n", 48 | "# 保留作答题目大于15的学生\n", 49 | "group = data.groupby(['item_id'], as_index=False)\n", 50 | "df = pd.DataFrame(columns=['user_id', 'item_id', 'score'])\n", 51 | "for i in group:\n", 52 | " if len(i[1]) >= 15:\n", 53 | " df = df.append([i[1]])\n", 54 | "df.to_csv(f'{path_prefix}log.csv', index=None)" 55 | ] 56 | } 57 | ], 58 | "metadata": { 59 | "kernelspec": { 60 | "display_name": "Python 3.9.7 ('base')", 61 | "language": "python", 62 | "name": "python3" 63 | }, 64 | "language_info": { 65 | "codemirror_mode": { 66 | "name": "ipython", 67 | "version": 3 68 | }, 69 | "file_extension": ".py", 70 | "mimetype": "text/x-python", 71 | "name": "python", 72 | "nbconvert_exporter": "python", 73 | "pygments_lexer": "ipython3", 74 | "version": "3.9.7" 75 | }, 76 | "orig_nbformat": 4, 77 | "vscode": { 78 | "interpreter": { 79 | "hash": "30cf1c0bf0a8c24a67f341fa01023997b228873b5bd061707cfd99d0cfb90c8a" 80 | } 81 | } 82 | }, 83 | "nbformat": 4, 84 | "nbformat_minor": 2 85 | } 86 | -------------------------------------------------------------------------------- /examples/IRR/DINA.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/6/19 @ tongshiwei 3 | 4 | 5 | from EduCDM.IRR import DINA 6 | import logging 7 | from longling.lib.structure import AttrDict 8 | from longling import set_logging_info 9 | from EduCDM.IRR import pair_etl as etl, point_etl as vt_etl, extract_item 10 | 11 | set_logging_info() 12 | 13 | params = AttrDict( 14 | batch_size=256, 15 | n_neg=10, 16 | n_imp=10, 17 | logger=logging.getLogger(), 18 | hyper_params={"user_num": 4164, "knowledge_num": 123} 19 | ) 20 | item_knowledge = extract_item("../../data/a0910/item.csv", params["hyper_params"]["knowledge_num"], params) 21 | train_data, train_df = etl("../../data/a0910/train.csv", item_knowledge, params) 22 | valid_data, _ = vt_etl("../../data/a0910/valid.csv", item_knowledge, params) 23 | test_data, _ = vt_etl("../../data/a0910/test.csv", item_knowledge, params) 24 | 25 | cdm = DINA( 26 | 4163 + 1, 27 | 17746 + 1, 28 | 123, 29 | ste=True 30 | ) 31 | cdm.train( 32 | train_data, 33 | valid_data, 34 | epoch=2, 35 | ) 36 | cdm.save("IRR-DINA.params") 37 | 38 | cdm.load("IRR-DINA.params") 39 | print(cdm.eval(test_data)) 40 | -------------------------------------------------------------------------------- /examples/IRR/IRT.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/6/19 @ tongshiwei 3 | 4 | 5 | from EduCDM.IRR import IRT 6 | import logging 7 | from longling.lib.structure import AttrDict 8 | from longling import set_logging_info 9 | from EduCDM.IRR import pair_etl as etl, point_etl as vt_etl, extract_item 10 | 11 | set_logging_info() 12 | 13 | params = AttrDict( 14 | batch_size=256, 15 | n_neg=10, 16 | n_imp=10, 17 | logger=logging.getLogger(), 18 | hyper_params={"user_num": 4164} 19 | ) 20 | item_knowledge = extract_item("../../data/a0910/item.csv", 123, params) 21 | train_data, train_df = etl("../../data/a0910/train.csv", item_knowledge, params) 22 | valid_data, _ = vt_etl("../../data/a0910/valid.csv", item_knowledge, params) 23 | test_data, _ = vt_etl("../../data/a0910/test.csv", item_knowledge, params) 24 | 25 | cdm = IRT( 26 | 4163 + 1, 27 | 17746 + 1, 28 | 123 29 | ) 30 | cdm.train( 31 | train_data, 32 | valid_data, 33 | epoch=2, 34 | ) 35 | cdm.save("IRR-IRT.params") 36 | 37 | cdm.load("IRR-IRT.params") 38 | print(cdm.eval(test_data)) 39 | -------------------------------------------------------------------------------- /examples/IRR/MIRT.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/6/19 @ tongshiwei 3 | 4 | 5 | from EduCDM.IRR import MIRT 6 | import logging 7 | from longling.lib.structure import AttrDict 8 | from longling import set_logging_info 9 | from EduCDM.IRR import pair_etl as etl, point_etl as vt_etl, extract_item 10 | 11 | set_logging_info() 12 | 13 | params = AttrDict( 14 | batch_size=256, 15 | n_neg=10, 16 | n_imp=10, 17 | logger=logging.getLogger(), 18 | hyper_params={"user_num": 4164} 19 | ) 20 | item_knowledge = extract_item("../../data/a0910/item.csv", 123, params) 21 | train_data, train_df = etl("../../data/a0910/train.csv", item_knowledge, params) 22 | valid_data, _ = vt_etl("../../data/a0910/valid.csv", item_knowledge, params) 23 | test_data, _ = vt_etl("../../data/a0910/test.csv", item_knowledge, params) 24 | 25 | cdm = MIRT( 26 | 4163 + 1, 27 | 17746 + 1, 28 | 123 29 | ) 30 | cdm.train( 31 | train_data, 32 | valid_data, 33 | epoch=2, 34 | ) 35 | cdm.save("IRR-MIRT.params") 36 | 37 | cdm.load("IRR-MIRT.params") 38 | print(cdm.eval(test_data)) 39 | -------------------------------------------------------------------------------- /examples/IRR/NCDM.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/6/19 @ tongshiwei 3 | 4 | 5 | from EduCDM.IRR import NCDM 6 | import logging 7 | from longling.lib.structure import AttrDict 8 | from longling import set_logging_info 9 | from EduCDM.IRR import pair_etl as etl, point_etl as vt_etl, extract_item 10 | 11 | set_logging_info() 12 | 13 | params = AttrDict( 14 | batch_size=256, 15 | n_neg=10, 16 | n_imp=10, 17 | logger=logging.getLogger(), 18 | hyper_params={"user_num": 4164, "knowledge_num": 123} 19 | ) 20 | item_knowledge = extract_item("../../data/a0910/item.csv", params["hyper_params"]["knowledge_num"], params) 21 | train_data, train_df = etl("../../data/a0910/train.csv", item_knowledge, params) 22 | valid_data, _ = vt_etl("../../data/a0910/valid.csv", item_knowledge, params) 23 | test_data, _ = vt_etl("../../data/a0910/test.csv", item_knowledge, params) 24 | 25 | cdm = NCDM( 26 | 4163 + 1, 27 | 17746 + 1, 28 | 123, 29 | ) 30 | cdm.train( 31 | train_data, 32 | valid_data, 33 | epoch=2, 34 | ) 35 | cdm.save("IRR-NCDM.params") 36 | 37 | cdm.load("IRR-NCDM.params") 38 | print(cdm.eval(test_data)) 39 | -------------------------------------------------------------------------------- /examples/IRR/README.md: -------------------------------------------------------------------------------- 1 | # Item Response Ranking for Cognitive Diagnosis 2 | 3 | * [IRR-IRT](IRT.ipynb) 4 | * [IRR-DINA](DINA.ipynb) 5 | * [IRR-NCD](NCDM.ipynb) 6 | 7 | -------------------------------------------------------------------------------- /examples/IRR/prepare_dataset.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 5, 6 | "outputs": [ 7 | { 8 | "name": "stderr", 9 | "output_type": "stream", 10 | "text": [ 11 | "downloader, INFO http://base.ustc.edu.cn/data/cdbd/a0910/item.csv is saved as ..\\..\\data\\a0910\\item.csv\n", 12 | "downloader, INFO http://base.ustc.edu.cn/data/cdbd/a0910/test.csv is saved as ..\\..\\data\\a0910\\test.csv\n", 13 | "downloader, INFO http://base.ustc.edu.cn/data/cdbd/a0910/train.csv is saved as ..\\..\\data\\a0910\\train.csv\n", 14 | "downloader, INFO http://base.ustc.edu.cn/data/cdbd/a0910/valid.csv is saved as ..\\..\\data\\a0910\\valid.csv\n" 15 | ] 16 | }, 17 | { 18 | "name": "stdout", 19 | "output_type": "stream", 20 | "text": [ 21 | "Downloading ..\\..\\data\\a0910\\item.csv 100.00%: 258118 | 258118\n", 22 | "Downloading ..\\..\\data\\a0910\\test.csv 100.00%: 810767 | 810767\n", 23 | "Downloading ..\\..\\data\\a0910\\train.csv 100.00%: 2329161 | 2329161\n", 24 | "Downloading ..\\..\\data\\a0910\\valid.csv 100.00%: 371493 | 371493\n" 25 | ] 26 | }, 27 | { 28 | "data": { 29 | "text/plain": "'../../data'" 30 | }, 31 | "execution_count": 5, 32 | "metadata": {}, 33 | "output_type": "execute_result" 34 | } 35 | ], 36 | "source": [ 37 | "# Download the Cognitive Diagnosis Benchmark Datasets (CDBD)\n", 38 | "from EduData import get_data\n", 39 | "\n", 40 | "get_data(\"cdbd-a0910\", \"../../data\")\n" 41 | ], 42 | "metadata": { 43 | "collapsed": false, 44 | "pycharm": { 45 | "name": "#%%\n" 46 | } 47 | } 48 | } 49 | ], 50 | "metadata": { 51 | "kernelspec": { 52 | "display_name": "Python 3", 53 | "language": "python", 54 | "name": "python3" 55 | }, 56 | "language_info": { 57 | "codemirror_mode": { 58 | "name": "ipython", 59 | "version": 2 60 | }, 61 | "file_extension": ".py", 62 | "mimetype": "text/x-python", 63 | "name": "python", 64 | "nbconvert_exporter": "python", 65 | "pygments_lexer": "ipython2", 66 | "version": "2.7.6" 67 | } 68 | }, 69 | "nbformat": 4, 70 | "nbformat_minor": 0 71 | } -------------------------------------------------------------------------------- /examples/IRT/EM/IRT.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/5/2 @ liujiayu 3 | import logging 4 | import numpy as np 5 | import pandas as pd 6 | from EduCDM import EMIRT 7 | 8 | train_data = pd.read_csv("../../../data/a0910/train.csv") 9 | valid_data = pd.read_csv("../../../data/a0910/valid.csv") 10 | test_data = pd.read_csv("../../../data/a0910/test.csv") 11 | 12 | stu_num = max(max(train_data['user_id']), max(test_data['user_id'])) 13 | prob_num = max(max(train_data['item_id']), max(test_data['item_id'])) 14 | 15 | R = -1 * np.ones(shape=(stu_num, prob_num)) 16 | R[train_data['user_id']-1, train_data['item_id']-1] = train_data['score'] 17 | 18 | test_set = [] 19 | for i in range(len(test_data)): 20 | row = test_data.iloc[i] 21 | test_set.append({'user_id':int(row['user_id'])-1, 'item_id':int(row['item_id'])-1, 'score':row['score']}) 22 | 23 | logging.getLogger().setLevel(logging.INFO) 24 | 25 | cdm = EMIRT(R, stu_num, prob_num, dim=1, skip_value=-1) # IRT, dim > 1 is MIRT 26 | 27 | cdm.train(lr=1e-3, epoch=2) 28 | cdm.save("irt.params") 29 | 30 | cdm.load("irt.params") 31 | rmse, mae = cdm.eval(test_set) 32 | print("RMSE, MAE are %.6f, %.6f" % (rmse, mae)) 33 | 34 | # ---incremental training 35 | new_data = [{'user_id': 0, 'item_id': 2, 'score': 0.0}, {'user_id': 1, 'item_id': 1, 'score': 1.0}] 36 | cdm.inc_train(new_data, lr=1e-3, epoch=2) 37 | 38 | # ---evaluate user's state 39 | stu_rec = np.random.randint(-1, 2, size=prob_num) 40 | dia_state = cdm.transform(stu_rec) 41 | print("user's state is " + str(dia_state)) 42 | -------------------------------------------------------------------------------- /examples/IRT/EM/prepare_dataset.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "outputs": [ 7 | { 8 | "name": "stderr", 9 | "output_type": "stream", 10 | "text": [ 11 | "downloader, INFO ..\\..\\..\\data\\a0910\\item.csv already exists. Send resume request after 258118 bytes\n", 12 | "downloader, INFO http://base.ustc.edu.cn/data/cdbd/a0910/item.csv is saved as ..\\..\\..\\data\\a0910\\item.csv\n", 13 | "downloader, WARNING Range not support. Redownloading...\n", 14 | "downloader, INFO ..\\..\\..\\data\\a0910\\test.csv already exists. Send resume request after 810767 bytes\n", 15 | "downloader, INFO http://base.ustc.edu.cn/data/cdbd/a0910/test.csv is saved as ..\\..\\..\\data\\a0910\\test.csv\n", 16 | "downloader, WARNING Range not support. Redownloading...\n", 17 | "downloader, INFO ..\\..\\..\\data\\a0910\\train.csv already exists. Send resume request after 2329161 bytes\n", 18 | "downloader, INFO http://base.ustc.edu.cn/data/cdbd/a0910/train.csv is saved as ..\\..\\..\\data\\a0910\\train.csv\n", 19 | "downloader, WARNING Range not support. Redownloading...\n", 20 | "downloader, INFO ..\\..\\..\\data\\a0910\\valid.csv already exists. Send resume request after 371493 bytes\n", 21 | "downloader, INFO http://base.ustc.edu.cn/data/cdbd/a0910/valid.csv is saved as ..\\..\\..\\data\\a0910\\valid.csv\n", 22 | "downloader, WARNING Range not support. Redownloading...\n" 23 | ] 24 | }, 25 | { 26 | "name": "stdout", 27 | "output_type": "stream", 28 | "text": [ 29 | "Downloading 100.00% : 376832 | 37149361" 30 | ] 31 | }, 32 | { 33 | "data": { 34 | "text/plain": "'../../../data'" 35 | }, 36 | "execution_count": 1, 37 | "metadata": {}, 38 | "output_type": "execute_result" 39 | } 40 | ], 41 | "source": [ 42 | "# Download the Cognitive Diagnosis Benchmark Datasets (CDBD)\n", 43 | "from EduData import get_data\n", 44 | "\n", 45 | "get_data(\"cdbd-a0910\", \"../../../data\")\n" 46 | ], 47 | "metadata": { 48 | "collapsed": false, 49 | "pycharm": { 50 | "name": "#%%\n" 51 | } 52 | } 53 | } 54 | ], 55 | "metadata": { 56 | "kernelspec": { 57 | "display_name": "Python 3", 58 | "language": "python", 59 | "name": "python3" 60 | }, 61 | "language_info": { 62 | "codemirror_mode": { 63 | "name": "ipython", 64 | "version": 2 65 | }, 66 | "file_extension": ".py", 67 | "mimetype": "text/x-python", 68 | "name": "python", 69 | "nbconvert_exporter": "python", 70 | "pygments_lexer": "ipython2", 71 | "version": "2.7.6" 72 | } 73 | }, 74 | "nbformat": 4, 75 | "nbformat_minor": 0 76 | } -------------------------------------------------------------------------------- /examples/IRT/GD/IRT.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/3/23 @ tongshiwei 3 | import logging 4 | from EduCDM import GDIRT 5 | import torch 6 | from torch.utils.data import TensorDataset, DataLoader 7 | import pandas as pd 8 | 9 | train_data = pd.read_csv("../../../data/a0910/train.csv") 10 | valid_data = pd.read_csv("../../../data/a0910/valid.csv") 11 | test_data = pd.read_csv("../../../data/a0910/test.csv") 12 | 13 | batch_size = 256 14 | 15 | 16 | def transform(x, y, z, batch_size, **params): 17 | dataset = TensorDataset( 18 | torch.tensor(x, dtype=torch.int64), 19 | torch.tensor(y, dtype=torch.int64), 20 | torch.tensor(z, dtype=torch.float32) 21 | ) 22 | return DataLoader(dataset, batch_size=batch_size, **params) 23 | 24 | 25 | train, valid, test = [ 26 | transform(data["user_id"], data["item_id"], data["score"], batch_size) 27 | for data in [train_data, valid_data, test_data] 28 | ] 29 | 30 | logging.getLogger().setLevel(logging.INFO) 31 | 32 | cdm = GDIRT(4164, 17747) 33 | 34 | cdm.train(train, valid, epoch=2) 35 | cdm.save("irt.params") 36 | 37 | cdm.load("irt.params") 38 | auc, accuracy = cdm.eval(test) 39 | print("auc: %.6f, accuracy: %.6f" % (auc, accuracy)) 40 | -------------------------------------------------------------------------------- /examples/IRT/GD/prepare_dataset.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "outputs": [ 7 | { 8 | "name": "stderr", 9 | "output_type": "stream", 10 | "text": [ 11 | "downloader, INFO ..\\..\\..\\data\\a0910\\item.csv already exists. Send resume request after 258118 bytes\n", 12 | "downloader, INFO http://base.ustc.edu.cn/data/cdbd/a0910/item.csv is saved as ..\\..\\..\\data\\a0910\\item.csv\n", 13 | "downloader, WARNING Range not support. Redownloading...\n", 14 | "downloader, INFO ..\\..\\..\\data\\a0910\\test.csv already exists. Send resume request after 810767 bytes\n", 15 | "downloader, INFO http://base.ustc.edu.cn/data/cdbd/a0910/test.csv is saved as ..\\..\\..\\data\\a0910\\test.csv\n", 16 | "downloader, WARNING Range not support. Redownloading...\n", 17 | "downloader, INFO ..\\..\\..\\data\\a0910\\train.csv already exists. Send resume request after 2329161 bytes\n", 18 | "downloader, INFO http://base.ustc.edu.cn/data/cdbd/a0910/train.csv is saved as ..\\..\\..\\data\\a0910\\train.csv\n", 19 | "downloader, WARNING Range not support. Redownloading...\n", 20 | "downloader, INFO ..\\..\\..\\data\\a0910\\valid.csv already exists. Send resume request after 371493 bytes\n", 21 | "downloader, INFO http://base.ustc.edu.cn/data/cdbd/a0910/valid.csv is saved as ..\\..\\..\\data\\a0910\\valid.csv\n", 22 | "downloader, WARNING Range not support. Redownloading...\n" 23 | ] 24 | }, 25 | { 26 | "name": "stdout", 27 | "output_type": "stream", 28 | "text": [ 29 | "Downloading 100.00% : 376832 | 37149361" 30 | ] 31 | }, 32 | { 33 | "data": { 34 | "text/plain": "'../../../data'" 35 | }, 36 | "execution_count": 1, 37 | "metadata": {}, 38 | "output_type": "execute_result" 39 | } 40 | ], 41 | "source": [ 42 | "# Download the Cognitive Diagnosis Benchmark Datasets (CDBD)\n", 43 | "from EduData import get_data\n", 44 | "\n", 45 | "get_data(\"cdbd-a0910\", \"../../../data\")\n" 46 | ], 47 | "metadata": { 48 | "collapsed": false, 49 | "pycharm": { 50 | "name": "#%%\n" 51 | } 52 | } 53 | } 54 | ], 55 | "metadata": { 56 | "kernelspec": { 57 | "display_name": "Python 3", 58 | "language": "python", 59 | "name": "python3" 60 | }, 61 | "language_info": { 62 | "codemirror_mode": { 63 | "name": "ipython", 64 | "version": 2 65 | }, 66 | "file_extension": ".py", 67 | "mimetype": "text/x-python", 68 | "name": "python", 69 | "nbconvert_exporter": "python", 70 | "pygments_lexer": "ipython2", 71 | "version": "2.7.6" 72 | } 73 | }, 74 | "nbformat": 4, 75 | "nbformat_minor": 0 76 | } -------------------------------------------------------------------------------- /examples/KaNCD/KaNCD.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2023/3/7 @ WangFei 3 | import logging 4 | from EduCDM import KaNCD 5 | import torch 6 | from torch.utils.data import TensorDataset, DataLoader 7 | import pandas as pd 8 | import numpy as np 9 | 10 | 11 | train_data = pd.read_csv("../../data/a0910/train.csv") 12 | valid_data = pd.read_csv("../../data/a0910/valid.csv") 13 | test_data = pd.read_csv("../../data/a0910/test.csv") 14 | df_item = pd.read_csv("../../data/a0910/item.csv") 15 | item2knowledge = {} 16 | knowledge_set = set() 17 | for i, s in df_item.iterrows(): 18 | item_id, knowledge_codes = s['item_id'], list(set(eval(s['knowledge_code']))) 19 | item2knowledge[item_id] = knowledge_codes 20 | knowledge_set.update(knowledge_codes) 21 | 22 | batch_size = 32 23 | user_n = np.max(train_data['user_id']) 24 | item_n = np.max([np.max(train_data['item_id']), np.max(valid_data['item_id']), np.max(test_data['item_id'])]) 25 | knowledge_n = np.max(list(knowledge_set)) 26 | 27 | 28 | def transform(user, item, item2knowledge, score, batch_size): 29 | knowledge_emb = torch.zeros((len(item), knowledge_n)) 30 | for idx in range(len(item)): 31 | knowledge_emb[idx][np.array(item2knowledge[item[idx]]) - 1] = 1.0 32 | 33 | data_set = TensorDataset( 34 | torch.tensor(user, dtype=torch.int64) - 1, # (1, user_n) to (0, user_n-1) 35 | torch.tensor(item, dtype=torch.int64) - 1, # (1, item_n) to (0, item_n-1) 36 | knowledge_emb, 37 | torch.tensor(score, dtype=torch.float32) 38 | ) 39 | return DataLoader(data_set, batch_size=batch_size, shuffle=True) 40 | 41 | 42 | train_set, valid_set, test_set = [ 43 | transform(data["user_id"], data["item_id"], item2knowledge, data["score"], batch_size) 44 | for data in [train_data, valid_data, test_data] 45 | ] 46 | 47 | logging.getLogger().setLevel(logging.INFO) 48 | cdm = KaNCD(exer_n=item_n, student_n=user_n, knowledge_n=knowledge_n, mf_type='gmf', dim=20) 49 | cdm.train(train_set, valid_set, epoch_n=3, device="cuda", lr=0.002) 50 | cdm.save("kancd.snapshot") 51 | 52 | cdm.load("kancd.snapshot") 53 | auc, accuracy = cdm.eval(test_set, device="cuda") 54 | print("auc: %.6f, accuracy: %.6f" % (auc, accuracy)) 55 | 56 | 57 | -------------------------------------------------------------------------------- /examples/KaNCD/prepare_dataset.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from EduData import get_data\n", 10 | "\n", 11 | "get_data(\"cdbd-a0910\", \"../../data\")" 12 | ] 13 | } 14 | ], 15 | "metadata": { 16 | "kernelspec": { 17 | "display_name": "Python 3", 18 | "language": "python", 19 | "name": "python3" 20 | }, 21 | "language_info": { 22 | "codemirror_mode": { 23 | "name": "ipython", 24 | "version": 3 25 | }, 26 | "file_extension": ".py", 27 | "mimetype": "text/x-python", 28 | "name": "python", 29 | "nbconvert_exporter": "python", 30 | "pygments_lexer": "ipython3", 31 | "version": "3.8.3" 32 | } 33 | }, 34 | "nbformat": 4, 35 | "nbformat_minor": 4 36 | } 37 | -------------------------------------------------------------------------------- /examples/MCD/MCD.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/3/23 @ tongshiwei 3 | import logging 4 | from EduCDM import MCD 5 | import torch 6 | from torch.utils.data import TensorDataset, DataLoader 7 | import pandas as pd 8 | 9 | train_data = pd.read_csv("../../data/a0910/train.csv") 10 | valid_data = pd.read_csv("../../data/a0910/valid.csv") 11 | test_data = pd.read_csv("../../data/a0910/test.csv") 12 | 13 | batch_size = 256 14 | 15 | 16 | def transform(x, y, z, batch_size, **params): 17 | dataset = TensorDataset( 18 | torch.tensor(x, dtype=torch.int64), 19 | torch.tensor(y, dtype=torch.int64), 20 | torch.tensor(z, dtype=torch.float32) 21 | ) 22 | return DataLoader(dataset, batch_size=batch_size, **params) 23 | 24 | 25 | train, valid, test = [ 26 | transform(data["user_id"], data["item_id"], data["score"], batch_size) 27 | for data in [train_data, valid_data, test_data] 28 | ] 29 | 30 | logging.getLogger().setLevel(logging.INFO) 31 | 32 | cdm = MCD(4164, 17747, 100) 33 | 34 | cdm.train(train, valid, epoch=2) 35 | cdm.save("mcd.params") 36 | 37 | cdm.load("mcd.params") 38 | auc, accuracy = cdm.eval(test) 39 | print("auc: %.6f, accuracy: %.6f" % (auc, accuracy)) 40 | -------------------------------------------------------------------------------- /examples/MCD/prepare_dataset.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "outputs": [ 7 | { 8 | "name": "stderr", 9 | "output_type": "stream", 10 | "text": [ 11 | "downloader, INFO http://base.ustc.edu.cn/data/cdbd/a0910/item.csv is saved as ..\\..\\data\\a0910\\item.csv\n", 12 | "downloader, INFO http://base.ustc.edu.cn/data/cdbd/a0910/test.csv is saved as ..\\..\\data\\a0910\\test.csv\n", 13 | "downloader, INFO http://base.ustc.edu.cn/data/cdbd/a0910/train.csv is saved as ..\\..\\data\\a0910\\train.csv\n", 14 | "downloader, INFO http://base.ustc.edu.cn/data/cdbd/a0910/valid.csv is saved as ..\\..\\data\\a0910\\valid.csv\n" 15 | ] 16 | }, 17 | { 18 | "name": "stdout", 19 | "output_type": "stream", 20 | "text": [ 21 | "Downloading ..\\..\\data\\a0910\\item.csv 100.00%: 258118 | 258118\n", 22 | "Downloading ..\\..\\data\\a0910\\test.csv 100.00%: 810767 | 810767\n", 23 | "Downloading ..\\..\\data\\a0910\\train.csv 100.00%: 2329161 | 2329161\n", 24 | "Downloading ..\\..\\data\\a0910\\valid.csv 100.00%: 371493 | 371493\n" 25 | ] 26 | }, 27 | { 28 | "data": { 29 | "text/plain": "'../../data'" 30 | }, 31 | "execution_count": 2, 32 | "metadata": {}, 33 | "output_type": "execute_result" 34 | } 35 | ], 36 | "source": [ 37 | "# Download the Cognitive Diagnosis Benchmark Datasets (CDBD)\n", 38 | "from EduData import get_data\n", 39 | "\n", 40 | "get_data(\"cdbd-a0910\", \"../../data\")\n" 41 | ], 42 | "metadata": { 43 | "collapsed": false, 44 | "pycharm": { 45 | "name": "#%%\n" 46 | } 47 | } 48 | } 49 | ], 50 | "metadata": { 51 | "kernelspec": { 52 | "display_name": "Python 3", 53 | "language": "python", 54 | "name": "python3" 55 | }, 56 | "language_info": { 57 | "codemirror_mode": { 58 | "name": "ipython", 59 | "version": 2 60 | }, 61 | "file_extension": ".py", 62 | "mimetype": "text/x-python", 63 | "name": "python", 64 | "nbconvert_exporter": "python", 65 | "pygments_lexer": "ipython2", 66 | "version": "2.7.6" 67 | } 68 | }, 69 | "nbformat": 4, 70 | "nbformat_minor": 0 71 | } -------------------------------------------------------------------------------- /examples/MIRT/MIRT.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/3/23 @ tongshiwei 3 | import logging 4 | from EduCDM import MIRT 5 | import torch 6 | from torch.utils.data import TensorDataset, DataLoader 7 | import pandas as pd 8 | 9 | train_data = pd.read_csv("../../data/a0910/train.csv") 10 | valid_data = pd.read_csv("../../data/a0910/valid.csv") 11 | test_data = pd.read_csv("../../data/a0910/test.csv") 12 | 13 | batch_size = 256 14 | 15 | 16 | def transform(x, y, z, batch_size, **params): 17 | dataset = TensorDataset( 18 | torch.tensor(x, dtype=torch.int64), 19 | torch.tensor(y, dtype=torch.int64), 20 | torch.tensor(z, dtype=torch.float32) 21 | ) 22 | return DataLoader(dataset, batch_size=batch_size, **params) 23 | 24 | 25 | train, valid, test = [ 26 | transform(data["user_id"], data["item_id"], data["score"], batch_size) 27 | for data in [train_data, valid_data, test_data] 28 | ] 29 | 30 | logging.getLogger().setLevel(logging.INFO) 31 | 32 | cdm = MIRT(4164, 17747, 123) 33 | 34 | cdm.train(train, valid, epoch=2) 35 | cdm.save("mirt.params") 36 | 37 | cdm.load("mirt.params") 38 | auc, accuracy = cdm.eval(test) 39 | print("auc: %.6f, accuracy: %.6f" % (auc, accuracy)) 40 | -------------------------------------------------------------------------------- /examples/MIRT/prepare_dataset.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "outputs": [ 7 | { 8 | "name": "stderr", 9 | "output_type": "stream", 10 | "text": [ 11 | "downloader, INFO http://base.ustc.edu.cn/data/cdbd/a0910/item.csv is saved as ..\\..\\data\\a0910\\item.csv\n", 12 | "downloader, INFO http://base.ustc.edu.cn/data/cdbd/a0910/readme.txt is saved as ..\\..\\data\\a0910\\readme.txt\n", 13 | "downloader, INFO http://base.ustc.edu.cn/data/cdbd/a0910/test.csv is saved as ..\\..\\data\\a0910\\test.csv\n", 14 | "downloader, INFO http://base.ustc.edu.cn/data/cdbd/a0910/train.csv is saved as ..\\..\\data\\a0910\\train.csv\n", 15 | "downloader, INFO http://base.ustc.edu.cn/data/cdbd/a0910/valid.csv is saved as ..\\..\\data\\a0910\\valid.csv\n" 16 | ] 17 | }, 18 | { 19 | "name": "stdout", 20 | "output_type": "stream", 21 | "text": [ 22 | "Downloading ..\\..\\data\\a0910\\item.csv 100.00%: 258118 | 258118\n", 23 | "Downloading ..\\..\\data\\a0910\\readme.txt 100.00%: 86 | 86\n", 24 | "Downloading ..\\..\\data\\a0910\\test.csv 100.00%: 810767 | 810767\n", 25 | "Downloading ..\\..\\data\\a0910\\train.csv 100.00%: 2329161 | 2329161\n", 26 | "Downloading ..\\..\\data\\a0910\\valid.csv 100.00%: 371493 | 371493\n" 27 | ] 28 | }, 29 | { 30 | "data": { 31 | "text/plain": "'../../data'" 32 | }, 33 | "execution_count": 2, 34 | "metadata": {}, 35 | "output_type": "execute_result" 36 | } 37 | ], 38 | "source": [ 39 | "# Download the Cognitive Diagnosis Benchmark Datasets (CDBD)\n", 40 | "from EduData import get_data\n", 41 | "\n", 42 | "get_data(\"cdbd-a0910\", \"../../data\")\n" 43 | ], 44 | "metadata": { 45 | "collapsed": false, 46 | "pycharm": { 47 | "name": "#%%\n" 48 | } 49 | } 50 | } 51 | ], 52 | "metadata": { 53 | "kernelspec": { 54 | "display_name": "Python 3", 55 | "language": "python", 56 | "name": "python3" 57 | }, 58 | "language_info": { 59 | "codemirror_mode": { 60 | "name": "ipython", 61 | "version": 2 62 | }, 63 | "file_extension": ".py", 64 | "mimetype": "text/x-python", 65 | "name": "python", 66 | "nbconvert_exporter": "python", 67 | "pygments_lexer": "ipython2", 68 | "version": "2.7.6" 69 | } 70 | }, 71 | "nbformat": 4, 72 | "nbformat_minor": 0 73 | } -------------------------------------------------------------------------------- /examples/NCDM/NCDM.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/4/1 @ WangFei 3 | import logging 4 | from EduCDM import NCDM 5 | import torch 6 | from torch.utils.data import TensorDataset, DataLoader 7 | import pandas as pd 8 | import numpy as np 9 | 10 | 11 | train_data = pd.read_csv("../../data/a0910/train.csv") 12 | valid_data = pd.read_csv("../../data/a0910/valid.csv") 13 | test_data = pd.read_csv("../../data/a0910/test.csv") 14 | df_item = pd.read_csv("../../data/a0910/item.csv") 15 | item2knowledge = {} 16 | knowledge_set = set() 17 | for i, s in df_item.iterrows(): 18 | item_id, knowledge_codes = s['item_id'], list(set(eval(s['knowledge_code']))) 19 | item2knowledge[item_id] = knowledge_codes 20 | knowledge_set.update(knowledge_codes) 21 | 22 | batch_size = 32 23 | user_n = np.max(train_data['user_id']) 24 | item_n = np.max([np.max(train_data['item_id']), np.max(valid_data['item_id']), np.max(test_data['item_id'])]) 25 | knowledge_n = np.max(list(knowledge_set)) 26 | 27 | 28 | def transform(user, item, item2knowledge, score, batch_size): 29 | knowledge_emb = torch.zeros((len(item), knowledge_n)) 30 | for idx in range(len(item)): 31 | knowledge_emb[idx][np.array(item2knowledge[item[idx]]) - 1] = 1.0 32 | 33 | data_set = TensorDataset( 34 | torch.tensor(user, dtype=torch.int64) - 1, # (1, user_n) to (0, user_n-1) 35 | torch.tensor(item, dtype=torch.int64) - 1, # (1, item_n) to (0, item_n-1) 36 | knowledge_emb, 37 | torch.tensor(score, dtype=torch.float32) 38 | ) 39 | return DataLoader(data_set, batch_size=batch_size, shuffle=True) 40 | 41 | 42 | train_set, valid_set, test_set = [ 43 | transform(data["user_id"], data["item_id"], item2knowledge, data["score"], batch_size) 44 | for data in [train_data, valid_data, test_data] 45 | ] 46 | 47 | logging.getLogger().setLevel(logging.INFO) 48 | cdm = NCDM(knowledge_n, item_n, user_n) 49 | cdm.train(train_set, valid_set, epoch=3, device="cuda") 50 | cdm.save("ncdm.snapshot") 51 | 52 | cdm.load("ncdm.snapshot") 53 | auc, accuracy = cdm.eval(test_set) 54 | print("auc: %.6f, accuracy: %.6f" % (auc, accuracy)) 55 | 56 | 57 | -------------------------------------------------------------------------------- /examples/NCDM/prepare_dataset.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from EduData import get_data\n", 10 | "\n", 11 | "get_data(\"cdbd-a0910\", \"../../data\")" 12 | ] 13 | } 14 | ], 15 | "metadata": { 16 | "kernelspec": { 17 | "display_name": "Python 3", 18 | "language": "python", 19 | "name": "python3" 20 | }, 21 | "language_info": { 22 | "codemirror_mode": { 23 | "name": "ipython", 24 | "version": 3 25 | }, 26 | "file_extension": ".py", 27 | "mimetype": "text/x-python", 28 | "name": "python", 29 | "nbconvert_exporter": "python", 30 | "pygments_lexer": "ipython3", 31 | "version": "3.8.3" 32 | } 33 | }, 34 | "nbformat": 4, 35 | "nbformat_minor": 4 36 | } 37 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | # For pytest usage, refer to https://hb4dsai.readthedocs.io/zh/latest/Architecture/Test.html 3 | norecursedirs = docs *build* trash dev examples 4 | 5 | # Deal with marker warnings 6 | markers = 7 | flake8: flake8 8 | 9 | # Enable line length testing with maximum line length of 120 10 | flake8-max-line-length = 120 11 | 12 | # Ignore module level import not at top of file (E402) 13 | # Others can be found in https://flake8.pycqa.org/en/latest/user/error-codes.html 14 | flake8-ignore = E402 F401 F403 E126 W504 W503 15 | 16 | # --doctest-modules is used for unitest 17 | addopts = --doctest-modules --cov --cov-report=term-missing --flake8 18 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [coverage:run] 2 | source=EduCDM 3 | [coverage:report] 4 | exclude_lines = 5 | pragma: no cover 6 | pass 7 | raise NotImplementedError 8 | if __name__ == '__main__': 9 | if __name__ == "__main__": 10 | def __str__ 11 | def __repr__ 12 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | test_deps = [ 4 | 'pytest>=4', 5 | 'pytest-cov>=2.6.0', 6 | # 'pytest-flake8==4.0.1', 7 | 'pytest-flake8<1.1.2', 8 | 'flake8<5.0.0' 9 | ] 10 | 11 | setup( 12 | name='EduCDM', 13 | version='1.0.1', 14 | extras_require={ 15 | 'test': test_deps, 16 | }, 17 | packages=find_packages(), 18 | install_requires=[ 19 | "torch", "tqdm", "numpy>=1.16.5", "scikit-learn", "pandas", 20 | "longling>=1.3.33", "longling<=1.3.36", 'PyBaize>=0.0.7', 'fire' 21 | ], # And any other dependencies for needs 22 | entry_points={}, 23 | ) 24 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/3/17 @ tongshiwei 3 | 4 | import random 5 | 6 | random.seed(10) 7 | -------------------------------------------------------------------------------- /tests/dina/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/3/28 @ liujiayu 3 | -------------------------------------------------------------------------------- /tests/dina/em/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/6/21 @ tongshiwei 3 | -------------------------------------------------------------------------------- /tests/dina/em/conftest.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/3/28 @ liujiayu 3 | 4 | import random 5 | import numpy as np 6 | import pytest 7 | 8 | 9 | @pytest.fixture(scope="package") 10 | def conf(): 11 | user_num = 5 12 | item_num = 2 13 | know_num = 3 14 | return user_num, item_num, know_num 15 | 16 | 17 | @pytest.fixture(scope="package") 18 | def data(conf): 19 | user_num, item_num, know_num = conf 20 | q_m = np.zeros(shape=(item_num, know_num)) 21 | for i in range(item_num): 22 | for j in range(know_num): 23 | q_m[i, j] = random.randint(0, 1) 24 | 25 | R = -1 * np.ones(shape=(user_num, item_num)) 26 | for i in range(user_num): 27 | for j in range(item_num): 28 | R[i, j] = random.randint(-1, 1) 29 | 30 | new_data = [{'user_id': 1, 'item_id': 1, 'score': 1.0}] 31 | 32 | stu_rec = np.ones(item_num) 33 | for i in range(item_num): 34 | stu_rec[i] = random.randint(-1, 1) 35 | 36 | return user_num, item_num, know_num, R, q_m, new_data, stu_rec 37 | -------------------------------------------------------------------------------- /tests/dina/em/test_dina.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/3/28 @ liujiayu 3 | from EduCDM import EMDINA as DINA 4 | 5 | 6 | def test_train(data, tmp_path): 7 | stu_num, prob_num, know_num, R, q_m, new_data, stu_rec = data 8 | cdm = DINA(R, q_m, stu_num, prob_num, know_num, skip_value=-1) 9 | cdm.train(epoch=30, epsilon=1e-3) 10 | rmse, mae = cdm.eval([{'user_id': 0, 'item_id': 0, 'score': 1.0}]) 11 | filepath = tmp_path / "dina.params" 12 | cdm.save(filepath) 13 | cdm.load(filepath) 14 | cdm.inc_train(new_data, epoch=30, epsilon=1e-3) 15 | dia_id, dia_state = cdm.transform(stu_rec) 16 | -------------------------------------------------------------------------------- /tests/dina/gd/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/6/21 @ tongshiwei 3 | -------------------------------------------------------------------------------- /tests/dina/gd/conftest.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/3/23 @ tongshiwei 3 | 4 | import random 5 | import pytest 6 | import torch 7 | from torch.utils.data import TensorDataset, DataLoader 8 | 9 | 10 | @pytest.fixture(scope="package") 11 | def conf(): 12 | user_num = 5 13 | item_num = 2 14 | knowledge_num = 3 15 | return user_num, item_num, knowledge_num 16 | 17 | 18 | @pytest.fixture(scope="package") 19 | def data(conf): 20 | user_num, item_num, knowledge_num = conf 21 | log = [] 22 | for i in range(user_num): 23 | for j in range(item_num): 24 | k = [0] * knowledge_num 25 | k[random.randint(0, knowledge_num - 1)] = 1 26 | score = random.randint(0, 1) 27 | log.append((i, j, k, score)) 28 | 29 | user_id, item_id, knowledge, score = zip(*log) 30 | batch_size = 4 31 | 32 | dataset = TensorDataset( 33 | torch.tensor(user_id, dtype=torch.int64), 34 | torch.tensor(item_id, dtype=torch.int64), 35 | torch.tensor(knowledge, dtype=torch.float), 36 | torch.tensor(score, dtype=torch.float) 37 | ) 38 | return DataLoader(dataset, batch_size=batch_size) 39 | -------------------------------------------------------------------------------- /tests/dina/gd/test_gddina.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/4/23 @ tongshiwei 3 | 4 | import pytest 5 | from EduCDM import GDDINA 6 | 7 | 8 | @pytest.mark.parametrize("ste", [True, False]) 9 | def test_train(data, conf, tmp_path, ste): 10 | user_num, item_num, knowledge_num = conf 11 | cdm = GDDINA(user_num, item_num, knowledge_num, ste=ste) 12 | cdm.train(data, test_data=data, epoch=2) 13 | filepath = tmp_path / "dina.params" 14 | cdm.save(filepath) 15 | cdm.load(filepath) 16 | -------------------------------------------------------------------------------- /tests/fuzzycdf/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/3/28 @ liujiayu 3 | -------------------------------------------------------------------------------- /tests/fuzzycdf/conftest.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/3/28 @ liujiayu 3 | 4 | import random 5 | import numpy as np 6 | import pytest 7 | 8 | 9 | @pytest.fixture(scope="package") 10 | def conf(): 11 | user_num = 5 12 | item_num = 2 13 | know_num = 3 14 | return user_num, item_num, know_num 15 | 16 | 17 | @pytest.fixture(scope="package") 18 | def data(conf): 19 | user_num, item_num, know_num = conf 20 | q_m = np.zeros(shape=(item_num, know_num)) 21 | for i in range(item_num): 22 | for j in range(know_num): 23 | q_m[i, j] = random.randint(0, 1) 24 | 25 | R = -1 * np.ones(shape=(user_num, item_num)) 26 | for i in range(user_num): 27 | for j in range(item_num): 28 | R[i, j] = random.randint(-1, 1) 29 | 30 | index = random.randint(1, item_num - 1) 31 | obj_prob_index = np.arange(0, index) 32 | sub_prob_index = np.arange(index - 1, item_num) 33 | 34 | new_data = [{'user_id': 1, 'item_id': 1, 'score': 1.0}] 35 | 36 | return user_num, item_num, know_num, R, q_m, obj_prob_index, sub_prob_index, new_data 37 | -------------------------------------------------------------------------------- /tests/fuzzycdf/test_fuzzycdf.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/3/28 @ liujiayu 3 | from EduCDM import FuzzyCDF 4 | 5 | 6 | def test_train(data, tmp_path): 7 | stu_num, prob_num, know_num, R, q_m, obj_prob_index, sub_prob_index, new_data = data 8 | cdm = FuzzyCDF(R, q_m, stu_num, prob_num, know_num, obj_prob_index, sub_prob_index, skip_value=-1) 9 | cdm.train(epoch=10, burnin=5) 10 | rmse, mae = cdm.eval([{'user_id': 0, 'item_id': 0, 'score': 1.0}]) 11 | filepath = tmp_path / "fuzzycdf.params" 12 | cdm.save(filepath) 13 | cdm.load(filepath) 14 | cdm.inc_train(new_data, epoch=10, burnin=5) 15 | -------------------------------------------------------------------------------- /tests/icd/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdata-ustc/EduCDM/c610e6ae7f45cfe8f1106a1c342d4ef5a357472e/tests/icd/__init__.py -------------------------------------------------------------------------------- /tests/icd/conftest.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/4/6 @ WangFei 3 | 4 | import random 5 | import pytest 6 | import pandas as pd 7 | from EduCDM.ICD.etl import inc_stream 8 | 9 | 10 | @pytest.fixture(scope="package") 11 | def conf(): 12 | user_num = 50 13 | item_num = 20 14 | knowledge_num = 4 15 | return user_num, item_num, knowledge_num 16 | 17 | 18 | @pytest.fixture(scope="package") 19 | def data(conf): 20 | user_num, item_num, knowledge_num = conf 21 | i2k = {} 22 | for i in range(item_num): 23 | i2k[i] = [random.randint(0, knowledge_num - 1)] 24 | log = [] 25 | for i in range(user_num): 26 | for j in range(item_num): 27 | score = random.randint(0, 1) 28 | log.append([i, j, score]) 29 | random.shuffle(log) 30 | df = pd.DataFrame(log, columns=['user_id', 'item_id', 'score']) 31 | inc_train_df_list = list(inc_stream(df, stream_size=int(len(df) // 50))) 32 | 33 | return inc_train_df_list, i2k 34 | -------------------------------------------------------------------------------- /tests/icd/test_mirt.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/4/23 @ tongshiwei 3 | 4 | from EduCDM.ICD.ICD import ICD 5 | # from EduCDM import ICD 6 | 7 | 8 | def test_train(data, conf, tmp_path): 9 | user_n, item_n, know_n = conf 10 | cdm = ICD('mirt', user_n, item_n, know_n) 11 | log, i2k = data 12 | cdm.train(log, i2k) 13 | cdm.save() 14 | cdm.load() 15 | 16 | 17 | def test_exception(data, conf, tmp_path): 18 | try: 19 | user_n, item_n, know_n = conf 20 | cdm = ICD('mirt', user_n, item_n, know_n) 21 | log, i2k = data 22 | cdm.train(log, i2k) 23 | cdm.save() 24 | cdm.load() 25 | except ValueError: 26 | print(ValueError) 27 | -------------------------------------------------------------------------------- /tests/icd/test_ncd.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/4/23 @ tongshiwei 3 | 4 | from EduCDM.ICD.ICD import ICD 5 | # from EduCDM import ICD 6 | 7 | 8 | def test_train(data, conf, tmp_path): 9 | user_n, item_n, know_n = conf 10 | cdm = ICD('ncd', user_n, item_n, know_n) 11 | log, i2k = data 12 | cdm.train(log, i2k) 13 | cdm.save() 14 | cdm.load() 15 | 16 | 17 | def test_exception(data, conf, tmp_path): 18 | try: 19 | user_n, item_n, know_n = conf 20 | cdm = ICD('ncd', user_n, item_n, know_n) 21 | log, i2k = data 22 | cdm.train(log, i2k) 23 | cdm.save() 24 | cdm.load() 25 | except ValueError: 26 | print(ValueError) 27 | -------------------------------------------------------------------------------- /tests/irr/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/6/19 @ tongshiwei 3 | -------------------------------------------------------------------------------- /tests/irr/conftest.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/3/23 @ tongshiwei 3 | 4 | import pandas as pd 5 | import random 6 | import pytest 7 | from longling.lib.structure import AttrDict 8 | from EduCDM.IRR import pair_etl, point_etl, extract_item 9 | import logging 10 | from copy import deepcopy 11 | 12 | 13 | @pytest.fixture(scope="package") 14 | def conf(): 15 | user_num = 5 16 | item_num = 2 17 | knowledge_num = 3 18 | return user_num, item_num, knowledge_num 19 | 20 | 21 | @pytest.fixture(scope="package") 22 | def params(conf): 23 | user_num, item_num, knowledge_num = conf 24 | return AttrDict( 25 | logger=logging, 26 | user_num=user_num, 27 | item_num=item_num, 28 | knowledge_num=knowledge_num, 29 | n_neg=1, 30 | n_imp=1, 31 | hyper_params={"user_num": user_num}, 32 | batch_size=4 33 | ) 34 | 35 | 36 | @pytest.fixture(scope="package") 37 | def source(tmpdir_factory, conf): 38 | user_num, item_num, knowledge_num = conf 39 | 40 | d = tmpdir_factory.mktemp("irr") 41 | log_path = d / "log.csv" 42 | item_path = d / "item.csv" 43 | 44 | knowledge = [] 45 | for j in range(item_num): 46 | knowledge.append([j, [random.randint(1, knowledge_num)]]) 47 | 48 | pd.DataFrame(knowledge, columns=["item_id", "knowledge_code"]).to_csv(item_path) 49 | 50 | log = [] 51 | for i in range(user_num): 52 | for j in range(item_num): 53 | score = random.randint(0, 1) 54 | log.append((i, j, score)) 55 | 56 | pd.DataFrame(log, columns=["user_id", "item_id", "score"]).to_csv(log_path) 57 | 58 | return log_path, item_path 59 | 60 | 61 | @pytest.fixture(scope="package") 62 | def knowledge(source, params): 63 | _, item_path = source 64 | return extract_item(item_path, params.knowledge_num, params) 65 | 66 | 67 | @pytest.fixture(scope="package") 68 | def train_data(source, knowledge, params): 69 | log_path, _ = source 70 | data, _ = pair_etl(log_path, knowledge, params) 71 | return data 72 | 73 | 74 | @pytest.fixture(scope="package") 75 | def zero_train_data(source, knowledge, params): 76 | log_path, _ = source 77 | params_0 = dict(params.items()) 78 | params_0["n_neg"] = 0 79 | params_0["n_imp"] = 0 80 | params_0 = AttrDict(**params_0) 81 | data, _ = pair_etl(log_path, knowledge, params_0) 82 | return data 83 | 84 | 85 | @pytest.fixture(scope="package") 86 | def test_data(source, knowledge, params): 87 | log_path, _ = source 88 | data, _ = point_etl(log_path, knowledge, params) 89 | return data 90 | -------------------------------------------------------------------------------- /tests/irr/test_dina.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/6/19 @ tongshiwei 3 | 4 | from EduCDM.IRR import DINA 5 | 6 | 7 | def test_irr_dina(train_data, test_data, params, tmp_path): 8 | cdm = DINA(params.user_num, params.item_num, params.knowledge_num) 9 | cdm.train(train_data, test_data=test_data, epoch=2) 10 | filepath = tmp_path / "irr.params" 11 | cdm.save(filepath) 12 | cdm.load(filepath) 13 | 14 | 15 | def test_irt(zero_train_data, test_data, params, tmp_path): 16 | cdm = DINA(params.user_num, params.item_num, params.knowledge_num, zeta=0) 17 | cdm.train(zero_train_data, test_data=test_data, epoch=2) 18 | filepath = tmp_path / "irr.params" 19 | cdm.save(filepath) 20 | cdm.load(filepath) 21 | -------------------------------------------------------------------------------- /tests/irr/test_irt.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/6/19 @ tongshiwei 3 | 4 | from EduCDM.IRR import IRT 5 | 6 | 7 | def test_irr_irt(train_data, test_data, params, tmp_path): 8 | cdm = IRT(params.user_num, params.item_num, params.knowledge_num) 9 | cdm.train(train_data, test_data=test_data, epoch=2) 10 | filepath = tmp_path / "irr.params" 11 | cdm.save(filepath) 12 | cdm.load(filepath) 13 | 14 | 15 | def test_irt(zero_train_data, test_data, params, tmp_path): 16 | cdm = IRT(params.user_num, params.item_num, params.knowledge_num, zeta=0) 17 | cdm.train(zero_train_data, test_data=test_data, epoch=2) 18 | filepath = tmp_path / "irr.params" 19 | cdm.save(filepath) 20 | cdm.load(filepath) 21 | -------------------------------------------------------------------------------- /tests/irr/test_mirt.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/6/19 @ tongshiwei 3 | 4 | from EduCDM.IRR import MIRT 5 | 6 | 7 | def test_irr_irt(train_data, test_data, params, tmp_path): 8 | cdm = MIRT(params.user_num, params.item_num, params.knowledge_num) 9 | cdm.train(train_data, test_data=test_data, epoch=2) 10 | filepath = tmp_path / "irr.params" 11 | cdm.save(filepath) 12 | cdm.load(filepath) 13 | 14 | 15 | def test_irt(zero_train_data, test_data, params, tmp_path): 16 | cdm = MIRT(params.user_num, params.item_num, params.knowledge_num, zeta=0) 17 | cdm.train(zero_train_data, test_data=test_data, epoch=2) 18 | filepath = tmp_path / "irr.params" 19 | cdm.save(filepath) 20 | cdm.load(filepath) 21 | -------------------------------------------------------------------------------- /tests/irr/test_ncdm.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/7/1 @ tongshiwei 3 | 4 | from EduCDM.IRR import NCDM 5 | 6 | 7 | def test_irr_dina(train_data, test_data, params, tmp_path): 8 | cdm = NCDM(params.user_num, params.item_num, params.knowledge_num) 9 | cdm.train(train_data, test_data=test_data, epoch=2) 10 | filepath = tmp_path / "irr.params" 11 | cdm.save(filepath) 12 | cdm.load(filepath) 13 | 14 | 15 | def test_irt(zero_train_data, test_data, params, tmp_path): 16 | cdm = NCDM(params.user_num, params.item_num, params.knowledge_num, zeta=0) 17 | cdm.train(zero_train_data, test_data=test_data, epoch=2) 18 | filepath = tmp_path / "irr.params" 19 | cdm.save(filepath) 20 | cdm.load(filepath) 21 | -------------------------------------------------------------------------------- /tests/irt/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/6/21 @ tongshiwei 3 | -------------------------------------------------------------------------------- /tests/irt/em/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/4/23 @ tongshiwei 3 | -------------------------------------------------------------------------------- /tests/irt/em/conftest.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/5/2 @ liujiayu 3 | 4 | import random 5 | import numpy as np 6 | import pytest 7 | 8 | 9 | @pytest.fixture(scope="package") 10 | def conf(): 11 | user_num = 5 12 | item_num = 2 13 | return user_num, item_num 14 | 15 | 16 | @pytest.fixture(scope="package") 17 | def data(conf): 18 | user_num, item_num = conf 19 | 20 | R = -1 * np.ones(shape=(user_num, item_num)) 21 | for i in range(user_num): 22 | for j in range(item_num): 23 | R[i, j] = random.randint(-1, 1) 24 | 25 | new_data = [{'user_id': 1, 'item_id': 1, 'score': 1.0}] 26 | 27 | stu_rec = np.ones(item_num) 28 | for i in range(item_num): 29 | stu_rec[i] = random.randint(-1, 1) 30 | 31 | return user_num, item_num, R, new_data, stu_rec 32 | -------------------------------------------------------------------------------- /tests/irt/em/test_emirt.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/5/2 @ liujiayu 3 | 4 | from EduCDM import EMIRT 5 | 6 | 7 | def test_train(data, conf, tmp_path): 8 | stu_num, prob_num, R, new_data, stu_rec = data 9 | cdm = EMIRT(R, stu_num, prob_num, dim=1, skip_value=-1) 10 | cdm.train(lr=1e-3, epoch=30, epsilon=1e-1) 11 | rmse, mae = cdm.eval([{'user_id': 0, 'item_id': 0, 'score': 1.0}]) 12 | filepath = tmp_path / "irt.params" 13 | cdm.save(filepath) 14 | cdm.load(filepath) 15 | cdm.inc_train(new_data, lr=1e-3, epoch=10) 16 | cdm.transform(stu_rec) 17 | -------------------------------------------------------------------------------- /tests/irt/gd/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/4/23 @ tongshiwei 3 | -------------------------------------------------------------------------------- /tests/irt/gd/conftest.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/3/23 @ tongshiwei 3 | 4 | import random 5 | import pytest 6 | import torch 7 | from torch.utils.data import TensorDataset, DataLoader 8 | 9 | 10 | @pytest.fixture(scope="package") 11 | def conf(): 12 | user_num = 5 13 | item_num = 2 14 | return user_num, item_num 15 | 16 | 17 | @pytest.fixture(scope="package") 18 | def data(conf): 19 | user_num, item_num = conf 20 | log = [] 21 | for i in range(user_num): 22 | for j in range(item_num): 23 | score = random.randint(0, 1) 24 | log.append((i, j, score)) 25 | 26 | user_id, item_id, score = zip(*log) 27 | batch_size = 4 28 | 29 | dataset = TensorDataset( 30 | torch.tensor(user_id, dtype=torch.int64), 31 | torch.tensor(item_id, dtype=torch.int64), 32 | torch.tensor(score, dtype=torch.float) 33 | ) 34 | return DataLoader(dataset, batch_size=batch_size) 35 | -------------------------------------------------------------------------------- /tests/irt/gd/test_gdirt.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/4/23 @ tongshiwei 3 | 4 | from EduCDM import GDIRT 5 | import pytest 6 | 7 | 8 | def test_train(data, conf, tmp_path): 9 | user_num, item_num = conf 10 | cdm = GDIRT(user_num, item_num) 11 | cdm.train(data, test_data=data, epoch=2) 12 | filepath = tmp_path / "mcd.params" 13 | cdm.save(filepath) 14 | cdm.load(filepath) 15 | 16 | 17 | def test_exception(data, conf, tmp_path): 18 | try: 19 | user_num, item_num = conf 20 | cdm = GDIRT(user_num, item_num, value_range=10, a_range=100) 21 | cdm.train(data, test_data=data, epoch=2) 22 | filepath = tmp_path / "mcd.params" 23 | cdm.save(filepath) 24 | cdm.load(filepath) 25 | except ValueError: 26 | print(ValueError) 27 | -------------------------------------------------------------------------------- /tests/kancd/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2023/3/8 @ WangFei 3 | -------------------------------------------------------------------------------- /tests/kancd/conftest.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2023/3/8 @ WangFei 3 | 4 | import random 5 | import pytest 6 | import torch 7 | import numpy as np 8 | from torch.utils.data import TensorDataset, DataLoader 9 | 10 | 11 | @pytest.fixture(scope="package") 12 | def conf(): 13 | user_num = 5 14 | item_num = 2 15 | knowledge_num = 4 16 | return user_num, item_num, knowledge_num 17 | 18 | 19 | @pytest.fixture(scope="package") 20 | def data(conf): 21 | user_num, item_num, knowledge_num = conf 22 | knowledge_embs = np.zeros((item_num, knowledge_num)) 23 | for i in range(item_num): 24 | for j in range(knowledge_num): 25 | knowledge_embs[i][j] = random.randint(0, 1) 26 | log = [] 27 | for i in range(user_num): 28 | for j in range(item_num): 29 | score = random.randint(0, 1) 30 | log.append((i, j, knowledge_embs[j], score)) 31 | 32 | user_id, item_id, knowledge_emb, score = zip(*log) 33 | batch_size = 4 34 | 35 | dataset = TensorDataset( 36 | torch.tensor(user_id, dtype=torch.int64), 37 | torch.tensor(item_id, dtype=torch.int64), 38 | torch.tensor(knowledge_emb, dtype=torch.int64), 39 | torch.tensor(score, dtype=torch.float) 40 | ) 41 | return DataLoader(dataset, batch_size=batch_size) 42 | -------------------------------------------------------------------------------- /tests/kancd/test_kancd.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2023/3/8 @ WangFei 3 | from EduCDM import KaNCD 4 | 5 | 6 | def test_train(data, conf, tmp_path): 7 | user_num, item_num, knowledge_num = conf 8 | cdm = KaNCD(exer_n=item_num, student_n=user_num, knowledge_n=knowledge_num, mf_type='mf', dim=2) 9 | cdm.train(data, data, epoch_n=2) 10 | cdm = KaNCD(exer_n=item_num, student_n=user_num, knowledge_n=knowledge_num, mf_type='gmf', dim=2) 11 | cdm.train(data, data, epoch_n=2) 12 | cdm = KaNCD(exer_n=item_num, student_n=user_num, knowledge_n=knowledge_num, mf_type='ncf1', dim=2) 13 | cdm.train(data, data, epoch_n=2) 14 | cdm = KaNCD(exer_n=item_num, student_n=user_num, knowledge_n=knowledge_num, mf_type='ncf2', dim=2) 15 | cdm.train(data, data, epoch_n=2) 16 | filepath = tmp_path / "kancd.params" 17 | cdm.save(filepath) 18 | cdm.load(filepath) 19 | -------------------------------------------------------------------------------- /tests/mcd/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/3/23 @ tongshiwei 3 | -------------------------------------------------------------------------------- /tests/mcd/conftest.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/3/23 @ tongshiwei 3 | 4 | import random 5 | import pytest 6 | import torch 7 | from torch.utils.data import TensorDataset, DataLoader 8 | 9 | 10 | @pytest.fixture(scope="package") 11 | def conf(): 12 | user_num = 5 13 | item_num = 2 14 | return user_num, item_num 15 | 16 | 17 | @pytest.fixture(scope="package") 18 | def data(conf): 19 | user_num, item_num = conf 20 | log = [] 21 | for i in range(user_num): 22 | for j in range(item_num): 23 | score = random.randint(0, 1) 24 | log.append((i, j, score)) 25 | 26 | user_id, item_id, score = zip(*log) 27 | batch_size = 4 28 | 29 | dataset = TensorDataset( 30 | torch.tensor(user_id, dtype=torch.int64), 31 | torch.tensor(item_id, dtype=torch.int64), 32 | torch.tensor(score, dtype=torch.float) 33 | ) 34 | return DataLoader(dataset, batch_size=batch_size) 35 | -------------------------------------------------------------------------------- /tests/mcd/test_mcd.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/3/23 @ tongshiwei 3 | from EduCDM import MCD 4 | 5 | 6 | def test_train(data, conf, tmp_path): 7 | user_num, item_num = conf 8 | cdm = MCD(user_num, item_num, 10) 9 | cdm.train(data, test_data=data, epoch=2) 10 | filepath = tmp_path / "mcd.params" 11 | cdm.save(filepath) 12 | cdm.load(filepath) 13 | -------------------------------------------------------------------------------- /tests/mirt/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/7/1 @ tongshiwei 3 | -------------------------------------------------------------------------------- /tests/mirt/conftest.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/3/23 @ tongshiwei 3 | 4 | import random 5 | import pytest 6 | import torch 7 | from torch.utils.data import TensorDataset, DataLoader 8 | 9 | 10 | @pytest.fixture(scope="package") 11 | def conf(): 12 | user_num = 5 13 | item_num = 2 14 | return user_num, item_num 15 | 16 | 17 | @pytest.fixture(scope="package") 18 | def data(conf): 19 | user_num, item_num = conf 20 | log = [] 21 | for i in range(user_num): 22 | for j in range(item_num): 23 | score = random.randint(0, 1) 24 | log.append((i, j, score)) 25 | 26 | user_id, item_id, score = zip(*log) 27 | batch_size = 4 28 | 29 | dataset = TensorDataset( 30 | torch.tensor(user_id, dtype=torch.int64), 31 | torch.tensor(item_id, dtype=torch.int64), 32 | torch.tensor(score, dtype=torch.float) 33 | ) 34 | return DataLoader(dataset, batch_size=batch_size) 35 | -------------------------------------------------------------------------------- /tests/mirt/test_mirt.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/4/23 @ tongshiwei 3 | 4 | from EduCDM import MIRT 5 | import pytest 6 | 7 | 8 | def test_train(data, conf, tmp_path): 9 | user_num, item_num = conf 10 | cdm = MIRT(user_num, item_num, 10) 11 | cdm.train(data, test_data=data, epoch=2) 12 | filepath = tmp_path / "mcd.params" 13 | cdm.save(filepath) 14 | cdm.load(filepath) 15 | 16 | 17 | def test_exception(data, conf, tmp_path): 18 | try: 19 | user_num, item_num = conf 20 | cdm = MIRT(user_num, item_num, 10, a_range=100) 21 | cdm.train(data, test_data=data, epoch=2) 22 | filepath = tmp_path / "mcd.params" 23 | cdm.save(filepath) 24 | cdm.load(filepath) 25 | except ValueError: 26 | print(ValueError) 27 | -------------------------------------------------------------------------------- /tests/ncdm/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/4/6 @ WangFei 3 | -------------------------------------------------------------------------------- /tests/ncdm/conftest.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/4/6 @ WangFei 3 | 4 | import random 5 | import pytest 6 | import torch 7 | import numpy as np 8 | from torch.utils.data import TensorDataset, DataLoader 9 | 10 | 11 | @pytest.fixture(scope="package") 12 | def conf(): 13 | user_num = 5 14 | item_num = 2 15 | knowledge_num = 4 16 | return user_num, item_num, knowledge_num 17 | 18 | 19 | @pytest.fixture(scope="package") 20 | def data(conf): 21 | user_num, item_num, knowledge_num = conf 22 | knowledge_embs = np.zeros((item_num, knowledge_num)) 23 | for i in range(item_num): 24 | for j in range(knowledge_num): 25 | knowledge_embs[i][j] = random.randint(0, 1) 26 | log = [] 27 | for i in range(user_num): 28 | for j in range(item_num): 29 | score = random.randint(0, 1) 30 | log.append((i, j, knowledge_embs[j], score)) 31 | 32 | user_id, item_id, knowledge_emb, score = zip(*log) 33 | batch_size = 4 34 | 35 | dataset = TensorDataset( 36 | torch.tensor(user_id, dtype=torch.int64), 37 | torch.tensor(item_id, dtype=torch.int64), 38 | torch.tensor(knowledge_emb, dtype=torch.int64), 39 | torch.tensor(score, dtype=torch.float) 40 | ) 41 | return DataLoader(dataset, batch_size=batch_size) 42 | -------------------------------------------------------------------------------- /tests/ncdm/test_ncdm.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/4/6 @ WangFei 3 | from EduCDM import NCDM 4 | 5 | 6 | def test_train(data, conf, tmp_path): 7 | user_num, item_num, knowledge_num = conf 8 | cdm = NCDM(knowledge_num, item_num, user_num) 9 | cdm.train(data, test_data=data, epoch=2) 10 | filepath = tmp_path / "mcd.params" 11 | cdm.save(filepath) 12 | cdm.load(filepath) 13 | --------------------------------------------------------------------------------