├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   ├── documentation.md
    │   └── feature_request.md
    ├── PULL_REQUEST_TEMPLATE.md
    └── workflows
    │   ├── python-publish.yml
    │   └── python-test.yml
├── .gitignore
├── .readthedocs.yml
├── AUTHORS.md
├── CHANGE.txt
├── CONTRIBUTE.md
├── CONTRIBUTE_CH.md
├── EduNLP
    ├── Formula
    │   ├── Formula.py
    │   ├── README.md
    │   ├── __init__.py
    │   ├── ast
    │   │   ├── __init__.py
    │   │   ├── ast.py
    │   │   ├── katex.py
    │   │   └── readme.md
    │   └── viz
    │   │   ├── __init__.py
    │   │   ├── m_viz.py
    │   │   ├── tree_viz.py
    │   │   ├── utils.py
    │   │   └── viz.py
    ├── I2V
    │   ├── __init__.py
    │   └── i2v.py
    ├── ModelZoo
    │   ├── __init__.py
    │   ├── base_model.py
    │   ├── bert
    │   │   ├── __init__.py
    │   │   └── bert.py
    │   ├── disenqnet
    │   │   ├── __init__.py
    │   │   ├── disenqnet.py
    │   │   ├── modules.py
    │   │   └── utils.py
    │   ├── quesnet
    │   │   ├── __init__.py
    │   │   ├── modules.py
    │   │   ├── quesnet.py
    │   │   └── util.py
    │   ├── rnn
    │   │   ├── __init__.py
    │   │   ├── harnn.py
    │   │   └── rnn.py
    │   └── utils
    │   │   ├── __init__.py
    │   │   ├── data.py
    │   │   ├── device.py
    │   │   ├── downstream_output.py
    │   │   ├── masker.py
    │   │   ├── modules.py
    │   │   ├── padder.py
    │   │   └── torch_utils.py
    ├── Pipeline
    │   ├── __init__.py
    │   ├── base.py
    │   ├── components.py
    │   ├── knowledge_prediction.py
    │   ├── mappings.py
    │   └── property_prediction.py
    ├── Pretrain
    │   ├── __init__.py
    │   ├── bert_vec.py
    │   ├── disenqnet_vec.py
    │   ├── elmo_vec.py
    │   ├── gensim_vec.py
    │   ├── hugginface_utils.py
    │   ├── pretrian_utils.py
    │   └── quesnet_vec.py
    ├── SIF
    │   ├── __init__.py
    │   ├── constants.py
    │   ├── parser
    │   │   ├── __init__.py
    │   │   └── parser.py
    │   ├── segment
    │   │   ├── __init__.py
    │   │   └── segment.py
    │   ├── sif.py
    │   └── tokenization
    │   │   ├── __init__.py
    │   │   ├── formula
    │   │       ├── __init__.py
    │   │       ├── ast_token.py
    │   │       ├── formula.py
    │   │       └── linear_token.py
    │   │   ├── text
    │   │       ├── __init__.py
    │   │       ├── stopwords.py
    │   │       └── tokenization.py
    │   │   └── tokenization.py
    ├── Tokenizer
    │   ├── __init__.py
    │   └── tokenizer.py
    ├── Vector
    │   ├── __init__.py
    │   ├── bert_vec.py
    │   ├── const.py
    │   ├── disenqnet
    │   │   ├── __init__.py
    │   │   └── disenqnet.py
    │   ├── elmo_vec.py
    │   ├── embedding.py
    │   ├── gensim_vec.py
    │   ├── meta.py
    │   ├── quesnet
    │   │   ├── __init__.py
    │   │   └── quesnet.py
    │   ├── rnn
    │   │   ├── __init__.py
    │   │   └── rnn.py
    │   └── t2v.py
    ├── __init__.py
    ├── constant.py
    ├── main.py
    ├── meta_data
    │   └── sif_stopwords.txt
    └── utils
    │   ├── __init__.py
    │   ├── data.py
    │   ├── image.py
    │   ├── log.py
    │   └── path.py
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── asset
    └── _static
    │   ├── d2v.png
    │   ├── d2v_bow_tfidf.png
    │   ├── d2v_general.png
    │   ├── d2v_stem_tf.png
    │   ├── data.png
    │   ├── formula.png
    │   ├── i2v.png
    │   ├── item.png
    │   ├── item_figure.png
    │   ├── item_formula.png
    │   ├── parse.png
    │   ├── prepare_dataset.jpg
    │   ├── seg.png
    │   ├── sif.png
    │   ├── sif_addition.png
    │   ├── tokenizer.png
    │   ├── w2v_stem_text.png
    │   └── w2v_stem_tf.png
├── docs
    ├── EduNLP.png
    ├── Makefile
    ├── README.md
    ├── SIF4TI_CH.md
    ├── make.bat
    ├── requirements.txt
    ├── source
    │   ├── _static
    │   │   ├── EduNLP.png
    │   │   ├── formula.png
    │   │   ├── formulagroup.png
    │   │   ├── pipeline.png
    │   │   └── 流程图.png
    │   ├── api
    │   │   ├── ModelZoo.rst
    │   │   ├── formula.rst
    │   │   ├── i2v.rst
    │   │   ├── index.rst
    │   │   ├── pipeline.rst
    │   │   ├── pretrain.rst
    │   │   ├── sif.rst
    │   │   ├── tokenizer.rst
    │   │   ├── utils.rst
    │   │   └── vector.rst
    │   ├── conf.py
    │   ├── index.rst
    │   └── tutorial
    │   │   ├── en
    │   │       ├── index.rst
    │   │       ├── parse
    │   │       │   ├── FormulaSyntaxStructureParsing.rst
    │   │       │   └── TextSyntaxStructureParsing.rst
    │   │       ├── pipeline.rst
    │   │       ├── pretrain.rst
    │   │       ├── pretrain
    │   │       │   ├── loading.rst
    │   │       │   ├── pub.rst
    │   │       │   └── start.rst
    │   │       ├── seg.rst
    │   │       ├── seg
    │   │       │   ├── SemanticComponentSegmentation.rst
    │   │       │   └── StructuralComponentSegmentation.rst
    │   │       ├── sif.rst
    │   │       ├── tokenization.rst
    │   │       ├── tokenization
    │   │       │   ├── GensimSegTokenizer.rst
    │   │       │   ├── GensimWordTokenizer.rst
    │   │       │   ├── PureTextTokenizer.rst
    │   │       │   └── TextTokenizer.rst
    │   │       ├── tokenize.rst
    │   │       ├── tokenize
    │   │       │   ├── Sentence Segmentation.rst
    │   │       │   ├── Tokenization.rst
    │   │       │   └── WordSegmentation.rst
    │   │       ├── vectorization.rst
    │   │       └── vectorization
    │   │       │   ├── WithPre-trainedModel.rst
    │   │       │   └── WithoutPre-trainedModel.rst
    │   │   └── zh
    │   │       ├── formula.rst
    │   │       ├── index.rst
    │   │       ├── pipeline.rst
    │   │       ├── pretrain.rst
    │   │       ├── seg.rst
    │   │       ├── sif.rst
    │   │       ├── tokenization.rst
    │   │       ├── tokenize.rst
    │   │       └── vectorization.rst
    └── tutorial.ipynb
├── examples
    ├── downstream
    │   ├── difficulty_prediction
    │   │   ├── difficulty_prediction.ipynb
    │   │   └── utils.py
    │   ├── discrimination_prediction
    │   │   ├── discrimination_prediction.ipynb
    │   │   └── utils.py
    │   ├── knowledge_prediction
    │   │   ├── konwledge_prediction.ipynb
    │   │   └── utils.py
    │   ├── paper_segmentation
    │   │   ├── load_data.py
    │   │   ├── model.py
    │   │   ├── paper_segmentation.ipynb
    │   │   ├── samples
    │   │   │   └── train
    │   │   │   │   └── math
    │   │   │   │       └── paper_1.txt
    │   │   ├── trainer.py
    │   │   └── utils.py
    │   ├── quality_evaluation
    │   │   ├── quality_evaluation.ipynb
    │   │   └── train.py
    │   └── similarity_prediction
    │   │   └── similarity_prediction.ipynb
    ├── formula
    │   ├── formula.ipynb
    │   ├── formula.py
    │   └── tree.ipynb
    ├── i2v
    │   ├── get_pretrained_i2v.ipynb
    │   ├── get_pretrained_i2v_d2v_w2v.ipynb
    │   ├── i2v.ipynb
    │   ├── i2v_bert.ipynb
    │   ├── i2v_d2v.ipynb
    │   ├── i2v_disenq.ipynb
    │   ├── i2v_elmo.ipynb
    │   ├── i2v_quesnet.ipynb
    │   └── i2v_w2v.ipynb
    ├── pipeline
    │   └── pipeline.ipynb
    ├── pretrain
    │   ├── bert.ipynb
    │   ├── disenq.ipynb
    │   ├── elmo.ipynb
    │   ├── gensim
    │   │   ├── d2v_bow_tfidf.ipynb
    │   │   ├── d2v_general.ipynb
    │   │   ├── d2v_stem_tf.ipynb
    │   │   ├── w2v_stem_text.ipynb
    │   │   └── w2v_stem_tf.ipynb
    │   ├── hugginface_tokenizer.ipynb
    │   ├── prepare_dataset.ipynb
    │   ├── pretrained_tokenizer.ipynb
    │   ├── quesnet.ipynb
    │   ├── rnn
    │   │   └── rnn.py
    │   └── seg_token
    │   │   ├── d2v.ipynb
    │   │   ├── d2v_d1.ipynb
    │   │   └── d2v_d2.ipynb
    ├── sif
    │   ├── item.json
    │   ├── parse
    │   │   └── parse.ipynb
    │   ├── sci4sif.py
    │   ├── seg
    │   │   └── seg.ipynb
    │   ├── sif4sci.ipynb
    │   ├── sif_addition.ipynb
    │   ├── sif_check.ipynb
    │   └── tokenize
    │   │   └── tokenization.ipynb
    ├── t2v
    │   ├── get_pretrained_t2v.ipynb
    │   ├── t2v.ipynb
    │   ├── t2v_bert.ipynb
    │   ├── t2v_d2v.ipynb
    │   ├── t2v_disenq.ipynb
    │   ├── t2v_elmo.ipynb
    │   ├── t2v_quesnet.ipynb
    │   └── t2v_w2v.ipynb
    ├── test_model
    │   └── w2v
    │   │   └── gensim_luna_stem_t_sg_100.kv
    ├── tokenizer
    │   ├── all_tokenize.ipynb
    │   ├── test_stopwords.txt
    │   └── tokenizer.ipynb
    └── utils
    │   └── data.ipynb
├── pytest.ini
├── scripts
    └── extlib
    │   └── katex2python.py
├── setup.cfg
├── setup.py
├── static
    └── test_data
    │   ├── quesnet_img
    │       └── 000004d6-0479-11ec-829b-797d5eb43535.png
    │   └── standard_luna_data.json
└── tests
    ├── __init__.py
    ├── test_ast.py
    ├── test_formula.py
    ├── test_i2v
        ├── __init__.py
        └── test_pretrained.py
    ├── test_main.py
    ├── test_model_zoo
        └── test_rnn.py
    ├── test_pipeline
        ├── conftest.py
        └── test_pipelines.py
    ├── test_pretrain
        ├── __init__.py
        ├── conftest.py
        ├── test_hugginface_utils.py
        ├── test_pretrain_utils.py
        ├── test_pretrained_bert.py
        ├── test_pretrained_disenqnet.py
        ├── test_pretrained_elmo.py
        └── test_pretrained_quesnet.py
    ├── test_sif
        ├── __init__.py
        ├── conftest.py
        ├── test_parser.py
        ├── test_segement.py
        ├── test_sif.py
        └── test_tokenization.py
    ├── test_tokenizer
        ├── __init__.py
        └── test_tokenizer.py
    ├── test_utils
        └── test_modules.py
    └── test_vec
        ├── __init__.py
        ├── conftest.py
        ├── test_t2v.py
        └── test_vec.py


/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: 'Bug, needs triage'
 6 | 
 7 | ---
 8 | ## 🐛 Description
 9 | (A clear and concise description of what the bug is.)
10 | 
11 | ### Error Message
12 | (Paste the complete error message. Please also include stack trace by setting environment variable `DMLC_LOG_STACK_TRACE_DEPTH=100` before running your script.)
13 | 
14 | ## To Reproduce
15 | (If you developed your own code, please provide a short script that reproduces the error. For existing examples, please provide link.)
16 | 
17 | ### Steps to reproduce
18 | (Paste the commands you ran that produced the error.)
19 | 
20 | 1.
21 | 2.
22 | 
23 | ## What have you tried to solve it?
24 | 
25 | 1.
26 | 2.
27 | 
28 | ## Environment
29 | 
30 | <details>
31 | <summary>Environment Information</summary>
32 | 
33 | **Operating System:** ...
34 | 
35 | **Python Version:** (e.g., python3.6, anaconda/python3.7, venv/python3.8)
36 | 
37 | </details>
38 | 
39 | ## Additional context
40 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/documentation.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: 📚 Documentation
3 | about: Update api documentation or add the data analysis
4 | ---
5 | 
6 | ## 📚 Documentation


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: 'Feature request'
 6 | 
 7 | ---
 8 | 
 9 | ## Description
10 | (A clear and concise description of what the feature is.)
11 | - If the proposal is about a new dataset, provide description of what the dataset is and 
12 | attach the basic data analysis with it.
13 | - If the proposal is about an API, provide mock examples if possible.
14 | 
15 | ## References
16 | - list reference and related literature
17 | - list known implementations
18 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | Thanks for sending a pull request! 
 2 | Please make sure you click the link above to view the [contribution guidelines](../CONTRIBUTE.md), 
 3 | then fill out the blanks below.
 4 | 
 5 | ## Description ##
 6 | (Brief description on what this PR is about)
 7 | 
 8 | ### What does this implement/fix? Explain your changes.
 9 | ...
10 | 
11 | #### Pull request type
12 | - [ ] [DATASET] Add a new dataset
13 | - [ ] [BUGFIX] Bugfix
14 | - [ ] [FEATURE] New feature (non-breaking change which adds functionality)
15 | - [ ] [BREAKING] Breaking change (fix or feature that would cause existing functionality to not work as expected)
16 | - [ ] [STYLE] Code style update (formatting, renaming)
17 | - [ ] [REFACTOR] Refactoring (no functional changes, no api changes)
18 | - [ ] [BUILD] Build related changes
19 | - [ ] [DOC] Documentation content changes
20 | - [ ] [OTHER] Other (please describe): 
21 | 
22 | 
23 | #### Changes
24 | - Feature1, tests, (and when applicable, API doc)
25 | - Feature2, tests, (and when applicable, API doc)
26 | 
27 | or
28 | 
29 | - Fix1, tests
30 | - Fix2, tests
31 | 
32 | ### Does this close any currently open issues?
33 | ...
34 | 
35 | ### Any relevant logs, error output, etc?
36 | ...
37 | 
38 | ## Checklist ##
39 | Before you submit a pull request, please make sure you have to following:
40 | 
41 | ### Essentials ###
42 | - [ ] PR's title starts with a category (e.g. [BUGFIX], [FEATURE], [BREAKING], [DOC], etc)
43 | - [ ] Changes are complete (i.e. I finished coding on this PR)
44 | - [ ] All changes have test coverage and al tests passing
45 | - [ ] Code is well-documented (extended the README / documentation, if necessary)
46 | - [ ] If this PR is your first one, add your name and github account to [AUTHORS.md](../AUTHORS.md)
47 | 
48 | ## Comments ##
49 | - If this change is a backward incompatible change, why must this change be made.
50 | - Interesting edge cases to note here
51 | 


--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflows will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
 3 | 
 4 | name: Upload Python Package
 5 | 
 6 | on:
 7 |   release:
 8 |     types: [created]
 9 | 
10 | jobs:
11 |   deploy:
12 | 
13 |     runs-on: ubuntu-latest
14 | 
15 |     steps:
16 |     - uses: actions/checkout@v2
17 |     - name: Set up Python
18 |       uses: actions/setup-python@v2
19 |       with:
20 |         python-version: '3.x'
21 |     - name: Install dependencies
22 |       run: |
23 |         python -m pip install --upgrade pip
24 |         pip install setuptools wheel twine
25 |     - name: Build and publish
26 |       env:
27 |         TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
28 |         TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
29 |       run: |
30 |         python setup.py sdist bdist_wheel
31 |         twine upload dist/*
32 | 


--------------------------------------------------------------------------------
/.github/workflows/python-test.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | name: test
 3 | 
 4 | on: [push, pull_request]
 5 | 
 6 | jobs:
 7 |   build:
 8 | 
 9 |     runs-on: ${{ matrix.os }}
10 |     strategy:
11 |       matrix:
12 |         python-version: [3.6, 3.7, 3.8, 3.9]
13 |         include:
14 |           - os: "ubuntu-latest"
15 |           - os: "ubuntu-20.04"
16 |             python-version: "3.6"
17 | 
18 |     steps:
19 |       - uses: actions/checkout@v2
20 |       - name: Set up Python ${{ matrix.python-version }}
21 |         uses: actions/setup-python@v2
22 |         with:
23 |           python-version: ${{ matrix.python-version }}
24 |       - name: Install dependencies
25 |         run: |
26 |           pip install -e .[test,full]
27 |           pip install codecov
28 |       - name: Test with pytest
29 |         run: |
30 |           pytest
31 |           codecov


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib64/
 18 | parts/
 19 | sdist/
 20 | var/
 21 | wheels/
 22 | *.egg-info/
 23 | .installed.cfg
 24 | *.egg
 25 | MANIFEST
 26 | 
 27 | # PyInstaller
 28 | #  Usually these files are written by a python script from a template
 29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 30 | *.manifest
 31 | *.spec
 32 | 
 33 | # Installer logs
 34 | pip-log.txt
 35 | pip-delete-this-directory.txt
 36 | 
 37 | # Unit test / coverage reports
 38 | htmlcov/
 39 | .tox/
 40 | .coverage
 41 | .coverage.*
 42 | .cache
 43 | nosetests.xml
 44 | coverage.xml
 45 | *.cover
 46 | .hypothesis/
 47 | .pytest_cache/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | db.sqlite3
 57 | 
 58 | # Flask stuff:
 59 | instance/
 60 | .webassets-cache
 61 | 
 62 | # Scrapy stuff:
 63 | .scrapy
 64 | 
 65 | # Sphinx documentation
 66 | docs/_build/
 67 | **/_build/
 68 | **/_build/*
 69 | 
 70 | # PyBuilder
 71 | target/
 72 | 
 73 | # Jupyter Notebook
 74 | .ipynb_checkpoints
 75 | 
 76 | # pyenv
 77 | .python-version
 78 | 
 79 | # celery beat schedule file
 80 | celerybeat-schedule
 81 | 
 82 | # SageMath parsed files
 83 | *.sage.py
 84 | 
 85 | # Environments
 86 | .env
 87 | .venv
 88 | env/
 89 | venv/
 90 | ENV/
 91 | env.bak/
 92 | venv.bak/
 93 | 
 94 | # Spyder project settings
 95 | .spyderproject
 96 | .spyproject
 97 | 
 98 | # Rope project settings
 99 | .ropeproject
100 | 
101 | # mkdocs documentation
102 | /site
103 | 
104 | # IDE
105 | .idea/
106 | .vscode/
107 | .DS_Store
108 | 
109 | # Pyre type checker
110 | .pyre/
111 | 
112 | # User Definition
113 | data/
114 | deprecated/
115 | tmp*/
116 | jieba.cache
117 | *.kv
118 | *.zip
119 | examples/test_model


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | # Build documentation in the docs/ directory with Sphinx
 9 | sphinx:
10 |   configuration: docs/source/conf.py
11 | 
12 | # Build documentation with MkDocs
13 | #mkdocs:
14 | #  configuration: mkdocs.yml
15 | 
16 | # Optionally build your docs in additional formats such as PDF and ePub
17 | formats: []
18 | 
19 | # Optionally set the version of Python and requirements
20 | # required to build your docs
21 | python:
22 |   version: 3.7
23 |   install:
24 |     - requirements: docs/requirements.txt
25 |     - method: pip
26 |       path: .
27 |       extra_requirements:
28 |         - full


--------------------------------------------------------------------------------
/AUTHORS.md:
--------------------------------------------------------------------------------
 1 | # AUTHORS
 2 | 
 3 | [Shiwei Tong*](https://github.com/tswsxk)
 4 | 
 5 | [Rui Lv](https://github.com/karin0018)
 6 | 
 7 | [Fangzhou Yao](https://github.com/fannazya)
 8 | 
 9 | [Jinze Wu](https://github.com/hxwujinze)
10 | 
11 | [Xin Wang](https://github.com/WangXin1198)
12 | 
13 | [Longhu Qin](https://github.com/KenelmQLH)
14 | 
15 | [Pingzhi Li](https://github.com/pingzhiLi)
16 | 
17 | [Meikai Bao](https://github.com/BAOOOOOM)
18 | 
19 | [Yuting Ning](https://github.com/nnnyt)
20 | 
21 | [Jundong Wu](https://github.com/wintermelon008)
22 | 
23 | [Shangzi Xue](https://github.com/ShangziXue)
24 | 
25 | The stared contributors are the corresponding authors. 
26 | 


--------------------------------------------------------------------------------
/CHANGE.txt:
--------------------------------------------------------------------------------
 1 | v1.0.0
 2 |     1. Support cuda for I2V and T2V.
 3 |     2. Add demos for downstream tasks including knowledge & difficulty & discrimination prediction, similarity prediction and paper segmentation.
 4 |     3. Refactor quesnet for pretrain and vectorization.
 5 |     4. Update documents about tutorials and API.
 6 | 
 7 | v0.0.9
 8 |     1. Refactor tokenizer Basic Tokenizer and Pretrained Tokenizer
 9 |     2. Refactor model structures following huggingface styles for Elmo, BERT, DisenQNet and QuesNet
10 |     3. Add PreprocessingPipeline and Pipeline
11 |     4. Add downstream task: knowledge prediction and property prediction
12 |     5. Fix a bug in RNN which causes ELMo not converging
13 |     6. Move all the test models to modelhub
14 |     7. Update test data files
15 | 
16 | v0.0.8
17 |     1. add Emlo
18 |     2. add DisenQNet
19 |     3. add QuesNet
20 |     4. add tal-edu-bert
21 |     5. add dynamic mapping table from modelhub
22 |     6. fix cuda error
23 |     7. update pretrained models
24 | 
25 | v0.0.7:
26 |     1. add BERT and pretrained model (luna_bert)
27 |     2. speed up the process in sif
28 |     3. handling OOV in word2vec
29 |     4. add English tutorials
30 |     5. add api docs and prettify tutorials
31 |     6. fix the np.error in gensim_vec.W2V.infer_vector
32 |     7. fix the parameters lost in tokenization
33 | 
34 | v0.0.6:
35 |     1. dev: add half-pretrained rnn model
36 |     2. important!!!: rename TextTokenizer to PureTextTokenizer, and add a new tokenizer named TextTokenizer (the two have similar but not the same behaviours).
37 |     3. sif: add $\textf{}$ syntax
38 |     4. add two pretrained w2v model: w2v_sci_300 and w2v_lit_300
39 | 
40 | v0.0.5:
41 |     1. fix the missing stopwords.txt when use pip install
42 | 
43 | v0.0.4:
44 |     1. fix the project errors
45 | 
46 | v0.0.3:
47 |     1. update formula ast: supporting more symbols and functions defined in katex
48 |     2. add tokens to vector tools, including word2vec and doc2vec using gensim
49 |     3. sci4sif support tokenization grouped by segments
50 |     4. add special tokens: \SIFTag and \SIFSep
51 |     5. add item to vector tools
52 |     6. add interface for getting pretrained models, where the supported model names can be accessed by `edunlp i2v` in the command console
53 | 
54 | v0.0.2:
55 |     1. fix potential ModuleNotFoundError
56 | 
57 | v0.0.1:
58 |     1. Add Formula class to parse latex formula, which will generate the abstract syntax tree.
59 |     2. Add SIF v0.0.2.
60 |     3. Add sif4sci function which serves as a preprocess function for downstream tasks.
61 | 


--------------------------------------------------------------------------------
/CONTRIBUTE_CH.md:
--------------------------------------------------------------------------------
  1 | # 贡献规范
  2 | 
  3 | [English version](CONTRIBUTE.md)
  4 | 
  5 | ## 导引
  6 | 
  7 | 首先感谢您关注 EduNLP 并致力于让其变得更好！
  8 | 在您开始贡献自己的一份力之前，需要注意以下几点：
  9 | 1. 如果您希望我们实现新的功能。
 10 |    - 可以在通过 issue 来告诉我们您想要的功能，我们将及时展开讨论设计和实现。
 11 |    - 一旦我们一致地认为这个计划不错，那么您可以期待新的功能很快就可以与您见面。
 12 | 2. 如果您想要对于某个未解决问题的 issue 提供解决性意见或 bug 修复。
 13 |    - 可以先在 [EduNLP issue list](https://github.com/bigdata-ustc/EduNLP/issues) 中搜索您的问题。
 14 |    - 之后，选择一个具体问题和评论，来提供您的解决性意见或者 bug 修复。
 15 |    - 如果对于具体的 issue，您需要更多的细节，请向我们咨询。
 16 | 
 17 | 一旦您实现并已经测试过了你的想法或者是对于 bug 的修复，请通过 Pull Request 提及到到 [EduNLP](https://github.com/bigdata-ustc/EduNLP) :
 18 | 1. 首先fork此仓库到你的分支下
 19 | 2. 对代码进行修改。注意：我们强烈建议你遵守我们的 [commit格式规范](CONTRIBUTE_CH.md#关于Commit的格式)
 20 | 3. 通过代码测试，测试覆盖度达到100%，例子可见[此处](tests/test_sif)
 21 | 4. 通过Pull Request 提及到到 [EduNLP](https://github.com/bigdata-ustc/EduNLP) 。注意：我们提供了一个标准的PR请求模板，你需要认真完成其中的信息，一个标准且规范的PR可参考[此处](https://github.com/bigdata-ustc/EduNLP/pull/1)
 22 | 
 23 | 以下是对于不同贡献内容的有用建议：
 24 | 
 25 | ### 添加新的数据集或者数据分析
 26 | 
 27 | 有关新数据集或数据分析，请移步至 [EduData](https://github.com/bigdata-ustc/EduData) 。
 28 | 
 29 | #### 代码注释风格
 30 | 
 31 | 请使用 Numpy 代码注释风格：
 32 | 
 33 | ```
 34 | function 的功能
 35 | 
 36 |     Parameters
 37 |     ----------
 38 |     变量名 1: 类型, 是否 optional
 39 |        描述
 40 |     变量名 2: 类型, 是否 optional
 41 |        描述
 42 |     ...
 43 | 
 44 |     Returns
 45 |     -------
 46 |     变量名: 类型
 47 |        描述
 48 | 
 49 |     See Also (可选)
 50 |     --------
 51 |     类似 function: 类似 function 的功能
 52 | 
 53 |     Examples (可选)
 54 |     --------
 55 |     >>> 举例怎么用
 56 | ```
 57 | 
 58 | ### 关于Commit的格式
 59 | 
 60 | #### commit format
 61 | 
 62 | ```
 63 | [<type>](<scope>) <subject>
 64 | ```
 65 | 
 66 | #### type
 67 | - `feat`：新功能（feature）。
 68 | - `fix/to`：修复 bug，可以是 Q&A  发现的 bug，也可以是自己在使用时发现的 bug。
 69 |    - `fix`：产生 diff 并自动修复此问题。**适合于一次提交直接修复问题**。
 70 |    - `to`：只产生 diff 不自动修复此问题。**适合于多次提交**。最终修复问题提交时使用 `fix`。
 71 | - `docs`：文档（documentation）。
 72 | - `style`：格式（不影响代码运行的变动）。
 73 | - `refactor`：重构（即非新增功能，也不是修改 bug 的代码变动）。
 74 | - `perf`：优化相关，比如提升性能、体验。
 75 | - `test`：增加测试。
 76 | - `chore`：构建过程或辅助工具的变动。
 77 | - `revert`：回滚到上一个版本。
 78 | - `merge`：代码合并。
 79 | - `sync`：同步主线或分支的 bug。
 80 | - `arch`: 工程文件或工具的改动。
 81 | 
 82 | #### scope (可选)
 83 | 
 84 | scope 是用于说明 commit 影响的范围，比如<u>数据层</u>、<u>控制层</u>、<u>视图层</u>等等，视项目不同而不同。
 85 | 
 86 | 例如在 Angular，可以是 location，browser，compile，compile，rootScope， ngHref，ngClick，ngView等。如果你的修改影响了不止一个scope，你可以使用`*`代替。
 87 | 
 88 | #### subject (必须)
 89 | 
 90 | subject 是 commit 目的的简短描述，不超过50个字符。
 91 | 
 92 | 结尾不加句号或其他标点符号。
 93 | 
 94 | #### Example
 95 | 
 96 | - **[docs] update the README.md**
 97 | 
 98 | ```sh
 99 | git commit -m "[docs] update the README.md"
100 | ```
101 | 
102 | ## FAQ
103 | 
104 | 问题: 我已经在本地仔细地测试了代码，并通过了代码检查，但是在 CI 步骤时却报错？
105 | 回答: 这个问题可能是两个原因造成： 
106 | 1. 在线的 CI 系统与您自己本地系统有差别；
107 | 2. 可能是网络原因造成的，如果是可以通过 CI 的日志文件查看。
108 | 


--------------------------------------------------------------------------------
/EduNLP/Formula/README.md:
--------------------------------------------------------------------------------
1 | 0:无边 1：自身 2:弟 2：兄 3：子 4;父亲 5：跨树


--------------------------------------------------------------------------------
/EduNLP/Formula/__init__.py:
--------------------------------------------------------------------------------
1 | from .Formula import Formula, FormulaGroup, link_formulas
2 | from .ast import link_variable
3 | from .Formula import CONST_MATHORD
4 | 


--------------------------------------------------------------------------------
/EduNLP/Formula/ast/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/5/20 @ tongshiwei
3 | 
4 | from .ast import str2ast, get_edges, ast, link_variable, katex_parse
5 | 


--------------------------------------------------------------------------------
/EduNLP/Formula/ast/readme.md:
--------------------------------------------------------------------------------
1 | katex version: 0.13.11
2 | katex github: https://github.com/KaTeX/KaTeX
3 | node type can be found in https://github.com/KaTeX/KaTeX/blob/master/src/parseNode.js
4 | symbol type can be found in https://github.com/KaTeX/KaTeX/blob/master/src/symbols.js


--------------------------------------------------------------------------------
/EduNLP/Formula/viz/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/3/8 @ tongshiwei
3 | 
4 | import warnings
5 | # warnings.warn("Do not use this package")
6 | from .tree_viz import TreePlotter, ForestPlotter
7 | 


--------------------------------------------------------------------------------
/EduNLP/Formula/viz/m_viz.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | # 2021/3/8 @ tongshiwei
  3 | 
  4 | import matplotlib.pyplot as plt
  5 | from sklearn.tree._export import _MPLTreeExporter
  6 | from sklearn.tree._reingold_tilford import buchheim, Tree
  7 | from matplotlib.text import Annotation
  8 | 
  9 | 
 10 | class TreePlotter(_MPLTreeExporter):
 11 |     def recurse(self, node, ax, scale_x, scale_y, height, depth=0):
 12 |         kwargs = dict(bbox=self.bbox_args, ha='center', va='center',
 13 |                       zorder=100 - 10 * depth, xycoords='axes pixels')
 14 | 
 15 |         if self.fontsize is not None:
 16 |             kwargs['fontsize'] = self.fontsize
 17 | 
 18 |         # offset things by .5 to center them in plot
 19 |         xy = ((node.x + .5) * scale_x, height - (node.y + .5) * scale_y)
 20 | 
 21 |         if self.max_depth is None or depth <= self.max_depth:
 22 |             # if self.filled:
 23 |             #     kwargs['bbox']['fc'] = self.get_fill_color(tree,
 24 |             #                                                node.tree.node_id)
 25 |             if node.parent is None:
 26 |                 # root
 27 |                 ax.annotate(node.tree.label, xy, **kwargs)
 28 |             else:
 29 |                 xy_parent = ((node.parent.x + .5) * scale_x,
 30 |                              height - (node.parent.y + .5) * scale_y)
 31 |                 kwargs["arrowprops"] = self.arrow_args
 32 |                 ax.annotate(node.tree.label, xy_parent, xy, **kwargs)
 33 |             for child in node.children:
 34 |                 self.recurse(child, ax, scale_x, scale_y, height,
 35 |                              depth=depth + 1)
 36 | 
 37 |         else:
 38 |             xy_parent = ((node.parent.x + .5) * scale_x,
 39 |                          height - (node.parent.y + .5) * scale_y)
 40 |             kwargs["arrowprops"] = self.arrow_args
 41 |             kwargs['bbox']['fc'] = 'grey'
 42 |             ax.annotate("\n  (...)  \n", xy_parent, xy, **kwargs)
 43 | 
 44 |     def _make_forest(self, ast):
 45 |         forest = []
 46 |         for node in ast:
 47 |             if node["structure"]["father"] is None:
 48 |                 return Tree()
 49 |             else:
 50 |                 pass
 51 | 
 52 |         return Tree(name, node_id, *children)
 53 | 
 54 |     def export(self, formula_ast, ax=None):
 55 |         self.filled = False
 56 | 
 57 |         if ax is None:
 58 |             ax = plt.gca()
 59 |         ax.clear()
 60 |         ax.set_axis_off()
 61 |         # my_tree = self._make_tree(0, decision_tree.tree_,
 62 |         #                           decision_tree.criterion)
 63 |         my_tree = self._make_forest(formula_ast)
 64 |         draw_tree = buchheim(my_tree)
 65 | 
 66 |         # important to make sure we're still
 67 |         # inside the axis after drawing the box
 68 |         # this makes sense because the width of a box
 69 |         # is about the same as the distance between boxes
 70 |         max_x, max_y = draw_tree.max_extents() + 1
 71 |         ax_width = ax.get_window_extent().width
 72 |         ax_height = ax.get_window_extent().height
 73 | 
 74 |         scale_x = ax_width / max_x
 75 |         scale_y = ax_height / max_y
 76 | 
 77 |         self.recurse(draw_tree, ax,
 78 |                      scale_x, scale_y, ax_height)
 79 | 
 80 |         anns = [ann for ann in ax.get_children()
 81 |                 if isinstance(ann, Annotation)]
 82 | 
 83 |         # update sizes of all bboxes
 84 |         renderer = ax.figure.canvas.get_renderer()
 85 | 
 86 |         for ann in anns:
 87 |             ann.update_bbox_position_size(renderer)
 88 | 
 89 |         if self.fontsize is None:
 90 |             # get figure to data transform
 91 |             # adjust fontsize to avoid overlap
 92 |             # get max box width and height
 93 |             extents = [ann.get_bbox_patch().get_window_extent()
 94 |                        for ann in anns]
 95 |             max_width = max([extent.width for extent in extents])
 96 |             max_height = max([extent.height for extent in extents])
 97 |             # width should be around scale_x in axis coordinates
 98 |             size = anns[0].get_fontsize() * min(scale_x / max_width,
 99 |                                                 scale_y / max_height)
100 |             for ann in anns:
101 |                 ann.set_fontsize(size)
102 | 
103 |         return anns
104 | 


--------------------------------------------------------------------------------
/EduNLP/I2V/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/8/1 @ tongshiwei
3 | 
4 | from .i2v import I2V, get_pretrained_i2v
5 | from .i2v import D2V, W2V, Elmo, Bert, DisenQ, QuesNet
6 | 


--------------------------------------------------------------------------------
/EduNLP/ModelZoo/__init__.py:
--------------------------------------------------------------------------------
1 | from .utils import *
2 | from .bert import *
3 | from .rnn import *
4 | from .disenqnet import *
5 | from .quesnet import *
6 | 


--------------------------------------------------------------------------------
/EduNLP/ModelZoo/base_model.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | import json
 3 | import os
 4 | from pathlib import Path
 5 | import torch
 6 | from transformers import PretrainedConfig
 7 | # import logging
 8 | from ..utils import logger
 9 | 
10 | 
11 | class BaseModel(nn.Module):
12 |     base_model_prefix = ''
13 | 
14 |     def __init__(self):
15 |         super(BaseModel, self).__init__()
16 |         self.config = PretrainedConfig()
17 | 
18 |     def forward(self, *input):
19 |         raise NotImplementedError
20 | 
21 |     def save_pretrained(self, output_dir):
22 |         if not os.path.exists(output_dir):
23 |             os.makedirs(output_dir, exist_ok=True)
24 |         model_path = os.path.join(output_dir, 'pytorch_model.bin')
25 |         model_path = Path(model_path)
26 |         torch.save(self.state_dict(), model_path.open('wb'))
27 |         self.save_config(output_dir)
28 | 
29 |     @classmethod
30 |     def from_pretrained(cls, pretrained_model_path, *args, **kwargs):
31 |         config_path = os.path.join(pretrained_model_path, "config.json")
32 |         model_path = os.path.join(pretrained_model_path, "pytorch_model.bin")
33 |         model = cls.from_config(config_path, *args, **kwargs)
34 |         loaded_state_dict = torch.load(model_path, map_location=torch.device('cpu'))
35 |         loaded_keys = loaded_state_dict.keys()
36 |         expected_keys = model.state_dict().keys()
37 | 
38 |         prefix = cls.base_model_prefix
39 | 
40 |         if set(loaded_keys) == set(expected_keys):
41 |             # same architecture
42 |             model.load_state_dict(loaded_state_dict)
43 |         else:
44 |             has_prefix_module = any(s.startswith(prefix) for s in loaded_keys)
45 |             expects_prefix_module = any(s.startswith(prefix) for s in expected_keys)
46 | 
47 |             new_loaded_state_dict = {}
48 |             if expects_prefix_module and not has_prefix_module:
49 |                 # add prefix
50 |                 for key in loaded_keys:
51 |                     new_loaded_state_dict['.'.join([prefix, key])] = loaded_state_dict[key]
52 |             if has_prefix_module and not expects_prefix_module:
53 |                 # remove prefix
54 |                 for key in loaded_keys:
55 |                     if key.startswith(prefix):
56 |                         new_loaded_state_dict['.'.join(key.split('.')[1:])] = loaded_state_dict[key]
57 |             if has_prefix_module and expects_prefix_module:
58 |                 # both have prefix, only load the base encoder
59 |                 for key in loaded_keys:
60 |                     if key.startswith(prefix):
61 |                         new_loaded_state_dict[key] = loaded_state_dict[key]
62 |             loaded_state_dict = new_loaded_state_dict
63 |             model.load_state_dict(loaded_state_dict, strict=False)
64 |         loaded_keys = loaded_state_dict.keys()
65 |         missing_keys = set(expected_keys) - set(loaded_keys)
66 |         if len(missing_keys) == 0:
67 |             logger.info(
68 |                 f"All the weights of {model.__class__.__name__} were initialized from the model checkpoint at"
69 |                 f" {pretrained_model_path}.\nIf your task is similar to the task the model of the checkpoint"
70 |                 f" was trained on, you can already use {model.__class__.__name__} for predictions without further"
71 |                 " training."
72 |             )
73 |         elif len(missing_keys) > 0:
74 |             logger.warning(
75 |                 f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
76 |                 f" {pretrained_model_path} and are newly initialized: {missing_keys}\nYou should probably"
77 |                 " TRAIN this model on a down-stream task to be able to use it for predictions and inference."
78 |             )
79 |         return model
80 | 
81 |     def save_config(self, config_dir):
82 |         config_path = os.path.join(config_dir, "config.json")
83 |         with open(config_path, "w", encoding="utf-8") as wf:
84 |             json.dump(self.config.to_dict(), wf, ensure_ascii=False, indent=2)
85 | 
86 |     @classmethod
87 |     def from_config(cls, config_path, *args, **kwargs):
88 |         raise NotImplementedError
89 | 


--------------------------------------------------------------------------------
/EduNLP/ModelZoo/bert/__init__.py:
--------------------------------------------------------------------------------
1 | from .bert import *
2 | 


--------------------------------------------------------------------------------
/EduNLP/ModelZoo/disenqnet/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | from .disenqnet import *
4 | 


--------------------------------------------------------------------------------
/EduNLP/ModelZoo/disenqnet/utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import torch
 4 | from torch import nn
 5 | from torch.nn import functional as F
 6 | 
 7 | 
 8 | def get_mask(seq_len, lengths):
 9 |     device = lengths.device
10 |     # batch_size
11 |     batch_size = lengths.size(0)
12 |     # seq_len
13 |     pos_index = torch.arange(seq_len).to(device)
14 |     # batch_size * seq_len
15 |     mask = pos_index.unsqueeze(0).expand(batch_size, -1) >= lengths.unsqueeze(-1)
16 |     return mask
17 | 
18 | 
19 | def shuffle(real):
20 |     # |0 1 2 3| => |1 2 3 0|
21 |     device = real.device
22 |     batch_size = real.size(0)
23 |     shuffled_index = (torch.arange(batch_size) + 1) % batch_size
24 |     shuffled_index = shuffled_index.to(device)
25 |     shuffled = real.index_select(dim=0, index=shuffled_index)
26 |     return shuffled
27 | 
28 | 
29 | def spectral_norm(w, n_iteration=5):
30 |     device = w.device
31 |     # (o, i)
32 |     # bias: (O) -> (o, 1)
33 |     if w.dim() == 1:
34 |         w = w.unsqueeze(-1)
35 |     out_dim, in_dim = w.size()
36 |     # (i, o)
37 |     wt = w.transpose(0, 1)
38 |     # (1, i)
39 |     u = torch.ones(1, in_dim).to(device)
40 |     for _ in range(n_iteration):
41 |         # (1, i) * (i, o) -> (1, o)
42 |         v = torch.mm(u, wt)
43 |         v = v / v.norm(p=2)
44 |         # (1, o) * (o, i) -> (1, i)
45 |         u = torch.mm(v, w)
46 |         u = u / u.norm(p=2)
47 |     # (1, i) * (i, o) * (o, 1) -> (1, 1)
48 |     sn = torch.mm(torch.mm(u, wt), v.transpose(0, 1)).sum() ** 0.5
49 |     return sn
50 | 


--------------------------------------------------------------------------------
/EduNLP/ModelZoo/quesnet/__init__.py:
--------------------------------------------------------------------------------
1 | from .quesnet import QuesNet, QuesNetForPreTraining
2 | from .modules import AE, ImageAE, MetaAE
3 | 


--------------------------------------------------------------------------------
/EduNLP/ModelZoo/quesnet/modules.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class FeatureExtractor(nn.Module):
 6 |     def __init__(self, feat_size=512):
 7 |         super(FeatureExtractor, self).__init__()
 8 |         self.feat_size = feat_size
 9 | 
10 |     def make_batch(self, data, device, pretrain=False):
11 |         """Make batch from input data (python data / np arrays -> tensors)"""
12 |         raise NotImplementedError
13 | 
14 |     def load_emb(self, emb):
15 |         pass
16 | 
17 |     def forward(self, *input):
18 |         raise NotImplementedError
19 | 
20 | 
21 | class AE(nn.Module):
22 |     factor = 1
23 | 
24 |     def enc(self, item, *args, **kwargs):
25 |         return self.encoder(item, *args, **kwargs)
26 | 
27 |     def dec(self, item, *args, **kwargs):
28 |         return self.decoder(item, *args, **kwargs)
29 | 
30 |     def loss(self, item, emb=None):
31 |         if emb is None:
32 |             emb = self(item)
33 |             out = self.dec(emb)
34 |         else:
35 |             out = self.dec(emb)
36 | 
37 |         return self.recons_loss(out, item)
38 | 
39 |     def forward(self, item):
40 |         return self.enc(item)
41 | 
42 | 
43 | class ImageAE(AE):
44 |     def __init__(self, emb_size):
45 |         super().__init__()
46 |         self.emb_size = emb_size
47 |         self.recons_loss = nn.MSELoss()
48 |         self._encoder = nn.Sequential(
49 |             nn.Conv2d(1, 16, 3, stride=3),
50 |             nn.ReLU(True),
51 |             nn.MaxPool2d(2, stride=2),
52 |             nn.Conv2d(16, 32, 3, stride=2),
53 |             nn.ReLU(True),
54 |             nn.MaxPool2d(2, stride=1),
55 |             nn.Conv2d(32, emb_size, 3, stride=2)
56 |         )
57 |         self._decoder = nn.Sequential(
58 |             nn.ConvTranspose2d(emb_size // self.factor, 32, 3, stride=2),
59 |             nn.ReLU(True),
60 |             nn.ConvTranspose2d(32, 16, 5, stride=3, padding=1),
61 |             nn.ReLU(True),
62 |             nn.ConvTranspose2d(16, 8, 5, stride=3),
63 |             nn.ReLU(True),
64 |             nn.ConvTranspose2d(8, 1, 2, stride=2, padding=1),
65 |             nn.Sigmoid()
66 |         )
67 | 
68 |     def encoder(self, item, detach_tensor=False):
69 |         return self._encoder(item).detach().view(item.size(0), -1) if detach_tensor else self._encoder(item).view(
70 |             item.size(0), -1)
71 | 
72 |     def decoder(self, emb, detach_tensor=False):
73 |         return self._decoder(emb[:, :, None, None]).detach() if detach_tensor else self._decoder(emb[:, :, None, None])
74 | 
75 | 
76 | class MetaAE(AE):
77 |     def __init__(self, meta_size, emb_size):
78 |         super().__init__()
79 |         self.emb_size = emb_size
80 |         self.meta_size = meta_size
81 |         self.recons_loss = nn.BCEWithLogitsLoss()
82 |         self.encoder = nn.Sequential(nn.Linear(meta_size, emb_size),
83 |                                      nn.ReLU(True))
84 |         # error: inplace
85 |         # nn.Linear(emb_size, emb_size)
86 |         self.decoder = nn.Sequential(nn.Linear(emb_size // self.factor,
87 |                                                emb_size),
88 |                                      nn.ReLU(True),
89 |                                      nn.Linear(emb_size, meta_size))
90 | 


--------------------------------------------------------------------------------
/EduNLP/ModelZoo/quesnet/util.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.nn.utils.rnn import pack_padded_sequence
 3 | 
 4 | 
 5 | def argsort(seq):
 6 |     return sorted(range(len(seq)), key=seq.__getitem__)
 7 | 
 8 | 
 9 | class SeqBatch:
10 |     def __init__(self, seqs, dtype=None, device=None):
11 |         self.dtype = dtype
12 |         self.device = device
13 |         self.seqs = seqs
14 | 
15 |         if not seqs:
16 |             self.lens = [0]
17 |         else:
18 |             self.lens = [len(x) for x in seqs]
19 | 
20 |         self.ind = argsort(self.lens)[::-1]
21 |         self.inv = argsort(self.ind)
22 |         self.lens.sort(reverse=True)
23 |         self._prefix = [0]
24 |         self._index = {}
25 |         c = 0
26 | 
27 |         for i in range(self.lens[0]):
28 |             for j in range(len(self.lens)):
29 |                 if self.lens[j] <= i:
30 |                     break
31 |                 self._index[i, j] = c
32 |                 c += 1
33 | 
34 |     def packed(self):
35 |         ind = torch.tensor(self.ind, dtype=torch.long, device=self.device)
36 |         if not ind.numel() or ind.max() >= self.padded()[0].size(1):
37 |             return None, None
38 |         padded = self.padded()[0].index_select(1, ind)
39 |         return pack_padded_sequence(padded, torch.tensor(self.lens))
40 | 
41 |     def padded(self, max_len=None, batch_first=False):
42 |         if not self.seqs:
43 |             return torch.empty((0, 0), dtype=self.dtype, device=self.device), \
44 |                 torch.empty((0, 0), dtype=torch.bool, device=self.device)
45 | 
46 |         seqs = [torch.tensor(s, dtype=self.dtype, device=self.device)
47 |                 if not isinstance(s, torch.Tensor) else s
48 |                 for s in self.seqs]
49 |         if max_len is None:
50 |             max_len = self.lens[0]
51 |         seqs = [s[:max_len] for s in seqs]
52 |         mask = [[1] * len(s) + [0] * (max_len - len(s)) for s in seqs]
53 | 
54 |         trailing_dims = seqs[0].size()[1:]
55 |         if batch_first:
56 |             out_dims = (len(seqs), max_len) + trailing_dims
57 |         else:
58 |             out_dims = (max_len, len(seqs)) + trailing_dims
59 | 
60 |         padded = seqs[0].new(*out_dims).fill_(0)
61 |         for i, tensor in enumerate(seqs):
62 |             length = tensor.size(0)
63 |             # use index notation to prevent duplicate references to the tensor
64 |             if batch_first:
65 |                 padded[i, :length, ...] = tensor
66 |             else:
67 |                 padded[:length, i, ...] = tensor
68 |         return padded, torch.tensor(mask).byte().to(self.device)
69 | 
70 |     def index(self, item):
71 |         return self._index[item[0], self.inv[item[1]]]
72 | 
73 |     def invert(self, batch, dim=0):
74 |         return batch.index_select(dim, torch.tensor(self.inv, device=self.device))
75 | 


--------------------------------------------------------------------------------
/EduNLP/ModelZoo/rnn/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/7/12 @ tongshiwei
3 | 
4 | from .rnn import *
5 | from .harnn import *
6 | 


--------------------------------------------------------------------------------
/EduNLP/ModelZoo/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | # 2021/7/12 @ tongshiwei
 3 | 
 4 | from .padder import PadSequence, pad_sequence
 5 | from .device import set_device
 6 | from .masker import Masker
 7 | from .data import load_items
 8 | from .modules import MLP, TextCNN
 9 | from .torch_utils import *
10 | from .downstream_output import *
11 | 


--------------------------------------------------------------------------------
/EduNLP/ModelZoo/utils/data.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | 
 4 | def load_items(data_path):
 5 |     _data = []
 6 |     with open(data_path, encoding="utf-8") as f:
 7 |         for line in f.readlines():
 8 |             _data.append(json.loads(line))
 9 |     return _data
10 | 


--------------------------------------------------------------------------------
/EduNLP/ModelZoo/utils/device.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | # 2021/8/2 @ tongshiwei
 3 | import logging
 4 | import torch
 5 | from torch.nn import DataParallel
 6 | 
 7 | 
 8 | def set_device(_net, ctx, *args, **kwargs):  # pragma: no cover
 9 |     """code from longling v1.3.26"""
10 |     if ctx == "cpu":
11 |         return _net.cpu()
12 |     elif any(map(lambda x: x in ctx, ["cuda", "gpu"])):
13 |         if not torch.cuda.is_available():
14 |             try:
15 |                 torch.ones((1,), device=torch.device("cuda:0"))
16 |             except AssertionError as e:
17 |                 raise TypeError("no cuda detected, noly cpu is supported, the detailed error msg:%s" % str(e))
18 |         if torch.cuda.device_count() >= 1:
19 |             if ":" in ctx:
20 |                 ctx_name, device_ids = ctx.split(":")
21 |                 assert ctx_name in ["cuda", "gpu"], "the equipment should be 'cpu', 'cuda' or 'gpu', now is %s" % ctx
22 |                 device_ids = [int(i) for i in device_ids.strip().split(",")]
23 |                 try:
24 |                     if not isinstance(_net, DataParallel):
25 |                         return DataParallel(_net, device_ids).cuda()
26 |                     return _net.cuda(device_ids)
27 |                 except AssertionError as e:
28 |                     logging.error(device_ids)
29 |                     raise e
30 |             elif ctx in ["cuda", "gpu"]:
31 |                 if not isinstance(_net, DataParallel):
32 |                     _net = DataParallel(_net)
33 |                 return _net.cuda()
34 |             else:
35 |                 raise TypeError("the equipment should be 'cpu', 'cuda' or 'gpu', now is %s" % ctx)
36 |         else:
37 |             logging.error(torch.cuda.device_count())
38 |             raise TypeError("0 gpu can be used, use cpu")
39 |     else:
40 |         if not isinstance(_net, DataParallel):
41 |             return DataParallel(_net, device_ids=ctx).cuda()
42 |         return _net.cuda(ctx)
43 | 


--------------------------------------------------------------------------------
/EduNLP/ModelZoo/utils/downstream_output.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from transformers.modeling_outputs import ModelOutput
 3 | 
 4 | 
 5 | class PropertyPredictionOutput(ModelOutput):
 6 |     loss: torch.FloatTensor = None
 7 |     logits: torch.FloatTensor = None
 8 | 
 9 | 
10 | class KnowledgePredictionOutput(ModelOutput):
11 |     loss: torch.FloatTensor = None
12 |     logits: torch.FloatTensor = None
13 | 


--------------------------------------------------------------------------------
/EduNLP/ModelZoo/utils/masker.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | # 2021/8/3 @ tongshiwei
 3 | 
 4 | from copy import deepcopy
 5 | import numpy as np
 6 | 
 7 | 
 8 | class Masker(object):
 9 |     """
10 | 
11 |     Parameters
12 |     ----------
13 |     mask: int, str
14 |     per
15 |     seed
16 | 
17 |     Examples
18 |     ---------
19 |     >>> masker = Masker(per=0.5, seed=10)
20 |     >>> items = [[1, 1, 3, 4, 6], [2], [5, 9, 1, 4]]
21 |     >>> masked_seq, mask_label = masker(items)
22 |     >>> masked_seq
23 |     [[1, 1, 0, 0, 6], [2], [0, 9, 0, 4]]
24 |     >>> mask_label
25 |     [[0, 0, 1, 1, 0], [0], [1, 0, 1, 0]]
26 |     >>> items = [[1, 2, 3], [1, 1, 0], [2, 0, 0]]
27 |     >>> masked_seq, mask_label = masker(items, [3, 2, 1])
28 |     >>> masked_seq
29 |     [[1, 0, 3], [0, 1, 0], [2, 0, 0]]
30 |     >>> mask_label
31 |     [[0, 1, 0], [1, 0, 0], [0, 0, 0]]
32 |     >>> masker = Masker(mask="[MASK]", per=0.5, seed=10)
33 |     >>> items = [["a", "b", "c"], ["d", "[PAD]", "[PAD]"], ["hello", "world", "[PAD]"]]
34 |     >>> masked_seq, mask_label = masker(items, length=[3, 1, 2])
35 |     >>> masked_seq
36 |     [['a', '[MASK]', 'c'], ['d', '[PAD]', '[PAD]'], ['hello', '[MASK]', '[PAD]']]
37 |     >>> mask_label
38 |     [[0, 1, 0], [0, 0, 0], [0, 1, 0]]
39 | 
40 |     Returns
41 |     ----------
42 |     list
43 |         list of masked_seq and list of masked_list
44 |     """
45 |     def __init__(self, mask: (int, str, ...) = 0, per=0.2, seed=None):
46 |         self.seed = np.random.default_rng(seed)
47 |         self.per = per
48 |         self.mask = mask
49 | 
50 |     def __call__(self, seqs, length=None, *args, **kwargs) -> tuple:
51 |         seqs = deepcopy(seqs)
52 |         masked_list = []
53 |         if length is None:
54 |             length = [len(seq) for seq in seqs]
55 |         for seq, _length in zip(seqs, length):
56 |             masked = self.seed.choice(len(seq) - 1, size=int(_length * self.per), replace=False)
57 |             _masked_list = [0] * len(seq)
58 |             for _masked in masked:
59 |                 seq[_masked] = self.mask
60 |                 _masked_list[_masked] = 1
61 |             masked_list.append(_masked_list)
62 |         return seqs, masked_list
63 | 


--------------------------------------------------------------------------------
/EduNLP/ModelZoo/utils/modules.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | from torch.nn import functional as F
 4 | 
 5 | 
 6 | class MLP(nn.Module):
 7 |     def __init__(self, in_dim, n_classes, hidden_dim, dropout, n_layers=2, act=F.leaky_relu):
 8 |         super(MLP, self).__init__()
 9 |         self.l_in = nn.Linear(in_dim, hidden_dim)
10 |         self.l_hs = nn.ModuleList(nn.Linear(hidden_dim, hidden_dim) for _ in range(n_layers - 2))  # doctest: +ELLIPSIS
11 |         self.l_out = nn.Linear(hidden_dim, n_classes)
12 |         self.dropout = nn.Dropout(p=dropout)
13 |         self.act = act
14 | 
15 |     def forward(self, input):
16 |         hidden = self.act(self.l_in(self.dropout(input)))
17 |         for l_h in self.l_hs:
18 |             hidden = self.act(l_h(self.dropout(hidden)))
19 |         output = self.l_out(self.dropout(hidden))
20 |         return output
21 | 
22 | 
23 | class TextCNN(nn.Module):
24 |     def __init__(self, embed_dim, hidden_dim):
25 |         super(TextCNN, self).__init__()
26 |         kernel_sizes = [2, 3, 4, 5]
27 |         channel_dim = hidden_dim // len(kernel_sizes)
28 |         self.min_seq_len = max(kernel_sizes)
29 |         self.convs = nn.ModuleList([nn.Conv1d(embed_dim, channel_dim, k_size) for k_size in kernel_sizes])
30 | 
31 |     def forward(self, embed):
32 |         if embed.size(1) < self.min_seq_len:
33 |             device = embed.device
34 |             pad = torch.zeros(embed.size(0), self.min_seq_len - embed.size(1), embed.size(-1)).to(device)
35 |             embed = torch.cat((embed, pad), dim=1)
36 |         # (b, s, d) => (b, d, s) => (b, d', s') => (b, d', 1) => (b, d')
37 |         # batch_size * dim * seq_len
38 |         hidden = [F.leaky_relu(conv(embed.transpose(1, 2))) for conv in self.convs]
39 |         # batch_size * dim
40 |         hidden = [F.max_pool1d(h, kernel_size=h.size(2)).squeeze(-1) for h in hidden]
41 |         hidden = torch.cat(hidden, dim=-1)
42 |         return hidden
43 | 


--------------------------------------------------------------------------------
/EduNLP/ModelZoo/utils/padder.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | # 2021/7/12 @ tongshiwei
 3 | 
 4 | __all__ = ["PadSequence", "pad_sequence"]
 5 | 
 6 | 
 7 | class PadSequence(object):
 8 |     """
 9 |     Pad the sequence.
10 | 
11 |     Pad the sequence to the given `length` by inserting `pad_val`. If `clip` is set,
12 |     sequence that has length larger than `length` will be clipped.
13 | 
14 |     Parameters
15 |     ----------
16 |     length : int
17 |         The maximum length to pad/clip the sequence
18 |     pad_val : number
19 |         The pad value. Default 0
20 |     clip : bool
21 | 
22 |     Returns
23 |     -------
24 |     ret
25 |         list of number
26 |     """
27 |     def __init__(self, length, pad_val=0, clip=True):
28 |         self._length = length
29 |         self._pad_val = pad_val
30 |         self._clip = clip
31 | 
32 |     def __call__(self, sample: list):
33 |         sample_length = len(sample)
34 |         if sample_length >= self._length:
35 |             if self._clip and sample_length > self._length:
36 |                 return sample[:self._length]
37 |             else:
38 |                 return sample
39 |         else:
40 |             return sample + [
41 |                 self._pad_val for _ in range(self._length - sample_length)
42 |             ]
43 | 
44 | 
45 | def pad_sequence(sequence: list, max_length=None, pad_val=0, clip=True):
46 |     """
47 | 
48 |     Parameters
49 |     ----------
50 |     sequence
51 |     max_length
52 |     pad_val
53 |     clip
54 | 
55 |     Returns
56 |     -------
57 |     Modified list:list
58 |         padding the sequence in the same size.
59 | 
60 |     Examples
61 |     --------
62 |     >>> seq = [[4, 3, 3], [2], [3, 3, 2]]
63 |     >>> pad_sequence(seq)
64 |     [[4, 3, 3], [2, 0, 0], [3, 3, 2]]
65 |     >>> pad_sequence(seq, pad_val=1)
66 |     [[4, 3, 3], [2, 1, 1], [3, 3, 2]]
67 |     >>> pad_sequence(seq, max_length=2)
68 |     [[4, 3], [2, 0], [3, 3]]
69 |     >>> pad_sequence(seq, max_length=2, clip=False)
70 |     [[4, 3, 3], [2, 0], [3, 3, 2]]
71 |     """
72 |     padder = PadSequence(max([len(seq) for seq in sequence]) if max_length is None else max_length, pad_val, clip)
73 |     return [padder(seq) for seq in sequence]
74 | 


--------------------------------------------------------------------------------
/EduNLP/ModelZoo/utils/torch_utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def sequence_mask(lengths, max_len=None):
 5 |     """Same as tf.sequence_mask, Returns a mask tensor representing the first N positions of each cell.
 6 | 
 7 |     Parameters
 8 |     ----------
 9 |     lengths : _type_
10 |         integer tensor, all its values <= maxlen.
11 |     max_len : _type_, optional
12 |         scalar integer tensor, size of last dimension of returned tensor. Default is the maximum value in lengths.
13 | 
14 |     Returns
15 |     -------
16 |     _type_
17 |         A mask tensor of shape lengths.shape + (maxlen,)
18 | 
19 |     Examples:
20 |     ---------
21 |     >>> sequence_mask(torch.tensor([1, 3, 2]), 5)
22 |     tensor([[ True, False, False, False, False],
23 |             [ True,  True,  True, False, False],
24 |             [ True,  True, False, False, False]])
25 |     >>> sequence_mask(torch.tensor([[1, 3],[2,0]]))
26 |     tensor([[[ True, False, False],
27 |              [ True,  True,  True]],
28 |     <BLANKLINE>
29 |             [[ True,  True, False],
30 |              [False, False, False]]])
31 |     """
32 | 
33 |     lengths_shape = lengths.shape  # torch.size() is a tuple
34 |     lengths = lengths.reshape(-1)
35 | 
36 |     batch_size = lengths.numel()
37 |     max_len = max_len or int(lengths.max())
38 |     lengths_shape += (max_len,)
39 | 
40 |     return (torch.arange(0, max_len, device=lengths.device)
41 |             .type_as(lengths)
42 |             .unsqueeze(0).expand(batch_size, max_len)
43 |             .lt(lengths.unsqueeze(1))).reshape(lengths_shape)
44 | 
45 | 
46 | def gather_nd(params, indices):
47 |     """_summary_
48 | 
49 |     Parameters
50 |     ----------
51 |     params : _type_
52 |         _description_
53 |     indices : _type_
54 |         _description_
55 | 
56 |     Returns
57 |     -------
58 |     _type_
59 |         _description_
60 | 
61 |     Examples:
62 |     ---------
63 |     >>> gather_nd(
64 |     ...           params=torch.tensor([[1, 2, 3],
65 |     ...                                [4, 5, 6]]),
66 |     ...           indices=torch.tensor([[1],
67 |     ...                                 [0]]))
68 |     tensor([[4, 5, 6],
69 |             [1, 2, 3]])
70 |     """
71 |     newshape = indices.shape[:-1] + params.shape[indices.shape[-1]:]
72 |     indices = indices.view(-1, indices.shape[-1]).tolist()
73 |     out = torch.cat([params.__getitem__(tuple(i)) for i in indices])
74 |     return out.reshape(newshape)
75 | 


--------------------------------------------------------------------------------
/EduNLP/Pipeline/__init__.py:
--------------------------------------------------------------------------------
 1 | from .base import Pipeline, PreProcessingPipeline
 2 | from .mappings import TASK_MAPPING, TOKENIZER_MAPPING_NAMES
 3 | from .property_prediction import PropertyPredictionPipeline
 4 | from .knowledge_prediction import KnowledgePredictionPipeline
 5 | from ..Pretrain import PretrainedEduTokenizer
 6 | from ..ModelZoo.base_model import BaseModel
 7 | from ..Vector.t2v import get_pretrained_model_info
 8 | from ..constant import MODEL_DIR
 9 | from EduData import get_data
10 | from typing import Optional, Union, List
11 | 
12 | __all__ = ["pipeline"]
13 | 
14 | SUPPORTED_TASKS = {
15 |     "pre-process": {
16 |         "impl": Pipeline,
17 |         "default": None
18 |     },
19 |     "property-prediction": {
20 |         "impl": PropertyPredictionPipeline,
21 |         "default": "elmo_for_property_prediction_test_256"
22 |     },
23 |     "knowledge-prediction": {
24 |         "impl": KnowledgePredictionPipeline,
25 |         "default": "elmo_for_knowledge_prediction_test_256"
26 |     }
27 | }
28 | 
29 | 
30 | def pipeline(
31 |         task: str = None,
32 |         model: Optional[Union[BaseModel, str]] = None,
33 |         tokenizer: Optional[PretrainedEduTokenizer] = None,
34 |         pipeline_class: Optional[Pipeline] = None,
35 |         preprocess: Optional[List] = None,
36 |         **kwargs
37 | ):
38 |     """
39 |     Parameters
40 |     ----------
41 |     task: str, required
42 |     model: BaseModel or str, optional
43 | 
44 |     tokenizer: PretrainedEduTokenizer, optional
45 | 
46 |     pipeline_class: Pipeline, optional
47 |         to specify Pipeline class
48 |     preprocess: list, optional
49 |         a list of names of pre-process pipes
50 | 
51 |     Examples
52 |     ----------
53 |     >>> processor = pipeline(task="property-prediction") # doctest: +SKIP
54 |     >>> item = "如图所示，则三角形ABC的面积是_。"
55 |     >>> processor(item) # doctest: +SKIP
56 |     """
57 |     if preprocess is None and task is None and model is None:
58 |         raise RuntimeError("Please specify at least the model to use or task to do!")
59 |     elif model is None and tokenizer is not None:
60 |         raise RuntimeError("Specified tokenizer but no model is not allowed!")
61 |     elif task is None and model is not None:
62 |         raise RuntimeError("Please specify the task.")
63 |     elif task is None:
64 |         task = "pre-process"
65 | 
66 |     if task == "pre-process":
67 |         return PreProcessingPipeline(pipe_names=preprocess)
68 | 
69 |     if task in SUPPORTED_TASKS:
70 |         targeted_task = SUPPORTED_TASKS[task]
71 |     else:
72 |         raise KeyError(f"Unknown task {task}")
73 |     if pipeline_class is None:
74 |         pipeline_class = targeted_task["impl"]
75 |     if model is None or isinstance(model, str):
76 |         # TODO: 1. waiting for ModelHub and TEST
77 |         #       2. Check if the specified model and task are matched
78 |         # pretrained_name = targeted_task["default"] if model is None else model
79 |         # model_url, model_name, *args = get_pretrained_model_info(pretrained_name)
80 |         # model_path = get_data(model_url, MODEL_DIR)
81 |         # model = TASK_MAPPING[task][model_name].from_pretrained(model_path)
82 |         # tokenizer = TOKENIZER_MAPPING_NAMES[model_name].from_pretrained(model_path)
83 |         pass
84 |     elif isinstance(model, BaseModel) and isinstance(tokenizer, PretrainedEduTokenizer):
85 |         model, tokenizer = model, tokenizer
86 |     elif model is not None and tokenizer is not None:
87 |         raise KeyError(f"Unknown model and tokenizer: {model} and {tokenizer}")
88 | 
89 |     return pipeline_class(model=model, task=task, tokenizer=tokenizer, preproc_pipe_names=preprocess, **kwargs)
90 | 


--------------------------------------------------------------------------------
/EduNLP/Pipeline/components.py:
--------------------------------------------------------------------------------
 1 | from ..utils import dict2str4sif
 2 | from ..SIF import is_sif, to_sif, sif4sci
 3 | from ..SIF.segment import seg, SegmentList
 4 | from ..Tokenizer import PureTextTokenizer
 5 | from ..SIF.tokenization.text import tokenize
 6 | 
 7 | 
 8 | class BasePipe:
 9 |     def __init__(self, *args, **kwargs):
10 |         self.args = args
11 |         self.kwargs = kwargs
12 | 
13 |     def __call__(self, input_):
14 |         raise NotImplementedError
15 | 
16 | 
17 | class IsSifPipe(BasePipe):
18 |     def __init__(self, *args, **kwargs):
19 |         super(IsSifPipe, self).__init__(*args, **kwargs)
20 | 
21 |     def __call__(self, input_):
22 |         print(is_sif(input_, *self.args, **self.kwargs))
23 |         return input_
24 | 
25 | 
26 | class ToSifPipe(BasePipe):
27 |     def __init__(self, *args, **kwargs):
28 |         super(ToSifPipe, self).__init__(*args, **kwargs)
29 | 
30 |     def __call__(self, input_):
31 |         return to_sif(input_, *self.args, **self.kwargs)
32 | 
33 | 
34 | class Dict2Str4SifPipe(BasePipe):
35 |     def __init__(self, *args, **kwargs):
36 |         super(Dict2Str4SifPipe, self).__init__(*args, **kwargs)
37 | 
38 |     def __call__(self, input_):
39 |         return dict2str4sif(input_, *self.args, **self.kwargs)
40 | 
41 | 
42 | class Sif4SciPipe(BasePipe):
43 |     def __init__(self, *args, **kwargs):
44 |         super(Sif4SciPipe, self).__init__(*args, **kwargs)
45 | 
46 |     def __call__(self, input_):
47 |         return sif4sci(input_, *self.args, **self.kwargs)
48 | 
49 | 
50 | class SegPipe(BasePipe):
51 |     def __init__(self, *args, **kwargs):
52 |         super(SegPipe, self).__init__(*args, **kwargs)
53 | 
54 |     def __call__(self, input_):
55 |         return seg(input_, *self.args, **self.kwargs)
56 | 
57 | 
58 | class SegDescribePipe(BasePipe):
59 |     def __init__(self, *args, **kwargs):
60 |         super(SegDescribePipe, self).__init__(*args, **kwargs)
61 | 
62 |     def __call__(self, input_: SegmentList):
63 |         print(input_.describe())
64 |         return input_
65 | 
66 | 
67 | class SegFilterPipe(BasePipe):
68 |     def __init__(self, *args, **kwargs):
69 |         super(SegFilterPipe, self).__init__(*args, **kwargs)
70 | 
71 |     def __call__(self, input_: SegmentList):
72 |         input_.filter(*self.args, **self.kwargs)
73 |         return input_
74 | 
75 | 
76 | class TokenizePipe(BasePipe):
77 |     def __init__(self, *args, **kwargs):
78 |         super(TokenizePipe, self).__init__(*args, **kwargs)
79 | 
80 |     def __call__(self, input_):
81 |         return tokenize(input_, *self.args, **self.kwargs)
82 | 
83 | 
84 | PREPROCESSING_PIPES = {
85 |     'dict2str4sif': Dict2Str4SifPipe,
86 |     'is_sif': IsSifPipe,
87 |     'to_sif': ToSifPipe,
88 |     'sif4sci': Sif4SciPipe,
89 |     'seg': SegPipe,
90 |     'seg_describe': SegDescribePipe,
91 |     'seg_filter': SegFilterPipe,
92 |     'tokenize': TokenizePipe,
93 | }
94 | 


--------------------------------------------------------------------------------
/EduNLP/Pipeline/knowledge_prediction.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from .base import Pipeline, GenericTensor
 4 | from typing import Dict, Optional, Union
 5 | from torch import sigmoid
 6 | 
 7 | 
 8 | class KnowledgePredictionPipeline(Pipeline):
 9 |     def __init__(self, **kwargs):
10 |         super(KnowledgePredictionPipeline, self).__init__(**kwargs)
11 | 
12 |     def _sanitize_parameters(self, **pipeline_parameters):
13 |         tokenize_params, forward_params, postprocess_params = pipeline_parameters, {}, {}
14 |         return tokenize_params, forward_params, postprocess_params
15 | 
16 |     def _tokenize(self, input_, **tokenize_parameters) -> Dict[str, GenericTensor]:
17 |         return self.tokenizer(input_, **tokenize_parameters)
18 | 
19 |     def _forward(self, model_inputs, **forward_params):
20 |         return self.model(**model_inputs)
21 | 
22 |     def postprocess(self, model_outputs, **postprocess_params):
23 |         if 'num_classes_list' not in dir(self.model) or 'num_total_classes' not in dir(self.model):
24 |             raise ValueError('model is not for knowledge prediction: ', self.model)
25 |         outputs = model_outputs["logits"][0]
26 |         start_idx = 0
27 |         knowledge_list = []
28 |         for num_classes in self.model.num_classes_list:
29 |             level_prediction = torch.argmax(outputs[start_idx:start_idx + num_classes]) + start_idx
30 |             knowledge_list.append(level_prediction)
31 |             start_idx += num_classes
32 |         outputs = outputs.detach().numpy()
33 |         dict_knowledge = {
34 |             "knowledge_list": knowledge_list,
35 |             "knowledge_scores": outputs.tolist(),
36 |         }
37 |         return dict_knowledge
38 | 


--------------------------------------------------------------------------------
/EduNLP/Pipeline/mappings.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | from ..Pretrain import ElmoTokenizer, BertTokenizer, QuesNetTokenizer, DisenQTokenizer
 3 | from ..ModelZoo.rnn import ElmoLMForPropertyPrediction, ElmoLMForKnowledgePrediction
 4 | from ..ModelZoo.bert import BertForPropertyPrediction, BertForKnowledgePrediction
 5 | 
 6 | TOKENIZER_MAPPING_NAMES = OrderedDict(
 7 |     [
 8 |         ("elmo", ElmoTokenizer),
 9 |         ("bert", BertTokenizer),
10 |         ("quesnet", QuesNetTokenizer),
11 |         ("disenq", DisenQTokenizer)
12 |     ]
13 | )
14 | 
15 | MODEL_FOR_PROPERTY_PREDICTION_MAPPING_NAMES = OrderedDict(
16 |     [
17 |         ("elmo", ElmoLMForPropertyPrediction),
18 |         ("bert", BertForPropertyPrediction),
19 |     ]
20 | )
21 | 
22 | MODEL_FOR_KNOWLEDGE_PREDICTION_MAPPING_NAMES = OrderedDict(
23 |     [
24 |         ("elmo", ElmoLMForKnowledgePrediction),
25 |         ("bert", BertForKnowledgePrediction)
26 |     ]
27 | )
28 | 
29 | TASK_MAPPING = {
30 |     "property-prediction": MODEL_FOR_PROPERTY_PREDICTION_MAPPING_NAMES,
31 |     "knowledge-prediction": MODEL_FOR_KNOWLEDGE_PREDICTION_MAPPING_NAMES
32 | }
33 | 


--------------------------------------------------------------------------------
/EduNLP/Pipeline/property_prediction.py:
--------------------------------------------------------------------------------
 1 | from .base import Pipeline, GenericTensor
 2 | from typing import Dict, Optional, Union
 3 | 
 4 | 
 5 | class PropertyPredictionPipeline(Pipeline):
 6 |     def __init__(self, **kwargs):
 7 |         super(PropertyPredictionPipeline, self).__init__(**kwargs)
 8 | 
 9 |     def _sanitize_parameters(self, **pipeline_parameters):
10 |         tokenize_params, forward_params, postprocess_params = pipeline_parameters, {}, {}
11 |         return tokenize_params, forward_params, postprocess_params
12 | 
13 |     def _tokenize(self, input_, **tokenize_parameters) -> Dict[str, GenericTensor]:
14 |         return self.tokenizer(input_, **tokenize_parameters)
15 | 
16 |     def _forward(self, model_inputs, **forward_params):
17 |         return self.model(**model_inputs)
18 | 
19 |     def postprocess(self, model_outputs, **postprocess_params):
20 |         outputs = model_outputs["logits"]
21 |         outputs = outputs.detach().numpy()
22 |         dict_property = {"property": outputs.item()}
23 |         return dict_property
24 | 


--------------------------------------------------------------------------------
/EduNLP/Pretrain/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | # 2021/5/29 @ tongshiwei
 3 | 
 4 | from .gensim_vec import train_vector, GensimWordTokenizer, GensimSegTokenizer
 5 | from .elmo_vec import *
 6 | from .bert_vec import *
 7 | from .quesnet_vec import QuesNetTokenizer, pretrain_quesnet, Question
 8 | from .disenqnet_vec import *
 9 | from .pretrian_utils import *
10 | from .hugginface_utils import *
11 | 


--------------------------------------------------------------------------------
/EduNLP/SIF/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/5/16 @ tongshiwei
3 | 
4 | from .sif import is_sif, to_sif, sif4sci
5 | from .tokenization import link_formulas
6 | from .constants import *
7 | 


--------------------------------------------------------------------------------
/EduNLP/SIF/constants.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | # 2021/5/18 @ tongshiwei
 3 | 
 4 | TEXT_SYMBOL = "[TEXT]"
 5 | FORMULA_SYMBOL = "[FORMULA]"
 6 | FIGURE_SYMBOL = "[FIGURE]"
 7 | QUES_MARK_SYMBOL = "[MARK]"
 8 | TAG_SYMBOL = "[TAG]"
 9 | SEP_SYMBOL = "[SEP]"
10 | TEXT_BEGIN = r"[TEXT_BEGIN]"
11 | TEXT_END = r"[TEXT_END]"
12 | FORMULA_BEGIN = r"[FORMULA_BEGIN]"
13 | FORMULA_END = r"[FORMULA_END]"
14 | 
15 | EDU_SPYMBOLS = [
16 |     TEXT_SYMBOL, FORMULA_SYMBOL, FIGURE_SYMBOL,
17 |     QUES_MARK_SYMBOL, TAG_SYMBOL, SEP_SYMBOL,
18 |     TEXT_BEGIN, TEXT_END,
19 |     FORMULA_BEGIN, FORMULA_END
20 | ]
21 | 
22 | 
23 | class Symbol(str):
24 |     pass
25 | 


--------------------------------------------------------------------------------
/EduNLP/SIF/parser/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/6/02 @ fannazya
3 | 
4 | from .parser import (Parser)
5 | 


--------------------------------------------------------------------------------
/EduNLP/SIF/segment/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/5/18 @ tongshiwei
3 | 
4 | from .segment import (SegmentList, TextSegment, FigureFormulaSegment, LatexFormulaSegment, FigureSegment,
5 |                       QuesMarkSegment, Figure, TagSegment, SepSegment, seg)
6 | 


--------------------------------------------------------------------------------
/EduNLP/SIF/tokenization/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/5/18 @ tongshiwei
3 | 
4 | from .tokenization import tokenize, link_formulas
5 | 


--------------------------------------------------------------------------------
/EduNLP/SIF/tokenization/formula/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/5/18 @ tongshiwei
3 | 
4 | from .formula import tokenize
5 | from .ast_token import traversal_formula
6 | 


--------------------------------------------------------------------------------
/EduNLP/SIF/tokenization/formula/ast_token.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | # 2021/5/20 @ tongshiwei
  3 | import networkx as nx
  4 | from EduNLP.Formula import Formula
  5 | 
  6 | 
  7 | # def inorder_traversal(ast: nx.DiGraph):
  8 | #     visit = set()
  9 | #     nodes = []
 10 | #
 11 | #     def _inorder_traversal(_node):
 12 | #         if _node in visit:
 13 | #             return
 14 | #         successors = list(ast.successors(_node))
 15 | #         if successors:
 16 | #             if len(successors) <= 2:
 17 | #                 _inorder_traversal(successors[0])
 18 | #                 nodes.append(_node)
 19 | #                 visit.add(_node)
 20 | #                 if len(successors) == 2:
 21 | #                     _inorder_traversal(successors[1])
 22 | #             else:
 23 | #                 nodes.append(_node)
 24 | #                 for successor in successors:
 25 | #                     if successor in visit:
 26 | #                         continue
 27 | #                     _inorder_traversal(successor)
 28 | #         else:
 29 | #             nodes.append(_node)
 30 | #
 31 | #     for node in ast.nodes:
 32 | #         if node in visit or list(ast.predecessors(node)):
 33 | #             continue
 34 | #         _inorder_traversal(node)
 35 | #     return nodes
 36 | 
 37 | def traversal_formula(ast, ord2token=False, var_numbering=False, strategy="post", *args, **kwargs):
 38 |     """
 39 |     The part will run only when the return type is list. And it provides two strategy: post and linear.
 40 |     Besides, tokens list will append node follow its type.
 41 |     """
 42 |     tokens = []
 43 |     if strategy == "post":
 44 |         order = nx.dfs_postorder_nodes(ast)
 45 |     elif strategy == "linear":  # pragma: no cover
 46 |         order = ast.nodes
 47 |     else:  # pragma: no cover
 48 |         raise ValueError("Unknown traversal strategy: %s" % strategy)
 49 |     for i in order:
 50 |         node = ast.nodes[i]
 51 |         if node.get("type", "ignore") == "ignore":
 52 |             continue
 53 |         if ord2token is True and node["type"] in ["mathord", "textord", "text"]:
 54 |             if var_numbering is True and node["type"] == "mathord":
 55 |                 tokens.append("%s_%s" % (node["type"], node.get("var", "con")))
 56 |             else:
 57 |                 tokens.append(node["type"])
 58 |         else:
 59 |             tokens.append(node["text"])
 60 |     return tokens
 61 | 
 62 | 
 63 | def ast_tokenize(formula, ord2token=False, var_numbering=False, return_type="formula", *args, **kwargs):
 64 |     """
 65 |     According to return type, tokenizing formula by different methods.
 66 | 
 67 |     Parameters
 68 |     ----------
 69 |     formula
 70 |     ord2token
 71 |     var_numbering
 72 |     return_type
 73 |     args
 74 |     kwargs
 75 | 
 76 |     Returns
 77 |     -------
 78 | 
 79 |     Examples
 80 |     --------
 81 |     >>> ast_tokenize(r"{x + y}^\\frac{\\pi}{2} + 1 = x", return_type="list")
 82 |     ['x', '+', 'y', '{ }', '\\\\pi', '{ }', '2', '{ }', '\\\\frac', '\\\\supsub', '+', '1', '=', 'x']
 83 |     >>> ast_tokenize(r"{x + y}^\\frac{\\pi}{2} + 1 = x", return_type="list", ord2token=True)
 84 |     ['mathord', '+', 'mathord', '{ }', 'mathord', '{ }', 'textord', '{ }', '\\\\frac', '\\\\supsub', '+', 'textord', \
 85 | '=', 'mathord']
 86 |     >>> ast_tokenize(r"{x + y}^\\frac{\\pi}{2} + 1 = x", return_type="list", ord2token=True, var_numbering=True)
 87 |     ['mathord_0', '+', 'mathord_1', '{ }', 'mathord_con', '{ }', 'textord', '{ }', '\\\\frac', '\\\\supsub', \
 88 | '+', 'textord', '=', 'mathord_0']
 89 |     >>> len(ast_tokenize(r"{x + y}^\\frac{\\pi}{2} + 1 = x", return_type="ast").nodes)
 90 |     14
 91 |     >>> ast_tokenize(r"{x + y}^\\frac{\\pi}{2} + 1 = x")
 92 |     <Formula: {x + y}^\\frac{\\pi}{2} + 1 = x>
 93 |     """
 94 |     if return_type == "list":
 95 |         ast = Formula(formula, variable_standardization=True).ast_graph
 96 |         return traversal_formula(ast, ord2token=ord2token, var_numbering=var_numbering)
 97 |     elif return_type == "formula":
 98 |         return Formula(formula)
 99 |     elif return_type == "ast":
100 |         return Formula(formula).ast_graph
101 |     else:
102 |         raise ValueError()
103 | 
104 | 
105 | if __name__ == '__main__':
106 |     print(ast_tokenize(r"{x + y}^\frac{\pi}{2} + 1 = x", return_type="list", ord2token=True, var_numbering=True))
107 | 


--------------------------------------------------------------------------------
/EduNLP/SIF/tokenization/formula/formula.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | # 2021/5/18 @ tongshiwei
 3 | 
 4 | import warnings
 5 | 
 6 | from .linear_token import linear_tokenize
 7 | from .ast_token import ast_tokenize
 8 | 
 9 | 
10 | def tokenize(formula, method="linear", errors="raise", **kwargs):
11 |     """
12 |     The total function to tokenize formula by linear or ast.
13 | 
14 |     Parameters
15 |     ----------
16 |     formula
17 |     method
18 |     errors: how to handle the exception occurs in ast tokenize
19 |         "coerce": use linear_tokenize
20 |         "raise": raise exception
21 |     kwargs
22 | 
23 |     Returns
24 |     -------
25 | 
26 |     Examples
27 |     --------
28 |     >>> tokenize(r"\\frac{\\pi}{x + y} + 1 = x")
29 |     ['\\\\frac', '{', '\\\\pi', '}', '{', 'x', '+', 'y', '}', '+', '1', '=', 'x']
30 |     >>> tokenize(r"\\frac{\\pi}{x + y} + 1 = x", method="ast", ord2token=True)
31 |     <Formula: \\frac{\\pi}{x + y} + 1 = x>
32 |     >>> tokenize(r"\\frac{\\pi}{x + y} + 1 = x", method="ast", ord2token=True, return_type="list")
33 |     ['mathord', '{ }', 'mathord', '+', 'mathord', '{ }', '\\\\frac', '+', 'textord', '=', 'mathord']
34 |     """
35 |     if method == "linear":
36 |         return linear_tokenize(formula, **kwargs)
37 |     elif method == "ast":
38 |         try:
39 |             return ast_tokenize(formula, **kwargs)
40 |         except TypeError as e:  # pragma: no cover
41 |             if errors == "coerce":
42 |                 warnings.warn("A type error is detected, linear tokenize is applied")
43 |                 return linear_tokenize(formula)
44 |             else:
45 |                 raise e
46 |     else:
47 |         raise TypeError("Unknown method type: %s" % method)
48 | 


--------------------------------------------------------------------------------
/EduNLP/SIF/tokenization/text/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/5/18 @ tongshiwei
3 | from .tokenization import tokenize
4 | 


--------------------------------------------------------------------------------
/EduNLP/SIF/tokenization/text/stopwords.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | # 2021/5/18 @ tongshiwei
 3 | 
 4 | import os
 5 | from EduNLP.utils import abs_current_dir, path_append
 6 | 
 7 | DEFAULT_FILEPATH = os.path.abspath(
 8 |     path_append(abs_current_dir(__file__), "..", "..", "..", "meta_data", "sif_stopwords.txt")
 9 | )
10 | 
11 | 
12 | def get_stopwords(filepath=DEFAULT_FILEPATH):
13 |     _stopwords = set()
14 |     with open(filepath, encoding="utf-8") as f:
15 |         for line in f:
16 |             _stopwords.add(line.strip())
17 | 
18 |     return _stopwords
19 | 
20 | 
21 | DEFAULT_STOPWORDS = get_stopwords()
22 | 


--------------------------------------------------------------------------------
/EduNLP/SIF/tokenization/text/tokenization.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | # 2021/5/18 @ tongshiwei
 3 | import logging
 4 | import jieba
 5 | from .stopwords import DEFAULT_STOPWORDS
 6 | 
 7 | jieba.setLogLevel(logging.INFO)
 8 | 
 9 | 
10 | def is_chinese(word):
11 |     """判断一个char或者string是否是汉字(串)"""
12 |     for char in word:
13 |         if char < u'\u4e00' or char > u'\u9fa5':
14 |             return False
15 |     return True
16 | 
17 | 
18 | def tokenize(text, granularity="word", stopwords="default"):
19 |     """
20 |     Using jieba library to tokenize item by word or char.
21 | 
22 |     Parameters
23 |     ----------
24 |     text
25 |     granularity
26 |     stopwords: str, None or set
27 | 
28 |     Returns
29 |     -------
30 | 
31 |     Examples
32 |     --------
33 |     >>> tokenize("三角函数是基本初等函数之一")
34 |     ['三角函数', '初等', '函数']
35 |     >>> tokenize("三角函数是基本初等函数之一", granularity="char")
36 |     ['三', '角', '函', '数', '初', '等', '函', '数']
37 |     """
38 |     stopwords = DEFAULT_STOPWORDS if stopwords == "default" else stopwords
39 |     stopwords = stopwords if stopwords is not None else {}
40 |     if granularity == "word":
41 |         return [token for token in jieba.cut(text) if token not in stopwords and token.strip()]
42 |     elif granularity == "char":
43 |         jieba_tokens = [token for token in jieba.cut(text) if token not in stopwords and token.strip()]
44 |         # Use jieba_tokens to hangle sentence with mixed chinese and english.
45 |         split_tokens = []
46 |         for token in jieba_tokens:
47 |             if is_chinese(token):
48 |                 split_tokens.extend(list(token))
49 |             else:
50 |                 split_tokens.append(token)
51 |         return split_tokens
52 |     else:
53 |         raise TypeError("Unknown granularity %s" % granularity)
54 | 


--------------------------------------------------------------------------------
/EduNLP/Tokenizer/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/8/1 @ tongshiwei
3 | 
4 | from .tokenizer import *
5 | 


--------------------------------------------------------------------------------
/EduNLP/Vector/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | # 2021/5/29 @ tongshiwei
 3 | 
 4 | from .gensim_vec import W2V, D2V, BowLoader, TfidfLoader
 5 | from .const import *
 6 | from .rnn import RNNModel
 7 | from .t2v import T2V, get_pretrained_t2v, get_pretrained_model_info, get_all_pretrained_models
 8 | from .embedding import Embedding
 9 | from .bert_vec import BertModel
10 | from .quesnet import QuesNetModel
11 | from .disenqnet import DisenQModel
12 | from .elmo_vec import ElmoModel
13 | 


--------------------------------------------------------------------------------
/EduNLP/Vector/bert_vec.py:
--------------------------------------------------------------------------------
 1 | # from transformers import BertModel as HFBertModel
 2 | from transformers import AutoModel
 3 | from .meta import Vector
 4 | import torch
 5 | 
 6 | 
 7 | class BertModel(Vector):
 8 |     """
 9 |     Examples
10 |     --------
11 |     >>> from EduNLP.Pretrain import BertTokenizer
12 |     >>> tokenizer = BertTokenizer("bert-base-chinese", add_special_tokens=False)
13 |     >>> model = BertModel("bert-base-chinese")
14 |     >>> item = ["有公式$\\FormFigureID{wrong1?}$，如图$\\FigureID{088f15ea-xxx}$，若$x,y$满足约束",
15 |     ... "有公式$\\FormFigureID{wrong1?}$，如图$\\FigureID{088f15ea-xxx}$，若$x,y$满足约束"]
16 |     >>> inputs = tokenizer(item, return_tensors='pt')
17 |     >>> output = model(inputs)
18 |     >>> output.shape
19 |     torch.Size([2, 14, 768])
20 |     >>> tokens = model.infer_tokens(inputs)
21 |     >>> tokens.shape
22 |     torch.Size([2, 12, 768])
23 |     >>> tokens = model.infer_tokens(inputs, return_special_tokens=True)
24 |     >>> tokens.shape
25 |     torch.Size([2, 14, 768])
26 |     >>> item = model.infer_vector(inputs)
27 |     >>> item.shape
28 |     torch.Size([2, 768])
29 |     """
30 | 
31 |     def __init__(self, pretrained_dir, device="cpu"):
32 |         self.device = device
33 |         self.model = AutoModel.from_pretrained(pretrained_dir).to(self.device)
34 |         self.model.eval()
35 | 
36 |     def __call__(self, items: dict):
37 |         self.cuda_tensor(items)
38 |         tokens = self.model(**items).last_hidden_state
39 |         return tokens
40 | 
41 |     def infer_vector(self, items: dict, pooling_strategy='CLS', **kwargs) -> torch.Tensor:
42 |         vector = self(items)
43 |         if pooling_strategy == 'CLS':
44 |             return vector[:, 0, :]
45 |         elif pooling_strategy == 'average':
46 |             # the average of word embedding of the last layer
47 |             # batch_size, sent_len, embedding_dim
48 |             mask = items['attention_mask'].unsqueeze(-1).expand(vector.size())
49 |             mul_mask = vector * mask
50 |             # batch_size, embedding_dim
51 |             return mul_mask.sum(1) / (mask.sum(1) + 1e-10)
52 | 
53 |     def infer_tokens(self, items: dict, return_special_tokens=False, **kwargs) -> torch.Tensor:
54 |         tokens = self(items)
55 |         if return_special_tokens:
56 |             # include embedding of [CLS] and [SEP]
57 |             return tokens
58 |         else:
59 |             # ignore embedding of [CLS] and [SEP]
60 |             return tokens[:, 1:-1, :]
61 | 
62 |     @property
63 |     def vector_size(self):
64 |         return self.model.config.hidden_size
65 | 


--------------------------------------------------------------------------------
/EduNLP/Vector/const.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/7/12 @ tongshiwei
3 | 
4 | UNK = "[UNK]"
5 | PAD = "[PAD]"
6 | 


--------------------------------------------------------------------------------
/EduNLP/Vector/disenqnet/__init__.py:
--------------------------------------------------------------------------------
1 | from .disenqnet import DisenQModel
2 | 


--------------------------------------------------------------------------------
/EduNLP/Vector/disenqnet/disenqnet.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from EduNLP.ModelZoo.disenqnet.disenqnet import DisenQNet
 3 | from EduNLP.Vector.meta import Vector
 4 | 
 5 | 
 6 | class DisenQModel(Vector):
 7 |     def __init__(self, pretrained_dir, device="cpu"):
 8 |         """
 9 |         Parameters
10 |         ----------
11 |         pretrained_dir: str
12 |             the dirname to pretrained model
13 |         device: str
14 |             cpu or cuda, default is cpu
15 |         """
16 |         self.device = device
17 |         self.model = DisenQNet.from_pretrained(pretrained_dir).to(self.device)
18 |         self.model.eval()
19 | 
20 |     def __call__(self, items: dict):
21 |         self.cuda_tensor(items)
22 |         outputs = self.model(**items)
23 |         return outputs.embeded, outputs.k_hidden, outputs.i_hidden
24 | 
25 |     def infer_vector(self, items: dict, vector_type=None, **kwargs) -> torch.Tensor:
26 |         """
27 |         Parameters
28 |         ----------
29 |         vector_type: str
30 |             choose the type of items tensor to return.
31 |             Default is None, which means return both (k_hidden, i_hidden)
32 |             when vector_type="k", return k_hidden;
33 |             when vector_type="i", return i_hidden;
34 |         """
35 |         _, k_hidden, i_hidden = self(items)
36 |         if vector_type is None:
37 |             return k_hidden, i_hidden
38 |         elif vector_type == "k":
39 |             return k_hidden
40 |         elif vector_type == "i":
41 |             return i_hidden
42 |         else:
43 |             raise KeyError("vector_type must be one of (None, 'k', 'i') ")
44 | 
45 |     def infer_tokens(self, items: dict, **kwargs) -> torch.Tensor:
46 |         embeded, _, _ = self(items)
47 |         """
48 |         get tokens embedding with DisenQModel
49 |         Parameters
50 |         ----------
51 |         items: dict
52 |             {'content_idx': tensor(),'content_len': tensor()}, the tokens about question after tokenizer processing
53 | 
54 |         Returns:
55 |             torch.Tensor: token embedding
56 |         """
57 |         return embeded
58 | 
59 |     @property
60 |     def vector_size(self):
61 |         return self.model.hidden_size
62 | 


--------------------------------------------------------------------------------
/EduNLP/Vector/elmo_vec.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from EduNLP.ModelZoo.rnn import ElmoLM
 3 | from .meta import Vector
 4 | 
 5 | 
 6 | class ElmoModel(Vector):
 7 |     def __init__(self, pretrained_dir: str, device="cpu"):
 8 |         """
 9 |         Parameters
10 |         ----------
11 |         pretrained_model_path: str
12 |         """
13 |         super(ElmoModel, self).__init__()
14 |         self.device = device
15 |         self.model = ElmoLM.from_pretrained(pretrained_dir).to(device)
16 |         self.model.eval()
17 | 
18 |     def __call__(self, items: dict):
19 |         self.cuda_tensor(items)
20 |         outputs = self.model(**items)
21 |         return outputs
22 | 
23 |     def infer_vector(self, items: dict, **kwargs) -> torch.Tensor:
24 |         """
25 |         get sentence vector embedding with ElmoModel
26 |         Parameters
27 |         ----------
28 |         items: dict, {'seq_idx': tensor(),'seq_len':tensor()}, the tokens about question after tokenizer processing
29 | 
30 |         Returns:
31 |             torch.Tensor: sentence embedding
32 |         """
33 |         outputs = self(items)
34 |         item_embeds = torch.cat(
35 |             (outputs.forward_output[torch.arange(len(items["seq_len"])), torch.tensor(items["seq_len"]) - 1],
36 |              outputs.backward_output[torch.arange(len(items["seq_len"])), 0]),
37 |             dim=-1)
38 |         return item_embeds
39 | 
40 |     def infer_tokens(self, items, **kwargs) -> torch.Tensor:
41 |         """
42 |         get tokens embedding with ElmoModel
43 |         Parameters
44 |         ----------
45 |         items: dict, {'seq_idx': tensor()}, the tokens about question after tokenizer processing
46 | 
47 |         Returns:
48 |             torch.Tensor: token embedding
49 |         """
50 |         outputs = self(items)
51 |         forward_hiddens = outputs.forward_output
52 |         backward_hiddens = outputs.backward_output
53 |         return torch.cat((forward_hiddens, backward_hiddens), dim=-1)
54 | 
55 |     @property
56 |     def vector_size(self):
57 |         return 2 * self.model.hidden_size
58 | 


--------------------------------------------------------------------------------
/EduNLP/Vector/embedding.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | # 2021/7/12 @ tongshiwei
 3 | 
 4 | from typing import List
 5 | import torch
 6 | from .gensim_vec import W2V
 7 | from .const import PAD
 8 | from EduNLP.ModelZoo import pad_sequence, set_device
 9 | 
10 | 
11 | class Embedding(object):
12 |     def __init__(self, w2v: (W2V, tuple, list, dict, None), freeze=True, device=None, **kwargs):
13 |         if w2v is None:
14 |             self.w2v = None
15 |         elif isinstance(w2v, (tuple, list)):
16 |             self.w2v = W2V(*w2v)
17 |         elif isinstance(w2v, dict):
18 |             self.w2v = W2V(**w2v)
19 |         elif isinstance(w2v, W2V):
20 |             self.w2v = w2v
21 |         else:
22 |             raise TypeError("w2v argument must be one of W2V, tuple, list, dict or None, now is %s" % type(w2v))
23 | 
24 |         if self.w2v is not None:
25 |             self.vocab_size = len(self.w2v)
26 |             self.embedding_dim = self.w2v.vector_size
27 |         else:
28 |             self.vocab_size = kwargs["vocab_size"]
29 |             self.embedding_dim = kwargs["embedding_dim"]
30 | 
31 |         self.embedding = torch.nn.Embedding(self.vocab_size, self.embedding_dim)
32 | 
33 |         self.pad_val = 0
34 |         if self.w2v is not None:
35 |             self.embedding.from_pretrained(torch.Tensor(self.w2v.vectors), freeze)
36 |             self.pad_val = self.w2v.constants[PAD]
37 |         self.key_to_index = self.w2v.key_to_index if w2v is not None else lambda x: x
38 | 
39 |         if device is not None:
40 |             self.set_device(device)
41 | 
42 |     def __call__(self, items: List[List[str]], indexing=True, padding=True, vectorization=True, *args,
43 |                  **kwargs) -> tuple:
44 | 
45 |         items, item_len = self.indexing(items, padding=padding, indexing=indexing)
46 |         items = self.infer_token_vector(items, indexing=False)[0] if vectorization else items
47 |         return items, item_len
48 | 
49 |     def infer_token_vector(self, items: List[List[str]], indexing=True) -> tuple:
50 |         items, item_len = self.indexing(items, padding=True, indexing=indexing)
51 |         item_embedding = self.embedding(torch.LongTensor(items))
52 |         return item_embedding, item_len
53 | 
54 |     def indexing(self, items: List[List[str]], padding=False, indexing=True) -> tuple:
55 |         """
56 | 
57 |         Parameters
58 |         ----------
59 |         items: list of list of str(word/token)
60 |         padding: bool
61 |             whether padding the returned list with default pad_val to make all item in items have the same length
62 |         indexing: bool
63 | 
64 |         Returns
65 |         -------
66 |         token_idx: list of list of int
67 |             the list of the tokens of each item
68 |         token_len: list of int
69 |             the list of the length of tokens of each item
70 |         """
71 |         items_idx = [[self.key_to_index(word) for word in item] for item in items] if indexing else items
72 |         item_len = [len(_idx) for _idx in items_idx]
73 |         padded_items_idx = pad_sequence(items_idx, pad_val=self.pad_val) if padding is True else items_idx
74 |         return padded_items_idx, item_len
75 | 
76 |     def set_device(self, device):
77 |         self.embedding = set_device(self.embedding, device)
78 |         return self
79 | 


--------------------------------------------------------------------------------
/EduNLP/Vector/meta.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | # 2021/7/13 @ tongshiwei
 3 | import torch
 4 | 
 5 | 
 6 | class Vector(object):
 7 |     def infer_vector(self, items, *args, **kwargs) -> ...:
 8 |         pass
 9 | 
10 |     def infer_tokens(self, items, *args, **kwargs) -> ...:
11 |         pass
12 | 
13 |     @property
14 |     def vector_size(self):
15 |         raise NotImplementedError
16 | 
17 |     @property
18 |     def is_frozen(self):  # pragma: no cover
19 |         return True
20 | 
21 |     def freeze(self, *args, **kwargs):  # pragma: no cover
22 |         pass
23 | 
24 |     def cuda_tensor(self, items: dict):
25 |         for k, v in items.items():
26 |             if isinstance(v, torch.Tensor):
27 |                 items[k] = v.to(self.device)
28 | 


--------------------------------------------------------------------------------
/EduNLP/Vector/quesnet/__init__.py:
--------------------------------------------------------------------------------
1 | from .quesnet import QuesNetModel
2 | 


--------------------------------------------------------------------------------
/EduNLP/Vector/quesnet/quesnet.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from typing import Union
 3 | from EduNLP.ModelZoo.quesnet import QuesNet
 4 | from EduNLP.Pretrain import Question, QuesNetTokenizer
 5 | from EduNLP.Vector.meta import Vector
 6 | 
 7 | 
 8 | class QuesNetModel(Vector):
 9 |     def __init__(self, pretrained_dir, device="cpu", **kwargs):
10 |         """
11 |         Parameters
12 |         ----------
13 |         pretrained_dir: str
14 |             the dirname to pretrained model
15 |         device: str
16 |             cpu or cuda, default is cpu
17 |         img_dir: str
18 |             image dir
19 |         """
20 |         self.device = torch.device(device)
21 |         self.model = QuesNet.from_pretrained(pretrained_dir).to(self.device)
22 |         self.model.eval()
23 | 
24 |     def __call__(self, items: dict):
25 |         """ get question embedding with quesnet
26 | 
27 |         Parameters
28 |         ----------
29 |         items:
30 |             encodes from tokenizer
31 |         """
32 |         qs = [Question("", items['seq_idx'][i],
33 |                        [0], [[0], [0], [0]], items['meta_idx'][i]) for i in range(len(items['seq_idx']))]
34 |         outputs = self.model(self.model.make_batch(qs, device=self.device))
35 |         return outputs.hidden, outputs.embeded
36 | 
37 |     def infer_vector(self, items: Union[dict, list], **kwargs) -> torch.Tensor:
38 |         """ get question embedding with quesnet
39 | 
40 |         Parameters
41 |         ----------
42 |         items:
43 |             encodes from tokenizer
44 |         """
45 |         return self(items)[0]
46 | 
47 |     def infer_tokens(self, items: Union[dict, list], **kwargs) -> torch.Tensor:
48 |         """ get token embeddings with quesnet
49 | 
50 |         Parameters
51 |         ----------
52 |         items:
53 |             encodes from tokenizer
54 |         Returns
55 |         -------
56 |         torch.Tensor
57 |             word_embs + meta_emb
58 |         """
59 |         vector = self(items)[1]
60 |         """ Please note that output vector is like 0 0 seq_idx(text with image) 0 meta_idx 0 0"""
61 |         return vector[:, 2:-2, :]
62 | 
63 |     @property
64 |     def vector_size(self):
65 |         return self.model.feat_size
66 | 


--------------------------------------------------------------------------------
/EduNLP/Vector/rnn/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/7/12 @ tongshiwei
3 | 
4 | from .rnn import RNNModel
5 | 


--------------------------------------------------------------------------------
/EduNLP/Vector/rnn/rnn.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | # 2021/7/12 @ tongshiwei
  3 | 
  4 | import torch
  5 | from ..gensim_vec import W2V
  6 | from ..embedding import Embedding
  7 | from ..meta import Vector
  8 | from EduNLP.ModelZoo import rnn, set_device
  9 | from baize.torch import save_params
 10 | 
 11 | 
 12 | class RNNModel(Vector):
 13 |     """
 14 |     Examples
 15 |     --------
 16 |     >>> model = RNNModel("BiLSTM", None, 2, vocab_size=4, embedding_dim=3)
 17 |     >>> seq_idx = [[1, 2, 3], [1, 2, 0], [3, 0, 0]]
 18 |     >>> output, hn = model(seq_idx, indexing=False, padding=False)
 19 |     >>> seq_idx = [[1, 2, 3], [1, 2], [3]]
 20 |     >>> output, hn = model(seq_idx, indexing=False, padding=True)
 21 |     >>> output.shape
 22 |     torch.Size([3, 3, 4])
 23 |     >>> hn.shape
 24 |     torch.Size([2, 3, 2])
 25 |     >>> tokens = model.infer_tokens(seq_idx, indexing=False)
 26 |     >>> tokens.shape
 27 |     torch.Size([3, 3, 4])
 28 |     >>> tokens = model.infer_tokens(seq_idx, agg="mean", indexing=False)
 29 |     >>> tokens.shape
 30 |     torch.Size([3, 4])
 31 |     >>> item = model.infer_vector(seq_idx, indexing=False)
 32 |     >>> item.shape
 33 |     torch.Size([3, 4])
 34 |     >>> item = model.infer_vector(seq_idx, agg="mean", indexing=False)
 35 |     >>> item.shape
 36 |     torch.Size([3, 2])
 37 |     >>> item = model.infer_vector(seq_idx, agg=None, indexing=False)
 38 |     >>> item.shape
 39 |     torch.Size([2, 3, 2])
 40 |     """
 41 | 
 42 |     def __init__(self, rnn_type, w2v: (W2V, tuple, list, dict, None), hidden_size,
 43 |                  freeze_pretrained=True, model_params=None, device=None,
 44 |                  **kwargs):
 45 |         self.embedding = Embedding(w2v, freeze_pretrained, **kwargs)
 46 |         for key in ["vocab_size", "embedding_dim"]:
 47 |             if key in kwargs:
 48 |                 kwargs.pop(key)
 49 |         self.rnn = rnn.LM(
 50 |             rnn_type,
 51 |             self.embedding.vocab_size,
 52 |             self.embedding.embedding_dim,
 53 |             hidden_size=hidden_size,
 54 |             embedding=self.embedding.embedding,
 55 |             model_params=model_params,
 56 |             **kwargs
 57 |         )
 58 |         self.bidirectional = self.rnn.rnn.bidirectional
 59 |         self.hidden_size = self.rnn.hidden_size
 60 |         self.freeze_pretrained = freeze_pretrained
 61 |         if device is not None:
 62 |             self.set_device(device)
 63 | 
 64 |     def __call__(self, items, indexing=True, padding=True, **kwargs):
 65 |         seq_idx, seq_len = self.embedding(items, indexing=indexing, padding=padding, vectorization=False)
 66 | 
 67 |         tokens, item = self.rnn(torch.LongTensor(seq_idx), torch.LongTensor(seq_len))
 68 | 
 69 |         return tokens, item
 70 | 
 71 |     def infer_vector(self, items, agg: (int, str, None) = -1, indexing=True, padding=True, *args,
 72 |                      **kwargs) -> torch.Tensor:
 73 |         vector = self(items, indexing=indexing, padding=padding, **kwargs)[1]
 74 |         if agg is not None:
 75 |             if agg == -1:
 76 |                 return torch.reshape(vector, (vector.shape[1], -1))
 77 |             return eval("torch.%s" % agg)(vector, dim=0)
 78 |         return vector
 79 | 
 80 |     def infer_tokens(self, items, agg=None, *args, **kwargs) -> torch.Tensor:
 81 |         tokens = self(items, **kwargs)[0]
 82 |         if agg is not None:
 83 |             return eval("torch.%s" % agg)(tokens, dim=1)
 84 |         return tokens
 85 | 
 86 |     @property
 87 |     def vector_size(self) -> int:
 88 |         return self.hidden_size * (1 if self.bidirectional is False else 2)
 89 | 
 90 |     def set_device(self, device):
 91 |         self.rnn = set_device(self.rnn, device)
 92 | 
 93 |     def save(self, filepath, save_embedding=False):
 94 |         save_params(filepath, self.rnn, select=None if save_embedding is True else '^(?!.*embedding)')
 95 |         return filepath
 96 | 
 97 |     def freeze(self, *args, **kwargs):
 98 |         return self.eval()
 99 | 
100 |     @property
101 |     def is_frozen(self):
102 |         return not self.rnn.training
103 | 
104 |     def eval(self):
105 |         self.rnn.eval()
106 |         return self
107 | 
108 |     def train(self, mode=True):
109 |         self.rnn.train(mode)
110 |         return self
111 | 


--------------------------------------------------------------------------------
/EduNLP/__init__.py:
--------------------------------------------------------------------------------
1 | from .utils import logger
2 | from .I2V import get_pretrained_i2v
3 | 


--------------------------------------------------------------------------------
/EduNLP/constant.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/8/1 @ tongshiwei
3 | 
4 | import os
5 | from os.path import expanduser, join
6 | 
7 | ROOT = os.environ.get("EDUNLPPATH", join(expanduser("~"), ".EduNLP"))
8 | MODEL_DIR = os.environ.get("EDUNLPMODELPATH", join(ROOT, "model"))
9 | 


--------------------------------------------------------------------------------
/EduNLP/main.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | # 2021/8/2 @ tongshiwei
 3 | 
 4 | import fire
 5 | 
 6 | 
 7 | from EduNLP.Vector.t2v import get_all_pretrained_models
 8 | 
 9 | 
10 | def list_i2v():
11 |     print("\n".join(get_all_pretrained_models()))
12 | 
13 | 
14 | def cli():  # pragma: no cover
15 |     fire.Fire({"i2v": list_i2v})
16 | 


--------------------------------------------------------------------------------
/EduNLP/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/5/20 @ tongshiwei
3 | 
4 | from .path import abs_current_dir, path_append
5 | from .image import image2base64
6 | from .log import logger
7 | from .data import dict2str4sif
8 | 


--------------------------------------------------------------------------------
/EduNLP/utils/data.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | # 2021/5/30 @ tongshiwei
 3 | 
 4 | from contextlib import contextmanager
 5 | 
 6 | ann_format = r"$\SIFTag{{{}}}$"
 7 | ann_begin_format = r"$\SIFTag{{{}_begin}}$"
 8 | ann_end_format = r"$\SIFTag{{{}_end}}$"
 9 | ann_list_no_format = r"$\SIFTag{{list_{}}}$"
10 | 
11 | 
12 | @contextmanager
13 | def add_annotation(key, tag_mode, tar: list, key_as_tag=True):
14 |     """add tag"""
15 |     if key_as_tag is True:
16 |         if tag_mode == "delimiter":
17 |             tar.append(ann_begin_format.format(key))
18 |         elif tag_mode == "head":
19 |             tar.append(ann_format.format(key))
20 |     yield
21 |     if key_as_tag is True:
22 |         if tag_mode == "delimiter":
23 |             tar.append(ann_end_format.format(key))
24 |         elif tag_mode == "tail":
25 |             tar.append(ann_format.format(key))
26 | 
27 | 
28 | def dict2str4sif(obj: dict, key_as_tag=True, tag_mode="delimiter", add_list_no_tag=True, keys=None) -> str:
29 |     r"""
30 |     The function aims to transfer dictionary format item to string format item.
31 | 
32 |     Parameters
33 |     ----------
34 |     obj
35 |     key_as_tag
36 |     tag_mode
37 |         delimiter: add $\SIFTag{key_begin}$ in the head and add $\SIFTag{key_end}$ at the end
38 |         head: add $\SIFTag{key}$ in the head
39 |         tail: add $\SIFTag{key}$ at the end
40 |     add_list_no_tag
41 |     keys
42 | 
43 |     Examples
44 |     -------
45 |     >>> item = {
46 |     ...     "stem": r"若复数$z=1+2 i+i^{3}$，则$|z|=$",
47 |     ...     "options": ['0', '1', r'$\sqrt{2}$', '2'],
48 |     ... }
49 |     >>> item
50 |     {'stem': '若复数$z=1+2 i+i^{3}$，则$|z|=$', 'options': ['0', '1', '$\\sqrt{2}$', '2']}
51 |     >>> dict2str4sif(item) # doctest: +ELLIPSIS
52 |     '$\\SIFTag{stem_begin}$...$\\SIFTag{stem_end}$$\\SIFTag{options_begin}$...$\\SIFTag{options_end}$'
53 |     >>> dict2str4sif(item, add_list_no_tag=True) # doctest: +ELLIPSIS
54 |     '...$\\SIFTag{options_begin}$$\\SIFTag{list_0}$0$\\SIFTag{list_1}$1...$\\SIFTag{options_end}$'
55 |     >>> dict2str4sif(item, tag_mode="head") # doctest: +ELLIPSIS
56 |     '$\\SIFTag{stem}$...$\\SIFTag{options}$...'
57 |     >>> dict2str4sif(item, tag_mode="tail") # doctest: +ELLIPSIS
58 |     '若复数$z=1+2 i+i^{3}$，则$|z|=$$\\SIFTag{stem}$...2$\\SIFTag{options}$'
59 |     >>> dict2str4sif(item, add_list_no_tag=False) # doctest: +ELLIPSIS
60 |     '...$\\SIFTag{options_begin}$0$\\SIFSep$1$\\SIFSep$...$\\SIFTag{options_end}$'
61 |     >>> dict2str4sif(item, key_as_tag=False)
62 |     '若复数$z=1+2 i+i^{3}$，则$|z|=$0$\\SIFSep$1$\\SIFSep$$\\sqrt{2}$$\\SIFSep$2'
63 |     """
64 |     ret = []
65 |     keys = obj.keys() if keys is None else keys
66 |     for key in keys:
67 |         _obj = []
68 |         value = obj[key]
69 |         with add_annotation(key, tag_mode, _obj, key_as_tag):
70 |             if isinstance(value, str):
71 |                 _obj.append(value)
72 |             elif isinstance(value, (list, dict)):
73 |                 value = value.values() if isinstance(value, dict) else value
74 |                 for i, v in enumerate(value):
75 |                     v = str(v)
76 |                     if key_as_tag is True and add_list_no_tag is True:
77 |                         _obj.append(ann_list_no_format.format(i))
78 |                     else:
79 |                         if i > 0:
80 |                             _obj.append(r"$\SIFSep$")
81 |                     _obj.append(v)
82 |             else:  # pragma: no cover
83 |                 raise TypeError("Cannot handle %s" % type(value))
84 |         ret.append("".join(_obj))
85 |     return str("".join(ret))
86 | 


--------------------------------------------------------------------------------
/EduNLP/utils/image.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | # 2021/5/20 @ tongshiwei
 3 | 
 4 | import base64
 5 | from io import BytesIO
 6 | 
 7 | 
 8 | def image2base64(img):
 9 |     buffered = BytesIO()
10 |     img.save(buffered, format="png")
11 |     img_str = base64.b64encode(buffered.getvalue())
12 |     return img_str.decode("utf-8")
13 | 


--------------------------------------------------------------------------------
/EduNLP/utils/log.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | # 2021/5/29 @ tongshiwei
 3 | import logging
 4 | 
 5 | 
 6 | def get_logger():
 7 |     _logger = logging.getLogger("EduNLP")
 8 |     _logger.setLevel(logging.INFO)
 9 |     _logger.propagate = False
10 |     ch = logging.StreamHandler()
11 |     ch.setFormatter(logging.Formatter('[%(name)s, %(levelname)s] %(message)s'))
12 |     ch.setLevel(logging.INFO)
13 |     _logger.addHandler(ch)
14 |     return _logger
15 | 
16 | 
17 | logger = get_logger()
18 | 


--------------------------------------------------------------------------------
/EduNLP/utils/path.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | # 2021/5/20 @ tongshiwei
 3 | 
 4 | import os
 5 | from pathlib import PurePath
 6 | 
 7 | 
 8 | def abs_current_dir(filepath):
 9 |     """
10 |     获取文件所在目录的绝对路径
11 | 
12 |     Example
13 |     -------
14 |     .. code ::
15 | 
16 |         abs_current_dir(__file__)
17 | 
18 |     """
19 |     return os.path.abspath(os.path.dirname(filepath))
20 | 
21 | 
22 | def path_append(path, *addition, to_str=False):
23 |     """
24 |     路径合并函数
25 | 
26 |     Examples
27 |     --------
28 |     .. code-block:: python
29 | 
30 |         path_append("../", "../data", "../dataset1/", "train", to_str=True)
31 |         '../../data/../dataset1/train'
32 | 
33 |     Parameters
34 |     ----------
35 |     path: str or PurePath
36 |     addition: list(str or PurePath)
37 |     to_str: bool
38 |         Convert the new path to str
39 |     Returns
40 |     -------
41 | 
42 |     """
43 |     path = PurePath(path)
44 |     if addition:
45 |         for a in addition:
46 |             path = path / a
47 |     if to_str:
48 |         return str(path)
49 |     return path
50 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include EduNLP/meta_data *


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | VERSION=`ls dist/*.tar.gz | sed "s/dist\/EduNLP-\(.*\)\.tar\.gz/\1/g"`
 2 | 
 3 | ifdef ENVPIP
 4 |     PIP = $(ENVPIP)
 5 | else
 6 |     PIP = pip3
 7 | endif
 8 | 
 9 | ifdef ENVPYTHON
10 |     PYTHON = $(ENVPYTHON)
11 | else
12 |     PYTHON = python3
13 | endif
14 | 
15 | ifdef ENVPYTEST
16 |     PYTEST = $(ENVPYTEST)
17 | else
18 |     PYTEST = pytest
19 | endif
20 | 
21 | help:
22 | 
23 | 	@echo "install              install EduNLP"
24 | 	@echo "test                 run test"
25 | 	@echo "release              publish to PyPI and release in github"
26 | 	@echo "release_test         publish to TestPyPI"
27 | 	@echo "clean                remove all build, test, coverage and Python artifacts"
28 | 	@echo "clean-build          remove build artifacts"
29 | 	@echo "clean-pyc            remove Python file artifacts"
30 | 	@echo "clean-test           remove test and coverage artifacts"
31 | 
32 | .PHONY: install, test, build, release, release_test, version, .test, .build, clean
33 | 
34 | install:
35 | 	@echo "install EduNLP"
36 | 	$(PIP) install -e . --user
37 | 
38 | test:
39 | 	@echo "run test"
40 | 	$(PYTEST)
41 | 
42 | build: test, clean
43 | 	$(PYTHON) setup.py bdist_wheel sdist
44 | 
45 | .test:
46 | 	$(PYTEST) > /dev/null
47 | 
48 | .build: clean
49 | 	$(PYTHON) setup.py bdist_wheel sdist > /dev/null
50 | 
51 | version: .build
52 | 	@echo $(VERSION)
53 | 
54 | release: test, build
55 | 	@echo "publish to pypi and release in github"
56 | 	@echo "version $(VERSION)"
57 | 
58 | 	-@twine upload dist/* && git tag "v$(VERSION)"
59 | 	git push && git push --tags
60 | 
61 | release_test: test, build
62 | 	@echo "publish to test pypi"
63 | 	@echo "version $(VERSION)"
64 | 
65 | 	-@twine upload --repository test dist/*
66 | 
67 | clean: clean-build clean-pyc clean-test
68 | 
69 | clean-build:
70 | 	rm -rf build/*
71 | 	rm -rf dist/*
72 | 	rm -rf .eggs/*
73 | 	find . -name '*.egg-info' -exec rm -fr {} +
74 | 	find . -name '*.egg' -exec rm -f {} +
75 | 
76 | clean-pyc:
77 | 	find . -name '*.pyc' -exec rm -f {} +
78 | 	find . -name '*.pyo' -exec rm -f {} +
79 | 	find . -name '*~' -exec rm -f {} +
80 | 	find . -name '__pycache__' -exec rm -rf {} +
81 | 
82 | clean-test:
83 | 	rm -f .coverage


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <p align="center">
 2 |   <img width="300" src="docs/EduNLP.png">
 3 | </p>
 4 | 
 5 | # EduNLP
 6 | 
 7 | [![VERSION](https://img.shields.io/pypi/pyversions/longling)](https://pypi.python.org/pypi/longling)
 8 | [![PyPI](https://img.shields.io/pypi/v/EduNLP.svg)](https://pypi.python.org/pypi/EduNLP)
 9 | [![test](https://github.com/bigdata-ustc/EduNLP/actions/workflows/python-test.yml/badge.svg?branch=master)](https://github.com/bigdata-ustc/EduNLP/actions/workflows/python-test.yml)
10 | [![codecov](https://codecov.io/gh/bigdata-ustc/EduNLP/branch/master/graph/badge.svg?token=B7gscOGQLD)](https://codecov.io/gh/bigdata-ustc/EduNLP)
11 | [![Documentation Status](https://readthedocs.org/projects/edunlp/badge/?version=latest)](https://edunlp.readthedocs.io/en/latest/?badge=latest)
12 | [![Download](https://img.shields.io/pypi/dm/EduNLP.svg?style=flat)](https://pypi.python.org/pypi/EduNLP)
13 | [![License](https://img.shields.io/github/license/bigdata-ustc/EduNLP)](LICENSE)
14 | [![DOI](https://zenodo.org/badge/332661206.svg)](https://zenodo.org/badge/latestdoi/332661206)
15 | 
16 | 
17 | EduNLP is a library for advanced Natural Language Processing in Python and is one of the projects of [EduX]((https://github.com/bigdata-ustc/EduX)) plan of [BDAA](https://github.com/bigdata-ustc). It's built on the very latest research, and was designed from day one to be used in real educational products.
18 | 
19 | EduNLP now comes with pretrained pipelines and currently supports segment, tokenization and vertorization. It supports varies of preprocessing for NLP in educational scenario, such as formula parsing, multi-modal segment.
20 | 
21 | EduNLP is commercial open-source software, released under the [Apache-2.0 license](LICENSE).
22 | 
23 | ## Quickstart
24 | 
25 | ### Installation
26 | 
27 | Git and install by pip
28 | ``` sh
29 | # basic installation
30 | pip install .
31 | 
32 | # full installation
33 | pip install .[full]
34 | ```
35 | or install from pypi:
36 | ```
37 | # basic installation
38 | pip install EduNLP
39 | 
40 | # full installation
41 | pip install EduNLP[full]
42 | ```
43 | 
44 | ### Usage
45 | 
46 | ```python
47 | from EduNLP import get_pretrained_i2v
48 | i2v = get_pretrained_i2v("d2v_all_300", "./model")
49 | item_vector, token_vector = i2v(["the content of item 1", "the content of item 2"])
50 | ```
51 | 
52 | ### Tutorial
53 | 
54 | For more details, please refer to the full documentation ([latest](https://edunlp.readthedocs.io/en/latest) | [stable](https://edunlp.readthedocs.io/en/stable)).
55 | 
56 | ### Resource
57 | We will continuously publish new datasets in [Standard Item Format (SIF)](https://github.com/bigdata-ustc/EduNLP/blob/master/docs/SIF4TI_CH.md) to encourage the relevant research works. The data resources can be accessed via another EduX project [EduData](https://github.com/bigdata-ustc/EduData)
58 | 
59 | ## Contribute
60 | 
61 | EduNLP is still under development. More algorithms and features are going to be added and we always welcome contributions to help make EduNLP better. If you would like to contribute, please follow this [guideline](CONTRIBUTE.md)([开发指南](CONTRIBUTE_CH.md)).
62 | 
63 | ## Citation
64 | 
65 | If this repository is helpful for you, please cite our work
66 | 
67 | ```
68 | @misc{bigdata2021edunlp,
69 |   title={EduNLP},
70 |   author={bigdata-ustc},
71 |   publisher = {GitHub},
72 |   journal = {GitHub repository},
73 |   year = {2021},
74 |   howpublished = {\url{https://github.com/bigdata-ustc/EduNLP}},
75 | }
76 | ```
77 | 


--------------------------------------------------------------------------------
/asset/_static/d2v.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdata-ustc/EduNLP/95cd9fc71a1bdd6156c42af7434aee0a8fc0a82e/asset/_static/d2v.png


--------------------------------------------------------------------------------
/asset/_static/d2v_bow_tfidf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdata-ustc/EduNLP/95cd9fc71a1bdd6156c42af7434aee0a8fc0a82e/asset/_static/d2v_bow_tfidf.png


--------------------------------------------------------------------------------
/asset/_static/d2v_general.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdata-ustc/EduNLP/95cd9fc71a1bdd6156c42af7434aee0a8fc0a82e/asset/_static/d2v_general.png


--------------------------------------------------------------------------------
/asset/_static/d2v_stem_tf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdata-ustc/EduNLP/95cd9fc71a1bdd6156c42af7434aee0a8fc0a82e/asset/_static/d2v_stem_tf.png


--------------------------------------------------------------------------------
/asset/_static/data.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdata-ustc/EduNLP/95cd9fc71a1bdd6156c42af7434aee0a8fc0a82e/asset/_static/data.png


--------------------------------------------------------------------------------
/asset/_static/formula.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdata-ustc/EduNLP/95cd9fc71a1bdd6156c42af7434aee0a8fc0a82e/asset/_static/formula.png


--------------------------------------------------------------------------------
/asset/_static/i2v.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdata-ustc/EduNLP/95cd9fc71a1bdd6156c42af7434aee0a8fc0a82e/asset/_static/i2v.png


--------------------------------------------------------------------------------
/asset/_static/item.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdata-ustc/EduNLP/95cd9fc71a1bdd6156c42af7434aee0a8fc0a82e/asset/_static/item.png


--------------------------------------------------------------------------------
/asset/_static/item_figure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdata-ustc/EduNLP/95cd9fc71a1bdd6156c42af7434aee0a8fc0a82e/asset/_static/item_figure.png


--------------------------------------------------------------------------------
/asset/_static/item_formula.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdata-ustc/EduNLP/95cd9fc71a1bdd6156c42af7434aee0a8fc0a82e/asset/_static/item_formula.png


--------------------------------------------------------------------------------
/asset/_static/parse.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdata-ustc/EduNLP/95cd9fc71a1bdd6156c42af7434aee0a8fc0a82e/asset/_static/parse.png


--------------------------------------------------------------------------------
/asset/_static/prepare_dataset.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdata-ustc/EduNLP/95cd9fc71a1bdd6156c42af7434aee0a8fc0a82e/asset/_static/prepare_dataset.jpg


--------------------------------------------------------------------------------
/asset/_static/seg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdata-ustc/EduNLP/95cd9fc71a1bdd6156c42af7434aee0a8fc0a82e/asset/_static/seg.png


--------------------------------------------------------------------------------
/asset/_static/sif.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdata-ustc/EduNLP/95cd9fc71a1bdd6156c42af7434aee0a8fc0a82e/asset/_static/sif.png


--------------------------------------------------------------------------------
/asset/_static/sif_addition.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdata-ustc/EduNLP/95cd9fc71a1bdd6156c42af7434aee0a8fc0a82e/asset/_static/sif_addition.png


--------------------------------------------------------------------------------
/asset/_static/tokenizer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdata-ustc/EduNLP/95cd9fc71a1bdd6156c42af7434aee0a8fc0a82e/asset/_static/tokenizer.png


--------------------------------------------------------------------------------
/asset/_static/w2v_stem_text.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdata-ustc/EduNLP/95cd9fc71a1bdd6156c42af7434aee0a8fc0a82e/asset/_static/w2v_stem_text.png


--------------------------------------------------------------------------------
/asset/_static/w2v_stem_tf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdata-ustc/EduNLP/95cd9fc71a1bdd6156c42af7434aee0a8fc0a82e/asset/_static/w2v_stem_tf.png


--------------------------------------------------------------------------------
/docs/EduNLP.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdata-ustc/EduNLP/95cd9fc71a1bdd6156c42af7434aee0a8fc0a82e/docs/EduNLP.png


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
 1 | EduNLP document and tutorial folder
 2 | ===================================
 3 | 
 4 | Requirements
 5 | ------------
 6 | 
 7 | See the requirements `docs_deps` in `setup.py`:
 8 | 
 9 | ```sh
10 | pip install -e .[doc]
11 | ```
12 | 
13 | Build documents
14 | ---------------
15 | 
16 | First, clean up existing files:
17 | 
18 | ```
19 | make clean
20 | ```
21 | 
22 | Then build:
23 | 
24 | ```
25 | make html
26 | ```
27 | 
28 | Render locally
29 | --------------
30 | 
31 | ```
32 | cd build/html
33 | python3 -m http.server 8000
34 | ```
35 | 


--------------------------------------------------------------------------------
/docs/SIF4TI_CH.md:
--------------------------------------------------------------------------------
 1 | # 标准项目格式
 2 | 
 3 | version: 0.2
 4 | 
 5 | 为了后续研究和使用的方便，我们需要一个统一的试题语法标准。
 6 | 
 7 | ## 语法规则
 8 | 1. 题目文本中只允许出现中文字符、中英文标点和换行符。
 9 | 2. 使用 \$\SIFBlank\$ 替换横线，对于选择题中的括号使用 \$\SIFChoice\$ 替换。
10 | 3. 图片 ID 以公式的形式嵌入文本中：`$\FigureID{ uuid }$` 或用 base64 编码表示，特别的，内容为公式的图片用`$\FormFigureID{ uuid }$`表示。
11 | 4. 文本标注格式：统一用 `$\textf{item,CHAR_EN}$` 表示，目前定义的有：b-加粗，i-斜体，u-下划线，w-下划波浪线，d-加点，t-标题。标注可以混用，按字母顺序排序，例如：$\textf{EduNLP, biu}$ 表示 <u>***EduNLP***</u>
12 | 5. 其余诸如，英文字母、罗马字符、数字等数学符号一律需要使用 latex 格式表示，即嵌在 `$$` 之中。
13 | 6. 分子式的录入标准暂且参考 [INCHI](https://zh.wikipedia.org/wiki/%E5%9B%BD%E9%99%85%E5%8C%96%E5%90%88%E7%89%A9%E6%A0%87%E8%AF%86)
14 | 7. 目前对 latex 内部语法没有要求。
15 | 
16 | ```
17 | 1. Item -> CHARACTER|EN_PUN_LIST|CH_PUN_LIST|FORMULA|QUES_MARK
18 | 2. EN_PUN_LIST -> [',', '.', '?', '!', ':', ';', '\'', '\"', '(', ')', ' ','_','/','|','\\','<','>','[',']','-']
19 | 3. CH_PUN_LIST -> ['，', '。', '！', '？', '：','；', '‘', '’', '“', '”', '（', '）', ' ', '、','《','》','—','．']
20 | 4. FORMULA -> $latex formula$ | $\FormFigureID{UUID}$ | $\FormFigureBase64{BASE64}$
21 | 5. FIGURE -> $\FigureID{UUID}$ | $\FigureBase64{BASE64}$
22 | 6. UUID -> [a-zA-Z\-0-9]+
23 | 7. CHARACTER -> CHAR_EN | CHAR_CH
24 | 8. CHAR_EN -> [a-zA-Z]+
25 | 9. CHAR_CH -> [\u4e00-\u9fa5]+
26 | 10. DIGITAL -> [0-9]+
27 | 11. QUES_MARK -> $\SIFBlank$ | $\SIFChoice$
28 | ```
29 | 
30 | ### 注意事项
31 | 1. 保留字符与转义
32 | 2. 数字
33 | 3. 选空与填空
34 | 4. 对于单个的数字或字符也需要添加 `$$`（目前能实现自动校验）
35 | 5. latex 公式中尽量不出现中文：（`\text{这里出现中文}`）
36 | 6. MySql 数据库导入数据时会自动忽略一个 `\`，所以录入的公式需要进一步处理为 `\\`
37 | 
38 | ## 示例
39 | 
40 | 标准形式:
41 | 
42 | 1. `若$x,y$满足约束条件$\\left\\{\\begin{array}{c}2 x+y-2 \\leq 0 \\\\ x-y-1 \\geq 0 \\\\ y+1 \\geq 0\\end{array}\\right.$，则$z=x+7 y$的最大值$\\SIFUnderline$'`
43 | 
44 | 2. `已知函数$f(x)=|3 x+1|-2|x|$画出$y=f(x)$的图像求不等式$f(x)>f(x+1)$的解集$\\PictureID{3bf2ddf4-8af1-11eb-b750-b46bfc50aa29}$$\\PictureID{59b8bd14-8af1-11eb-93a5-b46bfc50aa29}$$\\PictureID{63118b3a-8b75-11eb-a5c0-b46bfc50aa29}$$\\PictureID{6a006179-8b76-11eb-b386-b46bfc50aa29}$$\\PictureID{088f15eb-8b7c-11eb-a86f-b46bfc50aa29}$`
45 | 
46 | 非标准形式：
47 | 
48 | 1. 字母、数字和数学符号连续混合出现：
49 |     例如：
50 |     `完成下面的2x2列联表，`
51 |     `（单位：m3）`
52 |     `则输出的n=`
53 |     
54 | 2. 特殊的数学符号没有用 latex 公式表示：
55 |     例如：
56 |     `命题中真命题的序号是 ①`
57 |     `AB是⊙O的直径，AC是⊙O的切线，BC交⊙O于点E．若D为AC的中点`
58 |     
59 | 3. 出现以 unicode 编码写成的字符
60 |     例如：`则$a$的取值范围是（\u3000\u3000）`
61 | 
62 | 
63 | ## Change Log
64 | 
65 | 2021-05-18
66 | 
67 | 修改：
68 | 1. 原用 \$\SIFUnderline\$ 和 \$\SIFBracket\$ 来替换填空题中的横线和选择题中的括号，现分别用 \$\SIFBlank\$ 和 \$\SIFChoice\$ 替换。 
69 | 2. 原统一用`$\PictureID{ uuid }$`表示图片，现使用`$\FigureID{ uuid }$`，其中对于数据公式，用`$\FormFigureID{ uuid }$`来表示。
70 | 
71 | 2021-06-28 
72 |   
73 | 添加： 
74 | 1. 注明 `$$` 之中不能出现换行符。 
75 | 2. 添加文本标注格式说明。 
76 | 
77 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx
2 | sphinx_rtd_theme
3 | sphinx_toggleprompt
4 | sphinx-gallery>=0.6
5 | nbsphinx
6 | m2r2
7 | 


--------------------------------------------------------------------------------
/docs/source/_static/EduNLP.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdata-ustc/EduNLP/95cd9fc71a1bdd6156c42af7434aee0a8fc0a82e/docs/source/_static/EduNLP.png


--------------------------------------------------------------------------------
/docs/source/_static/formula.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdata-ustc/EduNLP/95cd9fc71a1bdd6156c42af7434aee0a8fc0a82e/docs/source/_static/formula.png


--------------------------------------------------------------------------------
/docs/source/_static/formulagroup.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdata-ustc/EduNLP/95cd9fc71a1bdd6156c42af7434aee0a8fc0a82e/docs/source/_static/formulagroup.png


--------------------------------------------------------------------------------
/docs/source/_static/pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdata-ustc/EduNLP/95cd9fc71a1bdd6156c42af7434aee0a8fc0a82e/docs/source/_static/pipeline.png


--------------------------------------------------------------------------------
/docs/source/_static/流程图.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdata-ustc/EduNLP/95cd9fc71a1bdd6156c42af7434aee0a8fc0a82e/docs/source/_static/流程图.png


--------------------------------------------------------------------------------
/docs/source/api/ModelZoo.rst:
--------------------------------------------------------------------------------
 1 | EduNLP.ModelZoo
 2 | ==================
 3 | 
 4 | base_model
 5 | -----------
 6 | 
 7 | .. automodule:: EduNLP.ModelZoo.base_model
 8 |    :members:
 9 | 
10 | ::
11 |     相关方法中的参数说明：
12 | 
13 |     save_pretrained(output_dir)：
14 |           output_dir: str
15 |           The path you want to save your model
16 | 
17 |     classmethodfrom_pretrained(pretrained_model_path, *args, **kwargs):
18 |           pretrained_model_path: str
19 |           The path where you load your checkpoint from
20 | 
21 |     save_config(config_dir):
22 |           config_dir: str
23 |           The path you want to save the config file
24 |     
25 |     @classmethod 
26 |     from_config(config_path, *args, **kwargs):
27 |           config_path: str
28 |           The path where you load the config file
29 |    
30 | 
31 | 
32 | rnn
33 | -----------
34 | 
35 | .. automodule:: EduNLP.ModelZoo.rnn
36 |    :members:
37 |    :imported-members:
38 | 
39 | ::
40 |     参数补充说明：
41 |     @classmethod from_config(config_path, **kwargs)：
42 |           config_path: str
43 |           The path where you load the config file
44 | 
45 | 
46 | 
47 | disenqnet
48 | -----------
49 | 
50 | .. automodule:: EduNLP.ModelZoo.disenqnet
51 |    :members:
52 |    :imported-members:
53 | 
54 | ::
55 |     参数补充说明：
56 |     @classmethod from_config(config_path, **kwargs)：
57 |           config_path: str
58 |           The path where you load the config file
59 | 
60 | quesnet
61 | -----------
62 | 
63 | .. automodule:: EduNLP.ModelZoo.quesnet
64 |    :members:
65 |    :imported-members:
66 | 
67 | utils
68 | -----------
69 | 
70 | .. automodule:: EduNLP.ModelZoo.utils
71 |    :members:
72 |    :imported-members:
73 | 


--------------------------------------------------------------------------------
/docs/source/api/formula.rst:
--------------------------------------------------------------------------------
 1 | EduNLP.Formula
 2 | =======================
 3 | 
 4 | .. automodule:: EduNLP.Formula.Formula
 5 |    :members:
 6 |    :imported-members:
 7 | 
 8 | .. automodule:: EduNLP.Formula.ast
 9 |    :members:
10 |    :imported-members:
11 | 


--------------------------------------------------------------------------------
/docs/source/api/i2v.rst:
--------------------------------------------------------------------------------
1 | EduNLP.I2V
2 | ============
3 | 
4 | .. automodule:: EduNLP.I2V.i2v
5 |    :members:
6 | 


--------------------------------------------------------------------------------
/docs/source/api/index.rst:
--------------------------------------------------------------------------------
 1 | EduNLP
 2 | ======
 3 | 
 4 | SIF
 5 | ----------------------
 6 | .. automodule:: EduNLP.SIF.sif
 7 |    :members:
 8 |    :imported-members:
 9 | 
10 | EduNLP.Formula
11 | ---------------------
12 | 
13 | .. automodule:: EduNLP.Formula.ast
14 |    :members:
15 |    :imported-members:
16 | 
17 | EduNLP.I2V
18 | -----------------
19 | 
20 | .. automodule:: EduNLP.I2V.i2v
21 |    :members:
22 |    :imported-members:
23 | 
24 | EduNLP.Pretrain
25 | -------------------
26 | 
27 | .. automodule:: EduNLP.Pretrain
28 |    :members:
29 |    :imported-members:
30 | 
31 | EduNLP.Tokenizer
32 | ----------------------
33 | 
34 | .. automodule:: EduNLP.Tokenizer
35 |    :members:
36 |    :imported-members:
37 | 
38 | Vector
39 | ---------------
40 | 
41 | .. automodule:: EduNLP.Vector
42 |    :members:
43 |    :imported-members:
44 | 
45 | 
46 | Pipeline
47 | ---------------
48 | 
49 | .. automodule:: EduNLP.Pipeline
50 |    :members:
51 |    :imported-members:
52 | 


--------------------------------------------------------------------------------
/docs/source/api/pipeline.rst:
--------------------------------------------------------------------------------
 1 | EduNLP.Pipeline
 2 | ==================
 3 | 
 4 | Pipeline
 5 | ----------------------------------------------------------
 6 | 
 7 | .. automodule:: EduNLP.Pipeline.base
 8 |    :members:
 9 | 
10 | 
11 | Components
12 | ----------------------------------------------------------
13 | 
14 | .. automodule:: EduNLP.Pipeline.components
15 |    :members:
16 | 
17 | 
18 | Property prediction
19 | ----------------------------------------------------------
20 | 
21 | .. automodule:: EduNLP.Pipeline.property_prediction
22 |    :members:
23 | 
24 | Knowledge prediction
25 | ----------------------------------------------------------
26 | 
27 | .. automodule:: EduNLP.Pipeline.knowledge_prediction
28 |    :members:
29 | 


--------------------------------------------------------------------------------
/docs/source/api/pretrain.rst:
--------------------------------------------------------------------------------
 1 | EduNLP.Pretrain
 2 | ==================
 3 | 
 4 | EduNLP.Pretrain.pretrian_utils
 5 | ---------------------------------------------------------------
 6 | .. automodule:: EduNLP.Pretrain.pretrian_utils
 7 |    :members:
 8 | 
 9 | 
10 | EduNLP.Pretrain.hugginface_utils
11 | ---------------------------------------------------------------
12 | 
13 | .. automodule:: EduNLP.Pretrain.hugginface_utils
14 |    :members:
15 | 
16 | 
17 | EduNLP.Pretrain.gensim_vec
18 | ---------------------------------------------------------------
19 | 
20 | .. automodule:: EduNLP.Pretrain.gensim_vec
21 |    :members:
22 | 
23 | EduNLP.Pretrain.elmo_vec
24 | ---------------------------------------------------------------
25 | 
26 | .. automodule:: EduNLP.Pretrain.elmo_vec
27 |    :members:
28 | 
29 | EduNLP.Pretrain.bert_vec
30 | ---------------------------------------------------------------
31 | 
32 | .. automodule:: EduNLP.Pretrain.bert_vec
33 |    :members:
34 | 
35 | EduNLP.Pretrain.disenqnet_vec
36 | ---------------------------------------------------------------
37 | 
38 | .. automodule:: EduNLP.Pretrain.disenqnet_vec
39 |    :members:
40 | 
41 | EduNLP.Pretrain.quesnet_vec
42 | ---------------------------------------------------------------
43 | 
44 | .. automodule:: EduNLP.Pretrain.quesnet_vec
45 |    :members:


--------------------------------------------------------------------------------
/docs/source/api/sif.rst:
--------------------------------------------------------------------------------
 1 | EduNLP.SIF
 2 | ==============
 3 | 
 4 | SIF
 5 | ----------
 6 | .. automodule:: EduNLP.SIF.sif
 7 |    :members:
 8 |    :imported-members:
 9 | 
10 | 
11 | Parser
12 | --------
13 | .. automodule:: EduNLP.SIF.parser
14 |    :members:
15 |    :imported-members:
16 | 
17 | Segment
18 | ----------
19 | .. automodule:: EduNLP.SIF.segment.segment
20 |    :members:
21 |    :imported-members:
22 | 
23 | 
24 | Tokenization
25 | ---------------
26 | 
27 | tokenize
28 | ^^^^^^^^^^
29 | .. automodule:: EduNLP.SIF.tokenization.tokenization
30 |    :members:
31 |    :imported-members:
32 | 
33 | text
34 | ^^^^^^
35 | .. automodule:: EduNLP.SIF.tokenization.text
36 |    :members:
37 |    :imported-members:
38 | 
39 | 
40 | formula
41 | ^^^^^^^^^
42 | .. automodule:: EduNLP.SIF.tokenization.formula.formula
43 |    :members:
44 |    :imported-members:
45 | 
46 | .. automodule:: EduNLP.SIF.tokenization.formula.ast_token
47 |    :members:
48 |    :imported-members:
49 | 
50 | .. automodule:: EduNLP.SIF.tokenization.formula.linear_token
51 |    :members:
52 |    :imported-members:
53 | 


--------------------------------------------------------------------------------
/docs/source/api/tokenizer.rst:
--------------------------------------------------------------------------------
 1 | EduNLP.Tokenizer
 2 | =====================================
 3 | 
 4 | .. automodule:: EduNLP.Tokenizer
 5 |    :members:
 6 |    :imported-members:
 7 | 
 8 | AstFormulaTokenizer参数定义
 9 | #######################################
10 | 
11 | ::
12 |     Parameters
13 |         ----------
14 |         symbol : str, optional
15 |             Elements to symbolize before tokenization, by default "gmas"
16 |         figures : _type_, optional
17 |             Info for figures in items, by default None
18 |         """
19 |    
20 | CharTokenizer参数定义
21 | #######################################
22 | 
23 | ::
24 |     """Tokenize text char by char. eg. "题目内容" -> ["题",  "目",  "内", 容"]
25 | 
26 |         Parameters
27 |         ----------
28 |         stop_words : str, optional
29 |             stop_words to skip, by default "default"
30 |         """
31 | 
32 | CustomTokenizer参数定义
33 | #######################################
34 | 
35 | ::
36 |     """Tokenize SIF items by customized configuration
37 | 
38 |         Parameters
39 |         ----------
40 |         symbol : str, optional
41 |             Elements to symbolize before tokenization, by default "gmas"
42 |         figures : _type_, optional
43 |             Info for figures in items, by default None
44 |         kwargs: addtional configuration for SIF items
45 |             including text_params, formula_params, figure_params, more details could be found in `EduNLP.SIF.sif4sci`
46 |         """
47 | 
48 | PureTextTokenizer参数定义
49 | #######################################
50 | 
51 | ::
52 |     """
53 |         Treat all elements in SIF item as prue text. Spectially, tokenize formulas as text.
54 | 
55 |         Parameters
56 |         ----------
57 |         handle_figure_formula : str, optional
58 |             whether to skip or symbolize special formulas( $\\FormFigureID{…}$ and $\\FormFigureBase64{…}),
59 |             by default skip
60 | 
61 | SpaceTokenizer参数定义
62 | #######################################        
63 | 
64 | ::
65 |     """
66 |     Tokenize text by space. eg. "题目 内容" -> ["题目", "内容"]
67 | 
68 |     Parameters
69 |     ----------
70 |     stop_words : str, optional
71 |         stop_words to skip, by default "default"
72 |     """
73 | 
74 | EduNLP.Tokenizer.get_tokenizer参数定义
75 | ####################################### 
76 | 
77 | ::
78 |     Parameters
79 |     ----------
80 |     name: str
81 |         the name of tokenizer, e.g. text, pure_text.
82 |     args:
83 |         the parameters passed to tokenizer
84 |     kwargs:
85 |         the parameters passed to tokenizer
86 |     Returns
87 |     -------
88 |     tokenizer: Tokenizer


--------------------------------------------------------------------------------
/docs/source/api/utils.rst:
--------------------------------------------------------------------------------
1 | EduNLP.utils
2 | ====================
3 | 
4 | .. automodule:: EduNLP.utils
5 |    :members:
6 |    :imported-members:
7 | 


--------------------------------------------------------------------------------
/docs/source/api/vector.rst:
--------------------------------------------------------------------------------
 1 | EduNLP.Vector
 2 | ==========================
 3 | 
 4 | 
 5 | EduNLP.Vector.t2v
 6 | -------------------------
 7 | 
 8 | .. automodule:: EduNLP.Vector.t2v
 9 |    :members:
10 | 
11 | 
12 | EduNLP.Vector.disenqnet
13 | -------------------------
14 | 
15 | .. automodule:: EduNLP.Vector.disenqnet.disenqnet
16 |    :members:
17 | 
18 | EduNLP.Vector.quesnet
19 | -------------------------
20 | 
21 | .. automodule:: EduNLP.Vector.quesnet.quesnet
22 |    :members:
23 | 
24 | EduNLP.Vector.elmo_vec
25 | -------------------------
26 | 
27 | .. automodule:: EduNLP.Vector.elmo_vec
28 |    :members:
29 | 
30 | 
31 | EduNLP.Vector.gensim_vec
32 | -------------------------
33 | 
34 | .. automodule:: EduNLP.Vector.gensim_vec
35 |    :members:
36 | 
37 | 
38 | EduNLP.Vector.embedding
39 | -------------------------
40 | 
41 | .. automodule:: EduNLP.Vector.embedding
42 |    :members:
43 | 


--------------------------------------------------------------------------------
/docs/source/tutorial/en/index.rst:
--------------------------------------------------------------------------------
 1 | Get Started
 2 | ===============
 3 | 
 4 | *  `Standard Item Format <sif.rst>`_
 5 | 
 6 | *  `Syntax Parsing <tokenize.rst>`_
 7 | 
 8 | *  `Component Segmentation <seg.rst>`_
 9 | 
10 | *  `Tokenization <tokenization.rst>`_
11 | 
12 | *  `Pre-training <pretrain.rst>`_
13 | 
14 | *  `Vectorization <vectorization.rst>`_
15 | 
16 | *  `Pipeline <pipeline.rst>`_
17 | 
18 | Main process
19 | ---------------
20 | 
21 | .. figure:: ../../_static/pipeline.png
22 | 
23 | * `Component Segmentation <seg.rst>`_ :  Segment items in SIF format according to the types of items, so that elements in different types(text, formulas, pictures, etc.) can be tokenized respectively.
24 | 
25 | * `Syntax Parsing <tokenize.rst>`_ :  parsing different components in different ways, including formula parsing, text parsing, etc., serves the tokenization process later. 
26 | 
27 | * `Tokenization <tokenization.rst>`_: Further process the result of component segmentation and syntax parsing, and finally the multi-modal tokenization sequence of the item is obtained.  
28 | 
29 | * `Vectorization <vectorization.rst>`_: Fed the list of tokenized items into pre-training models, so as to get the corresponding vectors of items.
30 | 
31 | * **Downstream** Apply the obtained vectors to downstream tasks.
32 | 
33 | Examples
34 | ---------
35 | 
36 | To help you quickly understand the functions of this project, this section only shows the usages of common function interface. Intermediate function modules (such as parse, formula, segment, etc.) and more subdivided interface methods are not shown. For further study, please refer to relevant documents.
37 | 
38 | ------------------------------------------------------------
39 | 
40 | .. nbgallery::
41 |     :caption: This is a thumbnail gallery:
42 |     :name: start_galler
43 |     :glob:
44 |     
45 |     Tokenization  <../../build/blitz/sif/sif4sci.ipynb>
46 | 
47 |     Vectorization  <../../build/blitz/i2v/get_pretrained_i2v.ipynb>
48 | 
49 |     Pipeline <../../build/blitz/pipeline/pipeline.ipynb>
50 | 


--------------------------------------------------------------------------------
/docs/source/tutorial/en/parse/TextSyntaxStructureParsing.rst:
--------------------------------------------------------------------------------
 1 | Text syntax structure parsing
 2 | --------------------------------
 3 | 
 4 | This section is mainly realized by EduNLP.SIF.Parse module. Its main function is to extract letters and numbers in the text and convert them into standard format.
 5 | 
 6 | This module is mainly used as an *middle module* to parse the input text. In general, users do not call this module directly.
 7 | 
 8 | Introduction of Main Content
 9 | +++++++++++++++++++++++++++++++++++++
10 | 
11 | 1. Judge the type of the incoming text in the following order
12 | 
13 | * is_chinese: its function is to match Chinese characters[\u4e00-\u9fa5].
14 | 
15 | * is_alphabet: its function is to match alphabets other than formulas. Only the alphabets between two Chinese characters will be corrected (wrapped with $$), and the rest of the cases are regarded as formulas that do not conform to latex syntax.
16 | 
17 | * is_number: its function is to match numbers other than formulas. Only the numbers between two Chinese characters will be corrected, and the rest of the cases are regarded as formulas that do not conform to latex syntax.
18 | 
19 | 2. Match latex formula
20 | 
21 | * If Chinese characters appear in latex, print warning only once.
22 | 
23 | * Use _is_formula_legal function, check the completeness and analyzability of latex formula, and report an error for formulas that do not conform to latex syntax.
24 | 
25 | Input
26 | >>>>>>>
27 | 
28 | Type: str
29 | 
30 | Content：question text
31 | 
32 | ::
33 | 
34 |    >>> text1 = '生产某种零件的A工厂25名工人的日加工零件数_   _'
35 |    >>> text2 = 'X的分布列为(   )'
36 |    >>> text3 = '① AB是⊙O的直径，AC是⊙O的切线，BC交⊙O于点E．AC的中点为D'
37 |    >>> text4 = '支持公式如$\\frac{y}{x}$，$\\SIFBlank$，$\\FigureID{1}$，不支持公式如$\\frac{ \\dddot y}{x}$'
38 | 
39 | Parsing
40 | >>>>>>>>>>>>>>>>>>>>
41 | 
42 | ::
43 | 
44 |    >>> text_parser1 = Parser(text1)
45 |    >>> text_parser2 = Parser(text2)
46 |    >>> text_parser3 = Parser(text3)
47 |    >>> text_parser4 = Parser(text4)
48 | 
49 | Related parameters description(?)
50 | >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
51 | 
52 | - Try to convert text to standard format
53 | 
54 | ::
55 | 
56 |    >>> text_parser1.description_list()
57 |    >>> print('text_parser1.text:',text_parser1.text)
58 |    text_parser1.text: 生产某种零件的$A$工厂$25$名工人的日加工零件数$\SIFBlank$
59 |    >>> text_parser2.description_list()
60 |    >>> print('text_parser2.text:',text_parser2.text)
61 |    text_parser2.text: $X$的分布列为$\SIFChoice$
62 | 
63 | - Determine if the text has syntax errors
64 | 
65 | ::
66 | 
67 |    >>> text_parser3.description_list()
68 |    >>> print('text_parser3.error_flag: ',text_parser3.error_flag)
69 |    text_parser3.error_flag:  1
70 |    >>> text_parser4.description_list()
71 |    >>> print('text_parser4.fomula_illegal_flag: ',text_parser4.fomula_illegal_flag)
72 |    text_parser4.fomula_illegal_flag:  1
73 | 


--------------------------------------------------------------------------------
/docs/source/tutorial/en/pipeline.rst:
--------------------------------------------------------------------------------
 1 | =======
 2 | Pipleine
 3 | =======
 4 | 
 5 | .. nbgallery::
 6 |     :caption: This is a thumbnail gallery:
 7 |     :name: pipleine_gallery
 8 |     :glob:
 9 | 
10 |     Pipleine  <../../build/blitz/pipeline/pipeline.ipynb>
11 | 


--------------------------------------------------------------------------------
/docs/source/tutorial/en/pretrain/loading.rst:
--------------------------------------------------------------------------------
 1 | Load models
 2 | ----------------
 3 | 
 4 | Transfer the obtained model to the I2V module to load the model.
 5 |  
 6 | Examples：
 7 | 
 8 | ::
 9 | 
10 |         >>> model_path = "../test_model/d2v/test_gensim_luna_stem_tf_d2v_256.bin"
11 |         >>> i2v = D2V("text","d2v",filepath=model_path, pretrained_t2v = False)
12 | 


--------------------------------------------------------------------------------
/docs/source/tutorial/en/pretrain/pub.rst:
--------------------------------------------------------------------------------
 1 | The overview of our public model
 2 | ------------------------------------
 3 | 
 4 | 
 5 | Version Description
 6 | #########################
 7 | 
 8 | First level version:
 9 | 
10 | * Public version 1 (luna_pub): college entrance examination
11 | * Public version 2 (luna_pub_large): college entrance examination + regional examination
12 | 
13 | Second level version:
14 | 
15 | * Minor subjects(Chinese,Math,English,History,Geography,Politics,Biology,Physics,Chemistry)
16 | * Major subjects(science, arts and all subject)
17 | 
18 | Third level version【to be finished】:
19 | 
20 | * Don't use third-party initializers
21 | * Use third-party initializers
22 | 
23 | Description of train data in models
24 | #######################################
25 | 
26 | * Currently, the data used in w2v and d2v models are the subjects of senior high school.
27 | * test data:`[OpenLUNA.json] <http://base.ustc.edu.cn/data/OpenLUNA/OpenLUNA.json>`_
28 | 
29 | At present, the following models are provided. More models of different subjects and question types are being trained. Please look forward to it.
30 |     "d2v_all_300" (all subject), "d2v_science_300" (Science), "d2v_english_300" (English)，"d2v_literal_300" (Arts)
31 | 
32 | Examples of model training
33 | ----------------------------
34 | 
35 | Get the dataset
36 | ####################
37 | 
38 | .. toctree::
39 |    :maxdepth: 1
40 |    :titlesonly:
41 | 
42 |    prepare_dataset  <../../../build/blitz/pretrain/prepare_dataset.ipynb>
43 | 
44 | An example of d2v in gensim model
45 | ####################################
46 | 
47 | .. toctree::
48 |    :maxdepth: 1
49 |    :titlesonly:
50 | 
51 |    d2v_bow_tfidf  <../../../build/blitz/pretrain/gensim/d2v_bow_tfidf.ipynb>
52 |    d2v_general  <../../../build/blitz/pretrain/gensim/d2v_general.ipynb>
53 |    d2v_stem_tf  <../../../build/blitz/pretrain/gensim/d2v_stem_tf.ipynb>
54 | 
55 | An example of w2v in gensim model
56 | ####################################
57 | 
58 | .. toctree::
59 |    :maxdepth: 1
60 |    :titlesonly:
61 | 
62 |    w2v_stem_text  <../../../build/blitz/pretrain/gensim/w2v_stem_text.ipynb>
63 |    w2v_stem_tf  <../../../build/blitz/pretrain/gensim/w2v_stem_tf.ipynb>
64 | 
65 | An example of seg_token
66 | ############################
67 | 
68 | .. toctree::
69 |    :maxdepth: 1
70 |    :titlesonly:
71 | 
72 |    d2v.ipynb  <../../../build/blitz/pretrain/seg_token/d2v.ipynb>
73 |    d2v_d1  <../../../build/blitz/pretrain/seg_token/d2v_d1.ipynb>
74 |    d2v_d2  <../../../build/blitz/pretrain/seg_token/d2v_d2.ipynb>
75 | 


--------------------------------------------------------------------------------
/docs/source/tutorial/en/pretrain/start.rst:
--------------------------------------------------------------------------------
 1 | Train the model
 2 | ------------------
 3 | 
 4 | Call train_Vector function interface directly to make the training model easier. This section calls the relevant training models in the gensim library. At present, the training methods of "sg"、 "cbow"、 "fastext"、 "d2v"、 "bow"、 "tfidf" are provided. Parameter embedding_dim is also provided for users to determine vector dimension according to their needs.
 5 | 
 6 | Basic Steps
 7 | ##################
 8 | 
 9 | 1.Determine the type of model and select the appropriate tokenizer (GensimWordTokenizer、 GensimSegTokenizer) to finish tokenization.
10 | 
11 | 2.Call train_vector function to get the required pre-trained model。
12 | 
13 | Examples：
14 | 
15 | ::
16 | 
17 |         >>> tokenizer = GensimWordTokenizer(symbol="gmas", general=True)
18 |         >>> token_item = tokenizer("有公式$\\FormFigureID{wrong1?}$，如图$\\FigureID{088f15ea-xxx}$,\
19 |         ... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$，则$z=x+7 y$的最大值为$\\SIFBlank$")
20 |         >>> print(token_item.tokens[:10])
21 |         ['公式', '[FORMULA]', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[FORMULA]']
22 |         
23 |         # 10 dimension with fasstext method
24 |         train_vector(sif_items, "../../../data/w2v/gensim_luna_stem_tf_", 10, method="d2v")
25 | 


--------------------------------------------------------------------------------
/docs/source/tutorial/en/seg/SemanticComponentSegmentation.rst:
--------------------------------------------------------------------------------
 1 | Semantic Component Segmentation
 2 | ------------------------------------
 3 | 
 4 | Because multiple-choice questions are given in the form of dict, it is necessary to convert them into text format while retaining their data relationship. This function can be realized by dict2str4sif function which can convert multiple-choice question items into character format and identify question stem and options。
 5 | 
 6 | 
 7 | Basic Usage
 8 | ++++++++++++++++++
 9 | 
10 | ::
11 | 
12 |  >>> item = {
13 |  ...     "stem": r"若复数$z=1+2 i+i^{3}$，则$|z|=$",
14 |  ...     "options": ['0', '1', r'$\sqrt{2}$', '2'],
15 |  ... }
16 |  >>> dict2str4sif(item) # doctest: +ELLIPSIS
17 |  '$\\SIFTag{stem_begin}$若复数$z=1+2 i+i^{3}$，则$|z|=$$\\SIFTag{stem_end}$$\\SIFTag{options_begin}$$\\SIFTag{list_0}$0$\\SIFTag{list_1}$1$\\SIFTag{list_2}$$\\sqrt{2}$$\\SIFTag{list_3}$2$\\SIFTag{options_end}$'
18 | 
19 | Optional additional parameters / interfaces
20 | ++++++++++++++++++++++++++++++++++++++++++++++++
21 | 
22 | 1.add_list_no_tag: if this parameter is true, it means that you need to count the labels in the options section.
23 | 
24 | ::
25 | 
26 |  >>> dict2str4sif(item, add_list_no_tag=True) # doctest: +ELLIPSIS
27 |  '$\\SIFTag{stem_begin}$若复数$z=1+2 i+i^{3}$，则$|z|=$$\\SIFTag{stem_end}$$\\SIFTag{options_begin}$$\\SIFTag{list_0}$0$\\SIFTag{list_1}$1$\\SIFTag{list_2}$$\\sqrt{2}$$\\SIFTag{list_3}$2$\\SIFTag{options_end}$'
28 |  
29 |  >>> dict2str4sif(item, add_list_no_tag=False) # doctest: +ELLIPSIS
30 |  '$\\SIFTag{stem_begin}$若复数$z=1+2 i+i^{3}$，则$|z|=$$\\SIFTag{stem_end}$$\\SIFTag{options_begin}$0$\\SIFSep$1$\\SIFSep$$\\sqrt{2}$$\\SIFSep$2$\\SIFTag{options_end}$'
31 | 
32 | 2.tag_mode: The location for the label can be selected using this parameter. 'delimiter' is to label both the beginning and the end,'head' is to label only the head, and 'tail' is to label only the tail.
33 | 
34 | ::
35 | 
36 |  >>> dict2str4sif(item, tag_mode="head") # doctest: +ELLIPSIS
37 |  '$\\SIFTag{stem}$若复数$z=1+2 i+i^{3}$，则$|z|=$$\\SIFTag{options}$$\\SIFTag{list_0}$0$\\SIFTag{list_1}$1$\\SIFTag{list_2}$$\\sqrt{2}$$\\SIFTag{list_3}$2'
38 |  
39 |  >>> dict2str4sif(item, tag_mode="tail") # doctest: +ELLIPSIS
40 |  '若复数$z=1+2 i+i^{3}$，则$|z|=$$\\SIFTag{stem}$$\\SIFTag{list_0}$0$\\SIFTag{list_1}$1$\\SIFTag{list_2}$$\\sqrt{2}$$\\SIFTag{list_3}$2$\\SIFTag{options}$'
41 | 
42 | 3.key_as_tag: If this parameter is false, this process will only adds $\SIFSep$ between the options without distinguishing the type of segmentation label.
43 | 
44 | ::
45 | 
46 |  >>> dict2str4sif(item, key_as_tag=False)
47 |  '若复数$z=1+2 i+i^{3}$，则$|z|=$0$\\SIFSep$1$\\SIFSep$$\\sqrt{2}$$\\SIFSep$2'


--------------------------------------------------------------------------------
/docs/source/tutorial/en/seg/StructuralComponentSegmentation.rst:
--------------------------------------------------------------------------------
 1 | Structural Component Segmentation
 2 | ------------------------------------
 3 | 
 4 | This step is to segment sliced items. In this step, there is a depth option. You can select all positions or some labels for segmentation according to your needs, such as \SIFSep and \SIFTag. You can also select where to add labels, either at the head and tail or only at the head or tail.
 5 | 
 6 | 
 7 | There are two modes:
 8 | 
 9 | * linear mode: it is used for text processing (word segmentation using jieba library);
10 | 
11 | * ast mode: it is used to parse the formula.
12 | 
13 | Basic Usage
14 | ++++++++++++++++++
15 | 
16 | ::
17 | 
18 |  >>> test_item = r"如图所示，则$\bigtriangleup ABC$的面积是$\SIFBlank$。$\FigureID{1}$"
19 |  >>> seg(test_item)
20 |  >>> ['如图所示，则', '\\bigtriangleup ABC', '的面积是', '\\SIFBlank', '。', \FigureID{1}]
21 | 
22 | Optional additional parameters/interfaces
23 | +++++++++++++++++++++++++++++++++++++++++++++
24 | 
25 | 1.describe: count the number of elements of different types
26 | 
27 | ::
28 | 
29 |  >>> s.describe()
30 |  {'t': 3, 'f': 1, 'g': 1, 'm': 1}
31 | 
32 | 2.filter: this interface can screen out one or more types of elements.
33 | 
34 | Using this interface, you can pass in a "keep" parameter or a special character directly to choose what type of elements to retain.
35 | 
36 | Element type represented by symbol:
37 |    "t": text
38 |    "f": formula
39 |    "g": figure
40 |    "m": question mark
41 |    "a": tag
42 |    "s": sep tag
43 | 
44 | ::
45 | 
46 |  >>> with s.filter("f"):
47 |  ...     s
48 |  ['如图所示，则', '的面积是', '\\SIFBlank', '。', \FigureID{1}]
49 |  >>> with s.filter(keep="t"):
50 |  ...     s
51 |  ['如图所示，则', '的面积是', '。']
52 | 
53 | 3.symbol: this interface can convert some types of data into special symbols
54 | 
55 | Element type represented by symbol:
56 | 
57 | -   "t": text
58 | -   "f": formula
59 | -   "g": figure
60 | -   "m": question mark
61 | 
62 | ::
63 | 
64 |  >>> seg(test_item, symbol="fgm")
65 |  ['如图所示，则', '[FORMULA]', '的面积是', '[MARK]', '。', '[FIGURE]']
66 |  >>> seg(test_item, symbol="tfgm")
67 |  ['[TEXT]', '[FORMULA]', '[TEXT]', '[MARK]', '[TEXT]', '[FIGURE]']
68 | 


--------------------------------------------------------------------------------
/docs/source/tutorial/en/tokenization/GensimSegTokenizer.rst:
--------------------------------------------------------------------------------
1 | GensimSegTokenizer
2 | =====================
3 | 
4 | By default, the pictures, separators, blanks in the question text and other parts of the incoming item are converted into special characters for data security and tokenization of text, formulas and labels. Also, the tokenizer uses linear analysis method for text and abstract analysis method of syntax tree for formulas.
5 | 
6 | Compared to GensimWordTokenizer, the main differences are:
7 | 
8 | * It provides the depth option for segmentation position, such as \SIFSep and \SIFTag.
9 | * By default, labels are inserted in the header of item components (such as text and formula).


--------------------------------------------------------------------------------
/docs/source/tutorial/en/tokenization/GensimWordTokenizer.rst:
--------------------------------------------------------------------------------
 1 | GensimWordTokenizer
 2 | =====================
 3 | 
 4 | By default, the pictures, blanks in the question text and other parts of the incoming item are converted into special characters for data security and the tokenization of text, formulas, labels and separators. Also, the tokenizer uses linear analysis method for text and abstract syntax tree method for formulas respectively. You can choose each of them by ``general`` parameter:
 5 | 
 6 | -true, it means that the incoming item conforms to SIF and the linear analysis method should be used.
 7 | -false, it means that the incoming item doesn't conform to SIF and the abstract syntax tree method should be used.
 8 | 
 9 | Examples
10 | ----------
11 |         
12 | ::
13 | 
14 |         >>> tokenizer = GensimWordTokenizer(symbol="gmas", general=True)
15 |         >>> token_item = tokenizer("有公式$\\FormFigureID{wrong1?}$，如图$\\FigureID{088f15ea-xxx}$,\
16 |         ... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$，则$z=x+7 y$的最大值为$\\SIFBlank$")
17 |         >>> print(token_item.tokens[:10])
18 |         ['公式', '[FORMULA]', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[FORMULA]']
19 |         >>> tokenizer = GensimWordTokenizer(symbol="fgmas", general=False)
20 |         >>> token_item = tokenizer("有公式$\\FormFigureID{wrong1?}$，如图$\\FigureID{088f15ea-xxx}$,\
21 |         ... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$，则$z=x+7 y$的最大值为$\\SIFBlank$")
22 |         >>> print(token_item.tokens[:10])
23 |         ['公式', '[FORMULA]', '如图', '[FIGURE]', '[FORMULA]', '约束条件', '公式', '[FORMULA]', '[SEP]', '[FORMULA]']
24 | 


--------------------------------------------------------------------------------
/docs/source/tutorial/en/tokenization/PureTextTokenizer.rst:
--------------------------------------------------------------------------------
 1 | PureTextTokenizer
 2 | ===================
 3 | 
 4 | By default, the pictures, labels, separators, blanks in the question text and other parts of the incoming item are converted into special characters for data security. At the same time, special formulas such as $\\FormFigureID{...}$ and $\\FormFigureBase64{...}$ are screened out to facilitate the tokenization of text and plain text formulas. Also, the tokenizer uses linear analysis method for text and formulas, and the ``key`` parameter provided is used to preprocess the incoming item, which will be improved based on users' requirements in the future.
 5 | 
 6 | Examples
 7 | ----------
 8 | 
 9 | ::
10 | 
11 |     >>> tokenizer = PureTextTokenizer()
12 |     >>> items = ["有公式$\\FormFigureID{wrong1?}$，如图$\\FigureID{088f15ea-xxx}$,\
13 |     ... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$，则$z=x+7 y$的最大值为$\\SIFBlank$"]
14 |     >>> tokens = tokenizer(items)
15 |     >>> next(tokens)[:10]
16 |     ['公式', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[SEP]', 'z']
17 |     >>> items = ["已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$"]
18 |     >>> tokens = tokenizer(items)
19 |     >>> next(tokens)  # doctest: +NORMALIZE_WHITESPACE
20 |     ['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<',
21 |     '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',',
22 |     '\\quad', 'A', '\\cap', 'B', '=']
23 |     >>> items = [{
24 |     ... "stem": "已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$",
25 |     ... "options": ["1", "2"]
26 |     ... }]
27 |     >>> tokens = tokenizer(items, key=lambda x: x["stem"])
28 |     >>> next(tokens)  # doctest: +NORMALIZE_WHITESPACE
29 |     ['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<',
30 |     '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',',
31 |     '\\quad', 'A', '\\cap', 'B', '=']
32 | 


--------------------------------------------------------------------------------
/docs/source/tutorial/en/tokenization/TextTokenizer.rst:
--------------------------------------------------------------------------------
 1 | TextTokenizer
 2 | ================
 3 | 
 4 | By default, the pictures, labels, separators, blanks in the question text and other parts of the incoming item are converted into special characters for data security and tokenization of text and formulas. Also, the tokenizer uses linear analysis method for text and formulas, and the ``key`` parameter provided is used to preprocess the incoming item, which will be improved based on users' requirements in the future.
 5 | 
 6 | 
 7 | Examples
 8 | ----------
 9 | 
10 | ::
11 | 
12 |     >>> items = ["已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$"]
13 |     >>> tokenizer = TextTokenizer()
14 |     >>> tokens = tokenizer(items)
15 |     >>> next(tokens)  # doctest: +NORMALIZE_WHITESPACE
16 |     ['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<',
17 |     '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',',
18 |     '\\quad', 'A', '\\cap', 'B', '=']
19 |     >>> items = [{
20 |     ... "stem": "已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$",
21 |     ... "options": ["1", "2"]
22 |     ... }]
23 |     >>> tokens = tokenizer(items, key=lambda x: x["stem"])
24 |     >>> next(tokens)  # doctest: +NORMALIZE_WHITESPACE
25 |     ['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<',
26 |     '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',',
27 |     '\\quad', 'A', '\\cap', 'B', '=']
28 | 


--------------------------------------------------------------------------------
/docs/source/tutorial/en/tokenize/Sentence Segmentation.rst:
--------------------------------------------------------------------------------
1 | Sentence Segmentation
2 | -------------------------
3 | During the process of sentence segmentation, a long document is divided into several sentences. Each sentence is a "token" (to be realized).
4 | 


--------------------------------------------------------------------------------
/docs/source/tutorial/en/tokenize/Tokenization.rst:
--------------------------------------------------------------------------------
 1 | Tokenization
 2 | --------------
 3 | Tokenization is comprehensive analysis. In this process, sentences with formulas are segmented into several markers. Each marker is a "token".
 4 | We provide some encapsulated tokenizers for users to call them conveniently. The following is a complete list of tokenizers.
 5 | 
 6 | Examples
 7 |     
 8 | ::
 9 | 
10 |     >>> items = ["已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$"]
11 |     >>> tokenizer = TextTokenizer()
12 |     >>> tokens = tokenizer(items)
13 |     >>> next(tokens)  # doctest: +NORMALIZE_WHITESPACE
14 |     ['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<',
15 |     '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',',
16 |     '\\quad', 'A', '\\cap', 'B', '=']
17 | 
18 | 
19 | 
20 | You can view ``./EduNLP/Tokenizer/tokenizer.py`` and ``./EduNLP/Pretrain/gensim_vec.py`` for more tokenizers. Following is a complete list of tokenizers:
21 | 
22 | .. toctree::
23 |   :maxdepth: 1
24 |   :titlesonly:
25 | 
26 |   ../tokenization/TextTokenizer
27 |   ../tokenization/PureTextTokenizer
28 |   ../tokenization/GensimSegTokenizer
29 |   ../tokenization/GensimWordTokenizer
30 | 


--------------------------------------------------------------------------------
/docs/source/tutorial/en/tokenize/WordSegmentation.rst:
--------------------------------------------------------------------------------
 1 | Word segmentation
 2 | ---------------------
 3 | 
 4 | Text-tokenization: A sentence (without formulas) consists of several "words" in order. The process of dividing a sentence into several words is called "Text-tokenization". According to the granularity of "words", it can be subdivided into "Word-tokenization" and "Char-tokenization".
 5 | 
 6 | ::
 7 | 
 8 |    - Word-tokenization: each phrase is a token.
 9 | 
10 |    - Char-tokenization: each character is a token.
11 | 
12 | 
13 | Text-tokenization is divided into two main steps:
14 | 
15 | 1. Text-tokenization:
16 | 
17 |    - Word-tokenization: use the word segmentation tool to segment and extract words from the question text. Our project supports `jieba`.
18 | 
19 |    - Char-tokenization: process text by character.
20 | 
21 | 2. Filter: filter the specified stopwords.
22 | 
23 |    The default stopwords used in this project:`[stopwords] <https://github.com/bigdata-ustc/EduNLP/blob/master/EduNLP/meta_data/sif_stopwords.txt>`_
24 |    You can also use your own stopwords. The following example demonstrates how to use.
25 | 
26 | Examples:
27 | 
28 | ::
29 |     
30 |     >>> text = "三角函数是基本初等函数之一"
31 |     >>> tokenize(text, granularity="word")
32 |     ['三角函数', '初等', '函数']
33 |     
34 |     >>> tokenize(text, granularity="char")
35 |     ['三', '角', '函', '数', '基', '初', '函', '数']
36 |     
37 | 


--------------------------------------------------------------------------------
/docs/source/tutorial/en/vectorization/WithPre-trainedModel.rst:
--------------------------------------------------------------------------------
 1 | Use the pre-training model: call get_pretrained_i2v directly
 2 | --------------------------------------------------------------------
 3 | 
 4 | Use the pre-training model provided by EduNLP to convert the given question text into vectors.
 5 | 
 6 | * Advantages: Simple and convenient.
 7 | 
 8 | * Disadvantages: Only the model given in the project can be used, which has great limitations.
 9 | 
10 | * Call this function to obtain the corresponding pre-training model. At present, the following pre training models are provided: d2v_all_300, d2v_science_300, d2v_english_300 and d2v_literal_300.
11 | 
12 | Selection and use of models
13 | ####################################
14 | 
15 | Select the pre-training model according to the subject:
16 | 
17 | +--------------------+------------------------+
18 | |   Pre-training model name  | Subject of model training data |
19 | +====================+========================+
20 | |    d2v_all_300     |        all subject          |
21 | +--------------------+------------------------+
22 | |    d2v_science_300     |         Science           |
23 | +--------------------+------------------------+
24 | |    d2v_literal_300     |         Arts           |
25 | +--------------------+------------------------+
26 | |    d2v_english_300     |         English           |
27 | +--------------------+------------------------+
28 | 
29 | The concrete process of processing
30 | ####################################
31 | 
32 | 1.Download the corresponding preprocessing model
33 | 
34 | 2.Transfer the obtained model to D2V and process it with D2V
35 |   Convert the obtained model into D2V and process it through D2V
36 | 
37 | Examples：
38 | 
39 | ::
40 | 
41 |   >>> i2v = get_pretrained_i2v("d2v_science_300")
42 |   >>> i2v(item)
43 | 


--------------------------------------------------------------------------------
/docs/source/tutorial/en/vectorization/WithoutPre-trainedModel.rst:
--------------------------------------------------------------------------------
 1 | Don't use the pre-trained model: call existing models directly
 2 | ----------------------------------------------------------------
 3 | 
 4 | You can use any pre-trained model provided by yourself (just give the storage path of the model) to convert the given question text into vectors.
 5 | 
 6 | * Advantages: it is flexible to use your own model and its parameters can be adjusted freely.
 7 | 
 8 | Specific process of processing
 9 | +++++++++++++++++++++++++++++++++++
10 | 
11 | 1.Call get_tokenizer function to get the result after word segmentation;
12 | 
13 | 2.Select the model provided for vectorization depending on the model used.
14 | 
15 | Examples：
16 | 
17 | ::
18 | 
19 |   >>> model_path = "../test_model/d2v/test_gensim_luna_stem_tf_d2v_256.bin"
20 |   >>> i2v = D2V("text","d2v",filepath=model_path, pretrained_t2v = False)
21 |   >>> i2v(item)
22 | 


--------------------------------------------------------------------------------
/docs/source/tutorial/zh/index.rst:
--------------------------------------------------------------------------------
 1 | 入门
 2 | =====
 3 | 
 4 | *  `SIF标准格式 <sif.rst>`_ 
 5 | 
 6 | *  `成分分解 <seg.rst>`_ 
 7 | 
 8 | *  `语法解析 <tokenize.rst>`_ 
 9 | 
10 | *  `令牌化 <tokenization.rst>`_ 
11 | 
12 | *  `预训练 <pretrain.rst>`_ 
13 | 
14 | *  `向量化 <vectorization.rst>`_ 
15 | 
16 | *  `流水线 <pipeline.rst>`_ 
17 | 
18 | 主要流程
19 | ----------
20 | 
21 | .. figure:: ../../_static/流程图.png
22 | 
23 | * `成分分解 <seg.rst>`_ ：对符合SIF标准的试题进行分解，识别出题目中不同的成分（如文本、公式、图片等）。
24 | 
25 | * `语法解析 <tokenize.rst>`_ ：对不同的成分进行个性化解析，包括公式解析、文本解析等，从而服务于后面的令牌化环节。
26 | 
27 | * `令牌化 <tokenization.rst>`_：根据成分分解和语法解析的结果，获取试题不同成分的令牌化序列，最终得到试题的多模态令牌序列。
28 | 
29 | * `向量化 <vectorization.rst>`_：将令牌序列送入预训练模型，得到试题相应的表征向量。
30 | 
31 | * **下游模型**：将预训练模型得到的试题表征应用于各种下游任务（如难度预测、知识点预测、相似题检索等）。
32 | 
33 | 示例
34 | --------
35 | 
36 | 为使您快速了解此项目的功能，此部分仅展示常用的函数接口使用方法（如得到令牌化序列、获取向量化表征等），对于其中间函数模块（如parse、segment、tokenize、formula等）以及更细分的接口方法不做展示，如需深入学习，请查看相关部分的文档。
37 | 
38 | .. nbgallery::
39 |     :caption: This is a thumbnail gallery:
40 |     :name: start_galler
41 |     :glob:
42 |     
43 |     令牌化  <../../build/blitz/sif/sif4sci.ipynb>
44 |     
45 |     向量化  <../../build/blitz/i2v/get_pretrained_i2v.ipynb>
46 | 
47 |     流水线 <../../build/blitz/pipeline/pipeline.ipynb>
48 | 


--------------------------------------------------------------------------------
/docs/source/tutorial/zh/pipeline.rst:
--------------------------------------------------------------------------------
1 | =======
2 | 流水线
3 | =======
4 | 
5 | .. nbinfo::
6 |     notebook: 
7 | 
8 |     `流水线 <../../build/blitz/pipeline/pipeline.ipynb>`_
9 | 


--------------------------------------------------------------------------------
/docs/source/tutorial/zh/sif.rst:
--------------------------------------------------------------------------------
  1 | SIF标准格式
  2 | ==============================
  3 | 
  4 | 标准规范
  5 | ----------------------------------------
  6 | 
  7 | version: 0.2
  8 | 
  9 | 为了后续研究和使用的方便，我们需要一个统一的试题语法标准。
 10 | 
 11 | 语法规则
 12 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 13 | 
 14 | 1. 题目文本中只允许出现中文字符、中英文标点和换行符。
 15 | 
 16 | 2. 使用 \$\SIFBlank\$ 替换横线，对于选择题中的括号使用 \$\SIFChoice\$ 替换。
 17 | 
 18 | 3. 图片 ID 以公式的形式嵌入文本中：``$\FigureID{ uuid }$`` 或用 base64 编码表示，特别的，内容为公式的图片用 ``$\FormFigureID{ uuid }$`` 表示。
 19 | 
 20 | 4. 文本标注格式：统一用 ``$\textf{item,CHAR_EN}$`` 表示，目前定义的有：b-加粗，i-斜体，u-下划线，w-下划波浪线，d-加点，t-标题。标注可以混用，按字母顺序排序，例如：$\textf{EduNLP, b}$ 表示  **EduNLP** 
 21 | 
 22 | 5. 其余诸如，英文字母、罗马字符、数字等数学符号一律需要使用 latex 格式表示，即嵌在 ``$$`` 之中。
 23 | 
 24 | 6. 分子式的录入标准暂且参考 `INCHI <https://zh.wikipedia.org/wiki/%E5%9B%BD%E9%99%85%E5%8C%96%E5%90%88%E7%89%A9%E6%A0%87%E8%AF%86>`_
 25 | 
 26 | 7. 目前对 latex 内部语法没有要求。
 27 | 
 28 | ::
 29 | 
 30 |   1. Item -> CHARACTER|EN_PUN_LIST|CH_PUN_LIST|FORMULA|QUES_MARK
 31 |   2. EN_PUN_LIST -> [',', '.', '?', '!', ':', ';', '\'', '\"', '(', ')', ' ','_','/','|','\\','<','>','[',']','-']
 32 |   3. CH_PUN_LIST -> ['，', '。', '！', '？', '：','；', '‘', '’', '“', '”', '（', '）', ' ', '、','《','》','—','．']
 33 |   4. FORMULA -> $latex formula$ | $\FormFigureID{UUID}$ | $\FormFigureBase64{BASE64}$
 34 |   5. FIGURE -> $\FigureID{UUID}$ | $\FigureBase64{BASE64}$
 35 |   6. UUID -> [a-zA-Z\-0-9]+
 36 |   7. CHARACTER -> CHAR_EN | CHAR_CH
 37 |   8. CHAR_EN -> [a-zA-Z]+
 38 |   9. CHAR_CH -> [\u4e00-\u9fa5]+
 39 |   10. DIGITAL -> [0-9]+
 40 |   11. QUES_MARK -> $\SIFBlank$ | $\SIFChoice$
 41 | 
 42 | 
 43 | 注意事项
 44 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 45 | 
 46 | 1. 保留字符与转义
 47 | 
 48 | 2. 数字
 49 | 
 50 | 3. 选空与填空
 51 | 
 52 | 4. 对于单个的数字或字符也需要添加 ``$$`` （目前能实现自动校验）
 53 | 
 54 | 5. latex 公式中尽量不出现中文：（``\text{这里出现中文}``）
 55 | 
 56 | 6. MySql 数据库导入数据时会自动忽略一个 ``\``，所以录入的公式需要进一步处理为 ``\\``
 57 | 
 58 | 示例
 59 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 60 | 
 61 | 标准形式:
 62 | 
 63 | ::
 64 | 
 65 |  1. 若$x,y$满足约束条件$\\left\\{\\begin{array}{c}2 x+y-2 \\leq 0 \\\\ x-y-1 \\geq 0 \\\\ y+1 \\geq 0\\end{array}\\right.$，则$z=x+7 y$的最大值$\\SIFUnderline$'
 66 |  
 67 |  2. 已知函数$f(x)=|3 x+1|-2|x|$画出$y=f(x)$的图像求不等式$f(x)>f(x+1)$的解集$\\PictureID{3bf2ddf4-8af1-11eb-b750-b46bfc50aa29}$$\\PictureID{59b8bd14-8af1-11eb-93a5-b46bfc50aa29}$$\\PictureID{63118b3a-8b75-11eb-a5c0-b46bfc50aa29}$$\\PictureID{6a006179-8b76-11eb-b386-b46bfc50aa29}$$\\PictureID{088f15eb-8b7c-11eb-a86f-b46bfc50aa29}$
 68 | 
 69 | 非标准形式：
 70 | 
 71 | 1. 字母、数字和数学符号连续混合出现：
 72 | 
 73 |     例如：
 74 |     
 75 |     ``完成下面的2x2列联表，``
 76 |     
 77 |     ``（单位：m3）``
 78 |     
 79 |     ``则输出的n=``
 80 |     
 81 | 2. 特殊的数学符号没有用 latex 公式表示：
 82 | 
 83 |     例如：
 84 |     
 85 |     ``命题中真命题的序号是 ①``
 86 |     
 87 |     ``AB是⊙O的直径，AC是⊙O的切线，BC交⊙O于点E．若D为AC的中点``
 88 |     
 89 | 3. 出现以 unicode 编码写成的字符
 90 | 
 91 |     例如：``则$a$的取值范围是（\u3000\u3000）``
 92 | 
 93 | 
 94 | 标准化检验
 95 | ---------------------
 96 | 
 97 | 调用库
 98 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 99 | ::
100 | 
101 |     from EduNLP.SIF import is_sif, to_sif
102 | 
103 | is_sif
104 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
105 | 
106 | 判断题目是否为SIF标准格式
107 | 
108 | ::
109 | 
110 |     >>> text1 = '若$x,y$满足约束条件' 
111 |     >>> text2 = '$\\left\\{\\begin{array}{c}2 x+y-2 \\leq 0 \\\\ x-y-1 \\geq 0 \\\\ y+1 \\geq 0\\end{array}\\right.$，' 
112 |     >>> text3 = '则$z=x+7 y$的最大值$\\SIFUnderline$'
113 |     >>> text4 = '某校一个课外学习小组为研究某作物的发芽率y和温度x（单位...'
114 |     >>> is_sif(text1)
115 |     True
116 |     >>> is_sif(text2)
117 |     True
118 |     >>> is_sif(text3)
119 |     True
120 |     >>> is_sif(text4)
121 |     False
122 | 
123 | to_sif
124 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
125 | 
126 | 将题目转换为SIF标准格式
127 | 
128 | ::
129 | 
130 |     >>> text = '某校一个课外学习小组为研究某作物的发芽率y和温度x（单位...'
131 |     >>> to_sif(text)
132 |     '某校一个课外学习小组为研究某作物的发芽率$y$和温度$x$（单位...'
133 | 


--------------------------------------------------------------------------------
/docs/source/tutorial/zh/tokenize.rst:
--------------------------------------------------------------------------------
  1 | 语法解析
  2 | =========
  3 | 
  4 | 在教育资源中，文本、公式都具有内在的隐式或显式的语法结构，提取这种结构对表征学习是大有裨益的：
  5 | 
  6 | * 文本语法结构解析
  7 | 
  8 | * 公式语法结构解析
  9 | 
 10 | 文本语法结构解析
 11 | --------------------
 12 | 
 13 | 根据题目文本切分粒度的大小，文本解析又分为 **“句解析”** 和 **“词解析”**。
 14 | 
 15 | 
 16 | 句解析（sentence-tokenization）
 17 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 18 | 
 19 | 将较长的文档切分成若干句子的过程称为“分句”。每个句子为一个“令牌”（token）。（待实现）    
 20 |   
 21 | 
 22 | 词解析（text-tokenization）
 23 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 24 | 
 25 | 一个句子（不含公式）是由若干“词”按顺序构成的，将一个句子切分为若干词的过程称为“词解析”。根据词的粒度大小，又可细分为“词组解析”和"单字解析"。
 26 | 
 27 | 主要步骤
 28 | """""""""""""""""""""""""
 29 | 
 30 | （1）分词
 31 | 
 32 | - 词组解析：使用分词工具切分并提取题目文本中的词。  
 33 |     本项目目前支持的分词工具有：`jieba`   
 34 | - 单字解析：按字符划分。
 35 | 
 36 | （2） 过滤停用词
 37 | 
 38 | - 本项目默认使用的停用词表：`stopwords <https://github.com/bigdata-ustc/EduNLP/blob/master/EduNLP/meta_data/sif_stopwords.txt>`_
 39 | - 你也可以使用自己的停用词表，具体使用方法见下面的示例。
 40 | 
 41 | 
 42 | 示例
 43 | """""""""""""""""""""""""
 44 | 
 45 | 导入模块
 46 | 
 47 | ::
 48 | 
 49 |   from EduNLP.SIF.tokenization.text import tokenize 
 50 | 
 51 | 
 52 | 输入
 53 | 
 54 | ::
 55 | 
 56 |   text = "三角函数是基本初等函数之一"
 57 | 
 58 | 
 59 | 词组解析
 60 | 
 61 | ::
 62 | 
 63 |   # 输出：默认使用 EduNLP 项目提供的停用词表
 64 |   >>> tokenize(text, granularity="word")
 65 |   ['三角函数', '初等', '函数']
 66 | 
 67 | 
 68 | 单字解析
 69 | 
 70 | ::
 71 | 
 72 |   # 输出：默认使用 EduNLP 项目提供的停用词表
 73 |   >>> tokenize(text, granularity="char")
 74 |   ['三', '角', '函', '数', '基', '初', '函', '数']
 75 | 
 76 | 
 77 | 使用自己的停用词表
 78 | 
 79 | ::
 80 | 
 81 |   >>> spath = "test_stopwords.txt"
 82 |   >>> from EduNLP.SIF.tokenization.text.stopwords import get_stopwords
 83 |   >>> stopwords = get_stopwords(spath)
 84 |   >>> stopwords
 85 |   {'一旦', '一时', '一来', '一样', '一次', '一片', '一番', '一直', '一致'}
 86 |   >>> tokenize(text, granularity="word", stopwords=stopwords)
 87 |   ['三角函数', '是', '基本', '初等', '函数', '之一']
 88 | 
 89 | 
 90 | 公式语法结构解析
 91 | --------------------
 92 | 
 93 | 公式解析（formula-tokenization）：理科类文本中常常含有公式。将一个符合 latex 语法的公式解析为标记字符列表或抽象语法树的过程称为“公式解析”。
 94 | 
 95 | 包括两种方案
 96 | 
 97 | - 公式线性解析
 98 | - 公式AST解析
 99 | 
100 | .. note::
101 | 
102 |   本小节主要介绍如何获取不同格式的公式解析结果。公式解析的底层实现请参考：`EduNLP.Formula` 部分。
103 | 
104 | 
105 | （1）公式线性解析
106 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
107 | 
108 | 如果您想按 latex 语法标记拆分公式的各个部分，并得到顺序序列结果，输出方法可以选择：`linear`
109 | 
110 | ::
111 |   >>> tokenize(formula, method="linear")
112 |   ['\\frac', '{', '\\pi', '}', '{', 'x', '+', 'y', '}', '+', '1', '=', 'x']
113 | 
114 | 
115 | （2） 公式AST解析
116 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
117 | 
118 | 如果您想得到公式解析出的语法分析树序列，输出方法可以选择：`ast`
119 | 
120 | > 抽象语法分析树，简称语法树（Syntax tree），是源代码语法结构的一种抽象表示。它以树状的形式表现编程语言的语法结构，树上的每个节点都表示源代码中的一种结构。  
121 | > 因此，ast 可以看做是公式的语法结构表征。
122 | 
123 | ::
124 |   >>> tokenize(formula, method="ast", return_type="list", ord2token=False)
125 |   ['\\pi', '{ }', 'x', '+', 'y', '{ }', '\\frac', '+', '1', '=', 'x']
126 | 
127 | 
128 | （3）公式AST解析+变量符号化
129 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
130 | 
131 | 如果您只是关心公式的结构和类型，并不关心变量具体是什么，比如二元二次方程 `x^2 + y = 1` ，它从公式结构和类型上来说，和 `w^2 + z = 1` 没有区别。  
132 | 此时，您可以设置如下参数：`ord2token = True`，将公式变量名转换成 token
133 | 
134 | ::
135 |   >>> tokenize(formula, method="ast", return_type="list", ord2token=True)
136 |   ['mathord', '{ }', 'mathord', '+', 'mathord', '{ }', '\\frac', '+', 'textord', '=', 'mathord']
137 | 
138 | 
139 | （4） 公式AST解析+变量标准化
140 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
141 | 
142 | 如果您除了 （3） 中提供的功能之外，还需要区分不同的变量。此时可以另外设置参数：`var_numbering=True`
143 | 
144 | ::
145 |   >>> tokenize(formula, method="ast", return_type="list", ord2token=True, var_numbering=True)
146 |   ['mathord_con', '{ }', 'mathord_0', '+', 'mathord_1', '{ }', '\\frac', '+', 'textord', '=', 'mathord_0']
147 | 
148 | 


--------------------------------------------------------------------------------
/examples/downstream/difficulty_prediction/utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import pandas as pd
 3 | 
 4 | 
 5 | def load_json(open_path):
 6 |     print("[load_json] start : {}".format(open_path))
 7 |     with open(open_path, "r", encoding='utf-8') as f:
 8 |         load_q = json.load(f)
 9 |     print("[load_json] num = {}, open_path = {}".format(len(load_q), open_path))
10 |     return load_q
11 | 
12 | 
13 | def get_train(train):
14 |     train_data = []
15 |     for item in train:
16 |         dic = {}
17 |         dic["content"] = item["content"]
18 |         dic["labels"] = float(item["difficulty"])
19 |         train_data.append(dic)
20 |     return train_data
21 | 
22 | 
23 | def get_val(val):
24 |     test_data, test_gap = [], []
25 |     start, end = 0, 0
26 |     for batch in val:
27 |         end += len(batch['questions'])
28 |         for item in batch['questions']:
29 |             dic = {}
30 |             dic['content'] = item["stem"]
31 |             dic['labels'] = item['diff']
32 |             # dic["labels"] = dic.pop("difficulty")
33 |             test_data.append(dic)
34 |         test_gap.append([start, end])
35 |         start = end
36 |     return test_data, test_gap
37 | 


--------------------------------------------------------------------------------
/examples/downstream/discrimination_prediction/utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import pandas as pd
 3 | 
 4 | 
 5 | def pre_disc(csv_path):
 6 |     items = pd.read_csv(csv_path)
 7 |     stem = items["stem"].tolist()
 8 |     disc = items["disc"].tolist()
 9 |     data = []
10 |     for i in range(len(stem)):
11 |         dic = {}
12 |         dic["content"] = stem[i]
13 |         dic["labels"] = disc[i]
14 |         data.append(dic)
15 |     return data
16 | 


--------------------------------------------------------------------------------
/examples/downstream/paper_segmentation/samples/train/math/paper_1.txt:
--------------------------------------------------------------------------------
 1 | =================
 2 | 2017年云南省临沧市临翔区民族中学高考数学三模试卷（文科）
 3 | 选择题
 4 | =================
 5 | 1. 已知集合,,则 \ (   ) $
 6 | A.	   B.	   C.	   D.
 7 | =================
 8 | 2. 已知复数,则复数的模为 \ (   ) $
 9 | A.	   B.	   C.	   D.
10 | =================
11 | 3. 已知点,,向量,若,则为 \ (   ) $
12 | A.	   B.	   C.	   D.
13 | =================
14 | 4. 已知函数满足,且当时,成立,若,,,则,,的大小关系是 \ (   ) $
15 | A.	   B.	   C.	   D.
16 | =================
17 | 5.如图的程序框图的算法思路源于数学名著几何原本中的“辗转相除法”,执行该程序框图图中“”表示除以的余数\ (   ) ab485270b=(
18 | A.	   B.	   C.	   D.
19 | =================
20 | 6. 某三棱锥的三视图如图所示,则该三棱锥的表面积为 \ (   ) $
21 | A.	   B.	   C.	   D.
22 | =================
23 | 7. 曲线在点处的切线与轴、轴围成的封闭图形的面积为 \ (   ) $
24 | A.	   B.	   C.	   D.
25 | =================
26 | 8. 已知,则 \ (   ) $
27 | A.	   B.	   C.	   D.
28 | =================
29 | 9. 下列说法正确的个数是 \ (   ) $
30 | 若为奇函数,则;
31 | “在中,若,则”的逆命题是假命题;
32 | “三个数,,成等比数列”是“”的既不充分也不必要条件;
33 | 命题“,”的否定是“,”.
34 | A.	   B.	   C.	   D.
35 | =================
36 | 10. 将函数的图象向右平移个单位后得到的图象的一个对称轴是 \ (   ) $
37 | A.	   B.	   C.	   D.
38 | =================
39 | 11. 已知等差数列的公差,且,,成等比数列,若,是数列的前项和,则的最小值为 \ (   ) $
40 | A.	   B.	   C.	   D.
41 | =================
42 | 12. 已知焦点为的抛物线上有一点,以为圆心,为半径的圆被轴截得的弦长为,则 \ (   ) $
43 | A.	   B.	   C.	   D.
44 | =================
45 | 填空题
46 | =================
47 | 13. 点是不等式组表示的平面区域内的一动点，且不等式恒成立，则的取值范围是 _____ ．
48 | =================
49 | 14. 已知的内角，，所对的边分别为，，，且，，，则的值为 _____ ．
50 | =================
51 | 15. 已知正四面体的棱长为，为棱的中点，过作其外接球的截面，则截面面积的最小值为 _____ ．
52 | =================
53 | 16. 设函数的图象与的图象关于直线对称，且，则 _____ ．
54 | =================
55 | 简答题
56 | =================
57 | 17. 已知数列的前项和
58 | Ⅰ求数列的通项公式；
59 | Ⅱ若，求数列的前项和．
60 | =================
61 | 18.
62 | =================
63 | 19. 如图，在直角梯形中，，，，是中点，将沿折起，使得面；
64 | Ⅰ求证：平面平面；
65 | Ⅱ若是的中点求三棱锥的体积．
66 | =================
67 | 20. 已知椭圆：的离心率为，过的左焦点的直线：，直线被圆：截得的弦长为．
68 | Ⅰ求椭圆的方程；
69 | Ⅱ设的右焦点为，在圆上是否存在点，满足，若存在，指出有几个这样的点不必求出点的坐标；若不存在，说明理由．
70 | =================
71 | 21. 已知函数为常数
72 | 当时，求函数的单调区间；
73 | 求时，不等式恒成立，求实数的取值范围．
74 | =================
75 | 22. 在直角坐标系中，曲线为参数，，其中，在以为极点，轴正半轴为极轴的极坐标系中，曲线：，曲线．
76 | Ⅰ求与交点的直角坐标系；
77 | Ⅱ若与相交于点，与相交于点，求的最大值．
78 | =================
79 | 23. 设函数
80 | Ⅰ解不等式；
81 | Ⅱ当时，，求实数的取值范围．
82 | =================
83 | 


--------------------------------------------------------------------------------
/examples/downstream/paper_segmentation/utils.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | import os
 4 | import logging
 5 | from datetime import datetime
 6 | 
 7 | ROOT_DIR = os.path.dirname(os.path.dirname(__file__))
 8 | 
 9 | def get_logger(logfile):
10 |     os.makedirs(os.path.dirname(logfile), exist_ok=True)
11 | 
12 |     logger = logging.getLogger(name="QuesQuality")
13 |     logger.setLevel(logging.INFO)
14 |     
15 |     handler = logging.FileHandler(filename=logfile, encoding="utf-8", mode="w")
16 |     handler.setLevel(logging.INFO)
17 |     formatter = logging.Formatter(fmt="%(asctime)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S")
18 |     handler.setFormatter(formatter)
19 |     
20 |     consolehandler = logging.StreamHandler()
21 |     consolehandler.setFormatter(formatter)
22 | 
23 |     logger.addHandler(handler)
24 |     logger.addHandler(consolehandler)   # log to file and print to console
25 |     return logger
26 | 
27 | 
28 | def get_pk(y_pred, y, k):
29 |     tag_num = len(y)
30 |     count = 0
31 |     for i in range(0, tag_num-k):
32 |         seg_count_y_pred = 0
33 |         seg_count_y = 0
34 |         for j in range(i, i+k):
35 |             seg_count_y_pred += y_pred[j]
36 |             seg_count_y += y[j]
37 |         if seg_count_y_pred != seg_count_y:
38 |             count += 1
39 |     return count
40 | 


--------------------------------------------------------------------------------
/examples/formula/formula.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "source": [
  6 |     "# Formula\r\n",
  7 |     "\r\n",
  8 |     "## 概述\r\n",
  9 |     "\r\n",
 10 |     "Formula 首先在分词功能中对原始文本的公式做切分处理，另外提供多种功能使之能够适应多种用户需求，例如 [公式解析树] 功能，可以将数学公式的抽象语法分析树用文本或图片的形式表示出来；又如[公式变量标准化]的功能，能判断几个子公式内的‘x’为同一变量。\r\n",
 11 |     "\r\n",
 12 |     "由于本部分常作为中间模块，故仅展示基本调用方法，如需更进一步学习模块相关参数请参见对应文档。"
 13 |    ],
 14 |    "metadata": {}
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 1,
 19 |    "source": [
 20 |     "import matplotlib.pyplot as plt\r\n",
 21 |     "from EduNLP.Formula import Formula\r\n",
 22 |     "from EduNLP.Formula import FormulaGroup\r\n",
 23 |     "from EduNLP.Formula.viz import ForestPlotter"
 24 |    ],
 25 |    "outputs": [],
 26 |    "metadata": {}
 27 |   },
 28 |   {
 29 |    "cell_type": "markdown",
 30 |    "source": [
 31 |     "## 公式语法结构分析\n",
 32 |     "\n",
 33 |     "### 初始化实例\n",
 34 |     "\n",
 35 |     "- item 类型：`str or List[Dict]` \n",
 36 |     "- item 内容：latex 公式 或 公式经解析后产生的抽象语法分析树(abstracted syntax tree)"
 37 |    ],
 38 |    "metadata": {}
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 2,
 43 |    "source": [
 44 |     "f = Formula(\"x^2 + x+1 = y\")\r\n",
 45 |     "f "
 46 |    ],
 47 |    "outputs": [
 48 |     {
 49 |      "output_type": "execute_result",
 50 |      "data": {
 51 |       "text/plain": [
 52 |        "<Formula: x^2 + x+1 = y>"
 53 |       ]
 54 |      },
 55 |      "metadata": {},
 56 |      "execution_count": 2
 57 |     }
 58 |    ],
 59 |    "metadata": {
 60 |     "collapsed": true
 61 |    }
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "source": [
 66 |     "## 方程组结构解析\n",
 67 |     "\n",
 68 |     "调用 `FormulaGroup` 类解析公式方程组，相关的属性和函数方法同上。"
 69 |    ],
 70 |    "metadata": {}
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": 21,
 75 |    "source": [
 76 |     "fs = FormulaGroup([\r\n",
 77 |     "    \"x^2 = y\",\r\n",
 78 |     "    \"x^3 = y^2\",\r\n",
 79 |     "    \"x + y = \\pi\"\r\n",
 80 |     "])\r\n",
 81 |     "fs"
 82 |    ],
 83 |    "outputs": [
 84 |     {
 85 |      "output_type": "execute_result",
 86 |      "data": {
 87 |       "text/plain": [
 88 |        "<FormulaGroup: <Formula: x^2 = y>;<Formula: x^3 = y^2>;<Formula: x + y = \\pi>>"
 89 |       ]
 90 |      },
 91 |      "metadata": {},
 92 |      "execution_count": 21
 93 |     }
 94 |    ],
 95 |    "metadata": {
 96 |     "collapsed": false,
 97 |     "pycharm": {
 98 |      "name": "#%%\n"
 99 |     }
100 |    }
101 |   }
102 |  ],
103 |  "metadata": {
104 |   "kernelspec": {
105 |    "name": "python3",
106 |    "display_name": "Python 3.8.5 64-bit"
107 |   },
108 |   "language_info": {
109 |    "name": "python",
110 |    "version": "3.8.5",
111 |    "mimetype": "text/x-python",
112 |    "codemirror_mode": {
113 |     "name": "ipython",
114 |     "version": 3
115 |    },
116 |    "pygments_lexer": "ipython3",
117 |    "nbconvert_exporter": "python",
118 |    "file_extension": ".py"
119 |   },
120 |   "interpreter": {
121 |    "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a"
122 |   }
123 |  },
124 |  "nbformat": 4,
125 |  "nbformat_minor": 2
126 | }


--------------------------------------------------------------------------------
/examples/formula/formula.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | # 2021/3/8 @ tongshiwei
 3 | #
 4 | from EduNLP.Formula import Formula, FormulaGroup, link_formulas
 5 | #
 6 | # f1 = Formula(r"x + y", variable_standardization=True)
 7 | # f2 = Formula(r"y + x", variable_standardization=True)
 8 | # f3 = Formula(r"z + y", variable_standardization=True)
 9 | #
10 | # print(f1.element)
11 | # print(f2.element)
12 | # print(f3.element)
13 | #
14 | # print("-----------------------")
15 | #
16 | # link_formulas(f1, f2, f3)
17 | #
18 | # print("------------------------")
19 | #
20 | # print(f1.element)
21 | # print(f2.element)
22 | # print(f3.element)
23 | #
24 | # print("---------------------")
25 | #
26 | # fg = FormulaGroup(
27 | #     [r"x + y", r"y + x", r"y + z"]
28 | # )
29 | # for f in fg:
30 | #     print(f.element)
31 | 
32 | # fg = FormulaGroup(["x", "y", "x"])
33 | # print(fg.elements)
34 | 
35 | fg = FormulaGroup(["x", Formula("y"), "x"])
36 | print(fg.elements)


--------------------------------------------------------------------------------
/examples/formula/tree.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 1,
 6 |    "source": [
 7 |     "import networkx as nx\n",
 8 |     "\n",
 9 |     "g = nx.DiGraph()\n",
10 |     "g.add_node(0, value=1, id=0)\n",
11 |     "g.add_node(1, value=2, id=1)\n",
12 |     "g.add_node(2, id=2)\n",
13 |     "g.add_edge(0, 1)\n",
14 |     "g.add_edge(0, 2)\n",
15 |     "g.nodes[0]"
16 |    ],
17 |    "outputs": [
18 |     {
19 |      "output_type": "execute_result",
20 |      "data": {
21 |       "text/plain": [
22 |        "{'value': 1, 'id': 0}"
23 |       ]
24 |      },
25 |      "metadata": {},
26 |      "execution_count": 1
27 |     }
28 |    ],
29 |    "metadata": {
30 |     "collapsed": true
31 |    }
32 |   }
33 |  ],
34 |  "metadata": {
35 |   "kernelspec": {
36 |    "name": "python3",
37 |    "display_name": "Python 3.8.5 64-bit"
38 |   },
39 |   "language_info": {
40 |    "name": "python",
41 |    "version": "3.8.5",
42 |    "mimetype": "text/x-python",
43 |    "codemirror_mode": {
44 |     "name": "ipython",
45 |     "version": 3
46 |    },
47 |    "pygments_lexer": "ipython3",
48 |    "nbconvert_exporter": "python",
49 |    "file_extension": ".py"
50 |   },
51 |   "interpreter": {
52 |    "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a"
53 |   }
54 |  },
55 |  "nbformat": 4,
56 |  "nbformat_minor": 2
57 | }


--------------------------------------------------------------------------------
/examples/i2v/i2v.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# I2V 向量化容器\n",
  8 |     "\n",
  9 |     "向量化过程是将原始题目（item）转成向量（vector）的过程，它包括两个步骤：\n",
 10 |     "- 使用 `Tokenizer` 将原始题目（item）转化为令牌化序列（tokens）；\n",
 11 |     "- 使用 `T2V` 向量化容器 将令牌化序列（tokens）转成向量（vector）。\n",
 12 |     "\n",
 13 |     "为了使用户能直接使用本地的（或公开的）预训练模型，我们提供了`I2V向量化容器`, 将令牌化、向量化操作同时封装起来。"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "markdown",
 18 |    "metadata": {},
 19 |    "source": [
 20 |     "## 概述\n",
 21 |     "\n",
 22 |     "使用EduNLP的开源预训练模型将给定的题目转成向量。\n",
 23 |     "\n",
 24 |     "- 优点：用户不需要研究令牌化和模型加载的细节。令牌化和向量化的参数已由预训练模型的参数文件定义好。\n",
 25 |     "- 缺点：不适合修改预训练的模型参数或令牌化容器参数"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 1,
 31 |    "metadata": {},
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "import os\n",
 35 |     "\n",
 36 |     "items = [\n",
 37 |     "  r\"题目一：如图几何图形．此图由三个半圆构成，三个半圆的直径分别为直角三角形$ABC$的斜边$BC$, 直角边$AB$, $AC$.$\\bigtriangleup ABC$的三边所围成的区域记为$I$,黑色部分记为$II$, 其余部分记为$III$.在整个图形中随机取一点，此点取自$I,II,III$的概率分别记为$p_1,p_2,p_3$,则$\\SIFChoice$$\\FigureID{1}$\",\n",
 38 |     "  r\"题目二: 如图来自古希腊数学家希波克拉底所研究的几何图形．此图由三个半圆构成，三个半圆的直径分别为直角三角形$ABC$的斜边$BC$, 直角边$AB$, $AC$.$\\bigtriangleup ABC$的三边所围成的区域记为$I$,黑色部分记为$II$, 其余部分记为$III$.在整个图形中随机取一点，此点取自$I,II,III$的概率分别记为$p_1,p_2,p_3$,则$\\SIFChoice$$\\FigureID{1}$\"\n",
 39 |     "]\n",
 40 |     "\n",
 41 |     "model_dir = \"../test_model\""
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "markdown",
 46 |    "metadata": {},
 47 |    "source": [
 48 |     "# 示例：使用 W2V 加载本地模型\n",
 49 |     "## W2V"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": 2,
 55 |    "metadata": {},
 56 |    "outputs": [
 57 |     {
 58 |      "name": "stderr",
 59 |      "output_type": "stream",
 60 |      "text": [
 61 |       "E:\\dev_env\\anaconda\\envs\\data\\lib\\site-packages\\gensim\\similarities\\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package <https://pypi.org/project/python-Levenshtein/> is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.\n",
 62 |       "  warnings.warn(msg)\n"
 63 |      ]
 64 |     },
 65 |     {
 66 |      "name": "stdout",
 67 |      "output_type": "stream",
 68 |      "text": [
 69 |       "2 256\n",
 70 |       "2 56 256\n"
 71 |      ]
 72 |     }
 73 |    ],
 74 |    "source": [
 75 |     "from EduNLP.I2V import W2V\n",
 76 |     "\n",
 77 |     "pretrained_path = os.path.join(model_dir, \"w2v/w2v_test_256/w2v_test_256.kv\")\n",
 78 |     "i2v = W2V(\"pure_text\", \"w2v\", pretrained_path)\n",
 79 |     "\n",
 80 |     "item_vector, token_vector = i2v(items)\n",
 81 |     "# or\n",
 82 |     "item_vector, token_vector = i2v.infer_vector(items)\n",
 83 |     "# or\n",
 84 |     "item_vector = i2v.infer_item_vector(items)\n",
 85 |     "token_vector = i2v.infer_token_vector(items)\n",
 86 |     "\n",
 87 |     "print(len(item_vector), len(item_vector[0])) \n",
 88 |     "print(len(token_vector), len(token_vector[0]), len(token_vector[0][0]))"
 89 |    ]
 90 |   }
 91 |  ],
 92 |  "metadata": {
 93 |   "interpreter": {
 94 |    "hash": "776957673adb719a00031a24ed5efd2fa5ce8a13405e5193f8d278edd3805d55"
 95 |   },
 96 |   "kernelspec": {
 97 |    "display_name": "Python 3.6.13 64-bit ('data': conda)",
 98 |    "name": "python3"
 99 |   },
100 |   "language_info": {
101 |    "codemirror_mode": {
102 |     "name": "ipython",
103 |     "version": 3
104 |    },
105 |    "file_extension": ".py",
106 |    "mimetype": "text/x-python",
107 |    "name": "python",
108 |    "nbconvert_exporter": "python",
109 |    "pygments_lexer": "ipython3",
110 |    "version": "3.6.2"
111 |   },
112 |   "orig_nbformat": 4
113 |  },
114 |  "nbformat": 4,
115 |  "nbformat_minor": 2
116 | }
117 | 


--------------------------------------------------------------------------------
/examples/i2v/i2v_d2v.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 使用 D2V 向量化容器\n",
  8 |     "## 导入功能块"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": 1,
 14 |    "metadata": {},
 15 |    "outputs": [
 16 |     {
 17 |      "name": "stderr",
 18 |      "output_type": "stream",
 19 |      "text": [
 20 |       "d:\\MySoftwares\\Anaconda\\envs\\data\\lib\\site-packages\\gensim\\similarities\\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package <https://pypi.org/project/python-Levenshtein/> is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.\n",
 21 |       "  warnings.warn(msg)\n"
 22 |      ]
 23 |     }
 24 |    ],
 25 |    "source": [
 26 |     "from EduNLP.I2V import I2V, D2V, get_pretrained_i2v"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 2,
 32 |    "metadata": {},
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "items = [\n",
 36 |     "  r\"题目一：如图几何图形．此图由三个半圆构成，三个半圆的直径分别为直角三角形$ABC$的斜边$BC$, 直角边$AB$, $AC$.$\\bigtriangleup ABC$的三边所围成的区域记为$I$,黑色部分记为$II$, 其余部分记为$III$.在整个图形中随机取一点，此点取自$I,II,III$的概率分别记为$p_1,p_2,p_3$,则$\\SIFChoice$$\\FigureID{1}$\",\n",
 37 |     "  r\"题目二: 如图来自古希腊数学家希波克拉底所研究的几何图形．此图由三个半圆构成，三个半圆的直径分别为直角三角形$ABC$的斜边$BC$, 直角边$AB$, $AC$.$\\bigtriangleup ABC$的三边所围成的区域记为$I$,黑色部分记为$II$, 其余部分记为$III$.在整个图形中随机取一点，此点取自$I,II,III$的概率分别记为$p_1,p_2,p_3$,则$\\SIFChoice$$\\FigureID{1}$\"\n",
 38 |     "]"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "markdown",
 43 |    "metadata": {},
 44 |    "source": [
 45 |     "## 向量化\n",
 46 |     "### 使用EduNLP中公开的预训练模型\n",
 47 |     "> - D2V没有实现token向量化，只能获得 item（题目）的表征"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 4,
 53 |    "metadata": {},
 54 |    "outputs": [
 55 |     {
 56 |      "name": "stderr",
 57 |      "output_type": "stream",
 58 |      "text": [
 59 |       "EduNLP, INFO Use pretrained t2v model d2v_test_256\n",
 60 |       "downloader, INFO http://base.ustc.edu.cn/data/model_zoo/modelhub/doc2vec_pub/1/d2v_test_256.zip is saved as ..\\test_model\\d2v\\d2v_test_256.zip\n",
 61 |       "downloader, INFO file existed, skipped\n"
 62 |      ]
 63 |     },
 64 |     {
 65 |      "name": "stdout",
 66 |      "output_type": "stream",
 67 |      "text": [
 68 |       "2 256\n"
 69 |      ]
 70 |     }
 71 |    ],
 72 |    "source": [
 73 |     "save_dir =  \"../test_model/d2v\"\n",
 74 |     "i2v = get_pretrained_i2v(\"d2v_test_256\", model_dir=save_dir)\n",
 75 |     "\n",
 76 |     "item_vector, _ = i2v.infer_vector(items)\n",
 77 |     "# or\n",
 78 |     "item_vector = i2v.infer_item_vector(items)\n",
 79 |     "\n",
 80 |     "print(len(item_vector), len(item_vector[0])) "
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "markdown",
 85 |    "metadata": {},
 86 |    "source": [
 87 |     "### 使用本地模型"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": 5,
 93 |    "metadata": {},
 94 |    "outputs": [
 95 |     {
 96 |      "name": "stdout",
 97 |      "output_type": "stream",
 98 |      "text": [
 99 |       "2 256\n"
100 |      ]
101 |     }
102 |    ],
103 |    "source": [
104 |     "pretrained_path = \"../test_model/d2v/d2v_test_256/d2v_test_256.bin\"\n",
105 |     "i2v = D2V(\"pure_text\", \"d2v\", pretrained_path)\n",
106 |     "\n",
107 |     "item_vector, _ = i2v(items)\n",
108 |     "# or\n",
109 |     "item_vector, _ = i2v.infer_vector(items)\n",
110 |     "# or\n",
111 |     "item_vector = i2v.infer_item_vector(items)\n",
112 |     "\n",
113 |     "print(len(item_vector), len(item_vector[0])) "
114 |    ]
115 |   }
116 |  ],
117 |  "metadata": {
118 |   "interpreter": {
119 |    "hash": "2a09bcfc86f5d80d5adfb774779878f28f4d48d5a6d6c0020bcfd8afaf909ec6"
120 |   },
121 |   "kernelspec": {
122 |    "display_name": "Python 3.6.13 ('data')",
123 |    "language": "python",
124 |    "name": "python3"
125 |   },
126 |   "language_info": {
127 |    "codemirror_mode": {
128 |     "name": "ipython",
129 |     "version": 3
130 |    },
131 |    "file_extension": ".py",
132 |    "mimetype": "text/x-python",
133 |    "name": "python",
134 |    "nbconvert_exporter": "python",
135 |    "pygments_lexer": "ipython3",
136 |    "version": "3.6.13"
137 |   },
138 |   "orig_nbformat": 4
139 |  },
140 |  "nbformat": 4,
141 |  "nbformat_minor": 2
142 | }
143 | 


--------------------------------------------------------------------------------
/examples/pretrain/prepare_dataset.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "# Download Data by EduData"
 8 |    ]
 9 |   },
10 |   {
11 |    "cell_type": "code",
12 |    "execution_count": 1,
13 |    "metadata": {},
14 |    "outputs": [
15 |     {
16 |      "name": "stderr",
17 |      "output_type": "stream",
18 |      "text": [
19 |       "downloader, INFO http://base.ustc.edu.cn/data/OpenLUNA/OpenLUNA.json is saved as ..\\..\\data\\OpenLUNA.json\n"
20 |      ]
21 |     },
22 |     {
23 |      "name": "stdout",
24 |      "output_type": "stream",
25 |      "text": [
26 |       "Downloading ..\\..\\data\\OpenLUNA.json 100.00%: 269KB | 269KB\n"
27 |      ]
28 |     },
29 |     {
30 |      "data": {
31 |       "text/plain": [
32 |        "'..\\\\..\\\\data\\\\OpenLUNA.json'"
33 |       ]
34 |      },
35 |      "execution_count": 1,
36 |      "metadata": {},
37 |      "output_type": "execute_result"
38 |     }
39 |    ],
40 |    "source": [
41 |     "from EduData import get_data\n",
42 |     "\n",
43 |     "get_data(\"open-luna\", \"../../data/\")"
44 |    ]
45 |   }
46 |  ],
47 |  "metadata": {
48 |   "kernelspec": {
49 |    "display_name": "Python 3.10.4 64-bit",
50 |    "language": "python",
51 |    "name": "python3"
52 |   },
53 |   "language_info": {
54 |    "codemirror_mode": {
55 |     "name": "ipython",
56 |     "version": 3
57 |    },
58 |    "file_extension": ".py",
59 |    "mimetype": "text/x-python",
60 |    "name": "python",
61 |    "nbconvert_exporter": "python",
62 |    "pygments_lexer": "ipython3",
63 |    "version": "3.10.4"
64 |   },
65 |   "vscode": {
66 |    "interpreter": {
67 |     "hash": "2469a70536e4d2335a2ea8907942d0699c37342a371ac185bdb5b0aa6f073890"
68 |    }
69 |   }
70 |  },
71 |  "nbformat": 4,
72 |  "nbformat_minor": 2
73 | }
74 | 


--------------------------------------------------------------------------------
/examples/pretrain/rnn/rnn.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | # 2021/8/3 @ tongshiwei
 3 | 
 4 | from longling import load_jsonl
 5 | from EduNLP.Tokenizer import get_tokenizer
 6 | from EduNLP.Pretrain import train_vector
 7 | from EduNLP.Vector import W2V, RNNModel
 8 | 
 9 | 
10 | def etl():
11 |     tokenizer = get_tokenizer("pure_text")
12 |     return tokenizer([item["stem"] for item in load_jsonl("../../../data/OpenLUNA.json")])
13 | 
14 | 
15 | items = list(etl())
16 | model_path = train_vector(items, "./w2v", 10, "sg")
17 | 
18 | w2v = W2V(model_path, "sg")
19 | rnn = RNNModel("lstm", w2v, 5, device="cpu")
20 | saved_params = rnn.save("./lstm.params", save_embedding=True)
21 | 
22 | rnn1 = RNNModel("lstm", w2v, 5, model_params=saved_params)
23 | 


--------------------------------------------------------------------------------
/examples/sif/item.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdata-ustc/EduNLP/95cd9fc71a1bdd6156c42af7434aee0a8fc0a82e/examples/sif/item.json


--------------------------------------------------------------------------------
/examples/sif/parse/parse.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# parse\n",
  8 |     "\n",
  9 |     "主要功能为将文本中的字母、数字等进行提取，将其转换为标准格式。\n",
 10 |     "\n",
 11 |     "\n",
 12 |     "## 概述\n",
 13 |     "\n",
 14 |     "1、将选择题中的括号，填空题中的下划线用特殊标识替换掉，并将字符、公式用$$包裹起来，使item能通过$符号准确的按照类型切割开；\n",
 15 |     "\n",
 16 |     "2、判断当前item是否合法，并报出错误类型。\n",
 17 |     "\n",
 18 |     "## 具体处理内容\n",
 19 |     "\n",
 20 |     "1.匹配公式之外的英文字母、数字，只对两个汉字之间的字母、数字做修正，其余匹配到的情况视为不合 latex 语法录入的公式\n",
 21 |     "\n",
 22 |     "2.匹配“（  ）”型括号（包含英文格式和中文格式），即括号内无内容或为空格的括号，将括号替换 ``$\\\\SIFChoice$`` \n",
 23 |     "\n",
 24 |     "3.匹配下划线，替换连续的下划线或下划线中夹杂空格的情况，将其替换为 ``$\\\\SIFBlank$`` \n",
 25 |     "\n",
 26 |     "4.匹配latex公式，主要检查latex公式的完整性和可解析性，对latex 中出现中文字符发出警告"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "markdown",
 31 |    "metadata": {},
 32 |    "source": [
 33 |     "## 导入类"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 2,
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "from EduNLP.Formula.ast import str2ast, katex_parse\n",
 43 |     "from EduNLP.SIF.parser import Parser"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "markdown",
 48 |    "metadata": {},
 49 |    "source": [
 50 |     "## 输入\n",
 51 |     "\n",
 52 |     "类型：str  \n",
 53 |     "内容：题目文本 （text）"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": 3,
 59 |    "metadata": {},
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "text1 = '生产某种零件的A工厂25名工人的日加工零件数_   _'\n",
 63 |     "text2 = 'X的分布列为(   )'\n",
 64 |     "text3 = '① AB是⊙O的直径，AC是⊙O的切线，BC交⊙O于点E．AC的中点为D'\n",
 65 |     "text4 = '支持公式如$\\\\frac{y}{x}$，$\\\\SIFBlank$，$\\\\FigureID{1}$，不支持公式如$\\\\frac{ \\\\dddot y}{x}$'"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "markdown",
 70 |    "metadata": {},
 71 |    "source": [
 72 |     "## 输出"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "metadata": {},
 78 |    "source": [
 79 |     "### 尝试转换为标准形式"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": 6,
 85 |    "metadata": {},
 86 |    "outputs": [
 87 |     {
 88 |      "name": "stdout",
 89 |      "output_type": "stream",
 90 |      "text": [
 91 |       "text_parser1.text: 生产某种零件的$A$工厂$25$名工人的日加工零件数$\\SIFBlank$\n",
 92 |       "text_parser2.text: $X$的分布列为$\\SIFChoice$\n"
 93 |      ]
 94 |     }
 95 |    ],
 96 |    "source": [
 97 |     "text_parser1 = Parser(text1)\n",
 98 |     "text_parser1.description_list()\n",
 99 |     "print('text_parser1.text:',text_parser1.text)\n",
100 |     "\n",
101 |     "\n",
102 |     "text_parser2 = Parser(text2)\n",
103 |     "text_parser2.description_list()\n",
104 |     "print('text_parser2.text:',text_parser2.text)\n"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "markdown",
109 |    "metadata": {},
110 |    "source": [
111 |     "### 判断是否有语法问题"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": 7,
117 |    "metadata": {},
118 |    "outputs": [
119 |     {
120 |      "name": "stdout",
121 |      "output_type": "stream",
122 |      "text": [
123 |       "text_parser3.error_flag:  1\n",
124 |       "text_parser4.fomula_illegal_flag:  1\n"
125 |      ]
126 |     }
127 |    ],
128 |    "source": [
129 |     "text_parser3 = Parser(text3)\n",
130 |     "text_parser3.description_list()\n",
131 |     "print('text_parser3.error_flag: ',text_parser3.error_flag)\n",
132 |     "\n",
133 |     "\n",
134 |     "text_parser4 = Parser(text4)\n",
135 |     "text_parser4.description_list()\n",
136 |     "print('text_parser4.fomula_illegal_flag: ',text_parser4.fomula_illegal_flag)\n"
137 |    ]
138 |   }
139 |  ],
140 |  "metadata": {
141 |   "interpreter": {
142 |    "hash": "6f23ddf1f0697a8f0c43dd2435bdb82528077c79e9967f824fba6a3b52b05faf"
143 |   },
144 |   "kernelspec": {
145 |    "display_name": "Python 3.6.3 64-bit",
146 |    "name": "python3"
147 |   },
148 |   "language_info": {
149 |    "codemirror_mode": {
150 |     "name": "ipython",
151 |     "version": 3
152 |    },
153 |    "file_extension": ".py",
154 |    "mimetype": "text/x-python",
155 |    "name": "python",
156 |    "nbconvert_exporter": "python",
157 |    "pygments_lexer": "ipython3",
158 |    "version": "3.6.3"
159 |   },
160 |   "orig_nbformat": 4
161 |  },
162 |  "nbformat": 4,
163 |  "nbformat_minor": 2
164 | }
165 | 


--------------------------------------------------------------------------------
/examples/sif/sci4sif.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | # 2021/5/18 @ tongshiwei
  3 | 
  4 | from EduNLP.SIF import sif4sci, link_formulas
  5 | 
  6 | # item = r"若集合$A=\{x \in R | |x - 2| \leq 5\}$，则$A$中最小整数位是$\SIFChoice$"
  7 | # print(item)
  8 | # print(sif4sci(item, symbol="fgm", tokenization=False))
  9 | # print(sif4sci(item, symbol="fgm", tokenization=True))
 10 | # print(sif4sci(item, symbol="t"))
 11 | # print(sif4sci(item, symbol="fgm", tokenization=False))
 12 | # print(sif4sci(item, symbol="fgm"))
 13 | # print(sif4sci(item, symbol="gm", tokenization_params={"formula_params": {"method": "ast"}}))
 14 | # print(sif4sci(item, symbol="gm", tokenization_params={"formula_params": {"method": "linear"}}))
 15 | # print(sif4sci(item, tokenization_params={"formula_params": {"method": "ast", "ord2token": True}}))
 16 | # print(
 17 | #     sif4sci(item, tokenization_params={"formula_params": {"method": "ast", "ord2token": True, "var_numbering": True}}))
 18 | # print(sif4sci(item, tokenization_params={"formula_params": {"method": "ast", "return_type": "list"}}))
 19 | # print(
 20 | #     sif4sci(item, tokenization_params={"formula_params": {"method": "ast", "ord2token": True, "return_type": "list"}}).formula_tokens
 21 | # )
 22 | # print(
 23 | #     sif4sci(item, tokenization_params={
 24 | #         "formula_params": {"method": "ast", "ord2token": True, "var_numbering": True, "return_type": "list"}})
 25 | # )
 26 | # print(sif4sci(item, tokenization_params={"formula_params": {"method": "ast", "return_type": "ast"}}))
 27 | # print(sif4sci(item, tokenization_params={"formula_params": {"method": "ast", "return_type": "formula"}}))
 28 | 
 29 | # e = r"$x$ 是 $y$ 那么 $y$ 和 $z$ 是什么 $x,y,z$"
 30 | # print(sif4sci(e, symbol="gm",
 31 | #               tokenization_params={
 32 | #                   "formula_params": {
 33 | #                       "method": "ast", "return_type": "list", "ord2token": True, "var_numbering": True,
 34 | #                   }
 35 | #               }))
 36 | #
 37 | # test_item_1 = [r"$x < y$", r"$y = x$", r"$y < x$"]
 38 | # tls = [
 39 | #     sif4sci(e, symbol="gm",
 40 | #             tokenization_params={
 41 | #                 "formula_params": {
 42 | #                     "method": "ast", "return_type": "list", "ord2token": True, "var_numbering": True,
 43 | #                 }
 44 | #             })
 45 | #     for e in test_item_1
 46 | # ]
 47 | # link_formulas(*tls)
 48 | # print(tls)
 49 | # seg = sif4sci(e, tokenization=False)
 50 | # with seg.filter(keep="t"):
 51 | #     print(seg)
 52 | # e = r'某校一个课外学习小组为研究某作物的发芽率y和温度x（单位：$^{\circ} \mathrm{C}$）的关系，在20个不同温度条件下进行种子发芽实验，由实验数据$\left(x_{i}, y_{i}\right)(i=1,2, \cdots, 20)$得到下面的散点图：由此散点图，在10$^{\circ} \mathrm{C}$至40$^{\circ} \mathrm{C}$之间，下面四个回归方程类型中最适宜作为发芽率y和温度x的回归方程类型的是$\FigureID{3bf20b91-8af1-11eb-86ff-b46bfc50aa29}$$\FigureID{59b851d3-8af1-11eb-bd45-b46bfc50aa29}$$\FigureID{6310d375-8b75-11eb-bf70-b46bfc50aa29}$$\FigureID{6a006175-8b76-11eb-aa57-b46bfc50aa29}$$\FigureID{088f15e7-8b7c-11eb-a8aa-b46bfc50aa29}$'
 53 | # # e = r"$x$ 是 $y$ 那么 $y$ 和 $z$ 是什么 $x,y,z$"
 54 | 
 55 | # e = r'已知集合$A=\left\{x \mid x^{2}-3 x-4<0\right\}, \quad B=\{-4,1,3,5\}, \quad$ 则 $A \cap B=$'
 56 | 
 57 | from EduNLP.utils import dict2str4sif
 58 | 
 59 | test_item_1 = {
 60 |     "stem": "已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$",
 61 |     "options": ['\\{-4,1\\}', '\\{1,5\\}', '\\{3,5\\}', '\\{1,3\\}'],
 62 | }
 63 | e = dict2str4sif(test_item_1, tag_mode="head", add_list_no_tag=False)
 64 | seg = sif4sci(
 65 |     e,
 66 |     symbol="tfgmas",
 67 |     tokenization_params={
 68 |         "formula_params": {
 69 |             "method": "ast", "return_type": "list", "ord2token": True
 70 |         }
 71 |     },
 72 |     errors="raise"
 73 | )
 74 | print(seg.tokens)
 75 | # print(seg.get_segments())
 76 | #
 77 | # import json
 78 | # from tqdm import tqdm
 79 | #
 80 | #
 81 | # def load_items():
 82 | #     with open("../../data/OpenLUNA.json", encoding="utf-8") as f:
 83 | #         for line in f:
 84 | #             yield json.loads(line)
 85 | #
 86 | #
 87 | # from EduNLP.SIF import sif4sci
 88 | #
 89 | # sif_items = []
 90 | # for i, item in tqdm(enumerate(load_items()), "sifing"):
 91 | #     if i > 100:
 92 | #         break
 93 | #     sif_item = sif4sci(
 94 | #         item["stem"],
 95 | #         symbol="gm",
 96 | #         tokenization_params={"formula_params": {
 97 | #             "method": "ast",
 98 | #             "return_type": "list",
 99 | #             "ord2token": True,
100 | #         }}
101 | #     )
102 | #     if sif_item:
103 | #         sif_items.append(sif_item.tokens)
104 | 


--------------------------------------------------------------------------------
/examples/test_model/w2v/gensim_luna_stem_t_sg_100.kv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdata-ustc/EduNLP/95cd9fc71a1bdd6156c42af7434aee0a8fc0a82e/examples/test_model/w2v/gensim_luna_stem_t_sg_100.kv


--------------------------------------------------------------------------------
/examples/tokenizer/test_stopwords.txt:
--------------------------------------------------------------------------------
1 | 一旦
2 | 一时
3 | 一来
4 | 一样
5 | 一次
6 | 一片
7 | 一番
8 | 一直
9 | 一致


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
 1 | [pytest]
 2 | # For pytest usage, refer to https://hb4dsai.readthedocs.io/zh/latest/Architecture/Test.html
 3 | norecursedirs = docs *build* trash dev examples EduNLP/Formula/viz EduNLP/Formula/ast scripts data
 4 | 
 5 | # Deal with marker warnings
 6 | markers =
 7 |     flake8: flake8
 8 | 
 9 | # Enable line length testing with maximum line length of 120
10 | flake8-max-line-length = 120
11 | 
12 | # Ignore module level import not at top of file (E402)
13 | # Others can be found in https://flake8.pycqa.org/en/latest/user/error-codes.html
14 | flake8-ignore = E402 F401 F403
15 | 
16 | # --doctest-modules is used for unittest
17 | addopts = --doctest-modules --cov --cov-report=term-missing --flake8
18 | 


--------------------------------------------------------------------------------
/scripts/extlib/katex2python.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | # 2021/5/30 @ tongshiwei
 3 | from pathlib import PurePath
 4 | from fire import Fire
 5 | import requests
 6 | import js2py
 7 | import tempfile
 8 | 
 9 | 
10 | def get_katex_from_url(version, tar):
11 |     katex_version = version
12 |     url = "https://cdn.jsdelivr.net/npm/katex@{}/dist/katex.js".format(katex_version)
13 |     ret = requests.get(url, allow_redirects=True)
14 |     assert ret.status_code == 200, ret.status_code
15 |     content = requests.get(url).content
16 |     tar.write(content)
17 |     return url
18 | 
19 | 
20 | def update_katex_py(src=None, tar="katex.py"):
21 |     '''
22 |     Notes
23 |     ----------
24 |     In that some formulas can not parse well by katex.py for some js2py errors, 
25 |     we need to manually omit a few codes after ketex.py is built.
26 |     eg 1. Array.fill() error :
27 |         # var.get('res').put('cols', var.get('Array').create(var.get('numCols')).callprop('fill', Js({'type':Js('align'),'align':var.get('colAlign')})))
28 |     '''
29 |     src = "katex.js" if src is None else src
30 |     if PurePath(src).suffix == ".js":
31 |         print("%s -> %s" % (src, tar))
32 |         js2py.translate_file("katex.js", tar)
33 |     else:
34 |         with tempfile.NamedTemporaryFile() as tmp_tar:
35 |             print("katex version: %s" % src)
36 |             url = get_katex_from_url(src, tmp_tar)
37 |             src = tmp_tar.name
38 |             print("%s -> %s" % (url, tar))
39 |             js2py.translate_file(src, tar)
40 | 
41 | 
42 | if __name__ == '__main__':
43 |     Fire(update_katex_py)
44 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [coverage:run]
 2 | source=EduNLP
 3 | omit=EduNLP/Formula/ast/*,EduNLP/Formula/viz/*,EduNLP/utils/path.py
 4 | [coverage:report]
 5 | exclude_lines =
 6 |     pragma: no cover
 7 |     pass
 8 |     raise NotImplementedError
 9 |     if __name__ == '__main__':
10 |     if __name__ == "__main__":
11 |     def __str__
12 |     def __repr__
13 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from setuptools import setup, find_packages
 3 | 
 4 | tutor_deps = [
 5 |     "pillow",
 6 |     "tqdm",
 7 |     "ipython"
 8 | ]
 9 | test_deps = [
10 |     'pytest>=4',
11 |     'pytest-cov>=2.6.0',
12 |     'pytest-flake8',
13 |     'flake8<5.0.0'
14 | ]
15 | docs_deps = [
16 |     'sphinx',
17 |     'sphinx_rtd_theme',
18 |     'sphinx_toggleprompt',
19 |     'sphinx-gallery>=0.6',
20 |     'nbsphinx',
21 |     'm2r2'
22 | ]
23 | 
24 | dev_deps = ["requests"] + docs_deps + test_deps
25 | 
26 | try:
27 |     import torch
28 | 
29 |     ml_pytorch_deps = []
30 | except ModuleNotFoundError:
31 |     import sys
32 | 
33 |     if 5 <= sys.version_info[1]:
34 |         ml_pytorch_deps = ["torch<=1.12.1"]
35 |     else:
36 |         ml_pytorch_deps = []
37 |         logging.warning("Current python version %s is not supported by pytorch", str(sys.version_info[:2]))
38 | 
39 | vec_deps = [
40 |     'gensim',
41 |     'transformers<4.29.0',
42 |     'torchvision',
43 |     'datasets'] + ml_pytorch_deps
44 | 
45 | setup(
46 |     name='EduNLP',
47 |     version='0.0.9',
48 |     extras_require={
49 |         'test': test_deps,
50 |         'doc': docs_deps,
51 |         'tutor': tutor_deps,
52 |         'dev': dev_deps,
53 |         'vec': vec_deps,
54 |         'full': vec_deps + tutor_deps
55 |     },
56 |     packages=find_packages(),
57 |     include_package_data=True,
58 |     install_requires=[
59 |         'networkx',
60 |         'numpy>=1.17.0',
61 |         'jieba',
62 |         'js2py',
63 |         'EduData>=0.0.16',
64 |         'PyBaize>=0.0.3'
65 |     ],  # And any other dependencies foo needs
66 |     entry_points={
67 |         "console_scripts": [
68 |             "edunlp = EduNLP.main:cli",
69 |         ],
70 |     },
71 |     classifiers=[
72 |         'Programming Language :: Python :: 3.6',
73 |         'Programming Language :: Python :: 3.7',
74 |         'Programming Language :: Python :: 3.8',
75 |         'Programming Language :: Python :: 3.9',
76 |         "Environment :: Other Environment",
77 |         "Intended Audience :: Developers",
78 |         "License :: OSI Approved :: Apache Software License",
79 |         "Operating System :: OS Independent",
80 |         "Topic :: Software Development :: Libraries :: Python Modules",
81 |     ],
82 | )
83 | 


--------------------------------------------------------------------------------
/static/test_data/quesnet_img/000004d6-0479-11ec-829b-797d5eb43535.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdata-ustc/EduNLP/95cd9fc71a1bdd6156c42af7434aee0a8fc0a82e/static/test_data/quesnet_img/000004d6-0479-11ec-829b-797d5eb43535.png


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/5/20 @ tongshiwei
3 | 


--------------------------------------------------------------------------------
/tests/test_ast.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from EduNLP.Formula.ast import str2ast
 4 | 
 5 | 
 6 | def test_ast():
 7 |     ast_str_list = []
 8 |     # normal examples
 9 |     ast_str_list.append(r"{x + y}^\frac{\pi}{2} + 1 = x")
10 |     ast_str_list.append(r"\color{#0FF} x = y")
11 |     ast_str_list.append(r"x^2 + 1 = y")
12 |     ast_str_list.append(r"\verb!x^2!")
13 |     ast_str_list.append(r"\utilde{AB}")
14 |     ast_str_list.append(r"\mathrm{Ab0}")
15 |     ast_str_list.append(r"{1,2,3}")
16 |     ast_str_list.append(r"\huge AB")
17 |     ast_str_list.append(r"\underline{AB}")
18 |     ast_str_list.append(r"\sqrt{\smash[b]{y}}")
19 |     ast_str_list.append(r"\hbox{AA BB}")
20 |     ast_str_list.append(r"abc\llap{abcdefghi}")
21 |     ast_str_list.append(r"\raisebox{3em}{hi}")
22 |     ast_str_list.append(r"\textcolor{#228B22}{F=ma}")
23 |     ast_str_list.append(r"\displaystyle\sum_{i=1}^n")
24 |     ast_str_list.append(r"\def\foo{x^2} \foo + \foo")
25 |     ast_str_list.append(r"thank \hphantom{xyz} you")
26 |     ast_str_list.append(r"\mathchoice{D}{T}{S}{SS}")
27 |     ast_str_list.append(r"\bigotimes")
28 |     ast_str_list.append(r"{AB}_b^c")
29 |     ast_str_list.append(r"\left\{\begin{array}{c}2 x+y-2 \leq 0 \\ x-y-1 \geq 0 \\ y+1 \geq 0\end{array}\right.")
30 |     ast_str_list.append(r"\cancel{5}")
31 | 
32 |     # work only when katex is in 'display' mode :
33 |     ast_str_list.append(r"\begin{matrix} a & b \\ c & d \end{matrix}")
34 |     ast_str_list.append(r"\begin{pmatrix} a&b\\c&d \end{pmatrix}")
35 |     ast_str_list.append(r"\begin{matrix}k个\\ \overbrace{(-1)^{k-1}k,\cdots,(-1)^{k-1}k}\end{matrix}")
36 | 
37 |     # work only when 'trust' katex html func:
38 |     ast_str_list.append(r"\href{https://katex.org}{katex}")
39 |     ast_str_list.append(r"\htmlStyle{color: red;}{x}")
40 |     ast_str_list.append(r"\url{www.baidu.com}")
41 |     ast_str_list.append(r"\htmlId{bar}{x}")
42 |     ast_str_list.append(r"\htmlClass{foo}{x}")
43 |     ast_str_list.append("\\includegraphics[height=0.8em, totalheight=0.9em, \
44 |     width=0.9em, alt=KA logo]{https://katex.org/img/khan-academy.png}")
45 | 
46 |     for ast_str in ast_str_list:
47 |         str2ast(ast_str)
48 | 


--------------------------------------------------------------------------------
/tests/test_formula.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | # 2021/5/20 @ tongshiwei
 3 | import pytest
 4 | from EduNLP.Formula import Formula, FormulaGroup
 5 | 
 6 | 
 7 | def test_formula():
 8 |     formula = r"x + x"
 9 |     f = Formula(formula)
10 |     f.variable_standardization(inplace=False)
11 |     f.variable_standardization(inplace=True)
12 |     assert len(f.ast_graph.nodes) == len(f.ast)
13 |     f.to_str()
14 | 
15 |     formula = r"\frac{\pi}{2}"
16 |     f = Formula(formula, variable_standardization=True)
17 |     assert repr(f) == r"<Formula: \frac{\pi}{2}>"
18 | 
19 |     f = Formula(f.ast)
20 |     assert f.resetable is False
21 |     with pytest.raises(TypeError):
22 |         f.reset_ast()
23 | 
24 |     fg = FormulaGroup([r"x + x", r"x + \frac{\pi}{2}"], variable_standardization=True)
25 |     for f in fg:
26 |         assert f in fg
27 |     assert len(fg[0].ast) == 3
28 |     fg.to_str()
29 | 
30 |     fg = FormulaGroup(["x", "y", "x"])
31 |     assert len(fg.ast) == 3
32 | 
33 |     with pytest.raises(TypeError):
34 |         FormulaGroup([{}])
35 | 


--------------------------------------------------------------------------------
/tests/test_i2v/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/8/1 @ tongshiwei
3 | 


--------------------------------------------------------------------------------
/tests/test_i2v/test_pretrained.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | # 2021/8/2 @ tongshiwei
 3 | import pytest
 4 | from EduNLP import get_pretrained_i2v
 5 | # from EduNLP.I2V.i2v import MODELS
 6 | from EduNLP.I2V import D2V, W2V
 7 | from EduNLP.Vector import get_pretrained_model_info, get_all_pretrained_models
 8 | 
 9 | 
10 | def test_pretrained_i2v(tmp_path):
11 | 
12 |     d = tmp_path / "model"
13 |     d.mkdir()
14 | 
15 |     url, t2v_name = get_pretrained_model_info("d2v_test_256")
16 |     assert url != ""
17 |     assert t2v_name == "d2v"
18 |     model_names = get_all_pretrained_models()
19 |     assert "d2v_test_256" in model_names
20 | 
21 |     get_pretrained_i2v("d2v_test_256", d)
22 | 
23 |     with pytest.raises(KeyError):
24 |         get_pretrained_i2v("error")
25 | 
26 |     get_pretrained_i2v("w2v_test_256", d)
27 | 
28 | #     get_pretrained_i2v("quesnet_test_256", d)
29 | 
30 | #     get_pretrained_i2v("elmo_test", d)
31 | 
32 | #     # get_pretrained_i2v("tal_edu_bert", d)
33 | 


--------------------------------------------------------------------------------
/tests/test_main.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/8/2 @ tongshiwei
3 | 
4 | from EduNLP.main import list_i2v
5 | 
6 | 
7 | def test_list_i2v():
8 |     list_i2v()
9 | 


--------------------------------------------------------------------------------
/tests/test_model_zoo/test_rnn.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from EduNLP.ModelZoo.rnn import LM
 3 | 
 4 | 
 5 | idxs = torch.tensor([
 6 |     [1, 2, 3, 4, 0, 0],
 7 |     [1, 2, 0, 0, 0, 0],
 8 |     [1, 0, 0, 0, 0, 0],
 9 |     [1, 2, 0, 0, 0, 0]
10 | ])
11 | 
12 | lens = torch.tensor([4, 2, 1, 2])
13 | 
14 | rnn = LM(rnn_type="lstm", vocab_size=20, embedding_dim=5, hidden_size=10)
15 | output, hn = rnn(idxs, lens)
16 | 
17 | print("[output]", output)
18 | print("[hn]", hn)
19 | 


--------------------------------------------------------------------------------
/tests/test_pipeline/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from PIL import Image
 3 | from EduNLP.utils import abs_current_dir, path_append
 4 | from EduNLP.Vector import get_pretrained_model_info
 5 | from EduData import get_data
 6 | 
 7 | 
 8 | @pytest.fixture(scope="module")
 9 | def pretrained_elmo_for_property_prediction_dir():
10 |     model_dir = path_append(abs_current_dir(__file__), "../../examples/test_model/elmo", to_str=True)
11 |     url, _ = get_pretrained_model_info('elmo_pp_test')
12 |     path = get_data(url, model_dir)
13 |     return path
14 | 
15 | 
16 | @pytest.fixture(scope="module")
17 | def pretrained_elmo_for_knowledge_prediction_dir():
18 |     model_dir = path_append(abs_current_dir(__file__), "../../examples/test_model/elmo", to_str=True)
19 |     url, _ = get_pretrained_model_info('elmo_kp_test')
20 |     path = get_data(url, model_dir)
21 |     return path
22 | 


--------------------------------------------------------------------------------
/tests/test_pretrain/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdata-ustc/EduNLP/95cd9fc71a1bdd6156c42af7434aee0a8fc0a82e/tests/test_pretrain/__init__.py


--------------------------------------------------------------------------------
/tests/test_pretrain/conftest.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | # 2021/5/30 @ tongshiwei
 3 | import torch
 4 | import pytest
 5 | import os
 6 | from EduNLP.utils import abs_current_dir, path_append
 7 | from EduNLP.ModelZoo import load_items
 8 | 
 9 | # TEST_GPU = torch.cuda.is_available()
10 | 
11 | 
12 | @pytest.fixture(scope="module")
13 | def standard_luna_data():
14 |     data_path = path_append(abs_current_dir(__file__), "../../static/test_data/standard_luna_data.json", to_str=True)
15 |     _data = load_items(data_path)[:10]
16 |     return _data
17 | 
18 | 
19 | @pytest.fixture(scope="module")
20 | def pretrained_tokenizer_dir(tmp_path_factory):
21 |     return str(tmp_path_factory.mktemp("pretrained_tokenizer_dir"))
22 | 
23 | 
24 | @pytest.fixture(scope="module")
25 | def pretrained_model_dir(tmp_path_factory):
26 |     return str(tmp_path_factory.mktemp("pretrained_model_dir"))
27 | 
28 | 
29 | @pytest.fixture(scope="module")
30 | def pretrained_pp_dir(tmp_path_factory):
31 |     return str(tmp_path_factory.mktemp("pretrained_pp_dir"))
32 | 
33 | 
34 | @pytest.fixture(scope="module")
35 | def pretrained_kp_dir(tmp_path_factory):
36 |     return str(tmp_path_factory.mktemp("pretrained_kp_dir"))
37 | 


--------------------------------------------------------------------------------
/tests/test_pretrain/test_hugginface_utils.py:
--------------------------------------------------------------------------------
 1 | from EduNLP.Pretrain.hugginface_utils import TokenizerForHuggingface
 2 | from transformers import AutoTokenizer
 3 | import os
 4 | os.environ["WANDB_DISABLED"] = "true"
 5 | 
 6 | 
 7 | class TestPretrainUtils:
 8 |     def test_hf_tokenzier(self, pretrained_tokenizer_dir):
 9 |         tokenizer = TokenizerForHuggingface(tokenize_method=None)
10 |         tokenizer = TokenizerForHuggingface(add_special_tokens=True)
11 |         assert isinstance(tokenizer.vocab_size, int)
12 |         item = 'This is a test.'
13 |         res = tokenizer.decode(tokenizer.encode(item))
14 |         right_ans = '[CLS] [UNK] is a test. [SEP]'
15 |         assert res == right_ans, res
16 |         items = ['This is a test.', 'This is a test 2.']
17 |         res = tokenizer.decode(tokenizer.encode(items))
18 |         right_ans = ['[CLS] [UNK] is a test. [SEP]', '[CLS] [UNK] is a test 2. [SEP]']
19 |         assert res == right_ans, res
20 | 
21 |         tokenizer_hf = AutoTokenizer.from_pretrained("bert-base-chinese")
22 |         tokenizer_hf.save_pretrained(pretrained_tokenizer_dir)
23 | 
24 |         tokenizer_hf = TokenizerForHuggingface.from_pretrained(pretrained_tokenizer_dir)
25 | 


--------------------------------------------------------------------------------
/tests/test_pretrain/test_pretrain_utils.py:
--------------------------------------------------------------------------------
 1 | from EduNLP.Pretrain.pretrian_utils import EduVocab, PretrainedEduTokenizer, EduDataset
 2 | import pytest
 3 | import os
 4 | 
 5 | 
 6 | class TestPretrainUtils:
 7 |     def test_eduvocab(self):
 8 |         test = EduVocab(specials=['token1'])
 9 |         assert len(test) == 5
10 |         token_list = ['An', 'apple', 'a', 'day', 'keeps', 'doctors', 'away']
11 |         test.add_tokens(token_list)
12 |         right_ans = ['[PAD]', '[UNK]', '[BOS]', '[EOS]', 'token1',
13 |                      'An', 'apple', 'a', 'day', 'keeps', 'doctors', 'away']
14 |         assert test.tokens == right_ans
15 |         assert test.vocab_size == len(right_ans)
16 |         test_token_list = ['An', 'banana', 'is', 'a', 'kind', 'of', 'fruit']
17 |         res = test.convert_sequence_to_token(test.convert_sequence_to_idx(test_token_list, bos=True, eos=True))
18 |         right_ans = ['[BOS]', 'An', '[UNK]', '[UNK]', 'a', '[UNK]', '[UNK]', '[UNK]', '[EOS]']
19 |         assert res == right_ans
20 |         test.add_specials(['token2', 'token3'])
21 |         right_ans = ['[PAD]', '[UNK]', '[BOS]', '[EOS]', 'token1', 'token2', 'token3']
22 |         test.special_tokens == right_ans
23 |         test = EduVocab(corpus_items=[token_list])
24 | 
25 |     def test_edu_tokenizer(self, pretrained_tokenizer_dir):
26 |         test = EduVocab()
27 |         token_list = ['An', 'apple', 'a', 'day', 'keeps', 'doctors', 'away']
28 |         test.add_tokens(token_list)
29 |         vocab_path = os.path.join(pretrained_tokenizer_dir, 'vocab.txt')
30 |         test.save_vocab(vocab_path)
31 |         test = EduVocab(vocab_path=vocab_path)
32 | 
33 |         text = 'An apple a day keeps doctors away'
34 |         tokenizer = PretrainedEduTokenizer(vocab_path=vocab_path, max_length=100)
35 |         res = tokenizer(text, padding='max_length')
36 |         assert res['seq_idx'].shape[0] == 100
37 |         res = tokenizer(text, padding='longest')
38 |         assert res['seq_idx'].shape[0] == res['seq_len']
39 |         res = tokenizer(text, padding='do_not_pad')
40 |         assert res['seq_idx'].shape[0] == res['seq_len']
41 |         with pytest.raises(ValueError):
42 |             res = tokenizer(text, padding='wrong_pad')
43 |         tokenizer.add_tokens("[token]")
44 |         tokenizer.add_specials("[special]")
45 |         res = tokenizer.decode(tokenizer.encode({'content': 'An banana'}, key=lambda x: x['content']))
46 |         right_ans = ['An', '[UNK]']
47 |         print(res)
48 |         assert res == right_ans, res
49 | 
50 |         res = tokenizer.decode(tokenizer.encode(['An banana']))
51 |         assert res == [['An', '[UNK]']]
52 |         tokenizer.save_pretrained(f"{pretrained_tokenizer_dir}/save_dir")
53 | 
54 |     def test_edu_dateset(self, standard_luna_data, pretrained_tokenizer_dir):
55 |         tokenizer = PretrainedEduTokenizer()
56 |         tokenizer.set_vocab(standard_luna_data, key=lambda x: x["ques_content"])
57 |         dataset = EduDataset(tokenizer,
58 |                              items=standard_luna_data,
59 |                              stem_key="ques_content")
60 |         assert "seq_idx" in dataset[0].keys() and "seq_len" in dataset[0].keys()
61 |         dataset.to_disk(f"{pretrained_tokenizer_dir}/dataset")
62 | 
63 |         local_dataset = EduDataset(tokenizer, f"{pretrained_tokenizer_dir}/dataset")
64 |         assert local_dataset[0] == dataset[0]
65 | 


--------------------------------------------------------------------------------
/tests/test_sif/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/5/20 @ tongshiwei
3 | 


--------------------------------------------------------------------------------
/tests/test_sif/conftest.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | # 2021/5/20 @ tongshiwei
 3 | import os
 4 | import pytest
 5 | from PIL import Image
 6 | from EduNLP.utils import abs_current_dir, path_append, image2base64
 7 | 
 8 | 
 9 | @pytest.fixture(scope="module")
10 | def img_dir():
11 |     return os.path.abspath(path_append(abs_current_dir(__file__), "..", "..", "asset", "_static"))
12 | 
13 | 
14 | @pytest.fixture(scope="module")
15 | def figure0(img_dir):
16 |     return Image.open(path_append(img_dir, "item_formula.png", to_str=True))
17 | 
18 | 
19 | @pytest.fixture(scope="module")
20 | def figure1(img_dir):
21 |     return Image.open(path_append(img_dir, "item_figure.png", to_str=True))
22 | 
23 | 
24 | @pytest.fixture(scope="module")
25 | def figure0_base64(figure0):
26 |     return image2base64(figure0)
27 | 
28 | 
29 | @pytest.fixture(scope="module")
30 | def figure1_base64(figure1):
31 |     return image2base64(figure1)
32 | 


--------------------------------------------------------------------------------
/tests/test_sif/test_parser.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from EduNLP.SIF.parser.parser import Parser
 3 | 
 4 | 
 5 | def test_parser():
 6 |     text = ''
 7 |     text_parser = Parser(text)
 8 |     text_parser.description_list()
 9 | 
10 |     text = '随机$text{观测}$生产某种零件的A工厂25名工人的日加工零件数_   _'
11 |     text_parser = Parser(text)
12 |     text_parser.description_list()
13 | 
14 |     text = 'X的分布列为(   )'
15 |     text_parser = Parser(text)
16 |     text_parser.description_list()
17 | 
18 |     text = '由题意得（ ）'
19 |     text_parser = Parser(text)
20 |     text_parser.description_list()
21 |     assert text_parser.error_flag == 0
22 | 
23 |     text = '1.命题中真命题的序号是\n ① AB是⊙O的直径，AC是⊙O的切线，BC交⊙O于点E．AC的中点为D'
24 |     text_parser = Parser(text)
25 |     text_parser.description_list()
26 |     assert text_parser.error_flag == 1
27 | 
28 |     text = r"公式两侧的匹配符号需要完整，如不允许$\frac{y}{x}"
29 |     text_parser = Parser(text)
30 |     text_parser.description_list()
31 |     assert text_parser.error_flag == 1
32 | 
33 |     text = r"支持公式如$\frac{y}{x}$，$\SIFBlank$，$\FigureID{1}$，不支持公式如$\frac{ \dddot y}{x}$"
34 |     text_parser = Parser(text)
35 |     text_parser.description_list()
36 |     assert text_parser.fomula_illegal_flag == 1
37 | 


--------------------------------------------------------------------------------
/tests/test_sif/test_segement.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | # 2021/5/20 @ tongshiwei
 3 | 
 4 | import pytest
 5 | 
 6 | from EduNLP.SIF.segment import seg
 7 | from EduNLP.utils import image2base64
 8 | 
 9 | 
10 | def test_segment(figure0, figure1, figure0_base64, figure1_base64):
11 |     seg(
12 |         r"如图所示，则$\FormFigureID{0}$的面积是$\SIFBlank$。$\FigureID{1}$",
13 |         figures={
14 |             "0": figure0,
15 |             "1": figure1
16 |         }
17 |     )
18 |     s = seg(
19 |         r"如图所示，则$\FormFigureBase64{%s}$的面积是$\SIFBlank$。$\FigureBase64{%s}$" % (figure0_base64, figure1_base64),
20 |         figures=True
21 |     )
22 |     with pytest.raises(TypeError):
23 |         s.append("123")
24 |     seg_test_text = seg(
25 |         r"如图所示，有三组$\textf{机器人,bu}$在踢$\textf{足球,b}$",
26 |         figures=True
27 |     )
28 |     assert seg_test_text.text_segments == ['如图所示，有三组机器人在踢足球']
29 | 


--------------------------------------------------------------------------------
/tests/test_sif/test_sif.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | # 2021/5/20 @ tongshiwei
 3 | 
 4 | from EduNLP.SIF import is_sif
 5 | from EduNLP.SIF import to_sif
 6 | from EduNLP.SIF import sif4sci
 7 | import pytest
 8 | 
 9 | 
10 | def test_is_sif():
11 |     text = '若$x,y$满足约束条件' \
12 |            '$\\left\\{\\begin{array}{c}2 x+y-2 \\leq 0 \\\\ x-y-1 \\geq 0 \\\\ y+1 \\geq 0\\end{array}\\right.$，' \
13 |            '则$z=x+7 y$的最大值$\\SIFUnderline$'
14 |     assert is_sif(text) == 1
15 | 
16 |     text = '公式需要满足完整性，完整的公式如' \
17 |            '$\\begin{matrix} a & b \\\\ c & d \\end{matrix}$' \
18 |            '，不完整的公式如$\\begin{matrix} a & b \\\\ c & d$'
19 |     with pytest.raises(ValueError):
20 |         is_sif(text)
21 | 
22 |     text = '公式需要满足符合katex的支持性，可支持的公式如' \
23 |            '$\\begin{matrix} a & b \\\\ c & d \\end{matrix}$' \
24 |            '，不可支持的公式如$\\frac{ \\dddot y }{ x }$'
25 |     with pytest.raises(ValueError):
26 |         is_sif(text)
27 | 
28 | 
29 | def test_to_sif():
30 |     text = '某校一个课外学习小组为研究某作物的发芽率y和温度x（单位...'
31 |     siftext = to_sif(text)
32 |     print(siftext)
33 | 
34 |     ret = is_sif(text, return_parser=True)
35 |     assert ret[0] == 0
36 |     if ret[0] is not True:
37 |         siftext = to_sif(text, parser=ret[1])
38 |     print(siftext)
39 | 
40 | 
41 | def test_sci4sif(figure0, figure1, figure0_base64, figure1_base64):
42 |     repr(sif4sci(
43 |         r"如图所示，则$\bigtriangleup ABC$的面积是$\SIFBlank$。$\FigureID{1}$",
44 |         tokenization_params={
45 |             "formula_params": {
46 |                 "method": "ast",
47 |                 "return_type": "ast"
48 |             }
49 |         }
50 |     ))
51 |     repr(sif4sci(
52 |         r"如图所示，则$\FormFigureID{0}$的面积是$\SIFBlank$。$\FigureID{1}$",
53 |         figures={
54 |             "0": figure0,
55 |             "1": figure1
56 |         },
57 |     ))
58 |     repr(sif4sci(
59 |         item=r"如图所示，则$\FormFigureBase64{%s}$的面积是$\SIFBlank$。$\FigureBase64{%s}$" % (
60 |             figure0_base64, figure1_base64
61 |         ),
62 |         tokenization_params={
63 |             "figure_params": {"figure_instance": True}
64 |         }
65 |     ))
66 |     repr(sif4sci(
67 |         r"如图所示，则$\bigtriangleup ABC$的面积是$\SIFBlank$。$\FigureID{1}$", mode=0
68 |     ))
69 |     repr(sif4sci(
70 |         r"如图所示，则$\bigtriangleup ABC$的面积是$\SIFBlank$。$\FigureID{1}$", mode=1
71 |     ))
72 |     repr(sif4sci(
73 |         r"如图所示，则$\bigtriangleup ABC$的面积是$\SIFBlank$。$\FigureID{1}$", mode=2
74 |     ))
75 | 
76 |     with pytest.raises(KeyError):
77 |         repr(sif4sci(
78 |             r"如图所示，则$\bigtriangleup ABC$的面积是$\SIFBlank$。$\FigureID{1}$", mode=3
79 |         ))
80 | 


--------------------------------------------------------------------------------
/tests/test_sif/test_tokenization.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | # 2021/5/20 @ tongshiwei
 3 | 
 4 | import pytest
 5 | from EduNLP.SIF.constants import Symbol
 6 | from EduNLP.SIF.segment.segment import SegmentList, LatexFormulaSegment
 7 | from EduNLP.SIF.tokenization import text
 8 | from EduNLP.SIF.tokenization import formula
 9 | from EduNLP.SIF.tokenization.tokenization import TokenList
10 | 
11 | 
12 | def test_text_tokenization():
13 |     with pytest.raises(TypeError):
14 |         text.tokenize("12345", "alpha")
15 | 
16 | 
17 | def test_formula_tokenization():
18 |     with pytest.raises(ValueError):
19 |         formula.ast_token.ast_tokenize("1 + 1", return_type="graph")
20 | 
21 |     with pytest.raises(TypeError):
22 |         formula.tokenize("1 + 1", method="plain")
23 | 
24 |     # with pytest.raises(TypeError):
25 |     #     formula.tokenize(r"\phantom{=}56+4", method="ast")
26 | 
27 | 
28 | def test_tokenization():
29 |     tl = TokenList(SegmentList(""))
30 |     with pytest.raises(TypeError):
31 |         tl.append(Symbol("[Unknown]"))
32 | 
33 |     with pytest.raises(TypeError):
34 |         tl.append("[Unknown]")
35 | 
36 |     tl.append(LatexFormulaSegment('x+y'), False)
37 | 


--------------------------------------------------------------------------------
/tests/test_tokenizer/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/8/1 @ tongshiwei
3 | 


--------------------------------------------------------------------------------
/tests/test_tokenizer/test_tokenizer.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | # 2021/8/1 @ tongshiwei
  3 | 
  4 | import pytest
  5 | from EduNLP.Tokenizer import get_tokenizer
  6 | from EduNLP.Pretrain import DisenQTokenizer
  7 | 
  8 | 
  9 | def test_tokenizer():
 10 |     with pytest.raises(KeyError):
 11 |         get_tokenizer("error")
 12 | 
 13 | 
 14 | def test_disenQTokenizer():
 15 |     tokenizer = DisenQTokenizer(max_length=10, tokenize_method="space")
 16 |     # with pytest.raises(RuntimeError):
 17 |     #     tokenizer("10 米 的 (2/5) = () 米 的 (1/2) .")
 18 | 
 19 |     test_items = [
 20 |         "10 米 的 (2/5) = () 米 的 (1/2) . 多 余 的 字",
 21 |         "-1 - 1",
 22 |         "5 % 2 + 3.14",
 23 |         "3.x",
 24 |         ".",
 25 |         "",
 26 |         "-1/2",
 27 |         "/",
 28 |         "1.2%",
 29 |     ]
 30 |     tokenizer.set_vocab(test_items)
 31 |     print(tokenizer.vocab_size)
 32 |     for item in test_items:
 33 |         token_item = tokenizer(item)
 34 |         print(token_item)
 35 | 
 36 |     test_item = tokenizer(test_items[0], padding=True)
 37 |     assert test_item["seq_idx"].shape[-1] == 10
 38 | 
 39 | 
 40 | def test_CharTokenizer():
 41 |     items = [{
 42 |         "stem": "文具店有 $600$ 本练习本，卖出一些后，还剩 $4$ 包，每包 $25$ 本，卖出多少本？",
 43 |         "options": ["1", "2"]
 44 |     }]
 45 |     tokenizer = get_tokenizer("char", stop_words=set("，？"))
 46 |     tokens = tokenizer(items, key=lambda x: x['stem'])
 47 |     ret = next(tokens)
 48 |     ans = ['文', '具', '店', '有', '$', '600', '$', '本', '练', '习', '本', '卖', '出', '一',
 49 |            '些', '后', '还', '剩', '$', '4', '$', '包', '每', '包', '$', '25', '$', '本', '卖', '出', '多', '少', '本']
 50 |     assert ret == ans
 51 | 
 52 | 
 53 | def test_SpaceTokenizer():
 54 |     items = ['文具店有 $600$ 本练习本，卖出一些后，还剩 $4$ 包，每包 $25$ 本，卖出多少本？']
 55 |     tokenizer = get_tokenizer("space", stop_words=[])
 56 |     tokens = tokenizer(items)
 57 |     ret = next(tokens)
 58 |     ans = ['文具店有', '$600$', '本练习本，卖出一些后，还剩', '$4$', '包，每包', '$25$', '本，卖出多少本？']
 59 |     assert ret == ans
 60 | 
 61 | 
 62 | def test_AstformulaTokenizer():
 63 |     items = ['文具店有 $600$ 本练习本，卖出一些后，还剩 $4$ 包，每包 $25$ 本，卖出多少本？']
 64 |     tokenizer = get_tokenizer("ast_formula")
 65 |     tokens = tokenizer(items)
 66 |     ret = next(tokens)
 67 |     ans = ['文具店', 'textord', 'textord', 'textord', '练习本', '卖出', '剩', 'textord', '包', '每包', 'textord', 'textord', '卖出']
 68 |     assert ret == ans
 69 | 
 70 | 
 71 | def test_PuretextTokenizer():
 72 |     items = ['文具店有 $600$ 本练习本，卖出一些后，还剩 $4$ 包，每包 $25$ 本，卖出多少本？']
 73 |     tokenizer = get_tokenizer("pure_text", stop_words=[])
 74 |     tokens = tokenizer(items)
 75 |     ret = next(tokens)
 76 |     ans = ['文具店', '600', '练习本', '卖出', '剩', '4', '包', '每包', '25', '卖出']
 77 |     assert ret == ans
 78 |     tokenizer = get_tokenizer("pure_text", stop_words=[], handle_figure_formula=None)
 79 |     tokens = tokenizer(items)
 80 |     ret = next(tokens)
 81 |     assert ret == ans
 82 |     tokenizer = get_tokenizer("pure_text", stop_words=[], handle_figure_formula='symbolize')
 83 |     tokens = tokenizer(items)
 84 |     ret = next(tokens)
 85 |     assert ret == ans
 86 |     with pytest.raises(ValueError):
 87 |         tokenizer = get_tokenizer("pure_text", stop_words=[], handle_figure_formula='wrong')
 88 | 
 89 | 
 90 | def test_CustomTokenizer():
 91 |     items = [{
 92 |         "stem": "文具店有 $600$ 本练习本，卖出一些后，还剩 $4$ 包，每包 $25$ 本，卖出多少本？",
 93 |         "options": ["1", "2"]
 94 |     }]
 95 |     tokenizer = get_tokenizer("custom", symbol='f')
 96 |     tokens = tokenizer(items, key=lambda x: x['stem'])
 97 |     ret = next(tokens)
 98 |     ans = ['文具店', '[FORMULA]', '练习本', '卖出', '剩', '[FORMULA]', '包', '每包', '[FORMULA]', '卖出']
 99 |     assert ret == ans
100 |     items = [{
101 |         "stem": "有公式$\\FormFigureID{1}$，如图$\\FigureID{088f15ea-xxx}$,若$x,y$满足约束条件公式$\\F\
102 |             ormFigureBase64{2}$,$\\SIFSep$，则$z=x+7 y$的最大值为$\\SIFBlank$",
103 |         "options": ["1", "2"]
104 |     }]
105 |     tokenizer = get_tokenizer("custom", symbol='f', handle_figure_formula="symbolize")
106 |     tokens = tokenizer(items, key=lambda x: x['stem'])
107 |     ret = next(tokens)
108 |     ret.pop(3)
109 |     ans = ['公式', '[FORMULA]', '如图', '\\FigureID{088f15ea-xxx}', '[FORMULA]', '约束条件', '公式', '[FORMULA]',
110 |            '\\SIFSep', '[FORMULA]', '最大值', '\\SIFBlank']
111 |     ans.pop(3)
112 |     assert ret == ans
113 | 


--------------------------------------------------------------------------------
/tests/test_utils/test_modules.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | from EduNLP.ModelZoo.utils import MLP, TextCNN
 4 | 
 5 | 
 6 | def test_modules():
 7 |     encoder = TextCNN(256, 128)
 8 | 
 9 |     input_embeds1 = torch.rand(4, 16, 256)
10 |     hidden_embeds1 = encoder(input_embeds1)
11 |     assert hidden_embeds1.shape == torch.Size([4, 128])
12 |     input_embeds2 = torch.rand(4, 1, 256)
13 |     hidden_embeds2 = encoder(input_embeds2)
14 |     assert hidden_embeds2.shape == torch.Size([4, 128])
15 | 
16 |     classifier = MLP(128, 10, 64, 0.5, n_layers=4)
17 |     logits = classifier(hidden_embeds1)
18 |     assert logits.shape == torch.Size([4, 10])
19 | 


--------------------------------------------------------------------------------
/tests/test_vec/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # 2021/5/30 @ tongshiwei
3 | 


--------------------------------------------------------------------------------
/tests/test_vec/conftest.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | # 2021/5/30 @ tongshiwei
 3 | 
 4 | import codecs
 5 | import json
 6 | import pytest
 7 | from EduNLP.utils import abs_current_dir, path_append
 8 | 
 9 | 
10 | @pytest.fixture(scope="module")
11 | def data():
12 |     _data = []
13 |     data_path = path_append(abs_current_dir(__file__), "../../static/test_data/standard_luna_data.json", to_str=True)
14 |     with codecs.open(data_path, encoding="utf-8") as f:
15 |         for line in f.readlines():
16 |             _data.append(json.loads(line))
17 |     return _data
18 | 


--------------------------------------------------------------------------------
/tests/test_vec/test_t2v.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | # 2021/8/2 @ tongshiwei
 3 | 
 4 | import pytest
 5 | from EduNLP.Vector import get_pretrained_t2v
 6 | 
 7 | 
 8 | def test_t2v():
 9 |     with pytest.raises(KeyError):
10 |         get_pretrained_t2v("error")
11 | 


--------------------------------------------------------------------------------